1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of rescaling functions
11 //
12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13 
14 #include "./dsp.h"
15 
16 #if defined(WEBP_USE_MIPS_DSP_R2)
17 
18 #include <assert.h>
19 #include "../utils/rescaler.h"
20 
21 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
22 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
23 
24 //------------------------------------------------------------------------------
25 // Row export
26 
ExportRowShrink(WebPRescaler * const wrk)27 static void ExportRowShrink(WebPRescaler* const wrk) {
28   int i;
29   const int x_out_max = wrk->dst_width * wrk->num_channels;
30   uint8_t* dst = wrk->dst;
31   rescaler_t* irow = wrk->irow;
32   const rescaler_t* frow = wrk->frow;
33   const int yscale = wrk->fy_scale * (-wrk->y_accum);
34   int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
35   const int temp7 = (int)wrk->fxy_scale;
36   const int temp6 = (x_out_max & ~0x3) << 2;
37   assert(!WebPRescalerOutputDone(wrk));
38   assert(wrk->y_accum <= 0);
39   assert(!wrk->y_expand);
40   assert(wrk->fxy_scale != 0);
41   if (yscale) {
42     if (x_out_max >= 4) {
43       int temp8, temp9, temp10, temp11;
44       __asm__ volatile (
45         "li       %[temp3],    0x10000                    \n\t"
46         "li       %[temp4],    0x8000                     \n\t"
47         "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
48       "1:                                                 \n\t"
49         "lw       %[temp0],    0(%[frow])                 \n\t"
50         "lw       %[temp1],    4(%[frow])                 \n\t"
51         "lw       %[temp2],    8(%[frow])                 \n\t"
52         "lw       %[temp5],    12(%[frow])                \n\t"
53         "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
54         "maddu    $ac0,        %[temp0],    %[yscale]     \n\t"
55         "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
56         "maddu    $ac1,        %[temp1],    %[yscale]     \n\t"
57         "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
58         "maddu    $ac2,        %[temp2],    %[yscale]     \n\t"
59         "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
60         "maddu    $ac3,        %[temp5],    %[yscale]     \n\t"
61         "addiu    %[frow],     %[frow],     16            \n\t"
62         "mfhi     %[temp0],    $ac0                       \n\t"
63         "mfhi     %[temp1],    $ac1                       \n\t"
64         "mfhi     %[temp2],    $ac2                       \n\t"
65         "mfhi     %[temp5],    $ac3                       \n\t"
66         "lw       %[temp8],    0(%[irow])                 \n\t"
67         "lw       %[temp9],    4(%[irow])                 \n\t"
68         "lw       %[temp10],   8(%[irow])                 \n\t"
69         "lw       %[temp11],   12(%[irow])                \n\t"
70         "addiu    %[dst],      %[dst],      4             \n\t"
71         "addiu    %[irow],     %[irow],     16            \n\t"
72         "subu     %[temp8],    %[temp8],    %[temp0]      \n\t"
73         "subu     %[temp9],    %[temp9],    %[temp1]      \n\t"
74         "subu     %[temp10],   %[temp10],   %[temp2]      \n\t"
75         "subu     %[temp11],   %[temp11],   %[temp5]      \n\t"
76         "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
77         "maddu    $ac0,        %[temp8],    %[temp7]      \n\t"
78         "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
79         "maddu    $ac1,        %[temp9],    %[temp7]      \n\t"
80         "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
81         "maddu    $ac2,        %[temp10],   %[temp7]      \n\t"
82         "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
83         "maddu    $ac3,        %[temp11],   %[temp7]      \n\t"
84         "mfhi     %[temp8],    $ac0                       \n\t"
85         "mfhi     %[temp9],    $ac1                       \n\t"
86         "mfhi     %[temp10],   $ac2                       \n\t"
87         "mfhi     %[temp11],   $ac3                       \n\t"
88         "sw       %[temp0],    -16(%[irow])               \n\t"
89         "sw       %[temp1],    -12(%[irow])               \n\t"
90         "sw       %[temp2],    -8(%[irow])                \n\t"
91         "sw       %[temp5],    -4(%[irow])                \n\t"
92         "sb       %[temp8],    -4(%[dst])                 \n\t"
93         "sb       %[temp9],    -3(%[dst])                 \n\t"
94         "sb       %[temp10],   -2(%[dst])                 \n\t"
95         "sb       %[temp11],   -1(%[dst])                 \n\t"
96         "bne      %[frow],     %[loop_end], 1b            \n\t"
97         : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
98           [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
99           [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
100           [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
101           [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
102         : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
103         : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
104           "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
105       );
106     }
107     for (i = 0; i < (x_out_max & 0x3); ++i) {
108       const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale);
109       const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
110       assert(v >= 0 && v <= 255);
111       *dst++ = v;
112       *irow++ = frac;   // new fractional start
113     }
114   } else {
115     if (x_out_max >= 4) {
116       __asm__ volatile (
117         "li       %[temp3],    0x10000                    \n\t"
118         "li       %[temp4],    0x8000                     \n\t"
119         "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
120       "1:                                                 \n\t"
121         "lw       %[temp0],    0(%[irow])                 \n\t"
122         "lw       %[temp1],    4(%[irow])                 \n\t"
123         "lw       %[temp2],    8(%[irow])                 \n\t"
124         "lw       %[temp5],    12(%[irow])                \n\t"
125         "addiu    %[dst],      %[dst],      4             \n\t"
126         "addiu    %[irow],     %[irow],     16            \n\t"
127         "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
128         "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
129         "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
130         "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
131         "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
132         "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
133         "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
134         "maddu    $ac3,        %[temp5],    %[temp7]      \n\t"
135         "mfhi     %[temp0],    $ac0                       \n\t"
136         "mfhi     %[temp1],    $ac1                       \n\t"
137         "mfhi     %[temp2],    $ac2                       \n\t"
138         "mfhi     %[temp5],    $ac3                       \n\t"
139         "sw       $zero,       -16(%[irow])               \n\t"
140         "sw       $zero,       -12(%[irow])               \n\t"
141         "sw       $zero,       -8(%[irow])                \n\t"
142         "sw       $zero,       -4(%[irow])                \n\t"
143         "sb       %[temp0],    -4(%[dst])                 \n\t"
144         "sb       %[temp1],    -3(%[dst])                 \n\t"
145         "sb       %[temp2],    -2(%[dst])                 \n\t"
146         "sb       %[temp5],    -1(%[dst])                 \n\t"
147         "bne      %[irow],     %[loop_end], 1b            \n\t"
148         : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
149           [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
150           [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
151         : [temp7]"r"(temp7), [temp6]"r"(temp6)
152         : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
153           "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
154       );
155     }
156     for (i = 0; i < (x_out_max & 0x3); ++i) {
157       const int v = (int)MULT_FIX(*irow, wrk->fxy_scale);
158       assert(v >= 0 && v <= 255);
159       *dst++ = v;
160       *irow++ = 0;
161     }
162   }
163 }
164 
ExportRowExpand(WebPRescaler * const wrk)165 static void ExportRowExpand(WebPRescaler* const wrk) {
166   int i;
167   uint8_t* dst = wrk->dst;
168   rescaler_t* irow = wrk->irow;
169   const int x_out_max = wrk->dst_width * wrk->num_channels;
170   const rescaler_t* frow = wrk->frow;
171   int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
172   const int temp6 = (x_out_max & ~0x3) << 2;
173   const int temp7 = (int)wrk->fy_scale;
174   assert(!WebPRescalerOutputDone(wrk));
175   assert(wrk->y_accum <= 0);
176   assert(wrk->y_expand);
177   assert(wrk->y_sub != 0);
178   if (wrk->y_accum == 0) {
179     if (x_out_max >= 4) {
180       __asm__ volatile (
181         "li       %[temp4],    0x10000                    \n\t"
182         "li       %[temp5],    0x8000                     \n\t"
183         "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
184       "1:                                                 \n\t"
185         "lw       %[temp0],    0(%[frow])                 \n\t"
186         "lw       %[temp1],    4(%[frow])                 \n\t"
187         "lw       %[temp2],    8(%[frow])                 \n\t"
188         "lw       %[temp3],    12(%[frow])                \n\t"
189         "addiu    %[dst],      %[dst],      4             \n\t"
190         "addiu    %[frow],     %[frow],     16            \n\t"
191         "mult     $ac0,        %[temp4],    %[temp5]      \n\t"
192         "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
193         "mult     $ac1,        %[temp4],    %[temp5]      \n\t"
194         "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
195         "mult     $ac2,        %[temp4],    %[temp5]      \n\t"
196         "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
197         "mult     $ac3,        %[temp4],    %[temp5]      \n\t"
198         "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
199         "mfhi     %[temp0],    $ac0                       \n\t"
200         "mfhi     %[temp1],    $ac1                       \n\t"
201         "mfhi     %[temp2],    $ac2                       \n\t"
202         "mfhi     %[temp3],    $ac3                       \n\t"
203         "sb       %[temp0],    -4(%[dst])                 \n\t"
204         "sb       %[temp1],    -3(%[dst])                 \n\t"
205         "sb       %[temp2],    -2(%[dst])                 \n\t"
206         "sb       %[temp3],    -1(%[dst])                 \n\t"
207         "bne      %[frow],     %[loop_end], 1b            \n\t"
208         : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
209           [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
210           [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
211         : [temp7]"r"(temp7), [temp6]"r"(temp6)
212         : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
213           "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
214       );
215     }
216     for (i = 0; i < (x_out_max & 0x3); ++i) {
217       const uint32_t J = *frow++;
218       const int v = (int)MULT_FIX(J, wrk->fy_scale);
219       assert(v >= 0 && v <= 255);
220       *dst++ = v;
221     }
222   } else {
223     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
224     const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
225     if (x_out_max >= 4) {
226       int temp8, temp9, temp10, temp11;
227       __asm__ volatile (
228         "li       %[temp8],    0x10000                    \n\t"
229         "li       %[temp9],    0x8000                     \n\t"
230         "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
231       "1:                                                 \n\t"
232         "lw       %[temp0],    0(%[frow])                 \n\t"
233         "lw       %[temp1],    4(%[frow])                 \n\t"
234         "lw       %[temp2],    8(%[frow])                 \n\t"
235         "lw       %[temp3],    12(%[frow])                \n\t"
236         "lw       %[temp4],    0(%[irow])                 \n\t"
237         "lw       %[temp5],    4(%[irow])                 \n\t"
238         "lw       %[temp10],   8(%[irow])                 \n\t"
239         "lw       %[temp11],   12(%[irow])                \n\t"
240         "addiu    %[dst],      %[dst],      4             \n\t"
241         "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
242         "maddu    $ac0,        %[A],        %[temp0]      \n\t"
243         "maddu    $ac0,        %[B],        %[temp4]      \n\t"
244         "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
245         "maddu    $ac1,        %[A],        %[temp1]      \n\t"
246         "maddu    $ac1,        %[B],        %[temp5]      \n\t"
247         "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
248         "maddu    $ac2,        %[A],        %[temp2]      \n\t"
249         "maddu    $ac2,        %[B],        %[temp10]     \n\t"
250         "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
251         "maddu    $ac3,        %[A],        %[temp3]      \n\t"
252         "maddu    $ac3,        %[B],        %[temp11]     \n\t"
253         "addiu    %[frow],     %[frow],     16            \n\t"
254         "addiu    %[irow],     %[irow],     16            \n\t"
255         "mfhi     %[temp0],    $ac0                       \n\t"
256         "mfhi     %[temp1],    $ac1                       \n\t"
257         "mfhi     %[temp2],    $ac2                       \n\t"
258         "mfhi     %[temp3],    $ac3                       \n\t"
259         "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
260         "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
261         "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
262         "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
263         "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
264         "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
265         "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
266         "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
267         "mfhi     %[temp0],    $ac0                       \n\t"
268         "mfhi     %[temp1],    $ac1                       \n\t"
269         "mfhi     %[temp2],    $ac2                       \n\t"
270         "mfhi     %[temp3],    $ac3                       \n\t"
271         "sb       %[temp0],    -4(%[dst])                 \n\t"
272         "sb       %[temp1],    -3(%[dst])                 \n\t"
273         "sb       %[temp2],    -2(%[dst])                 \n\t"
274         "sb       %[temp3],    -1(%[dst])                 \n\t"
275         "bne      %[frow],     %[loop_end], 1b            \n\t"
276         : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
277           [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
278           [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
279           [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
280           [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
281         : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
282         : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
283           "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
284       );
285     }
286     for (i = 0; i < (x_out_max & 0x3); ++i) {
287       const uint64_t I = (uint64_t)A * *frow++
288                        + (uint64_t)B * *irow++;
289       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
290       const int v = (int)MULT_FIX(J, wrk->fy_scale);
291       assert(v >= 0 && v <= 255);
292       *dst++ = v;
293     }
294   }
295 }
296 
297 #undef MULT_FIX
298 #undef ROUNDER
299 
300 //------------------------------------------------------------------------------
301 // Entry point
302 
303 extern void WebPRescalerDspInitMIPSdspR2(void);
304 
WebPRescalerDspInitMIPSdspR2(void)305 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
306   WebPRescalerExportRowExpand = ExportRowExpand;
307   WebPRescalerExportRowShrink = ExportRowShrink;
308 }
309 
310 #else  // !WEBP_USE_MIPS_DSP_R2
311 
312 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
313 
314 #endif  // WEBP_USE_MIPS_DSP_R2
315