1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_dsp/mips/inv_txfm_msa.h"
12 
idct32x8_row_transpose_store(const int16_t * input,int16_t * tmp_buf)13 static void idct32x8_row_transpose_store(const int16_t *input,
14                                          int16_t *tmp_buf) {
15   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
16 
17   /* 1st & 2nd 8x8 */
18   LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
19   LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
20   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
21                      n3);
22   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
23                      n7);
24   ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
25   ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
26   ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
27 
28   /* 3rd & 4th 8x8 */
29   LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
30   LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
31   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
32                      n3);
33   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
34                      n7);
35   ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
36   ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
37   ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
38   ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
39 }
40 
idct32x8_row_even_process_store(int16_t * tmp_buf,int16_t * tmp_eve_buf)41 static void idct32x8_row_even_process_store(int16_t *tmp_buf,
42                                             int16_t *tmp_eve_buf) {
43   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
44   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
45   v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
46 
47   /* Even stage 1 */
48   LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
49 
50   DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
51   DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
52   BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
53   DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
54 
55   loc1 = vec3;
56   loc0 = vec1;
57 
58   DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
59   DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
60   BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
61   BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
62   BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
63 
64   /* Even stage 2 */
65   LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
66   DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
67   DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
68   DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
69   DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
70 
71   vec0 = reg0 + reg4;
72   reg0 = reg0 - reg4;
73   reg4 = reg6 + reg2;
74   reg6 = reg6 - reg2;
75   reg2 = reg1 + reg5;
76   reg1 = reg1 - reg5;
77   reg5 = reg7 + reg3;
78   reg7 = reg7 - reg3;
79   reg3 = vec0;
80 
81   vec1 = reg2;
82   reg2 = reg3 + reg4;
83   reg3 = reg3 - reg4;
84   reg4 = reg5 - vec1;
85   reg5 = reg5 + vec1;
86 
87   DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
88   DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
89 
90   vec0 = reg0 - reg6;
91   reg0 = reg0 + reg6;
92   vec1 = reg7 - reg1;
93   reg7 = reg7 + reg1;
94 
95   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
96   DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
97 
98   /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
99   BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
100   ST_SH(loc0, (tmp_eve_buf + 15 * 8));
101   ST_SH(loc1, (tmp_eve_buf));
102   ST_SH(loc2, (tmp_eve_buf + 14 * 8));
103   ST_SH(loc3, (tmp_eve_buf + 8));
104 
105   BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
106   ST_SH(loc0, (tmp_eve_buf + 13 * 8));
107   ST_SH(loc1, (tmp_eve_buf + 2 * 8));
108   ST_SH(loc2, (tmp_eve_buf + 12 * 8));
109   ST_SH(loc3, (tmp_eve_buf + 3 * 8));
110 
111   /* Store 8 */
112   BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
113   ST_SH(loc0, (tmp_eve_buf + 11 * 8));
114   ST_SH(loc1, (tmp_eve_buf + 4 * 8));
115   ST_SH(loc2, (tmp_eve_buf + 10 * 8));
116   ST_SH(loc3, (tmp_eve_buf + 5 * 8));
117 
118   BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
119   ST_SH(loc0, (tmp_eve_buf + 9 * 8));
120   ST_SH(loc1, (tmp_eve_buf + 6 * 8));
121   ST_SH(loc2, (tmp_eve_buf + 8 * 8));
122   ST_SH(loc3, (tmp_eve_buf + 7 * 8));
123 }
124 
idct32x8_row_odd_process_store(int16_t * tmp_buf,int16_t * tmp_odd_buf)125 static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
126                                            int16_t *tmp_odd_buf) {
127   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
128   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
129 
130   /* Odd stage 1 */
131   reg0 = LD_SH(tmp_buf + 8);
132   reg1 = LD_SH(tmp_buf + 7 * 8);
133   reg2 = LD_SH(tmp_buf + 9 * 8);
134   reg3 = LD_SH(tmp_buf + 15 * 8);
135   reg4 = LD_SH(tmp_buf + 17 * 8);
136   reg5 = LD_SH(tmp_buf + 23 * 8);
137   reg6 = LD_SH(tmp_buf + 25 * 8);
138   reg7 = LD_SH(tmp_buf + 31 * 8);
139 
140   DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
141   DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
142   DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
143   DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
144 
145   vec0 = reg0 + reg3;
146   reg0 = reg0 - reg3;
147   reg3 = reg7 + reg4;
148   reg7 = reg7 - reg4;
149   reg4 = reg1 + reg2;
150   reg1 = reg1 - reg2;
151   reg2 = reg6 + reg5;
152   reg6 = reg6 - reg5;
153   reg5 = vec0;
154 
155   /* 4 Stores */
156   ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
157   ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
158 
159   SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
160   DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
161   ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
162 
163   /* 4 Stores */
164   DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
165   DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
166   BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
167   ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
168 
169   DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
170   ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
171 
172   /* Odd stage 2 */
173   /* 8 loads */
174   reg0 = LD_SH(tmp_buf + 3 * 8);
175   reg1 = LD_SH(tmp_buf + 5 * 8);
176   reg2 = LD_SH(tmp_buf + 11 * 8);
177   reg3 = LD_SH(tmp_buf + 13 * 8);
178   reg4 = LD_SH(tmp_buf + 19 * 8);
179   reg5 = LD_SH(tmp_buf + 21 * 8);
180   reg6 = LD_SH(tmp_buf + 27 * 8);
181   reg7 = LD_SH(tmp_buf + 29 * 8);
182 
183   DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
184   DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
185   DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
186   DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
187 
188   /* 4 Stores */
189   SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
190   DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
191   DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
192 
193   BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
194   ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
195 
196   DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
197   ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
198 
199   /* 4 Stores */
200   ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
201   BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
202   ST_SH(reg0, (tmp_odd_buf + 13 * 8));
203   ST_SH(reg1, (tmp_odd_buf + 14 * 8));
204 
205   DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
206   ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
207 
208   /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
209 
210   /* Load 8 & Store 8 */
211   LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
212   LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
213 
214   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
215   ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
216 
217   SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
218   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
219 
220   SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
221   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
222   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
223 
224   /* Load 8 & Store 8 */
225   LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
226   LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
227 
228   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
229   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
230 
231   SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
232   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
233 
234   SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
235   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
236   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
237 }
238 
idct_butterfly_transpose_store(int16_t * tmp_buf,int16_t * tmp_eve_buf,int16_t * tmp_odd_buf,int16_t * dst)239 static void idct_butterfly_transpose_store(int16_t *tmp_buf,
240                                            int16_t *tmp_eve_buf,
241                                            int16_t *tmp_odd_buf, int16_t *dst) {
242   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
243   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
244 
245   /* FINAL BUTTERFLY : Dependency on Even & Odd */
246   vec0 = LD_SH(tmp_odd_buf);
247   vec1 = LD_SH(tmp_odd_buf + 9 * 8);
248   vec2 = LD_SH(tmp_odd_buf + 14 * 8);
249   vec3 = LD_SH(tmp_odd_buf + 6 * 8);
250   loc0 = LD_SH(tmp_eve_buf);
251   loc1 = LD_SH(tmp_eve_buf + 8 * 8);
252   loc2 = LD_SH(tmp_eve_buf + 4 * 8);
253   loc3 = LD_SH(tmp_eve_buf + 12 * 8);
254 
255   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
256 
257   ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
258   ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
259   ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
260   ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
261 
262   /* Load 8 & Store 8 */
263   vec0 = LD_SH(tmp_odd_buf + 4 * 8);
264   vec1 = LD_SH(tmp_odd_buf + 13 * 8);
265   vec2 = LD_SH(tmp_odd_buf + 10 * 8);
266   vec3 = LD_SH(tmp_odd_buf + 3 * 8);
267   loc0 = LD_SH(tmp_eve_buf + 2 * 8);
268   loc1 = LD_SH(tmp_eve_buf + 10 * 8);
269   loc2 = LD_SH(tmp_eve_buf + 6 * 8);
270   loc3 = LD_SH(tmp_eve_buf + 14 * 8);
271 
272   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
273 
274   ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
275   ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
276   ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
277   ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
278 
279   /* Load 8 & Store 8 */
280   vec0 = LD_SH(tmp_odd_buf + 2 * 8);
281   vec1 = LD_SH(tmp_odd_buf + 11 * 8);
282   vec2 = LD_SH(tmp_odd_buf + 12 * 8);
283   vec3 = LD_SH(tmp_odd_buf + 7 * 8);
284   loc0 = LD_SH(tmp_eve_buf + 1 * 8);
285   loc1 = LD_SH(tmp_eve_buf + 9 * 8);
286   loc2 = LD_SH(tmp_eve_buf + 5 * 8);
287   loc3 = LD_SH(tmp_eve_buf + 13 * 8);
288 
289   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
290 
291   ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
292   ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
293   ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
294   ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
295 
296   /* Load 8 & Store 8 */
297   vec0 = LD_SH(tmp_odd_buf + 5 * 8);
298   vec1 = LD_SH(tmp_odd_buf + 15 * 8);
299   vec2 = LD_SH(tmp_odd_buf + 8 * 8);
300   vec3 = LD_SH(tmp_odd_buf + 1 * 8);
301   loc0 = LD_SH(tmp_eve_buf + 3 * 8);
302   loc1 = LD_SH(tmp_eve_buf + 11 * 8);
303   loc2 = LD_SH(tmp_eve_buf + 7 * 8);
304   loc3 = LD_SH(tmp_eve_buf + 15 * 8);
305 
306   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
307 
308   ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
309   ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
310   ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
311   ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
312 
313   /* Transpose : 16 vectors */
314   /* 1st & 2nd 8x8 */
315   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
316                      n3);
317   ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
318   ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
319 
320   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
321                      n7);
322   ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
323   ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
324 
325   /* 3rd & 4th 8x8 */
326   LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
327   LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
328   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
329                      n3);
330   ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
331   ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
332 
333   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
334                      n7);
335   ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
336   ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
337 }
338 
idct32x8_1d_rows_msa(const int16_t * input,int16_t * output)339 static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
340   DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
341   DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
342   DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
343 
344   idct32x8_row_transpose_store(input, &tmp_buf[0]);
345   idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
346   idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
347   idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
348                                  output);
349 }
350 
idct8x32_column_even_process_store(int16_t * tmp_buf,int16_t * tmp_eve_buf)351 static void idct8x32_column_even_process_store(int16_t *tmp_buf,
352                                                int16_t *tmp_eve_buf) {
353   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
354   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
355   v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
356 
357   /* Even stage 1 */
358   LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
359   tmp_buf += (2 * 32);
360 
361   DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
362   DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
363   BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
364   DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
365 
366   loc1 = vec3;
367   loc0 = vec1;
368 
369   DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
370   DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
371   BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
372   BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
373   BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
374 
375   /* Even stage 2 */
376   /* Load 8 */
377   LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
378 
379   DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
380   DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
381   DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
382   DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
383 
384   vec0 = reg0 + reg4;
385   reg0 = reg0 - reg4;
386   reg4 = reg6 + reg2;
387   reg6 = reg6 - reg2;
388   reg2 = reg1 + reg5;
389   reg1 = reg1 - reg5;
390   reg5 = reg7 + reg3;
391   reg7 = reg7 - reg3;
392   reg3 = vec0;
393 
394   vec1 = reg2;
395   reg2 = reg3 + reg4;
396   reg3 = reg3 - reg4;
397   reg4 = reg5 - vec1;
398   reg5 = reg5 + vec1;
399 
400   DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
401   DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
402 
403   vec0 = reg0 - reg6;
404   reg0 = reg0 + reg6;
405   vec1 = reg7 - reg1;
406   reg7 = reg7 + reg1;
407 
408   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
409   DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
410 
411   /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
412   /* Store 8 */
413   BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
414   ST_SH2(loc1, loc3, tmp_eve_buf, 8);
415   ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
416 
417   BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
418   ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
419   ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
420 
421   /* Store 8 */
422   BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
423   ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
424   ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
425 
426   BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
427   ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
428   ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
429 }
430 
idct8x32_column_odd_process_store(int16_t * tmp_buf,int16_t * tmp_odd_buf)431 static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
432                                               int16_t *tmp_odd_buf) {
433   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
434   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
435 
436   /* Odd stage 1 */
437   reg0 = LD_SH(tmp_buf + 32);
438   reg1 = LD_SH(tmp_buf + 7 * 32);
439   reg2 = LD_SH(tmp_buf + 9 * 32);
440   reg3 = LD_SH(tmp_buf + 15 * 32);
441   reg4 = LD_SH(tmp_buf + 17 * 32);
442   reg5 = LD_SH(tmp_buf + 23 * 32);
443   reg6 = LD_SH(tmp_buf + 25 * 32);
444   reg7 = LD_SH(tmp_buf + 31 * 32);
445 
446   DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
447   DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
448   DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
449   DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
450 
451   vec0 = reg0 + reg3;
452   reg0 = reg0 - reg3;
453   reg3 = reg7 + reg4;
454   reg7 = reg7 - reg4;
455   reg4 = reg1 + reg2;
456   reg1 = reg1 - reg2;
457   reg2 = reg6 + reg5;
458   reg6 = reg6 - reg5;
459   reg5 = vec0;
460 
461   /* 4 Stores */
462   ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
463   ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
464   SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
465   DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
466   ST_SH2(vec0, vec1, tmp_odd_buf, 8);
467 
468   /* 4 Stores */
469   DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
470   DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
471   BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
472   ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
473   DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
474   ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
475 
476   /* Odd stage 2 */
477   /* 8 loads */
478   reg0 = LD_SH(tmp_buf + 3 * 32);
479   reg1 = LD_SH(tmp_buf + 5 * 32);
480   reg2 = LD_SH(tmp_buf + 11 * 32);
481   reg3 = LD_SH(tmp_buf + 13 * 32);
482   reg4 = LD_SH(tmp_buf + 19 * 32);
483   reg5 = LD_SH(tmp_buf + 21 * 32);
484   reg6 = LD_SH(tmp_buf + 27 * 32);
485   reg7 = LD_SH(tmp_buf + 29 * 32);
486 
487   DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
488   DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
489   DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
490   DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
491 
492   /* 4 Stores */
493   SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
494   DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
495   DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
496   BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
497   ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
498   DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
499   ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
500 
501   /* 4 Stores */
502   ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
503   BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
504   ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
505   DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
506   ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
507 
508   /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
509   /* Load 8 & Store 8 */
510   LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
511   LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
512 
513   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
514   ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
515 
516   SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
517   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
518 
519   SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
520   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
521   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
522 
523   /* Load 8 & Store 8 */
524   LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
525   LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
526 
527   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
528   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
529 
530   SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
531   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
532 
533   SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
534   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
535   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
536 }
537 
idct8x32_column_butterfly_addblk(int16_t * tmp_eve_buf,int16_t * tmp_odd_buf,uint8_t * dst,int32_t dst_stride)538 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
539                                              int16_t *tmp_odd_buf, uint8_t *dst,
540                                              int32_t dst_stride) {
541   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
542   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
543 
544   /* FINAL BUTTERFLY : Dependency on Even & Odd */
545   vec0 = LD_SH(tmp_odd_buf);
546   vec1 = LD_SH(tmp_odd_buf + 9 * 8);
547   vec2 = LD_SH(tmp_odd_buf + 14 * 8);
548   vec3 = LD_SH(tmp_odd_buf + 6 * 8);
549   loc0 = LD_SH(tmp_eve_buf);
550   loc1 = LD_SH(tmp_eve_buf + 8 * 8);
551   loc2 = LD_SH(tmp_eve_buf + 4 * 8);
552   loc3 = LD_SH(tmp_eve_buf + 12 * 8);
553 
554   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
555   SRARI_H4_SH(m0, m2, m4, m6, 6);
556   VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
557 
558   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
559   SRARI_H4_SH(m0, m2, m4, m6, 6);
560   VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
561                       m6);
562 
563   /* Load 8 & Store 8 */
564   vec0 = LD_SH(tmp_odd_buf + 4 * 8);
565   vec1 = LD_SH(tmp_odd_buf + 13 * 8);
566   vec2 = LD_SH(tmp_odd_buf + 10 * 8);
567   vec3 = LD_SH(tmp_odd_buf + 3 * 8);
568   loc0 = LD_SH(tmp_eve_buf + 2 * 8);
569   loc1 = LD_SH(tmp_eve_buf + 10 * 8);
570   loc2 = LD_SH(tmp_eve_buf + 6 * 8);
571   loc3 = LD_SH(tmp_eve_buf + 14 * 8);
572 
573   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
574   SRARI_H4_SH(m1, m3, m5, m7, 6);
575   VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
576 
577   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
578   SRARI_H4_SH(m1, m3, m5, m7, 6);
579   VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
580                       m7);
581 
582   /* Load 8 & Store 8 */
583   vec0 = LD_SH(tmp_odd_buf + 2 * 8);
584   vec1 = LD_SH(tmp_odd_buf + 11 * 8);
585   vec2 = LD_SH(tmp_odd_buf + 12 * 8);
586   vec3 = LD_SH(tmp_odd_buf + 7 * 8);
587   loc0 = LD_SH(tmp_eve_buf + 1 * 8);
588   loc1 = LD_SH(tmp_eve_buf + 9 * 8);
589   loc2 = LD_SH(tmp_eve_buf + 5 * 8);
590   loc3 = LD_SH(tmp_eve_buf + 13 * 8);
591 
592   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
593   SRARI_H4_SH(n0, n2, n4, n6, 6);
594   VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
595 
596   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
597   SRARI_H4_SH(n0, n2, n4, n6, 6);
598   VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
599                       n6);
600 
601   /* Load 8 & Store 8 */
602   vec0 = LD_SH(tmp_odd_buf + 5 * 8);
603   vec1 = LD_SH(tmp_odd_buf + 15 * 8);
604   vec2 = LD_SH(tmp_odd_buf + 8 * 8);
605   vec3 = LD_SH(tmp_odd_buf + 1 * 8);
606   loc0 = LD_SH(tmp_eve_buf + 3 * 8);
607   loc1 = LD_SH(tmp_eve_buf + 11 * 8);
608   loc2 = LD_SH(tmp_eve_buf + 7 * 8);
609   loc3 = LD_SH(tmp_eve_buf + 15 * 8);
610 
611   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
612   SRARI_H4_SH(n1, n3, n5, n7, 6);
613   VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
614 
615   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
616   SRARI_H4_SH(n1, n3, n5, n7, 6);
617   VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
618                       n7);
619 }
620 
idct8x32_1d_columns_addblk_msa(int16_t * input,uint8_t * dst,int32_t dst_stride)621 static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
622                                            int32_t dst_stride) {
623   DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
624   DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
625 
626   idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
627   idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
628   idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
629                                    dst_stride);
630 }
631 
vpx_idct32x32_1024_add_msa(const int16_t * input,uint8_t * dst,int32_t dst_stride)632 void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
633                                 int32_t dst_stride) {
634   int32_t i;
635   DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
636   int16_t *out_ptr = out_arr;
637 
638   /* transform rows */
639   for (i = 0; i < 4; ++i) {
640     /* process 32 * 8 block */
641     idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
642   }
643 
644   /* transform columns */
645   for (i = 0; i < 4; ++i) {
646     /* process 8 * 32 block */
647     idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
648                                    dst_stride);
649   }
650 }
651 
vpx_idct32x32_34_add_msa(const int16_t * input,uint8_t * dst,int32_t dst_stride)652 void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
653                               int32_t dst_stride) {
654   int32_t i;
655   DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
656   int16_t *out_ptr = out_arr;
657 
658   for (i = 32; i--;) {
659     __asm__ __volatile__(
660         "sw     $zero,      0(%[out_ptr])     \n\t"
661         "sw     $zero,      4(%[out_ptr])     \n\t"
662         "sw     $zero,      8(%[out_ptr])     \n\t"
663         "sw     $zero,     12(%[out_ptr])     \n\t"
664         "sw     $zero,     16(%[out_ptr])     \n\t"
665         "sw     $zero,     20(%[out_ptr])     \n\t"
666         "sw     $zero,     24(%[out_ptr])     \n\t"
667         "sw     $zero,     28(%[out_ptr])     \n\t"
668         "sw     $zero,     32(%[out_ptr])     \n\t"
669         "sw     $zero,     36(%[out_ptr])     \n\t"
670         "sw     $zero,     40(%[out_ptr])     \n\t"
671         "sw     $zero,     44(%[out_ptr])     \n\t"
672         "sw     $zero,     48(%[out_ptr])     \n\t"
673         "sw     $zero,     52(%[out_ptr])     \n\t"
674         "sw     $zero,     56(%[out_ptr])     \n\t"
675         "sw     $zero,     60(%[out_ptr])     \n\t"
676 
677         :
678         : [out_ptr] "r"(out_ptr));
679 
680     out_ptr += 32;
681   }
682 
683   out_ptr = out_arr;
684 
685   /* rows: only upper-left 8x8 has non-zero coeff */
686   idct32x8_1d_rows_msa(input, out_ptr);
687 
688   /* transform columns */
689   for (i = 0; i < 4; ++i) {
690     /* process 8 * 32 block */
691     idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
692                                    dst_stride);
693   }
694 }
695 
vpx_idct32x32_1_add_msa(const int16_t * input,uint8_t * dst,int32_t dst_stride)696 void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
697                              int32_t dst_stride) {
698   int32_t i;
699   int16_t out;
700   v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
701   v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
702 
703   out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
704   out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
705   out = ROUND_POWER_OF_TWO(out, 6);
706 
707   vec = __msa_fill_h(out);
708 
709   for (i = 16; i--;) {
710     LD_UB2(dst, 16, dst0, dst1);
711     LD_UB2(dst + dst_stride, 16, dst2, dst3);
712 
713     UNPCK_UB_SH(dst0, res0, res4);
714     UNPCK_UB_SH(dst1, res1, res5);
715     UNPCK_UB_SH(dst2, res2, res6);
716     UNPCK_UB_SH(dst3, res3, res7);
717     ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
718     ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
719     CLIP_SH4_0_255(res0, res1, res2, res3);
720     CLIP_SH4_0_255(res4, res5, res6, res7);
721     PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
722                 tmp2, tmp3);
723 
724     ST_UB2(tmp0, tmp1, dst, 16);
725     dst += dst_stride;
726     ST_UB2(tmp2, tmp3, dst, 16);
727     dst += dst_stride;
728   }
729 }
730