1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_dsp/mips/fwd_txfm_msa.h"
12
fdct8x32_1d_column_load_butterfly(const int16_t * input,int32_t src_stride,int16_t * temp_buff)13 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
14 int32_t src_stride,
15 int16_t *temp_buff) {
16 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
17 v8i16 step0, step1, step2, step3;
18 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
19 v8i16 step0_1, step1_1, step2_1, step3_1;
20
21 /* 1st and 2nd set */
22 LD_SH4(input, src_stride, in0, in1, in2, in3);
23 LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
24 LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
25 LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
26 SLLI_4V(in0, in1, in2, in3, 2);
27 SLLI_4V(in4, in5, in6, in7, 2);
28 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
29 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
30 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
31 step3, in4, in5, in6, in7);
32 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
33 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
34 ST_SH4(step0, step1, step2, step3, temp_buff, 8);
35 ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
36 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
37 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
38
39 /* 3rd and 4th set */
40 LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
41 LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
42 LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
43 LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
44 SLLI_4V(in0, in1, in2, in3, 2);
45 SLLI_4V(in4, in5, in6, in7, 2);
46 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
47 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
48 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
49 step3, in4, in5, in6, in7);
50 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
51 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
52 ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
53 ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
54 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
55 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
56 }
57
fdct8x32_1d_column_even_store(int16_t * input,int16_t * temp)58 static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
59 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
60 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
61 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
62 v8i16 temp0, temp1;
63
64 /* fdct even */
65 LD_SH4(input, 8, in0, in1, in2, in3);
66 LD_SH4(input + 96, 8, in12, in13, in14, in15);
67 BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
68 vec3, in12, in13, in14, in15);
69 LD_SH4(input + 32, 8, in4, in5, in6, in7);
70 LD_SH4(input + 64, 8, in8, in9, in10, in11);
71 BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
72 in8, in9, in10, in11);
73
74 /* Stage 3 */
75 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
76 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
77 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
78 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
79 ST_SH(temp0, temp);
80 ST_SH(temp1, temp + 512);
81
82 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
83 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
84 ST_SH(temp0, temp + 256);
85 ST_SH(temp1, temp + 768);
86
87 SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
88 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
89 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
90 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
91 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
92 ST_SH(temp0, temp + 128);
93 ST_SH(temp1, temp + 896);
94
95 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
96 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
97 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
98 ST_SH(temp0, temp + 640);
99 ST_SH(temp1, temp + 384);
100
101 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
102 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
103 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
104 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
105 ADD2(in0, in1, in2, in3, vec0, vec7);
106 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
107 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
108 ST_SH(temp0, temp + 64);
109 ST_SH(temp1, temp + 960);
110
111 SUB2(in0, in1, in2, in3, in0, in2);
112 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
113 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
114 ST_SH(temp0, temp + 576);
115 ST_SH(temp1, temp + 448);
116
117 SUB2(in9, vec2, in14, vec5, vec2, vec5);
118 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
119 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
120 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
121 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
122 ST_SH(temp0, temp + 320);
123 ST_SH(temp1, temp + 704);
124
125 ADD2(in3, in2, in0, in1, vec3, vec4);
126 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
127 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
128 ST_SH(temp0, temp + 192);
129 ST_SH(temp1, temp + 832);
130 }
131
fdct8x32_1d_column_odd_store(int16_t * input,int16_t * temp_ptr)132 static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
133 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
134 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
135
136 in20 = LD_SH(input + 32);
137 in21 = LD_SH(input + 40);
138 in26 = LD_SH(input + 80);
139 in27 = LD_SH(input + 88);
140
141 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
142 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
143
144 in18 = LD_SH(input + 16);
145 in19 = LD_SH(input + 24);
146 in28 = LD_SH(input + 96);
147 in29 = LD_SH(input + 104);
148
149 vec4 = in19 - in20;
150 ST_SH(vec4, input + 32);
151 vec4 = in18 - in21;
152 ST_SH(vec4, input + 40);
153 vec4 = in29 - in26;
154 ST_SH(vec4, input + 80);
155 vec4 = in28 - in27;
156 ST_SH(vec4, input + 88);
157
158 in21 = in18 + in21;
159 in20 = in19 + in20;
160 in27 = in28 + in27;
161 in26 = in29 + in26;
162
163 LD_SH4(input + 48, 8, in22, in23, in24, in25);
164 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
165 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
166
167 in16 = LD_SH(input);
168 in17 = LD_SH(input + 8);
169 in30 = LD_SH(input + 112);
170 in31 = LD_SH(input + 120);
171
172 vec4 = in17 - in22;
173 ST_SH(vec4, input + 16);
174 vec4 = in16 - in23;
175 ST_SH(vec4, input + 24);
176 vec4 = in31 - in24;
177 ST_SH(vec4, input + 96);
178 vec4 = in30 - in25;
179 ST_SH(vec4, input + 104);
180
181 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
182 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
183 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
184 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
185 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
186 ADD2(in27, in26, in25, in24, in23, in20);
187 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
188 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
189 ST_SH(vec5, temp_ptr);
190 ST_SH(vec4, temp_ptr + 960);
191
192 SUB2(in27, in26, in25, in24, in22, in21);
193 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
194 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
195 ST_SH(vec5, temp_ptr + 448);
196 ST_SH(vec4, temp_ptr + 512);
197
198 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
199 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
200 SUB2(in26, in27, in24, in25, in23, in20);
201 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
202 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
203 ST_SH(vec4, temp_ptr + 704);
204 ST_SH(vec5, temp_ptr + 256);
205
206 ADD2(in26, in27, in24, in25, in22, in21);
207 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
208 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
209 ST_SH(vec4, temp_ptr + 192);
210 ST_SH(vec5, temp_ptr + 768);
211
212 LD_SH4(input + 16, 8, in22, in23, in20, in21);
213 LD_SH4(input + 80, 8, in26, in27, in24, in25);
214 in16 = in20;
215 in17 = in21;
216 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
217 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
218 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
219 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
220 ADD2(in28, in29, in31, in30, in16, in19);
221 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
222 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
223 ST_SH(vec5, temp_ptr + 832);
224 ST_SH(vec4, temp_ptr + 128);
225
226 SUB2(in28, in29, in31, in30, in17, in18);
227 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
228 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
229 ST_SH(vec5, temp_ptr + 320);
230 ST_SH(vec4, temp_ptr + 640);
231 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
232 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
233 SUB2(in29, in28, in30, in31, in16, in19);
234 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
235 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
236 ST_SH(vec5, temp_ptr + 576);
237 ST_SH(vec4, temp_ptr + 384);
238
239 ADD2(in29, in28, in30, in31, in17, in18);
240 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
241 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
242 ST_SH(vec5, temp_ptr + 64);
243 ST_SH(vec4, temp_ptr + 896);
244 }
245
fdct8x32_1d_column(const int16_t * input,int32_t src_stride,int16_t * tmp_buf,int16_t * tmp_buf_big)246 static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
247 int16_t *tmp_buf, int16_t *tmp_buf_big) {
248 fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
249 fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
250 fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
251 }
252
fdct8x32_1d_row_load_butterfly(int16_t * temp_buff,int16_t * output)253 static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
254 int16_t *output) {
255 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
256 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
257 v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
258
259 LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
260 LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
261 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
262 in4, in5, in6, in7);
263 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
264 in10, in11, in12, in13, in14, in15);
265 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
266 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
267 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
268 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
269 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
270
271 /* 2nd set */
272 LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
273 LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
274 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
275 in4, in5, in6, in7);
276 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
277 in10, in11, in12, in13, in14, in15);
278 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
279 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
280 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
281 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
282 (output + 8 * 8), 8);
283 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
284 }
285
fdct8x32_1d_row_even_4x(int16_t * input,int16_t * interm_ptr,int16_t * out)286 static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
287 int16_t *out) {
288 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
289 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
290 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
291 v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
292 v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
293 v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
294
295 /* fdct32 even */
296 /* stage 2 */
297 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
298 LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
299
300 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
301 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
302 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
303 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
304 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
305
306 /* Stage 3 */
307 UNPCK_SH_SW(vec0, vec0_l, vec0_r);
308 UNPCK_SH_SW(vec1, vec1_l, vec1_r);
309 UNPCK_SH_SW(vec2, vec2_l, vec2_r);
310 UNPCK_SH_SW(vec3, vec3_l, vec3_r);
311 UNPCK_SH_SW(vec4, vec4_l, vec4_r);
312 UNPCK_SH_SW(vec5, vec5_l, vec5_r);
313 UNPCK_SH_SW(vec6, vec6_l, vec6_r);
314 UNPCK_SH_SW(vec7, vec7_l, vec7_r);
315 ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
316 tmp1_w, tmp2_w, tmp3_w);
317 BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
318 ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
319 vec1_r, vec2_r, vec3_r);
320
321 tmp3_w = vec0_r + vec3_r;
322 vec0_r = vec0_r - vec3_r;
323 vec3_r = vec1_r + vec2_r;
324 vec1_r = vec1_r - vec2_r;
325
326 DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
327 vec4_r, tmp3_w, vec6_r, vec3_r);
328 FDCT32_POSTPROC_NEG_W(vec4_r);
329 FDCT32_POSTPROC_NEG_W(tmp3_w);
330 FDCT32_POSTPROC_NEG_W(vec6_r);
331 FDCT32_POSTPROC_NEG_W(vec3_r);
332 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
333 ST_SH2(vec5, vec4, out, 8);
334
335 DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
336 vec4_r, tmp3_w, vec6_r, vec3_r);
337 FDCT32_POSTPROC_NEG_W(vec4_r);
338 FDCT32_POSTPROC_NEG_W(tmp3_w);
339 FDCT32_POSTPROC_NEG_W(vec6_r);
340 FDCT32_POSTPROC_NEG_W(vec3_r);
341 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
342 ST_SH2(vec5, vec4, out + 16, 8);
343
344 LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
345 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
346 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
347 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
348 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
349 FDCT_POSTPROC_2V_NEG_H(in4, in5);
350 ST_SH(in4, out + 32);
351 ST_SH(in5, out + 56);
352
353 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
354 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
355 FDCT_POSTPROC_2V_NEG_H(in4, in5);
356 ST_SH(in4, out + 40);
357 ST_SH(in5, out + 48);
358
359 LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
360 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
361 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
362 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
363 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
364 ADD2(in0, in1, in2, in3, vec0, vec7);
365 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
366 FDCT_POSTPROC_2V_NEG_H(in4, in5);
367 ST_SH(in4, out + 64);
368 ST_SH(in5, out + 120);
369
370 SUB2(in0, in1, in2, in3, in0, in2);
371 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
372 FDCT_POSTPROC_2V_NEG_H(in4, in5);
373 ST_SH(in4, out + 72);
374 ST_SH(in5, out + 112);
375
376 SUB2(in9, vec2, in14, vec5, vec2, vec5);
377 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
378 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
379 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
380 FDCT_POSTPROC_2V_NEG_H(in4, in5);
381 ST_SH(in4, out + 80);
382 ST_SH(in5, out + 104);
383
384 ADD2(in3, in2, in0, in1, vec3, vec4);
385 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
386 FDCT_POSTPROC_2V_NEG_H(in4, in5);
387 ST_SH(in4, out + 96);
388 ST_SH(in5, out + 88);
389 }
390
fdct8x32_1d_row_even(int16_t * temp,int16_t * out)391 static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
392 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
393 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
394 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
395
396 /* fdct32 even */
397 /* stage 2 */
398 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
399 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
400
401 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
402 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
403 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
404
405 /* Stage 3 */
406 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
407 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
408 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
409 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
410 ST_SH(temp0, out);
411 ST_SH(temp1, out + 8);
412
413 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
414 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
415 ST_SH(temp0, out + 16);
416 ST_SH(temp1, out + 24);
417
418 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
419 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
420 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
421 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
422 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
423 ST_SH(temp0, out + 32);
424 ST_SH(temp1, out + 56);
425
426 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
427 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
428 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
429 ST_SH(temp0, out + 40);
430 ST_SH(temp1, out + 48);
431
432 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
433 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
434 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
435 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
436 ADD2(in0, in1, in2, in3, vec0, vec7);
437 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
438 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
439 ST_SH(temp0, out + 64);
440 ST_SH(temp1, out + 120);
441
442 SUB2(in0, in1, in2, in3, in0, in2);
443 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
444 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
445 ST_SH(temp0, out + 72);
446 ST_SH(temp1, out + 112);
447
448 SUB2(in9, vec2, in14, vec5, vec2, vec5);
449 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
450 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
451 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
452 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
453 ST_SH(temp0, out + 80);
454 ST_SH(temp1, out + 104);
455
456 ADD2(in3, in2, in0, in1, vec3, vec4);
457 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
458 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
459 ST_SH(temp0, out + 96);
460 ST_SH(temp1, out + 88);
461 }
462
fdct8x32_1d_row_odd(int16_t * temp,int16_t * interm_ptr,int16_t * out)463 static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
464 int16_t *out) {
465 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
466 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
467
468 in20 = LD_SH(temp + 32);
469 in21 = LD_SH(temp + 40);
470 in26 = LD_SH(temp + 80);
471 in27 = LD_SH(temp + 88);
472
473 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
474 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
475
476 in18 = LD_SH(temp + 16);
477 in19 = LD_SH(temp + 24);
478 in28 = LD_SH(temp + 96);
479 in29 = LD_SH(temp + 104);
480
481 vec4 = in19 - in20;
482 ST_SH(vec4, interm_ptr + 32);
483 vec4 = in18 - in21;
484 ST_SH(vec4, interm_ptr + 88);
485 vec4 = in28 - in27;
486 ST_SH(vec4, interm_ptr + 56);
487 vec4 = in29 - in26;
488 ST_SH(vec4, interm_ptr + 64);
489
490 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
491
492 in22 = LD_SH(temp + 48);
493 in23 = LD_SH(temp + 56);
494 in24 = LD_SH(temp + 64);
495 in25 = LD_SH(temp + 72);
496
497 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
498 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
499
500 in16 = LD_SH(temp);
501 in17 = LD_SH(temp + 8);
502 in30 = LD_SH(temp + 112);
503 in31 = LD_SH(temp + 120);
504
505 vec4 = in17 - in22;
506 ST_SH(vec4, interm_ptr + 40);
507 vec4 = in30 - in25;
508 ST_SH(vec4, interm_ptr + 48);
509 vec4 = in31 - in24;
510 ST_SH(vec4, interm_ptr + 72);
511 vec4 = in16 - in23;
512 ST_SH(vec4, interm_ptr + 80);
513
514 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
515 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
516 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
517
518 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
519 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
520 ADD2(in27, in26, in25, in24, in23, in20);
521
522 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
523 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
524 ST_SH(vec5, out);
525 ST_SH(vec4, out + 120);
526
527 SUB2(in27, in26, in25, in24, in22, in21);
528
529 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
530 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
531 ST_SH(vec5, out + 112);
532 ST_SH(vec4, out + 8);
533
534 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
535 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
536 SUB2(in26, in27, in24, in25, in23, in20);
537
538 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
539 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
540 ST_SH(vec4, out + 16);
541 ST_SH(vec5, out + 104);
542
543 ADD2(in26, in27, in24, in25, in22, in21);
544 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
545 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
546 ST_SH(vec4, out + 24);
547 ST_SH(vec5, out + 96);
548
549 in20 = LD_SH(interm_ptr + 32);
550 in21 = LD_SH(interm_ptr + 88);
551 in27 = LD_SH(interm_ptr + 56);
552 in26 = LD_SH(interm_ptr + 64);
553
554 in16 = in20;
555 in17 = in21;
556 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
557 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
558
559 in22 = LD_SH(interm_ptr + 40);
560 in25 = LD_SH(interm_ptr + 48);
561 in24 = LD_SH(interm_ptr + 72);
562 in23 = LD_SH(interm_ptr + 80);
563
564 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
565 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
566 ADD2(in28, in29, in31, in30, in16, in19);
567 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
568 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
569 ST_SH(vec5, out + 32);
570 ST_SH(vec4, out + 88);
571
572 SUB2(in28, in29, in31, in30, in17, in18);
573 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
574 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
575 ST_SH(vec5, out + 40);
576 ST_SH(vec4, out + 80);
577
578 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
579 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
580 SUB2(in29, in28, in30, in31, in16, in19);
581
582 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
583 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
584 ST_SH(vec5, out + 72);
585 ST_SH(vec4, out + 48);
586
587 ADD2(in29, in28, in30, in31, in17, in18);
588
589 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
590 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
591 ST_SH(vec4, out + 56);
592 ST_SH(vec5, out + 64);
593 }
594
fdct8x32_1d_row_transpose_store(int16_t * temp,int16_t * output)595 static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
596 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
597 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
598
599 /* 1st set */
600 in0 = LD_SH(temp);
601 in4 = LD_SH(temp + 32);
602 in2 = LD_SH(temp + 64);
603 in6 = LD_SH(temp + 96);
604 in1 = LD_SH(temp + 128);
605 in7 = LD_SH(temp + 152);
606 in3 = LD_SH(temp + 192);
607 in5 = LD_SH(temp + 216);
608
609 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
610 in4, in5, in6, in7);
611
612 /* 2nd set */
613 in0_1 = LD_SH(temp + 16);
614 in1_1 = LD_SH(temp + 232);
615 in2_1 = LD_SH(temp + 80);
616 in3_1 = LD_SH(temp + 168);
617 in4_1 = LD_SH(temp + 48);
618 in5_1 = LD_SH(temp + 176);
619 in6_1 = LD_SH(temp + 112);
620 in7_1 = LD_SH(temp + 240);
621
622 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
623 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
624 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
625
626 /* 3rd set */
627 in0 = LD_SH(temp + 8);
628 in1 = LD_SH(temp + 136);
629 in2 = LD_SH(temp + 72);
630 in3 = LD_SH(temp + 200);
631 in4 = LD_SH(temp + 40);
632 in5 = LD_SH(temp + 208);
633 in6 = LD_SH(temp + 104);
634 in7 = LD_SH(temp + 144);
635
636 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
637 32);
638 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
639 in4, in5, in6, in7);
640 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
641
642 /* 4th set */
643 in0_1 = LD_SH(temp + 24);
644 in1_1 = LD_SH(temp + 224);
645 in2_1 = LD_SH(temp + 88);
646 in3_1 = LD_SH(temp + 160);
647 in4_1 = LD_SH(temp + 56);
648 in5_1 = LD_SH(temp + 184);
649 in6_1 = LD_SH(temp + 120);
650 in7_1 = LD_SH(temp + 248);
651
652 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
653 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
654 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
655 32);
656 }
657
fdct32x8_1d_row(int16_t * temp,int16_t * temp_buf,int16_t * output)658 static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
659 fdct8x32_1d_row_load_butterfly(temp, temp_buf);
660 fdct8x32_1d_row_even(temp_buf, temp_buf);
661 fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
662 fdct8x32_1d_row_transpose_store(temp_buf, output);
663 }
664
fdct32x8_1d_row_4x(int16_t * tmp_buf_big,int16_t * tmp_buf,int16_t * output)665 static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
666 int16_t *output) {
667 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
668 fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
669 fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
670 fdct8x32_1d_row_transpose_store(tmp_buf, output);
671 }
672
vpx_fdct32x32_msa(const int16_t * input,int16_t * output,int32_t src_stride)673 void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
674 int32_t src_stride) {
675 int32_t i;
676 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
677 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
678
679 /* column transform */
680 for (i = 0; i < 4; ++i) {
681 fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
682 tmp_buf_big + (8 * i));
683 }
684
685 /* row transform */
686 fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
687
688 /* row transform */
689 for (i = 1; i < 4; ++i) {
690 fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
691 }
692 }
693
fdct8x32_1d_row_even_rd(int16_t * temp,int16_t * out)694 static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
695 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
696 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
697 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
698
699 /* fdct32 even */
700 /* stage 2 */
701 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
702 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
703
704 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
705 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
706 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
707 FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
708 FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
709 FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
710 FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
711 FDCT_POSTPROC_2V_NEG_H(in8, in9);
712 FDCT_POSTPROC_2V_NEG_H(in10, in11);
713 FDCT_POSTPROC_2V_NEG_H(in12, in13);
714 FDCT_POSTPROC_2V_NEG_H(in14, in15);
715
716 /* Stage 3 */
717 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
718
719 temp0 = in0 + in3;
720 in0 = in0 - in3;
721 in3 = in1 + in2;
722 in1 = in1 - in2;
723
724 DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
725 ST_SH(temp0, out);
726 ST_SH(temp1, out + 8);
727
728 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
729 ST_SH(temp0, out + 16);
730 ST_SH(temp1, out + 24);
731
732 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
733 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
734 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
735 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
736 ST_SH(temp0, out + 32);
737 ST_SH(temp1, out + 56);
738
739 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
740 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
741 ST_SH(temp0, out + 40);
742 ST_SH(temp1, out + 48);
743
744 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
745 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
746 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
747 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
748 ADD2(in0, in1, in2, in3, vec0, vec7);
749 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
750 ST_SH(temp0, out + 64);
751 ST_SH(temp1, out + 120);
752
753 SUB2(in0, in1, in2, in3, in0, in2);
754 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
755 ST_SH(temp0, out + 72);
756 ST_SH(temp1, out + 112);
757
758 SUB2(in9, vec2, in14, vec5, vec2, vec5);
759 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
760 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
761 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
762 ST_SH(temp0, out + 80);
763 ST_SH(temp1, out + 104);
764
765 ADD2(in3, in2, in0, in1, vec3, vec4);
766 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
767 ST_SH(temp0, out + 96);
768 ST_SH(temp1, out + 88);
769 }
770
fdct8x32_1d_row_odd_rd(int16_t * temp,int16_t * interm_ptr,int16_t * out)771 static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
772 int16_t *out) {
773 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
774 v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
775 v8i16 vec4, vec5;
776
777 in20 = LD_SH(temp + 32);
778 in21 = LD_SH(temp + 40);
779 in26 = LD_SH(temp + 80);
780 in27 = LD_SH(temp + 88);
781
782 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
783 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
784
785 FDCT_POSTPROC_2V_NEG_H(in20, in21);
786 FDCT_POSTPROC_2V_NEG_H(in26, in27);
787
788 in18 = LD_SH(temp + 16);
789 in19 = LD_SH(temp + 24);
790 in28 = LD_SH(temp + 96);
791 in29 = LD_SH(temp + 104);
792
793 FDCT_POSTPROC_2V_NEG_H(in18, in19);
794 FDCT_POSTPROC_2V_NEG_H(in28, in29);
795
796 vec4 = in19 - in20;
797 ST_SH(vec4, interm_ptr + 32);
798 vec4 = in18 - in21;
799 ST_SH(vec4, interm_ptr + 88);
800 vec4 = in29 - in26;
801 ST_SH(vec4, interm_ptr + 64);
802 vec4 = in28 - in27;
803 ST_SH(vec4, interm_ptr + 56);
804
805 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
806
807 in22 = LD_SH(temp + 48);
808 in23 = LD_SH(temp + 56);
809 in24 = LD_SH(temp + 64);
810 in25 = LD_SH(temp + 72);
811
812 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
813 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
814 FDCT_POSTPROC_2V_NEG_H(in22, in23);
815 FDCT_POSTPROC_2V_NEG_H(in24, in25);
816
817 in16 = LD_SH(temp);
818 in17 = LD_SH(temp + 8);
819 in30 = LD_SH(temp + 112);
820 in31 = LD_SH(temp + 120);
821
822 FDCT_POSTPROC_2V_NEG_H(in16, in17);
823 FDCT_POSTPROC_2V_NEG_H(in30, in31);
824
825 vec4 = in17 - in22;
826 ST_SH(vec4, interm_ptr + 40);
827 vec4 = in30 - in25;
828 ST_SH(vec4, interm_ptr + 48);
829 vec4 = in31 - in24;
830 ST_SH(vec4, interm_ptr + 72);
831 vec4 = in16 - in23;
832 ST_SH(vec4, interm_ptr + 80);
833
834 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
835 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
836 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
837 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
838 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
839 ADD2(in27, in26, in25, in24, in23, in20);
840 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
841 ST_SH(vec5, out);
842 ST_SH(vec4, out + 120);
843
844 SUB2(in27, in26, in25, in24, in22, in21);
845 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
846 ST_SH(vec5, out + 112);
847 ST_SH(vec4, out + 8);
848
849 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
850 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
851 SUB2(in26, in27, in24, in25, in23, in20);
852 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
853 ST_SH(vec4, out + 16);
854 ST_SH(vec5, out + 104);
855
856 ADD2(in26, in27, in24, in25, in22, in21);
857 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
858 ST_SH(vec4, out + 24);
859 ST_SH(vec5, out + 96);
860
861 in20 = LD_SH(interm_ptr + 32);
862 in21 = LD_SH(interm_ptr + 88);
863 in27 = LD_SH(interm_ptr + 56);
864 in26 = LD_SH(interm_ptr + 64);
865
866 in16 = in20;
867 in17 = in21;
868 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
869 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
870
871 in22 = LD_SH(interm_ptr + 40);
872 in25 = LD_SH(interm_ptr + 48);
873 in24 = LD_SH(interm_ptr + 72);
874 in23 = LD_SH(interm_ptr + 80);
875
876 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
877 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
878 in16 = in28 + in29;
879 in19 = in31 + in30;
880 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
881 ST_SH(vec5, out + 32);
882 ST_SH(vec4, out + 88);
883
884 SUB2(in28, in29, in31, in30, in17, in18);
885 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
886 ST_SH(vec5, out + 40);
887 ST_SH(vec4, out + 80);
888
889 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
890 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
891 SUB2(in29, in28, in30, in31, in16, in19);
892 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
893 ST_SH(vec5, out + 72);
894 ST_SH(vec4, out + 48);
895
896 ADD2(in29, in28, in30, in31, in17, in18);
897 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
898 ST_SH(vec4, out + 56);
899 ST_SH(vec5, out + 64);
900 }
901
fdct32x8_1d_row_rd(int16_t * tmp_buf_big,int16_t * tmp_buf,int16_t * output)902 static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
903 int16_t *output) {
904 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
905 fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
906 fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
907 fdct8x32_1d_row_transpose_store(tmp_buf, output);
908 }
909
vpx_fdct32x32_rd_msa(const int16_t * input,int16_t * out,int32_t src_stride)910 void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
911 int32_t src_stride) {
912 int32_t i;
913 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
914 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
915
916 /* column transform */
917 for (i = 0; i < 4; ++i) {
918 fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
919 &tmp_buf_big[0] + (8 * i));
920 }
921
922 /* row transform */
923 for (i = 0; i < 4; ++i) {
924 fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
925 out + (8 * i * 32));
926 }
927 }
928
vpx_fdct32x32_1_msa(const int16_t * input,int16_t * out,int32_t stride)929 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
930 int sum, i;
931 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
932 v4i32 vec_w = { 0 };
933
934 for (i = 0; i < 16; ++i) {
935 LD_SH4(input, 8, in0, in1, in2, in3);
936 input += stride;
937 LD_SH4(input, 8, in4, in5, in6, in7);
938 input += stride;
939 ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
940 ADD2(in0, in2, in4, in6, in0, in4);
941 vec_w += __msa_hadd_s_w(in0, in0);
942 vec_w += __msa_hadd_s_w(in4, in4);
943 }
944
945 sum = HADD_SW_S32(vec_w);
946 out[0] = (int16_t)(sum >> 3);
947 }
948