1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "vp9/common/vp9_enums.h"
14 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
15 #include "vpx_dsp/mips/fwd_txfm_msa.h"
16 
fadst16_cols_step1_msa(const int16_t * input,int32_t stride,const int32_t * const0,int16_t * int_buf)17 static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
18                                    const int32_t *const0, int16_t *int_buf) {
19   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
20   v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
21   v4i32 k0, k1, k2, k3;
22 
23   /* load input data */
24   r0 = LD_SH(input);
25   r15 = LD_SH(input + 15 * stride);
26   r7 = LD_SH(input + 7 * stride);
27   r8 = LD_SH(input + 8 * stride);
28   SLLI_4V(r0, r15, r7, r8, 2);
29 
30   /* stage 1 */
31   LD_SW2(const0, 4, k0, k1);
32   LD_SW2(const0 + 8, 4, k2, k3);
33   MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
34 
35   r3 = LD_SH(input + 3 * stride);
36   r4 = LD_SH(input + 4 * stride);
37   r11 = LD_SH(input + 11 * stride);
38   r12 = LD_SH(input + 12 * stride);
39   SLLI_4V(r3, r4, r11, r12, 2);
40 
41   LD_SW2(const0 + 4 * 4, 4, k0, k1);
42   LD_SW2(const0 + 4 * 6, 4, k2, k3);
43   MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
44 
45   /* stage 2 */
46   BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
47   ST_SH2(tp0, tp2, int_buf, 8);
48   ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
49 
50   LD_SW2(const0 + 4 * 8, 4, k0, k1);
51   k2 = LD_SW(const0 + 4 * 10);
52   MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
53 
54   ST_SH2(h0, h1, int_buf + 8 * 8, 8);
55   ST_SH2(h3, h2, int_buf + 12 * 8, 8);
56 
57   r9 = LD_SH(input + 9 * stride);
58   r6 = LD_SH(input + 6 * stride);
59   r1 = LD_SH(input + stride);
60   r14 = LD_SH(input + 14 * stride);
61   SLLI_4V(r9, r6, r1, r14, 2);
62 
63   LD_SW2(const0 + 4 * 11, 4, k0, k1);
64   LD_SW2(const0 + 4 * 13, 4, k2, k3);
65   MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
66 
67   ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
68 
69   r13 = LD_SH(input + 13 * stride);
70   r2 = LD_SH(input + 2 * stride);
71   r5 = LD_SH(input + 5 * stride);
72   r10 = LD_SH(input + 10 * stride);
73   SLLI_4V(r13, r2, r5, r10, 2);
74 
75   LD_SW2(const0 + 4 * 15, 4, k0, k1);
76   LD_SW2(const0 + 4 * 17, 4, k2, k3);
77   MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
78 
79   ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
80 
81   BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
82   ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
83 }
84 
fadst16_cols_step2_msa(int16_t * int_buf,const int32_t * const0,int16_t * out)85 static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
86                                    int16_t *out) {
87   int16_t *out_ptr = out + 128;
88   v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
89   v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
90   v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
91   v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
92   v4i32 k0, k1, k2, k3;
93 
94   LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
95   LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
96   LD_SW2(const0 + 4 * 19, 4, k0, k1);
97   k2 = LD_SW(const0 + 4 * 21);
98   MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
99 
100   tp0 = LD_SH(int_buf + 4 * 8);
101   tp1 = LD_SH(int_buf + 5 * 8);
102   tp3 = LD_SH(int_buf + 10 * 8);
103   tp2 = LD_SH(int_buf + 14 * 8);
104   LD_SW2(const0 + 4 * 22, 4, k0, k1);
105   k2 = LD_SW(const0 + 4 * 24);
106   MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
107   out4 = -out4;
108   ST_SH(out4, (out + 3 * 16));
109   ST_SH(out5, (out_ptr + 4 * 16));
110 
111   h1 = LD_SH(int_buf + 9 * 8);
112   h3 = LD_SH(int_buf + 12 * 8);
113   MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
114   out13 = -out13;
115   ST_SH(out12, (out + 2 * 16));
116   ST_SH(out13, (out_ptr + 5 * 16));
117 
118   tp0 = LD_SH(int_buf);
119   tp1 = LD_SH(int_buf + 8);
120   tp2 = LD_SH(int_buf + 2 * 8);
121   tp3 = LD_SH(int_buf + 6 * 8);
122 
123   BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
124   out1 = -out1;
125   ST_SH(out0, (out));
126   ST_SH(out1, (out_ptr + 7 * 16));
127 
128   h0 = LD_SH(int_buf + 8 * 8);
129   h2 = LD_SH(int_buf + 13 * 8);
130 
131   BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
132   out8 = -out8;
133   ST_SH(out8, (out + 16));
134   ST_SH(out9, (out_ptr + 6 * 16));
135 
136   /* stage 4 */
137   LD_SW2(const0 + 4 * 25, 4, k0, k1);
138   LD_SW2(const0 + 4 * 27, 4, k2, k3);
139   MADD_SHORT(h10, h11, k1, k2, out2, out3);
140   ST_SH(out2, (out + 7 * 16));
141   ST_SH(out3, (out_ptr));
142 
143   MADD_SHORT(out6, out7, k0, k3, out6, out7);
144   ST_SH(out6, (out + 4 * 16));
145   ST_SH(out7, (out_ptr + 3 * 16));
146 
147   MADD_SHORT(out10, out11, k0, k3, out10, out11);
148   ST_SH(out10, (out + 6 * 16));
149   ST_SH(out11, (out_ptr + 16));
150 
151   MADD_SHORT(out14, out15, k1, k2, out14, out15);
152   ST_SH(out14, (out + 5 * 16));
153   ST_SH(out15, (out_ptr + 2 * 16));
154 }
155 
fadst16_transpose_postproc_msa(int16_t * input,int16_t * out)156 static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
157   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
158   v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
159 
160   /* load input data */
161   LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
162   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
163                      r0, r1, r2, r3, r4, r5, r6, r7);
164   FDCT_POSTPROC_2V_NEG_H(r0, r1);
165   FDCT_POSTPROC_2V_NEG_H(r2, r3);
166   FDCT_POSTPROC_2V_NEG_H(r4, r5);
167   FDCT_POSTPROC_2V_NEG_H(r6, r7);
168   ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
169   out += 64;
170 
171   LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
172   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
173                      r8, r9, r10, r11, r12, r13, r14, r15);
174   FDCT_POSTPROC_2V_NEG_H(r8, r9);
175   FDCT_POSTPROC_2V_NEG_H(r10, r11);
176   FDCT_POSTPROC_2V_NEG_H(r12, r13);
177   FDCT_POSTPROC_2V_NEG_H(r14, r15);
178   ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
179   out += 64;
180 
181   /* load input data */
182   input += 128;
183   LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
184   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
185                      r0, r1, r2, r3, r4, r5, r6, r7);
186   FDCT_POSTPROC_2V_NEG_H(r0, r1);
187   FDCT_POSTPROC_2V_NEG_H(r2, r3);
188   FDCT_POSTPROC_2V_NEG_H(r4, r5);
189   FDCT_POSTPROC_2V_NEG_H(r6, r7);
190   ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
191   out += 64;
192 
193   LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
194   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
195                      r8, r9, r10, r11, r12, r13, r14, r15);
196   FDCT_POSTPROC_2V_NEG_H(r8, r9);
197   FDCT_POSTPROC_2V_NEG_H(r10, r11);
198   FDCT_POSTPROC_2V_NEG_H(r12, r13);
199   FDCT_POSTPROC_2V_NEG_H(r14, r15);
200   ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
201 }
202 
fadst16_rows_step1_msa(int16_t * input,const int32_t * const0,int16_t * int_buf)203 static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
204                                    int16_t *int_buf) {
205   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
206   v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
207   v4i32 k0, k1, k2, k3;
208 
209   /* load input data */
210   r0 = LD_SH(input);
211   r7 = LD_SH(input + 7 * 8);
212   r8 = LD_SH(input + 8 * 8);
213   r15 = LD_SH(input + 15 * 8);
214 
215   /* stage 1 */
216   LD_SW2(const0, 4, k0, k1);
217   LD_SW2(const0 + 4 * 2, 4, k2, k3);
218   MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
219 
220   r3 = LD_SH(input + 3 * 8);
221   r4 = LD_SH(input + 4 * 8);
222   r11 = LD_SH(input + 11 * 8);
223   r12 = LD_SH(input + 12 * 8);
224 
225   LD_SW2(const0 + 4 * 4, 4, k0, k1);
226   LD_SW2(const0 + 4 * 6, 4, k2, k3);
227   MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
228 
229   /* stage 2 */
230   BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
231   ST_SH2(tp0, tp1, int_buf, 4 * 8);
232   ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
233 
234   LD_SW2(const0 + 4 * 8, 4, k0, k1);
235   k2 = LD_SW(const0 + 4 * 10);
236   MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
237   ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
238   ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
239 
240   r1 = LD_SH(input + 8);
241   r6 = LD_SH(input + 6 * 8);
242   r9 = LD_SH(input + 9 * 8);
243   r14 = LD_SH(input + 14 * 8);
244 
245   LD_SW2(const0 + 4 * 11, 4, k0, k1);
246   LD_SW2(const0 + 4 * 13, 4, k2, k3);
247   MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
248   ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
249 
250   r2 = LD_SH(input + 2 * 8);
251   r5 = LD_SH(input + 5 * 8);
252   r10 = LD_SH(input + 10 * 8);
253   r13 = LD_SH(input + 13 * 8);
254 
255   LD_SW2(const0 + 4 * 15, 4, k0, k1);
256   LD_SW2(const0 + 4 * 17, 4, k2, k3);
257   MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
258   ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
259   BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
260   ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
261 }
262 
fadst16_rows_step2_msa(int16_t * int_buf,const int32_t * const0,int16_t * out)263 static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
264                                    int16_t *out) {
265   int16_t *out_ptr = out + 8;
266   v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
267   v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
268   v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
269   v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
270   v4i32 k0, k1, k2, k3;
271 
272   g13 = LD_SH(int_buf + 3 * 8);
273   g15 = LD_SH(int_buf + 7 * 8);
274   g5 = LD_SH(int_buf + 11 * 8);
275   g7 = LD_SH(int_buf + 15 * 8);
276 
277   LD_SW2(const0 + 4 * 19, 4, k0, k1);
278   k2 = LD_SW(const0 + 4 * 21);
279   MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
280 
281   tp0 = LD_SH(int_buf + 4 * 8);
282   tp1 = LD_SH(int_buf + 5 * 8);
283   tp3 = LD_SH(int_buf + 10 * 8);
284   tp2 = LD_SH(int_buf + 14 * 8);
285 
286   LD_SW2(const0 + 4 * 22, 4, k0, k1);
287   k2 = LD_SW(const0 + 4 * 24);
288   MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
289   out4 = -out4;
290   ST_SH(out4, (out + 3 * 16));
291   ST_SH(out5, (out_ptr + 4 * 16));
292 
293   h1 = LD_SH(int_buf + 9 * 8);
294   h3 = LD_SH(int_buf + 12 * 8);
295   MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
296   out13 = -out13;
297   ST_SH(out12, (out + 2 * 16));
298   ST_SH(out13, (out_ptr + 5 * 16));
299 
300   tp0 = LD_SH(int_buf);
301   tp1 = LD_SH(int_buf + 8);
302   tp2 = LD_SH(int_buf + 2 * 8);
303   tp3 = LD_SH(int_buf + 6 * 8);
304 
305   BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
306   out1 = -out1;
307   ST_SH(out0, (out));
308   ST_SH(out1, (out_ptr + 7 * 16));
309 
310   h0 = LD_SH(int_buf + 8 * 8);
311   h2 = LD_SH(int_buf + 13 * 8);
312   BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
313   out8 = -out8;
314   ST_SH(out8, (out + 16));
315   ST_SH(out9, (out_ptr + 6 * 16));
316 
317   /* stage 4 */
318   LD_SW2(const0 + 4 * 25, 4, k0, k1);
319   LD_SW2(const0 + 4 * 27, 4, k2, k3);
320   MADD_SHORT(h10, h11, k1, k2, out2, out3);
321   ST_SH(out2, (out + 7 * 16));
322   ST_SH(out3, (out_ptr));
323 
324   MADD_SHORT(out6, out7, k0, k3, out6, out7);
325   ST_SH(out6, (out + 4 * 16));
326   ST_SH(out7, (out_ptr + 3 * 16));
327 
328   MADD_SHORT(out10, out11, k0, k3, out10, out11);
329   ST_SH(out10, (out + 6 * 16));
330   ST_SH(out11, (out_ptr + 16));
331 
332   MADD_SHORT(out14, out15, k1, k2, out14, out15);
333   ST_SH(out14, (out + 5 * 16));
334   ST_SH(out15, (out_ptr + 2 * 16));
335 }
336 
fadst16_transpose_msa(int16_t * input,int16_t * out)337 static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
338   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
339   v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
340 
341   /* load input data */
342   LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
343           l4, l12, l5, l13, l6, l14, l7, l15);
344   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
345                      r0, r1, r2, r3, r4, r5, r6, r7);
346   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
347                      r8, r9, r10, r11, r12, r13, r14, r15);
348   ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
349   ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
350   out += 16 * 8;
351 
352   /* load input data */
353   input += 128;
354   LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
355           l4, l12, l5, l13, l6, l14, l7, l15);
356   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
357                      r0, r1, r2, r3, r4, r5, r6, r7);
358   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
359                      r8, r9, r10, r11, r12, r13, r14, r15);
360   ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
361   ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
362 }
363 
postproc_fdct16x8_1d_row(int16_t * intermediate,int16_t * output)364 static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
365   int16_t *temp = intermediate;
366   int16_t *out = output;
367   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
368   v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
369   v8i16 in12, in13, in14, in15;
370 
371   LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
372   temp = intermediate + 8;
373   LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
374   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
375                      in0, in1, in2, in3, in4, in5, in6, in7);
376   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
377                      in8, in9, in10, in11, in12, in13, in14, in15);
378   FDCT_POSTPROC_2V_NEG_H(in0, in1);
379   FDCT_POSTPROC_2V_NEG_H(in2, in3);
380   FDCT_POSTPROC_2V_NEG_H(in4, in5);
381   FDCT_POSTPROC_2V_NEG_H(in6, in7);
382   FDCT_POSTPROC_2V_NEG_H(in8, in9);
383   FDCT_POSTPROC_2V_NEG_H(in10, in11);
384   FDCT_POSTPROC_2V_NEG_H(in12, in13);
385   FDCT_POSTPROC_2V_NEG_H(in14, in15);
386   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
387                in8, in9, in10, in11, in12, in13, in14, in15,
388                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
389                in8, in9, in10, in11, in12, in13, in14, in15);
390   temp = intermediate;
391   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
392   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
393                 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
394   temp = intermediate;
395   LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
396   FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
397                in0, in1, in2, in3, in4, in5, in6, in7);
398   TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
399                      tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
400   ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
401   TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
402                      tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
403   out = output + 8;
404   ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
405 }
406 
vp9_fht16x16_msa(const int16_t * input,int16_t * output,int32_t stride,int32_t tx_type)407 void vp9_fht16x16_msa(const int16_t *input, int16_t *output,
408                       int32_t stride, int32_t tx_type) {
409   DECLARE_ALIGNED(32, int16_t, tmp[256]);
410   DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
411   DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
412   int32_t i;
413   int16_t *ptmpbuf = &tmp_buf[0];
414   int16_t *trans = &trans_buf[0];
415   const int32_t const_arr[29 * 4] = {
416     52707308, 52707308, 52707308, 52707308,
417     -1072430300, -1072430300, -1072430300, -1072430300,
418     795618043, 795618043, 795618043, 795618043,
419     -721080468, -721080468, -721080468, -721080468,
420     459094491, 459094491, 459094491, 459094491,
421     -970646691, -970646691, -970646691, -970646691,
422     1010963856, 1010963856, 1010963856, 1010963856,
423     -361743294, -361743294, -361743294, -361743294,
424     209469125, 209469125, 209469125, 209469125,
425     -1053094788, -1053094788, -1053094788, -1053094788,
426     1053160324, 1053160324, 1053160324, 1053160324,
427     639644520, 639644520, 639644520, 639644520,
428     -862444000, -862444000, -862444000, -862444000,
429     1062144356, 1062144356, 1062144356, 1062144356,
430     -157532337, -157532337, -157532337, -157532337,
431     260914709, 260914709, 260914709, 260914709,
432     -1041559667, -1041559667, -1041559667, -1041559667,
433     920985831, 920985831, 920985831, 920985831,
434     -551995675, -551995675, -551995675, -551995675,
435     596522295, 596522295, 596522295, 596522295,
436     892853362, 892853362, 892853362, 892853362,
437     -892787826, -892787826, -892787826, -892787826,
438     410925857, 410925857, 410925857, 410925857,
439     -992012162, -992012162, -992012162, -992012162,
440     992077698, 992077698, 992077698, 992077698,
441     759246145, 759246145, 759246145, 759246145,
442     -759180609, -759180609, -759180609, -759180609,
443     -759222975, -759222975, -759222975, -759222975,
444     759288511, 759288511, 759288511, 759288511 };
445 
446   switch (tx_type) {
447     case DCT_DCT:
448       /* column transform */
449       for (i = 0; i < 2; ++i) {
450         fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
451       }
452 
453       /* row transform */
454       for (i = 0; i < 2; ++i) {
455         fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
456       }
457       break;
458     case ADST_DCT:
459       /* column transform */
460       for (i = 0; i < 2; ++i) {
461         fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
462         fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
463       }
464 
465       /* row transform */
466       for (i = 0; i < 2; ++i) {
467         postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
468       }
469       break;
470     case DCT_ADST:
471       /* column transform */
472       for (i = 0; i < 2; ++i) {
473         fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
474       }
475 
476       fadst16_transpose_postproc_msa(tmp, trans);
477 
478       /* row transform */
479       for (i = 0; i < 2; ++i) {
480         fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
481         fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
482       }
483 
484       fadst16_transpose_msa(tmp, output);
485       break;
486     case ADST_ADST:
487       /* column transform */
488       for (i = 0; i < 2; ++i) {
489         fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
490         fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
491       }
492 
493       fadst16_transpose_postproc_msa(tmp, trans);
494 
495       /* row transform */
496       for (i = 0; i < 2; ++i) {
497         fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
498         fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
499       }
500 
501       fadst16_transpose_msa(tmp, output);
502       break;
503     default:
504       assert(0);
505       break;
506   }
507 }
508