1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
12 #define VPX_DSP_MIPS_INV_TXFM_MSA_H_
13 
14 #include "vpx_dsp/mips/macros_msa.h"
15 #include "vpx_dsp/mips/txfm_macros_msa.h"
16 #include "vpx_dsp/txfm_common.h"
17 
18 #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,               \
19                   out0, out1, out2, out3, out4, out5, out6, out7) {     \
20   v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
21   v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
22   v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
23     cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };               \
24   v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
25     -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                    \
26                                                                         \
27   SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
28   cnst2_m = -cnst0_m;                                                   \
29   ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
30   SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
31   cnst4_m = -cnst2_m;                                                   \
32   ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
33                                                                         \
34   ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
35   ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
36   DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
37                         cnst1_m, cnst2_m, cnst3_m, in7, in0,            \
38                         in4, in3);                                      \
39                                                                         \
40   SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
41   cnst2_m = -cnst0_m;                                                   \
42   ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
43   SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
44   cnst4_m = -cnst2_m;                                                   \
45   ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
46                                                                         \
47   ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
48   ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
49                                                                         \
50   DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
51                         cnst1_m, cnst2_m, cnst3_m, in5, in2,            \
52                         in6, in1);                                      \
53   BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
54   out7 = -s0_m;                                                         \
55   out0 = s1_m;                                                          \
56                                                                         \
57   SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
58                cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
59                                                                         \
60   ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
61   cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
62   cnst1_m = cnst0_m;                                                    \
63                                                                         \
64   ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
65   ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
66   DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
67                         cnst2_m, cnst3_m, cnst1_m, out1, out6,          \
68                         s0_m, s1_m);                                    \
69                                                                         \
70   SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
71   cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
72                                                                         \
73   ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
74   ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
75   out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                \
76   out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                \
77   out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                \
78   out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                \
79                                                                         \
80   out1 = -out1;                                                         \
81   out3 = -out3;                                                         \
82   out5 = -out5;                                                         \
83 }
84 
85 #define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({  \
86   v8i16 out0_m, r0_m, r1_m;                \
87                                            \
88   r0_m = __msa_fill_h(c0_h);               \
89   r1_m = __msa_fill_h(c1_h);               \
90   out0_m = __msa_ilvev_h(r1_m, r0_m);      \
91                                            \
92   out0_m;                                  \
93 })
94 
95 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) {  \
96   uint8_t *dst_m = (uint8_t *) (dst);                               \
97   v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                             \
98   v16i8 tmp0_m, tmp1_m;                                             \
99   v16i8 zero_m = { 0 };                                             \
100   v8i16 res0_m, res1_m, res2_m, res3_m;                             \
101                                                                     \
102   LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);        \
103   ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,        \
104              zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);       \
105   ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,          \
106        res0_m, res1_m, res2_m, res3_m);                             \
107   CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                   \
108   PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);      \
109   ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                      \
110 }
111 
112 #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) {   \
113   v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
114   v8i16 step0_m, step1_m;                                           \
115   v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
116                                                                     \
117   c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
118   c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
119   step0_m = __msa_ilvr_h(in2, in0);                                 \
120   DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
121                                                                     \
122   c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
123   c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
124   step1_m = __msa_ilvr_h(in3, in1);                                 \
125   DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
126   SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);      \
127                                                                     \
128   PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
129   SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
130   BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m,                         \
131               (v8i16)tmp2_m, (v8i16)tmp3_m,                         \
132               out0, out1, out2, out3);                              \
133 }
134 
135 #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
136   v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
137   v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
138   v8i16 zero_m = { 0 };                                             \
139   v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
140   v4i32 int0_m, int1_m, int2_m, int3_m;                             \
141   v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
142     sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                  \
143     -sinpi_4_9 };                                                   \
144                                                                     \
145   SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
146   ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
147   ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
148   DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
149   int0_m = tmp2_m + tmp1_m;                                         \
150                                                                     \
151   SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
152   ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
153   DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
154   int1_m = tmp0_m + tmp1_m;                                         \
155                                                                     \
156   c0_m = __msa_splati_h(mask_m, 6);                                 \
157   ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
158   ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
159   DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
160   int2_m = tmp0_m + tmp1_m;                                         \
161                                                                     \
162   c0_m = __msa_splati_h(mask_m, 6);                                 \
163   c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
164                                                                     \
165   res0_m = __msa_ilvr_h((in1), (in3));                              \
166   tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
167   int3_m = tmp2_m + tmp0_m;                                         \
168                                                                     \
169   res0_m = __msa_ilvr_h((in2), (in3));                              \
170   c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
171                                                                     \
172   tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
173   res1_m = __msa_ilvr_h((in0), (in2));                              \
174   c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
175                                                                     \
176   tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
177   int3_m += tmp2_m;                                                 \
178   int3_m += tmp3_m;                                                 \
179                                                                     \
180   SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);      \
181   PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
182   PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
183 }
184 
185 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({  \
186   v8i16 c0_m, c1_m;                                    \
187                                                        \
188   SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);    \
189   c0_m = __msa_ilvev_h(c1_m, c0_m);                    \
190                                                        \
191   c0_m;                                                \
192 })
193 
194 /* multiply and add macro */
195 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,        \
196                  out0, out1, out2, out3) {                              \
197   v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
198   v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
199                                                                         \
200   ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
201   ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
202   DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
203               cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
204   SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
205   PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
206   DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
207               cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
208   SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
209   PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
210 }
211 
212 /* idct 8x8 macro */
213 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,               \
214                        out0, out1, out2, out3, out4, out5, out6, out7) {     \
215   v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
216   v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
217   v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
218   v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
219     cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                  \
220                                                                              \
221   k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
222   k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
223   k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
224   k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
225   VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
226   SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
227   k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
228   k1_m = __msa_splati_h(mask_m, 4);                                          \
229                                                                              \
230   ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
231   DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
232               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
233   SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);               \
234   tp4_m = in1 + in3;                                                         \
235   PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
236   tp7_m = in7 + in5;                                                         \
237   k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
238   k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
239   VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
240            in0, in4, in2, in6);                                              \
241   BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
242   BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
243               out0, out1, out2, out3, out4, out5, out6, out7);               \
244 }
245 
246 #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,            \
247                         out0, out1, out2, out3, out4, out5, out6, out7) {  \
248   v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
249   v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
250   v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
251   v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
252     cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };    \
253   v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
254     cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };      \
255   v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
256     -cospi_16_64, 0, 0, 0, 0 };                                            \
257                                                                            \
258   k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
259   k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
260   ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
261   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
262               r0_m, r1_m, r2_m, r3_m);                                     \
263   k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
264   k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
265   ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
266   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
267               r4_m, r5_m, r6_m, r7_m);                                     \
268   ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
269        m0_m, m1_m, m2_m, m3_m);                                            \
270   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
271   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
272   SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
273        m0_m, m1_m, m2_m, m3_m);                                            \
274   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
275   PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
276   k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
277   k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
278   ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
279   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
280               r0_m, r1_m, r2_m, r3_m);                                     \
281   k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
282   k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
283   ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
284   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
285               r4_m, r5_m, r6_m, r7_m);                                     \
286   ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
287        m0_m, m1_m, m2_m, m3_m);                                            \
288   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
289   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
290   SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
291        m0_m, m1_m, m2_m, m3_m);                                            \
292   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
293   PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
294   ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
295   BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
296   k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
297   k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
298   ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
299   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
300               r0_m, r1_m, r2_m, r3_m);                                     \
301   k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
302   DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
303               r4_m, r5_m, r6_m, r7_m);                                     \
304   ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
305        m0_m, m1_m, m2_m, m3_m);                                            \
306   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
307   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
308   SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
309        m0_m, m1_m, m2_m, m3_m);                                            \
310   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
311   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
312   k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
313   k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
314   ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
315   DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
316               m0_m, m1_m, m2_m, m3_m);                                     \
317   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
318   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
319   ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
320   DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
321               m0_m, m1_m, m2_m, m3_m);                                     \
322   SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
323   PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
324                                                                            \
325   out1 = -in1;                                                             \
326   out3 = -in3;                                                             \
327   out5 = -in5;                                                             \
328   out7 = -in7;                                                             \
329 }
330 
331 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,        \
332                          r9, r10, r11, r12, r13, r14, r15,          \
333                          out0, out1, out2, out3, out4, out5,        \
334                          out6, out7, out8, out9, out10, out11,      \
335                          out12, out13, out14, out15) {              \
336   v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
337   v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
338   v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
339   v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
340   v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
341                                                                     \
342   /* stage 1 */                                                     \
343   k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
344   k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
345   k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
346   k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
347   MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                  \
348           g0_m, g1_m, g2_m, g3_m);                                  \
349   k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
350   k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
351   k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
352   k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
353   MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                 \
354           g4_m, g5_m, g6_m, g7_m);                                  \
355   k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
356   k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
357   k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
358   k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
359   MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                 \
360           g8_m, g9_m, g10_m, g11_m);                                \
361   k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
362   k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
363   k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
364   k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
365   MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                  \
366           g12_m, g13_m, g14_m, g15_m);                              \
367                                                                     \
368   /* stage 2 */                                                     \
369   k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
370   k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
371   k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
372   MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,          \
373           h0_m, h1_m, h2_m, h3_m);                                  \
374   k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
375   k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
376   k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
377   MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,         \
378           h4_m, h5_m, h6_m, h7_m);                                  \
379   BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
380   BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
381               h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
382                                                                     \
383   /* stage 3 */                                                     \
384   BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
385   k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
386   k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
387   k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
388   MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,           \
389           out4, out6, out5, out7);                                  \
390   MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,           \
391           out12, out14, out13, out15);                              \
392                                                                     \
393   /* stage 4 */                                                     \
394   k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
395   k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
396   k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
397   k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
398   MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                 \
399   MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                   \
400   MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);               \
401   MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);               \
402 }
403 
404 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
405                                       int32_t dst_stride);
406 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
407 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
408                                        int32_t dst_stride);
409 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
410 #endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_
411