1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/highbd_idct_neon.h"
15 #include "vpx_dsp/arm/idct_neon.h"
16 #include "vpx_dsp/inv_txfm.h"
17 
dct_const_round_shift_high_4(const int64x2x2_t in)18 static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
19   int32x2x2_t t32;
20 
21   t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
22   t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
23   return vcombine_s32(t32.val[0], t32.val[1]);
24 }
25 
dct_const_round_shift_high_4_dual(const int64x2x2_t * const in,int32x4_t * const d0,int32x4_t * const d1)26 static INLINE void dct_const_round_shift_high_4_dual(
27     const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
28   *d0 = dct_const_round_shift_high_4(in[0]);
29   *d1 = dct_const_round_shift_high_4(in[1]);
30 }
31 
32 static INLINE int32x4x2_t
dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t * const in)33 dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
34   int32x4x2_t out;
35   out.val[0] = dct_const_round_shift_high_4(in[0]);
36   out.val[1] = dct_const_round_shift_high_4(in[1]);
37   return out;
38 }
39 
dct_const_round_shift_high_4x2x2(const int64x2x2_t * const in,int32x4x2_t * const d0,int32x4x2_t * const d1)40 static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
41                                                     int32x4x2_t *const d0,
42                                                     int32x4x2_t *const d1) {
43   *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
44   *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
45 }
46 
highbd_idct_cospi_2_30(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_2_30_10_22,int32x4x2_t * const d0,int32x4x2_t * const d1)47 static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
48                                           const int32x4x2_t s1,
49                                           const int32x4_t cospi_2_30_10_22,
50                                           int32x4x2_t *const d0,
51                                           int32x4x2_t *const d1) {
52   int64x2x2_t t[4];
53 
54   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
55                                vget_low_s32(cospi_2_30_10_22), 1);
56   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
57                                vget_low_s32(cospi_2_30_10_22), 1);
58   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
59                                vget_low_s32(cospi_2_30_10_22), 1);
60   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
61                                vget_low_s32(cospi_2_30_10_22), 1);
62   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
63                                vget_low_s32(cospi_2_30_10_22), 1);
64   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
65                                vget_low_s32(cospi_2_30_10_22), 1);
66   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
67                                vget_low_s32(cospi_2_30_10_22), 1);
68   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
69                                vget_low_s32(cospi_2_30_10_22), 1);
70   t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
71                                vget_low_s32(cospi_2_30_10_22), 0);
72   t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
73                                vget_low_s32(cospi_2_30_10_22), 0);
74   t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
75                                vget_low_s32(cospi_2_30_10_22), 0);
76   t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
77                                vget_low_s32(cospi_2_30_10_22), 0);
78   t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
79                                vget_low_s32(cospi_2_30_10_22), 0);
80   t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
81                                vget_low_s32(cospi_2_30_10_22), 0);
82   t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
83                                vget_low_s32(cospi_2_30_10_22), 0);
84   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
85                                vget_low_s32(cospi_2_30_10_22), 0);
86   dct_const_round_shift_high_4x2x2(t, d0, d1);
87 }
88 
highbd_idct_cospi_4_28(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_4_12_20N_28,int32x4x2_t * const d0,int32x4x2_t * const d1)89 static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
90                                           const int32x4x2_t s1,
91                                           const int32x4_t cospi_4_12_20N_28,
92                                           int32x4x2_t *const d0,
93                                           int32x4x2_t *const d1) {
94   int64x2x2_t t[4];
95 
96   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
97                                vget_high_s32(cospi_4_12_20N_28), 1);
98   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
99                                vget_high_s32(cospi_4_12_20N_28), 1);
100   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
101                                vget_high_s32(cospi_4_12_20N_28), 1);
102   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
103                                vget_high_s32(cospi_4_12_20N_28), 1);
104   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
105                                vget_high_s32(cospi_4_12_20N_28), 1);
106   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
107                                vget_high_s32(cospi_4_12_20N_28), 1);
108   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
109                                vget_high_s32(cospi_4_12_20N_28), 1);
110   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
111                                vget_high_s32(cospi_4_12_20N_28), 1);
112   t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
113                                vget_low_s32(cospi_4_12_20N_28), 0);
114   t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
115                                vget_low_s32(cospi_4_12_20N_28), 0);
116   t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
117                                vget_low_s32(cospi_4_12_20N_28), 0);
118   t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
119                                vget_low_s32(cospi_4_12_20N_28), 0);
120   t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
121                                vget_low_s32(cospi_4_12_20N_28), 0);
122   t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
123                                vget_low_s32(cospi_4_12_20N_28), 0);
124   t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
125                                vget_low_s32(cospi_4_12_20N_28), 0);
126   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
127                                vget_low_s32(cospi_4_12_20N_28), 0);
128   dct_const_round_shift_high_4x2x2(t, d0, d1);
129 }
130 
highbd_idct_cospi_6_26(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_6_26N_14_18N,int32x4x2_t * const d0,int32x4x2_t * const d1)131 static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
132                                           const int32x4x2_t s1,
133                                           const int32x4_t cospi_6_26N_14_18N,
134                                           int32x4x2_t *const d0,
135                                           int32x4x2_t *const d1) {
136   int64x2x2_t t[4];
137 
138   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
139                                vget_low_s32(cospi_6_26N_14_18N), 0);
140   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
141                                vget_low_s32(cospi_6_26N_14_18N), 0);
142   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
143                                vget_low_s32(cospi_6_26N_14_18N), 0);
144   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
145                                vget_low_s32(cospi_6_26N_14_18N), 0);
146   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
147                                vget_low_s32(cospi_6_26N_14_18N), 0);
148   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
149                                vget_low_s32(cospi_6_26N_14_18N), 0);
150   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
151                                vget_low_s32(cospi_6_26N_14_18N), 0);
152   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
153                                vget_low_s32(cospi_6_26N_14_18N), 0);
154   t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
155                                vget_low_s32(cospi_6_26N_14_18N), 1);
156   t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
157                                vget_low_s32(cospi_6_26N_14_18N), 1);
158   t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
159                                vget_low_s32(cospi_6_26N_14_18N), 1);
160   t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
161                                vget_low_s32(cospi_6_26N_14_18N), 1);
162   t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
163                                vget_low_s32(cospi_6_26N_14_18N), 1);
164   t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
165                                vget_low_s32(cospi_6_26N_14_18N), 1);
166   t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
167                                vget_low_s32(cospi_6_26N_14_18N), 1);
168   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
169                                vget_low_s32(cospi_6_26N_14_18N), 1);
170   dct_const_round_shift_high_4x2x2(t, d0, d1);
171 }
172 
highbd_idct_cospi_10_22(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_2_30_10_22,int32x4x2_t * const d0,int32x4x2_t * const d1)173 static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
174                                            const int32x4x2_t s1,
175                                            const int32x4_t cospi_2_30_10_22,
176                                            int32x4x2_t *const d0,
177                                            int32x4x2_t *const d1) {
178   int64x2x2_t t[4];
179 
180   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
181                                vget_high_s32(cospi_2_30_10_22), 1);
182   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
183                                vget_high_s32(cospi_2_30_10_22), 1);
184   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
185                                vget_high_s32(cospi_2_30_10_22), 1);
186   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
187                                vget_high_s32(cospi_2_30_10_22), 1);
188   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
189                                vget_high_s32(cospi_2_30_10_22), 1);
190   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
191                                vget_high_s32(cospi_2_30_10_22), 1);
192   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
193                                vget_high_s32(cospi_2_30_10_22), 1);
194   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
195                                vget_high_s32(cospi_2_30_10_22), 1);
196   t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
197                                vget_high_s32(cospi_2_30_10_22), 0);
198   t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
199                                vget_high_s32(cospi_2_30_10_22), 0);
200   t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
201                                vget_high_s32(cospi_2_30_10_22), 0);
202   t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
203                                vget_high_s32(cospi_2_30_10_22), 0);
204   t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
205                                vget_high_s32(cospi_2_30_10_22), 0);
206   t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
207                                vget_high_s32(cospi_2_30_10_22), 0);
208   t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
209                                vget_high_s32(cospi_2_30_10_22), 0);
210   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
211                                vget_high_s32(cospi_2_30_10_22), 0);
212   dct_const_round_shift_high_4x2x2(t, d0, d1);
213 }
214 
highbd_idct_cospi_12_20(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_4_12_20N_28,int32x4x2_t * const d0,int32x4x2_t * const d1)215 static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
216                                            const int32x4x2_t s1,
217                                            const int32x4_t cospi_4_12_20N_28,
218                                            int32x4x2_t *const d0,
219                                            int32x4x2_t *const d1) {
220   int64x2x2_t t[4];
221 
222   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
223                                vget_low_s32(cospi_4_12_20N_28), 1);
224   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
225                                vget_low_s32(cospi_4_12_20N_28), 1);
226   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
227                                vget_low_s32(cospi_4_12_20N_28), 1);
228   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
229                                vget_low_s32(cospi_4_12_20N_28), 1);
230   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
231                                vget_low_s32(cospi_4_12_20N_28), 1);
232   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
233                                vget_low_s32(cospi_4_12_20N_28), 1);
234   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
235                                vget_low_s32(cospi_4_12_20N_28), 1);
236   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
237                                vget_low_s32(cospi_4_12_20N_28), 1);
238   t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
239                                vget_high_s32(cospi_4_12_20N_28), 0);
240   t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
241                                vget_high_s32(cospi_4_12_20N_28), 0);
242   t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
243                                vget_high_s32(cospi_4_12_20N_28), 0);
244   t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
245                                vget_high_s32(cospi_4_12_20N_28), 0);
246   t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
247                                vget_high_s32(cospi_4_12_20N_28), 0);
248   t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
249                                vget_high_s32(cospi_4_12_20N_28), 0);
250   t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
251                                vget_high_s32(cospi_4_12_20N_28), 0);
252   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
253                                vget_high_s32(cospi_4_12_20N_28), 0);
254   dct_const_round_shift_high_4x2x2(t, d0, d1);
255 }
256 
highbd_idct_cospi_14_18(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_6_26N_14_18N,int32x4x2_t * const d0,int32x4x2_t * const d1)257 static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
258                                            const int32x4x2_t s1,
259                                            const int32x4_t cospi_6_26N_14_18N,
260                                            int32x4x2_t *const d0,
261                                            int32x4x2_t *const d1) {
262   int64x2x2_t t[4];
263 
264   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
265                                vget_high_s32(cospi_6_26N_14_18N), 0);
266   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
267                                vget_high_s32(cospi_6_26N_14_18N), 0);
268   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
269                                vget_high_s32(cospi_6_26N_14_18N), 0);
270   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
271                                vget_high_s32(cospi_6_26N_14_18N), 0);
272   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
273                                vget_high_s32(cospi_6_26N_14_18N), 0);
274   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
275                                vget_high_s32(cospi_6_26N_14_18N), 0);
276   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
277                                vget_high_s32(cospi_6_26N_14_18N), 0);
278   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
279                                vget_high_s32(cospi_6_26N_14_18N), 0);
280   t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
281                                vget_high_s32(cospi_6_26N_14_18N), 1);
282   t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
283                                vget_high_s32(cospi_6_26N_14_18N), 1);
284   t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
285                                vget_high_s32(cospi_6_26N_14_18N), 1);
286   t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
287                                vget_high_s32(cospi_6_26N_14_18N), 1);
288   t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
289                                vget_high_s32(cospi_6_26N_14_18N), 1);
290   t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
291                                vget_high_s32(cospi_6_26N_14_18N), 1);
292   t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
293                                vget_high_s32(cospi_6_26N_14_18N), 1);
294   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
295                                vget_high_s32(cospi_6_26N_14_18N), 1);
296   dct_const_round_shift_high_4x2x2(t, d0, d1);
297 }
298 
highbd_idct_cospi_8_24_q_kernel(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int64x2x2_t * const t)299 static INLINE void highbd_idct_cospi_8_24_q_kernel(
300     const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
301     int64x2x2_t *const t) {
302   t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
303                                vget_high_s32(cospi_0_8_16_24), 1);
304   t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
305                                vget_high_s32(cospi_0_8_16_24), 1);
306   t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
307                                vget_high_s32(cospi_0_8_16_24), 1);
308   t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
309                                vget_high_s32(cospi_0_8_16_24), 1);
310   t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
311                                vget_high_s32(cospi_0_8_16_24), 1);
312   t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
313                                vget_high_s32(cospi_0_8_16_24), 1);
314   t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
315                                vget_high_s32(cospi_0_8_16_24), 1);
316   t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
317                                vget_high_s32(cospi_0_8_16_24), 1);
318   t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
319                                vget_low_s32(cospi_0_8_16_24), 1);
320   t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
321                                vget_low_s32(cospi_0_8_16_24), 1);
322   t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
323                                vget_low_s32(cospi_0_8_16_24), 1);
324   t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
325                                vget_low_s32(cospi_0_8_16_24), 1);
326   t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
327                                vget_low_s32(cospi_0_8_16_24), 1);
328   t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
329                                vget_low_s32(cospi_0_8_16_24), 1);
330   t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
331                                vget_low_s32(cospi_0_8_16_24), 1);
332   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
333                                vget_low_s32(cospi_0_8_16_24), 1);
334 }
335 
highbd_idct_cospi_8_24_d_kernel(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int64x2x2_t * const t)336 static INLINE void highbd_idct_cospi_8_24_d_kernel(
337     const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
338     int64x2x2_t *const t) {
339   t[0].val[0] =
340       vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
341   t[0].val[1] =
342       vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
343   t[1].val[0] =
344       vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
345   t[1].val[1] =
346       vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
347   t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
348                                vget_low_s32(cospi_0_8_16_24), 1);
349   t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
350                                vget_low_s32(cospi_0_8_16_24), 1);
351   t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
352                                vget_low_s32(cospi_0_8_16_24), 1);
353   t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
354                                vget_low_s32(cospi_0_8_16_24), 1);
355 }
356 
highbd_idct_cospi_8_24_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)357 static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
358                                             const int32x4x2_t s1,
359                                             const int32x4_t cospi_0_8_16_24,
360                                             int32x4x2_t *const d0,
361                                             int32x4x2_t *const d1) {
362   int64x2x2_t t[4];
363 
364   highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
365   dct_const_round_shift_high_4x2x2(t, d0, d1);
366 }
367 
highbd_idct_cospi_8_24_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)368 static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
369                                             const int32x4_t s1,
370                                             const int32x4_t cospi_0_8_16_24,
371                                             int32x4_t *const d0,
372                                             int32x4_t *const d1) {
373   int64x2x2_t t[2];
374 
375   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
376   dct_const_round_shift_high_4_dual(t, d0, d1);
377 }
378 
highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)379 static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
380                                                 const int32x4x2_t s1,
381                                                 const int32x4_t cospi_0_8_16_24,
382                                                 int32x4x2_t *const d0,
383                                                 int32x4x2_t *const d1) {
384   int64x2x2_t t[4];
385 
386   highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
387   t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
388   t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
389   t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
390   t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
391   dct_const_round_shift_high_4x2x2(t, d0, d1);
392 }
393 
highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)394 static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
395                                                 const int32x4_t s1,
396                                                 const int32x4_t cospi_0_8_16_24,
397                                                 int32x4_t *const d0,
398                                                 int32x4_t *const d1) {
399   int64x2x2_t t[2];
400 
401   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
402   t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
403   t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
404   dct_const_round_shift_high_4_dual(t, d0, d1);
405 }
406 
highbd_idct_cospi_16_16_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)407 static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
408                                              const int32x4x2_t s1,
409                                              const int32x4_t cospi_0_8_16_24,
410                                              int32x4x2_t *const d0,
411                                              int32x4x2_t *const d1) {
412   int64x2x2_t t[6];
413 
414   t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
415                                vget_high_s32(cospi_0_8_16_24), 0);
416   t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
417                                vget_high_s32(cospi_0_8_16_24), 0);
418   t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
419                                vget_high_s32(cospi_0_8_16_24), 0);
420   t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
421                                vget_high_s32(cospi_0_8_16_24), 0);
422   t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
423                                vget_high_s32(cospi_0_8_16_24), 0);
424   t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
425                                vget_high_s32(cospi_0_8_16_24), 0);
426   t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
427                                vget_high_s32(cospi_0_8_16_24), 0);
428   t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
429                                vget_high_s32(cospi_0_8_16_24), 0);
430   t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
431                                vget_high_s32(cospi_0_8_16_24), 0);
432   t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
433                                vget_high_s32(cospi_0_8_16_24), 0);
434   t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
435                                vget_high_s32(cospi_0_8_16_24), 0);
436   t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
437                                vget_high_s32(cospi_0_8_16_24), 0);
438   dct_const_round_shift_high_4x2x2(t, d0, d1);
439 }
440 
highbd_idct_cospi_16_16_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)441 static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
442                                              const int32x4_t s1,
443                                              const int32x4_t cospi_0_8_16_24,
444                                              int32x4_t *const d0,
445                                              int32x4_t *const d1) {
446   int64x2x2_t t[3];
447 
448   t[2].val[0] =
449       vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
450   t[2].val[1] =
451       vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
452   t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
453                                vget_high_s32(cospi_0_8_16_24), 0);
454   t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
455                                vget_high_s32(cospi_0_8_16_24), 0);
456   t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
457                                vget_high_s32(cospi_0_8_16_24), 0);
458   t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
459                                vget_high_s32(cospi_0_8_16_24), 0);
460   dct_const_round_shift_high_4_dual(t, d0, d1);
461 }
462 
highbd_idct16x16_add_stage7_dual(const int32x4x2_t * const step2,int32x4x2_t * const out)463 static INLINE void highbd_idct16x16_add_stage7_dual(
464     const int32x4x2_t *const step2, int32x4x2_t *const out) {
465   out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
466   out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
467   out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
468   out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
469   out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
470   out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
471   out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
472   out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
473   out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
474   out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
475   out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
476   out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
477   out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
478   out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
479   out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
480   out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
481   out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
482   out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
483   out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
484   out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
485   out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
486   out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
487   out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
488   out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
489   out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
490   out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
491   out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
492   out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
493   out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
494   out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
495   out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
496   out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
497 }
498 
highbd_idct16x16_add_stage7(const int32x4_t * const step2,int32x4_t * const out)499 static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
500                                                int32x4_t *const out) {
501   out[0] = vaddq_s32(step2[0], step2[15]);
502   out[1] = vaddq_s32(step2[1], step2[14]);
503   out[2] = vaddq_s32(step2[2], step2[13]);
504   out[3] = vaddq_s32(step2[3], step2[12]);
505   out[4] = vaddq_s32(step2[4], step2[11]);
506   out[5] = vaddq_s32(step2[5], step2[10]);
507   out[6] = vaddq_s32(step2[6], step2[9]);
508   out[7] = vaddq_s32(step2[7], step2[8]);
509   out[8] = vsubq_s32(step2[7], step2[8]);
510   out[9] = vsubq_s32(step2[6], step2[9]);
511   out[10] = vsubq_s32(step2[5], step2[10]);
512   out[11] = vsubq_s32(step2[4], step2[11]);
513   out[12] = vsubq_s32(step2[3], step2[12]);
514   out[13] = vsubq_s32(step2[2], step2[13]);
515   out[14] = vsubq_s32(step2[1], step2[14]);
516   out[15] = vsubq_s32(step2[0], step2[15]);
517 }
518 
vpx_highbd_idct16x16_256_add_half1d(const int32_t * input,int32_t * output,uint16_t * dest,const int stride,const int bd)519 void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
520                                          uint16_t *dest, const int stride,
521                                          const int bd) {
522   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
523   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
524   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
525   const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
526   int32x4x2_t in[16], step1[16], step2[16], out[16];
527 
528   // Load input (16x8)
529   in[0].val[0] = vld1q_s32(input);
530   in[0].val[1] = vld1q_s32(input + 4);
531   input += 8;
532   in[8].val[0] = vld1q_s32(input);
533   in[8].val[1] = vld1q_s32(input + 4);
534   input += 8;
535   in[1].val[0] = vld1q_s32(input);
536   in[1].val[1] = vld1q_s32(input + 4);
537   input += 8;
538   in[9].val[0] = vld1q_s32(input);
539   in[9].val[1] = vld1q_s32(input + 4);
540   input += 8;
541   in[2].val[0] = vld1q_s32(input);
542   in[2].val[1] = vld1q_s32(input + 4);
543   input += 8;
544   in[10].val[0] = vld1q_s32(input);
545   in[10].val[1] = vld1q_s32(input + 4);
546   input += 8;
547   in[3].val[0] = vld1q_s32(input);
548   in[3].val[1] = vld1q_s32(input + 4);
549   input += 8;
550   in[11].val[0] = vld1q_s32(input);
551   in[11].val[1] = vld1q_s32(input + 4);
552   input += 8;
553   in[4].val[0] = vld1q_s32(input);
554   in[4].val[1] = vld1q_s32(input + 4);
555   input += 8;
556   in[12].val[0] = vld1q_s32(input);
557   in[12].val[1] = vld1q_s32(input + 4);
558   input += 8;
559   in[5].val[0] = vld1q_s32(input);
560   in[5].val[1] = vld1q_s32(input + 4);
561   input += 8;
562   in[13].val[0] = vld1q_s32(input);
563   in[13].val[1] = vld1q_s32(input + 4);
564   input += 8;
565   in[6].val[0] = vld1q_s32(input);
566   in[6].val[1] = vld1q_s32(input + 4);
567   input += 8;
568   in[14].val[0] = vld1q_s32(input);
569   in[14].val[1] = vld1q_s32(input + 4);
570   input += 8;
571   in[7].val[0] = vld1q_s32(input);
572   in[7].val[1] = vld1q_s32(input + 4);
573   input += 8;
574   in[15].val[0] = vld1q_s32(input);
575   in[15].val[1] = vld1q_s32(input + 4);
576 
577   // Transpose
578   transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
579                     &in[7]);
580   transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
581                     &in[15]);
582 
583   // stage 1
584   step1[0] = in[0 / 2];
585   step1[1] = in[16 / 2];
586   step1[2] = in[8 / 2];
587   step1[3] = in[24 / 2];
588   step1[4] = in[4 / 2];
589   step1[5] = in[20 / 2];
590   step1[6] = in[12 / 2];
591   step1[7] = in[28 / 2];
592   step1[8] = in[2 / 2];
593   step1[9] = in[18 / 2];
594   step1[10] = in[10 / 2];
595   step1[11] = in[26 / 2];
596   step1[12] = in[6 / 2];
597   step1[13] = in[22 / 2];
598   step1[14] = in[14 / 2];
599   step1[15] = in[30 / 2];
600 
601   // stage 2
602   step2[0] = step1[0];
603   step2[1] = step1[1];
604   step2[2] = step1[2];
605   step2[3] = step1[3];
606   step2[4] = step1[4];
607   step2[5] = step1[5];
608   step2[6] = step1[6];
609   step2[7] = step1[7];
610   highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
611                          &step2[15]);
612   highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
613                           &step2[14]);
614   highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
615                           &step2[13]);
616   highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
617                          &step2[12]);
618 
619   // stage 3
620   step1[0] = step2[0];
621   step1[1] = step2[1];
622   step1[2] = step2[2];
623   step1[3] = step2[3];
624   highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
625                          &step1[7]);
626   highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
627                           &step1[6]);
628   step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
629   step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
630   step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
631   step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
632   step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
633   step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
634   step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
635   step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
636   step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
637   step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
638   step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
639   step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
640   step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
641   step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
642   step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
643   step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
644 
645   // stage 4
646   highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
647                             &step2[0]);
648   highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
649                            &step2[3]);
650   step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
651   step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
652   step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
653   step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
654   step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
655   step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
656   step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
657   step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
658   step2[8] = step1[8];
659   highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
660                            &step2[14]);
661   highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
662                                &step2[13], &step2[10]);
663   step2[11] = step1[11];
664   step2[12] = step1[12];
665   step2[15] = step1[15];
666 
667   // stage 5
668   step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
669   step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
670   step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
671   step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
672   step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
673   step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
674   step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
675   step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
676   step1[4] = step2[4];
677   highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
678                             &step1[6]);
679   step1[7] = step2[7];
680   step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
681   step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
682   step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
683   step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
684   step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
685   step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
686   step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
687   step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
688   step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
689   step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
690   step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
691   step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
692   step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
693   step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
694   step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
695   step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
696 
697   // stage 6
698   step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
699   step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
700   step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
701   step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
702   step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
703   step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
704   step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
705   step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
706   step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
707   step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
708   step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
709   step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
710   step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
711   step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
712   step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
713   step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
714   highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
715                             &step2[13]);
716   highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
717                             &step2[12]);
718   step2[8] = step1[8];
719   step2[9] = step1[9];
720   step2[14] = step1[14];
721   step2[15] = step1[15];
722 
723   // stage 7
724   highbd_idct16x16_add_stage7_dual(step2, out);
725 
726   if (output) {
727     highbd_idct16x16_store_pass1(out, output);
728   } else {
729     highbd_idct16x16_add_store(out, dest, stride, bd);
730   }
731 }
732 
highbd_idct_cospi_lane0_dual(const int32x4x2_t s,const int32x2_t coef)733 static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
734                                                        const int32x2_t coef) {
735   int64x2x2_t t[2];
736 
737   t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
738   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
739   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
740   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
741   return dct_const_round_shift_high_4x2_int64x2x2(t);
742 }
743 
highbd_idct_cospi_lane0(const int32x4_t s,const int32x2_t coef)744 static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
745                                                 const int32x2_t coef) {
746   int64x2x2_t t;
747 
748   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
749   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
750   return dct_const_round_shift_high_4(t);
751 }
752 
highbd_idct_cospi_lane1_dual(const int32x4x2_t s,const int32x2_t coef)753 static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
754                                                        const int32x2_t coef) {
755   int64x2x2_t t[2];
756 
757   t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
758   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
759   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
760   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
761   return dct_const_round_shift_high_4x2_int64x2x2(t);
762 }
763 
highbd_idct_cospi_lane1(const int32x4_t s,const int32x2_t coef)764 static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
765                                                 const int32x2_t coef) {
766   int64x2x2_t t;
767 
768   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
769   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
770   return dct_const_round_shift_high_4(t);
771 }
772 
vpx_highbd_idct16x16_38_add_half1d(const int32_t * input,int32_t * output,uint16_t * dest,const int stride,const int bd)773 static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
774                                                int32_t *output, uint16_t *dest,
775                                                const int stride, const int bd) {
776   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
777   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
778   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
779   const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
780   int32x4x2_t in[8], step1[16], step2[16], out[16];
781 
782   // Load input (8x8)
783   in[0].val[0] = vld1q_s32(input);
784   in[0].val[1] = vld1q_s32(input + 4);
785   input += 16;
786   in[1].val[0] = vld1q_s32(input);
787   in[1].val[1] = vld1q_s32(input + 4);
788   input += 16;
789   in[2].val[0] = vld1q_s32(input);
790   in[2].val[1] = vld1q_s32(input + 4);
791   input += 16;
792   in[3].val[0] = vld1q_s32(input);
793   in[3].val[1] = vld1q_s32(input + 4);
794   input += 16;
795   in[4].val[0] = vld1q_s32(input);
796   in[4].val[1] = vld1q_s32(input + 4);
797   input += 16;
798   in[5].val[0] = vld1q_s32(input);
799   in[5].val[1] = vld1q_s32(input + 4);
800   input += 16;
801   in[6].val[0] = vld1q_s32(input);
802   in[6].val[1] = vld1q_s32(input + 4);
803   input += 16;
804   in[7].val[0] = vld1q_s32(input);
805   in[7].val[1] = vld1q_s32(input + 4);
806 
807   // Transpose
808   transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
809                     &in[7]);
810 
811   // stage 1
812   step1[0] = in[0 / 2];
813   step1[2] = in[8 / 2];
814   step1[4] = in[4 / 2];
815   step1[6] = in[12 / 2];
816   step1[8] = in[2 / 2];
817   step1[10] = in[10 / 2];
818   step1[12] = in[6 / 2];
819   step1[14] = in[14 / 2];  // 0 in pass 1
820 
821   // stage 2
822   step2[0] = step1[0];
823   step2[2] = step1[2];
824   step2[4] = step1[4];
825   step2[6] = step1[6];
826   step2[8] =
827       highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
828   step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
829                                           vget_high_s32(cospi_6_26N_14_18N));
830   step2[10] =
831       highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
832   step2[11] =
833       highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
834   step2[12] =
835       highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
836   step2[13] =
837       highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
838   step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
839                                            vget_high_s32(cospi_6_26N_14_18N));
840   step2[15] =
841       highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
842 
843   // stage 3
844   step1[0] = step2[0];
845   step1[2] = step2[2];
846   step1[4] =
847       highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
848   step1[5] =
849       highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
850   step1[6] =
851       highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
852   step1[7] =
853       highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
854   step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
855   step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
856   step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
857   step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
858   step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
859   step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
860   step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
861   step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
862 
863   // stage 4
864   step2[0] = step2[1] =
865       highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
866   step2[2] =
867       highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
868   step2[3] =
869       highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
870   step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
871   step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
872   step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
873   step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
874   step2[8] = step1[8];
875   highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
876                            &step2[14]);
877   highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
878                                &step2[13], &step2[10]);
879   step2[11] = step1[11];
880   step2[12] = step1[12];
881   step2[15] = step1[15];
882 
883   // stage 5
884   step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
885   step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
886   step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
887   step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
888   step1[4] = step2[4];
889   highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
890                             &step1[6]);
891   step1[7] = step2[7];
892   step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
893   step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
894   step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
895   step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
896   step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
897   step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
898   step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
899   step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
900 
901   // stage 6
902   step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
903   step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
904   step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
905   step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
906   step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
907   step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
908   step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
909   step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
910   highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
911                             &step2[13]);
912   highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
913                             &step2[12]);
914   step2[8] = step1[8];
915   step2[9] = step1[9];
916   step2[14] = step1[14];
917   step2[15] = step1[15];
918 
919   // stage 7
920   highbd_idct16x16_add_stage7_dual(step2, out);
921 
922   if (output) {
923     highbd_idct16x16_store_pass1(out, output);
924   } else {
925     highbd_idct16x16_add_store(out, dest, stride, bd);
926   }
927 }
928 
highbd_idct16x16_10_add_half1d_pass1(const tran_low_t * input,int32_t * output)929 static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
930                                                  int32_t *output) {
931   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
932   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
933   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
934   const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
935   int32x4_t in[4], step1[16], step2[16], out[16];
936 
937   // Load input (4x4)
938   in[0] = vld1q_s32(input);
939   input += 16;
940   in[1] = vld1q_s32(input);
941   input += 16;
942   in[2] = vld1q_s32(input);
943   input += 16;
944   in[3] = vld1q_s32(input);
945 
946   // Transpose
947   transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
948 
949   // stage 1
950   step1[0] = in[0 / 2];
951   step1[4] = in[4 / 2];
952   step1[8] = in[2 / 2];
953   step1[12] = in[6 / 2];
954 
955   // stage 2
956   step2[0] = step1[0];
957   step2[4] = step1[4];
958   step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
959   step2[11] =
960       highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
961   step2[12] =
962       highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
963   step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
964 
965   // stage 3
966   step1[0] = step2[0];
967   step1[4] =
968       highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
969   step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
970   step1[8] = step2[8];
971   step1[9] = step2[8];
972   step1[10] = step2[11];
973   step1[11] = step2[11];
974   step1[12] = step2[12];
975   step1[13] = step2[12];
976   step1[14] = step2[15];
977   step1[15] = step2[15];
978 
979   // stage 4
980   step2[0] = step2[1] =
981       highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
982   step2[4] = step1[4];
983   step2[5] = step1[4];
984   step2[6] = step1[7];
985   step2[7] = step1[7];
986   step2[8] = step1[8];
987   highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
988                            &step2[14]);
989   highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
990                                &step2[13], &step2[10]);
991   step2[11] = step1[11];
992   step2[12] = step1[12];
993   step2[15] = step1[15];
994 
995   // stage 5
996   step1[0] = step2[0];
997   step1[1] = step2[1];
998   step1[2] = step2[1];
999   step1[3] = step2[0];
1000   step1[4] = step2[4];
1001   highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
1002                             &step1[6]);
1003   step1[7] = step2[7];
1004   step1[8] = vaddq_s32(step2[8], step2[11]);
1005   step1[9] = vaddq_s32(step2[9], step2[10]);
1006   step1[10] = vsubq_s32(step2[9], step2[10]);
1007   step1[11] = vsubq_s32(step2[8], step2[11]);
1008   step1[12] = vsubq_s32(step2[15], step2[12]);
1009   step1[13] = vsubq_s32(step2[14], step2[13]);
1010   step1[14] = vaddq_s32(step2[14], step2[13]);
1011   step1[15] = vaddq_s32(step2[15], step2[12]);
1012 
1013   // stage 6
1014   step2[0] = vaddq_s32(step1[0], step1[7]);
1015   step2[1] = vaddq_s32(step1[1], step1[6]);
1016   step2[2] = vaddq_s32(step1[2], step1[5]);
1017   step2[3] = vaddq_s32(step1[3], step1[4]);
1018   step2[4] = vsubq_s32(step1[3], step1[4]);
1019   step2[5] = vsubq_s32(step1[2], step1[5]);
1020   step2[6] = vsubq_s32(step1[1], step1[6]);
1021   step2[7] = vsubq_s32(step1[0], step1[7]);
1022   highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
1023                             &step2[13]);
1024   highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
1025                             &step2[12]);
1026   step2[8] = step1[8];
1027   step2[9] = step1[9];
1028   step2[14] = step1[14];
1029   step2[15] = step1[15];
1030 
1031   // stage 7
1032   highbd_idct16x16_add_stage7(step2, out);
1033 
1034   // pass 1: save the result into output
1035   vst1q_s32(output, out[0]);
1036   output += 4;
1037   vst1q_s32(output, out[1]);
1038   output += 4;
1039   vst1q_s32(output, out[2]);
1040   output += 4;
1041   vst1q_s32(output, out[3]);
1042   output += 4;
1043   vst1q_s32(output, out[4]);
1044   output += 4;
1045   vst1q_s32(output, out[5]);
1046   output += 4;
1047   vst1q_s32(output, out[6]);
1048   output += 4;
1049   vst1q_s32(output, out[7]);
1050   output += 4;
1051   vst1q_s32(output, out[8]);
1052   output += 4;
1053   vst1q_s32(output, out[9]);
1054   output += 4;
1055   vst1q_s32(output, out[10]);
1056   output += 4;
1057   vst1q_s32(output, out[11]);
1058   output += 4;
1059   vst1q_s32(output, out[12]);
1060   output += 4;
1061   vst1q_s32(output, out[13]);
1062   output += 4;
1063   vst1q_s32(output, out[14]);
1064   output += 4;
1065   vst1q_s32(output, out[15]);
1066 }
1067 
highbd_idct16x16_10_add_half1d_pass2(const int32_t * input,int32_t * const output,uint16_t * const dest,const int stride,const int bd)1068 static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
1069                                                  int32_t *const output,
1070                                                  uint16_t *const dest,
1071                                                  const int stride,
1072                                                  const int bd) {
1073   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
1074   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
1075   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
1076   const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
1077   int32x4x2_t in[4], step1[16], step2[16], out[16];
1078 
1079   // Load input (4x8)
1080   in[0].val[0] = vld1q_s32(input);
1081   input += 4;
1082   in[0].val[1] = vld1q_s32(input);
1083   input += 4;
1084   in[1].val[0] = vld1q_s32(input);
1085   input += 4;
1086   in[1].val[1] = vld1q_s32(input);
1087   input += 4;
1088   in[2].val[0] = vld1q_s32(input);
1089   input += 4;
1090   in[2].val[1] = vld1q_s32(input);
1091   input += 4;
1092   in[3].val[0] = vld1q_s32(input);
1093   input += 4;
1094   in[3].val[1] = vld1q_s32(input);
1095 
1096   // Transpose
1097   transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
1098                     &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
1099 
1100   // stage 1
1101   step1[0] = in[0 / 2];
1102   step1[4] = in[4 / 2];
1103   step1[8] = in[2 / 2];
1104   step1[12] = in[6 / 2];
1105 
1106   // stage 2
1107   step2[0] = step1[0];
1108   step2[4] = step1[4];
1109   step2[8] =
1110       highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
1111   step2[11] =
1112       highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
1113   step2[12] =
1114       highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
1115   step2[15] =
1116       highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
1117 
1118   // stage 3
1119   step1[0] = step2[0];
1120   step1[4] =
1121       highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
1122   step1[7] =
1123       highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
1124   step1[8] = step2[8];
1125   step1[9] = step2[8];
1126   step1[10] = step2[11];
1127   step1[11] = step2[11];
1128   step1[12] = step2[12];
1129   step1[13] = step2[12];
1130   step1[14] = step2[15];
1131   step1[15] = step2[15];
1132 
1133   // stage 4
1134   step2[0] = step2[1] =
1135       highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
1136   step2[4] = step1[4];
1137   step2[5] = step1[4];
1138   step2[6] = step1[7];
1139   step2[7] = step1[7];
1140   step2[8] = step1[8];
1141   highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
1142                            &step2[14]);
1143   highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
1144                                &step2[13], &step2[10]);
1145   step2[11] = step1[11];
1146   step2[12] = step1[12];
1147   step2[15] = step1[15];
1148 
1149   // stage 5
1150   step1[0] = step2[0];
1151   step1[1] = step2[1];
1152   step1[2] = step2[1];
1153   step1[3] = step2[0];
1154   step1[4] = step2[4];
1155   highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
1156                             &step1[6]);
1157   step1[7] = step2[7];
1158   step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
1159   step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
1160   step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
1161   step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
1162   step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
1163   step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
1164   step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
1165   step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
1166 
1167   // stage 6
1168   step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
1169   step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
1170   step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
1171   step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
1172   step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
1173   step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
1174   step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
1175   step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
1176   highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
1177                             &step2[13]);
1178   highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
1179                             &step2[12]);
1180   step2[8] = step1[8];
1181   step2[9] = step1[9];
1182   step2[14] = step1[14];
1183   step2[15] = step1[15];
1184 
1185   // stage 7
1186   highbd_idct16x16_add_stage7_dual(step2, out);
1187 
1188   if (output) {
1189     highbd_idct16x16_store_pass1(out, output);
1190   } else {
1191     highbd_idct16x16_add_store(out, dest, stride, bd);
1192   }
1193 }
1194 
vpx_highbd_idct16x16_256_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1195 void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
1196                                        int stride, int bd) {
1197   if (bd == 8) {
1198     int16_t row_idct_output[16 * 16];
1199 
1200     // pass 1
1201     // Parallel idct on the upper 8 rows
1202     vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
1203 
1204     // Parallel idct on the lower 8 rows
1205     vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
1206                                  stride, 1);
1207 
1208     // pass 2
1209     // Parallel idct to get the left 8 columns
1210     vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
1211 
1212     // Parallel idct to get the right 8 columns
1213     vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
1214                                  stride, 1);
1215   } else {
1216     int32_t row_idct_output[16 * 16];
1217 
1218     // pass 1
1219     // Parallel idct on the upper 8 rows
1220     vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
1221                                         bd);
1222 
1223     // Parallel idct on the lower 8 rows
1224     vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
1225                                         dest, stride, bd);
1226 
1227     // pass 2
1228     // Parallel idct to get the left 8 columns
1229     vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
1230                                         bd);
1231 
1232     // Parallel idct to get the right 8 columns
1233     vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
1234                                         dest + 8, stride, bd);
1235   }
1236 }
1237 
vpx_highbd_idct16x16_38_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1238 void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
1239                                       int stride, int bd) {
1240   if (bd == 8) {
1241     int16_t row_idct_output[16 * 16];
1242 
1243     // pass 1
1244     // Parallel idct on the upper 8 rows
1245     vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
1246 
1247     // pass 2
1248     // Parallel idct to get the left 8 columns
1249     vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
1250 
1251     // Parallel idct to get the right 8 columns
1252     vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
1253                                 stride, 1);
1254   } else {
1255     int32_t row_idct_output[16 * 16];
1256 
1257     // pass 1
1258     // Parallel idct on the upper 8 rows
1259     vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
1260                                        bd);
1261 
1262     // pass 2
1263     // Parallel idct to get the left 8 columns
1264     vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
1265 
1266     // Parallel idct to get the right 8 columns
1267     vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
1268                                        stride, bd);
1269   }
1270 }
1271 
vpx_highbd_idct16x16_10_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1272 void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
1273                                       int stride, int bd) {
1274   if (bd == 8) {
1275     int16_t row_idct_output[4 * 16];
1276 
1277     // pass 1
1278     // Parallel idct on the upper 8 rows
1279     vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
1280 
1281     // pass 2
1282     // Parallel idct to get the left 8 columns
1283     vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
1284 
1285     // Parallel idct to get the right 8 columns
1286     vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
1287                                       stride, 1);
1288   } else {
1289     int32_t row_idct_output[4 * 16];
1290 
1291     // pass 1
1292     // Parallel idct on the upper 8 rows
1293     highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
1294 
1295     // pass 2
1296     // Parallel idct to get the left 8 columns
1297     highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
1298                                          bd);
1299 
1300     // Parallel idct to get the right 8 columns
1301     highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
1302                                          dest + 8, stride, bd);
1303   }
1304 }
1305 
highbd_idct16x16_1_add_pos_kernel(uint16_t ** dest,const int stride,const int16x8_t res,const int16x8_t max)1306 static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
1307                                                      const int stride,
1308                                                      const int16x8_t res,
1309                                                      const int16x8_t max) {
1310   const uint16x8_t a0 = vld1q_u16(*dest + 0);
1311   const uint16x8_t a1 = vld1q_u16(*dest + 8);
1312   const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
1313   const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
1314   const int16x8_t c0 = vminq_s16(b0, max);
1315   const int16x8_t c1 = vminq_s16(b1, max);
1316   vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
1317   vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
1318   *dest += stride;
1319 }
1320 
highbd_idct16x16_1_add_neg_kernel(uint16_t ** dest,const int stride,const int16x8_t res)1321 static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
1322                                                      const int stride,
1323                                                      const int16x8_t res) {
1324   const uint16x8_t a0 = vld1q_u16(*dest + 0);
1325   const uint16x8_t a1 = vld1q_u16(*dest + 8);
1326   const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
1327   const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
1328   const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
1329   const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
1330   vst1q_u16(*dest + 0, c0);
1331   vst1q_u16(*dest + 8, c1);
1332   *dest += stride;
1333 }
1334 
vpx_highbd_idct16x16_1_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1335 void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
1336                                      int stride, int bd) {
1337   const tran_low_t out0 = HIGHBD_WRAPLOW(
1338       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1339   const tran_low_t out1 = HIGHBD_WRAPLOW(
1340       dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
1341   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
1342   const int16x8_t dc = vdupq_n_s16(a1);
1343   int i;
1344 
1345   if (a1 >= 0) {
1346     const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
1347     for (i = 0; i < 4; ++i) {
1348       highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1349       highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1350       highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1351       highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1352     }
1353   } else {
1354     for (i = 0; i < 4; ++i) {
1355       highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1356       highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1357       highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1358       highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1359     }
1360   }
1361 }
1362