1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdlib.h>
13 #include "av1/encoder/av1_fwd_txfm1d.h"
14 #include "av1/common/av1_txfm.h"
15 
av1_fdct4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)16 void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
17                    const int8_t *stage_range) {
18   const int32_t size = 4;
19   const int32_t *cospi;
20 
21   int32_t stage = 0;
22   int32_t *bf0, *bf1;
23   int32_t step[4];
24 
25   // stage 0;
26   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
27 
28   // stage 1;
29   stage++;
30   bf1 = output;
31   bf1[0] = input[0] + input[3];
32   bf1[1] = input[1] + input[2];
33   bf1[2] = -input[2] + input[1];
34   bf1[3] = -input[3] + input[0];
35   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36 
37   // stage 2
38   stage++;
39   cospi = cospi_arr(cos_bit);
40   bf0 = output;
41   bf1 = step;
42   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
43   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
44   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
45   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
46   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
47 
48   // stage 3
49   stage++;
50   bf0 = step;
51   bf1 = output;
52   bf1[0] = bf0[0];
53   bf1[1] = bf0[2];
54   bf1[2] = bf0[1];
55   bf1[3] = bf0[3];
56   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
57 }
58 
av1_fdct8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)59 void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
60                    const int8_t *stage_range) {
61   const int32_t size = 8;
62   const int32_t *cospi;
63 
64   int32_t stage = 0;
65   int32_t *bf0, *bf1;
66   int32_t step[8];
67 
68   // stage 0;
69   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
70 
71   // stage 1;
72   stage++;
73   bf1 = output;
74   bf1[0] = input[0] + input[7];
75   bf1[1] = input[1] + input[6];
76   bf1[2] = input[2] + input[5];
77   bf1[3] = input[3] + input[4];
78   bf1[4] = -input[4] + input[3];
79   bf1[5] = -input[5] + input[2];
80   bf1[6] = -input[6] + input[1];
81   bf1[7] = -input[7] + input[0];
82   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
83 
84   // stage 2
85   stage++;
86   cospi = cospi_arr(cos_bit);
87   bf0 = output;
88   bf1 = step;
89   bf1[0] = bf0[0] + bf0[3];
90   bf1[1] = bf0[1] + bf0[2];
91   bf1[2] = -bf0[2] + bf0[1];
92   bf1[3] = -bf0[3] + bf0[0];
93   bf1[4] = bf0[4];
94   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
95   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
96   bf1[7] = bf0[7];
97   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98 
99   // stage 3
100   stage++;
101   cospi = cospi_arr(cos_bit);
102   bf0 = step;
103   bf1 = output;
104   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
105   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
106   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
107   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
108   bf1[4] = bf0[4] + bf0[5];
109   bf1[5] = -bf0[5] + bf0[4];
110   bf1[6] = -bf0[6] + bf0[7];
111   bf1[7] = bf0[7] + bf0[6];
112   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
113 
114   // stage 4
115   stage++;
116   cospi = cospi_arr(cos_bit);
117   bf0 = output;
118   bf1 = step;
119   bf1[0] = bf0[0];
120   bf1[1] = bf0[1];
121   bf1[2] = bf0[2];
122   bf1[3] = bf0[3];
123   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
124   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
125   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
126   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
127   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
128 
129   // stage 5
130   stage++;
131   bf0 = step;
132   bf1 = output;
133   bf1[0] = bf0[0];
134   bf1[1] = bf0[4];
135   bf1[2] = bf0[2];
136   bf1[3] = bf0[6];
137   bf1[4] = bf0[1];
138   bf1[5] = bf0[5];
139   bf1[6] = bf0[3];
140   bf1[7] = bf0[7];
141   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
142 }
143 
av1_fdct16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)144 void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
145                     const int8_t *stage_range) {
146   const int32_t size = 16;
147   const int32_t *cospi;
148 
149   int32_t stage = 0;
150   int32_t *bf0, *bf1;
151   int32_t step[16];
152 
153   // stage 0;
154   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
155 
156   // stage 1;
157   stage++;
158   bf1 = output;
159   bf1[0] = input[0] + input[15];
160   bf1[1] = input[1] + input[14];
161   bf1[2] = input[2] + input[13];
162   bf1[3] = input[3] + input[12];
163   bf1[4] = input[4] + input[11];
164   bf1[5] = input[5] + input[10];
165   bf1[6] = input[6] + input[9];
166   bf1[7] = input[7] + input[8];
167   bf1[8] = -input[8] + input[7];
168   bf1[9] = -input[9] + input[6];
169   bf1[10] = -input[10] + input[5];
170   bf1[11] = -input[11] + input[4];
171   bf1[12] = -input[12] + input[3];
172   bf1[13] = -input[13] + input[2];
173   bf1[14] = -input[14] + input[1];
174   bf1[15] = -input[15] + input[0];
175   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
176 
177   // stage 2
178   stage++;
179   cospi = cospi_arr(cos_bit);
180   bf0 = output;
181   bf1 = step;
182   bf1[0] = bf0[0] + bf0[7];
183   bf1[1] = bf0[1] + bf0[6];
184   bf1[2] = bf0[2] + bf0[5];
185   bf1[3] = bf0[3] + bf0[4];
186   bf1[4] = -bf0[4] + bf0[3];
187   bf1[5] = -bf0[5] + bf0[2];
188   bf1[6] = -bf0[6] + bf0[1];
189   bf1[7] = -bf0[7] + bf0[0];
190   bf1[8] = bf0[8];
191   bf1[9] = bf0[9];
192   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
193   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
194   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
195   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
196   bf1[14] = bf0[14];
197   bf1[15] = bf0[15];
198   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
199 
200   // stage 3
201   stage++;
202   cospi = cospi_arr(cos_bit);
203   bf0 = step;
204   bf1 = output;
205   bf1[0] = bf0[0] + bf0[3];
206   bf1[1] = bf0[1] + bf0[2];
207   bf1[2] = -bf0[2] + bf0[1];
208   bf1[3] = -bf0[3] + bf0[0];
209   bf1[4] = bf0[4];
210   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
211   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
212   bf1[7] = bf0[7];
213   bf1[8] = bf0[8] + bf0[11];
214   bf1[9] = bf0[9] + bf0[10];
215   bf1[10] = -bf0[10] + bf0[9];
216   bf1[11] = -bf0[11] + bf0[8];
217   bf1[12] = -bf0[12] + bf0[15];
218   bf1[13] = -bf0[13] + bf0[14];
219   bf1[14] = bf0[14] + bf0[13];
220   bf1[15] = bf0[15] + bf0[12];
221   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
222 
223   // stage 4
224   stage++;
225   cospi = cospi_arr(cos_bit);
226   bf0 = output;
227   bf1 = step;
228   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
229   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
230   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
231   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
232   bf1[4] = bf0[4] + bf0[5];
233   bf1[5] = -bf0[5] + bf0[4];
234   bf1[6] = -bf0[6] + bf0[7];
235   bf1[7] = bf0[7] + bf0[6];
236   bf1[8] = bf0[8];
237   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
238   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
239   bf1[11] = bf0[11];
240   bf1[12] = bf0[12];
241   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
242   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
243   bf1[15] = bf0[15];
244   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
245 
246   // stage 5
247   stage++;
248   cospi = cospi_arr(cos_bit);
249   bf0 = step;
250   bf1 = output;
251   bf1[0] = bf0[0];
252   bf1[1] = bf0[1];
253   bf1[2] = bf0[2];
254   bf1[3] = bf0[3];
255   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
256   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
257   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
258   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
259   bf1[8] = bf0[8] + bf0[9];
260   bf1[9] = -bf0[9] + bf0[8];
261   bf1[10] = -bf0[10] + bf0[11];
262   bf1[11] = bf0[11] + bf0[10];
263   bf1[12] = bf0[12] + bf0[13];
264   bf1[13] = -bf0[13] + bf0[12];
265   bf1[14] = -bf0[14] + bf0[15];
266   bf1[15] = bf0[15] + bf0[14];
267   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
268 
269   // stage 6
270   stage++;
271   cospi = cospi_arr(cos_bit);
272   bf0 = output;
273   bf1 = step;
274   bf1[0] = bf0[0];
275   bf1[1] = bf0[1];
276   bf1[2] = bf0[2];
277   bf1[3] = bf0[3];
278   bf1[4] = bf0[4];
279   bf1[5] = bf0[5];
280   bf1[6] = bf0[6];
281   bf1[7] = bf0[7];
282   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
283   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
284   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
285   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
286   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
287   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
288   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
289   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
290   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
291 
292   // stage 7
293   stage++;
294   bf0 = step;
295   bf1 = output;
296   bf1[0] = bf0[0];
297   bf1[1] = bf0[8];
298   bf1[2] = bf0[4];
299   bf1[3] = bf0[12];
300   bf1[4] = bf0[2];
301   bf1[5] = bf0[10];
302   bf1[6] = bf0[6];
303   bf1[7] = bf0[14];
304   bf1[8] = bf0[1];
305   bf1[9] = bf0[9];
306   bf1[10] = bf0[5];
307   bf1[11] = bf0[13];
308   bf1[12] = bf0[3];
309   bf1[13] = bf0[11];
310   bf1[14] = bf0[7];
311   bf1[15] = bf0[15];
312   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
313 }
314 
av1_fdct32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)315 void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
316                     const int8_t *stage_range) {
317   const int32_t size = 32;
318   const int32_t *cospi;
319 
320   int32_t stage = 0;
321   int32_t *bf0, *bf1;
322   int32_t step[32];
323 
324   // stage 0;
325   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
326 
327   // stage 1;
328   stage++;
329   bf1 = output;
330   bf1[0] = input[0] + input[31];
331   bf1[1] = input[1] + input[30];
332   bf1[2] = input[2] + input[29];
333   bf1[3] = input[3] + input[28];
334   bf1[4] = input[4] + input[27];
335   bf1[5] = input[5] + input[26];
336   bf1[6] = input[6] + input[25];
337   bf1[7] = input[7] + input[24];
338   bf1[8] = input[8] + input[23];
339   bf1[9] = input[9] + input[22];
340   bf1[10] = input[10] + input[21];
341   bf1[11] = input[11] + input[20];
342   bf1[12] = input[12] + input[19];
343   bf1[13] = input[13] + input[18];
344   bf1[14] = input[14] + input[17];
345   bf1[15] = input[15] + input[16];
346   bf1[16] = -input[16] + input[15];
347   bf1[17] = -input[17] + input[14];
348   bf1[18] = -input[18] + input[13];
349   bf1[19] = -input[19] + input[12];
350   bf1[20] = -input[20] + input[11];
351   bf1[21] = -input[21] + input[10];
352   bf1[22] = -input[22] + input[9];
353   bf1[23] = -input[23] + input[8];
354   bf1[24] = -input[24] + input[7];
355   bf1[25] = -input[25] + input[6];
356   bf1[26] = -input[26] + input[5];
357   bf1[27] = -input[27] + input[4];
358   bf1[28] = -input[28] + input[3];
359   bf1[29] = -input[29] + input[2];
360   bf1[30] = -input[30] + input[1];
361   bf1[31] = -input[31] + input[0];
362   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
363 
364   // stage 2
365   stage++;
366   cospi = cospi_arr(cos_bit);
367   bf0 = output;
368   bf1 = step;
369   bf1[0] = bf0[0] + bf0[15];
370   bf1[1] = bf0[1] + bf0[14];
371   bf1[2] = bf0[2] + bf0[13];
372   bf1[3] = bf0[3] + bf0[12];
373   bf1[4] = bf0[4] + bf0[11];
374   bf1[5] = bf0[5] + bf0[10];
375   bf1[6] = bf0[6] + bf0[9];
376   bf1[7] = bf0[7] + bf0[8];
377   bf1[8] = -bf0[8] + bf0[7];
378   bf1[9] = -bf0[9] + bf0[6];
379   bf1[10] = -bf0[10] + bf0[5];
380   bf1[11] = -bf0[11] + bf0[4];
381   bf1[12] = -bf0[12] + bf0[3];
382   bf1[13] = -bf0[13] + bf0[2];
383   bf1[14] = -bf0[14] + bf0[1];
384   bf1[15] = -bf0[15] + bf0[0];
385   bf1[16] = bf0[16];
386   bf1[17] = bf0[17];
387   bf1[18] = bf0[18];
388   bf1[19] = bf0[19];
389   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
390   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
391   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
392   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
393   bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
394   bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
395   bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
396   bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
397   bf1[28] = bf0[28];
398   bf1[29] = bf0[29];
399   bf1[30] = bf0[30];
400   bf1[31] = bf0[31];
401   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
402 
403   // stage 3
404   stage++;
405   cospi = cospi_arr(cos_bit);
406   bf0 = step;
407   bf1 = output;
408   bf1[0] = bf0[0] + bf0[7];
409   bf1[1] = bf0[1] + bf0[6];
410   bf1[2] = bf0[2] + bf0[5];
411   bf1[3] = bf0[3] + bf0[4];
412   bf1[4] = -bf0[4] + bf0[3];
413   bf1[5] = -bf0[5] + bf0[2];
414   bf1[6] = -bf0[6] + bf0[1];
415   bf1[7] = -bf0[7] + bf0[0];
416   bf1[8] = bf0[8];
417   bf1[9] = bf0[9];
418   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
419   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
420   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
421   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
422   bf1[14] = bf0[14];
423   bf1[15] = bf0[15];
424   bf1[16] = bf0[16] + bf0[23];
425   bf1[17] = bf0[17] + bf0[22];
426   bf1[18] = bf0[18] + bf0[21];
427   bf1[19] = bf0[19] + bf0[20];
428   bf1[20] = -bf0[20] + bf0[19];
429   bf1[21] = -bf0[21] + bf0[18];
430   bf1[22] = -bf0[22] + bf0[17];
431   bf1[23] = -bf0[23] + bf0[16];
432   bf1[24] = -bf0[24] + bf0[31];
433   bf1[25] = -bf0[25] + bf0[30];
434   bf1[26] = -bf0[26] + bf0[29];
435   bf1[27] = -bf0[27] + bf0[28];
436   bf1[28] = bf0[28] + bf0[27];
437   bf1[29] = bf0[29] + bf0[26];
438   bf1[30] = bf0[30] + bf0[25];
439   bf1[31] = bf0[31] + bf0[24];
440   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
441 
442   // stage 4
443   stage++;
444   cospi = cospi_arr(cos_bit);
445   bf0 = output;
446   bf1 = step;
447   bf1[0] = bf0[0] + bf0[3];
448   bf1[1] = bf0[1] + bf0[2];
449   bf1[2] = -bf0[2] + bf0[1];
450   bf1[3] = -bf0[3] + bf0[0];
451   bf1[4] = bf0[4];
452   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
453   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
454   bf1[7] = bf0[7];
455   bf1[8] = bf0[8] + bf0[11];
456   bf1[9] = bf0[9] + bf0[10];
457   bf1[10] = -bf0[10] + bf0[9];
458   bf1[11] = -bf0[11] + bf0[8];
459   bf1[12] = -bf0[12] + bf0[15];
460   bf1[13] = -bf0[13] + bf0[14];
461   bf1[14] = bf0[14] + bf0[13];
462   bf1[15] = bf0[15] + bf0[12];
463   bf1[16] = bf0[16];
464   bf1[17] = bf0[17];
465   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
466   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
467   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
468   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
469   bf1[22] = bf0[22];
470   bf1[23] = bf0[23];
471   bf1[24] = bf0[24];
472   bf1[25] = bf0[25];
473   bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
474   bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
475   bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
476   bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
477   bf1[30] = bf0[30];
478   bf1[31] = bf0[31];
479   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
480 
481   // stage 5
482   stage++;
483   cospi = cospi_arr(cos_bit);
484   bf0 = step;
485   bf1 = output;
486   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
487   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
488   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
489   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
490   bf1[4] = bf0[4] + bf0[5];
491   bf1[5] = -bf0[5] + bf0[4];
492   bf1[6] = -bf0[6] + bf0[7];
493   bf1[7] = bf0[7] + bf0[6];
494   bf1[8] = bf0[8];
495   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
496   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
497   bf1[11] = bf0[11];
498   bf1[12] = bf0[12];
499   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
500   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
501   bf1[15] = bf0[15];
502   bf1[16] = bf0[16] + bf0[19];
503   bf1[17] = bf0[17] + bf0[18];
504   bf1[18] = -bf0[18] + bf0[17];
505   bf1[19] = -bf0[19] + bf0[16];
506   bf1[20] = -bf0[20] + bf0[23];
507   bf1[21] = -bf0[21] + bf0[22];
508   bf1[22] = bf0[22] + bf0[21];
509   bf1[23] = bf0[23] + bf0[20];
510   bf1[24] = bf0[24] + bf0[27];
511   bf1[25] = bf0[25] + bf0[26];
512   bf1[26] = -bf0[26] + bf0[25];
513   bf1[27] = -bf0[27] + bf0[24];
514   bf1[28] = -bf0[28] + bf0[31];
515   bf1[29] = -bf0[29] + bf0[30];
516   bf1[30] = bf0[30] + bf0[29];
517   bf1[31] = bf0[31] + bf0[28];
518   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
519 
520   // stage 6
521   stage++;
522   cospi = cospi_arr(cos_bit);
523   bf0 = output;
524   bf1 = step;
525   bf1[0] = bf0[0];
526   bf1[1] = bf0[1];
527   bf1[2] = bf0[2];
528   bf1[3] = bf0[3];
529   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
530   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
531   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
532   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
533   bf1[8] = bf0[8] + bf0[9];
534   bf1[9] = -bf0[9] + bf0[8];
535   bf1[10] = -bf0[10] + bf0[11];
536   bf1[11] = bf0[11] + bf0[10];
537   bf1[12] = bf0[12] + bf0[13];
538   bf1[13] = -bf0[13] + bf0[12];
539   bf1[14] = -bf0[14] + bf0[15];
540   bf1[15] = bf0[15] + bf0[14];
541   bf1[16] = bf0[16];
542   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
543   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
544   bf1[19] = bf0[19];
545   bf1[20] = bf0[20];
546   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
547   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
548   bf1[23] = bf0[23];
549   bf1[24] = bf0[24];
550   bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
551   bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
552   bf1[27] = bf0[27];
553   bf1[28] = bf0[28];
554   bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
555   bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
556   bf1[31] = bf0[31];
557   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
558 
559   // stage 7
560   stage++;
561   cospi = cospi_arr(cos_bit);
562   bf0 = step;
563   bf1 = output;
564   bf1[0] = bf0[0];
565   bf1[1] = bf0[1];
566   bf1[2] = bf0[2];
567   bf1[3] = bf0[3];
568   bf1[4] = bf0[4];
569   bf1[5] = bf0[5];
570   bf1[6] = bf0[6];
571   bf1[7] = bf0[7];
572   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
573   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
574   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
575   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
576   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
577   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
578   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
579   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
580   bf1[16] = bf0[16] + bf0[17];
581   bf1[17] = -bf0[17] + bf0[16];
582   bf1[18] = -bf0[18] + bf0[19];
583   bf1[19] = bf0[19] + bf0[18];
584   bf1[20] = bf0[20] + bf0[21];
585   bf1[21] = -bf0[21] + bf0[20];
586   bf1[22] = -bf0[22] + bf0[23];
587   bf1[23] = bf0[23] + bf0[22];
588   bf1[24] = bf0[24] + bf0[25];
589   bf1[25] = -bf0[25] + bf0[24];
590   bf1[26] = -bf0[26] + bf0[27];
591   bf1[27] = bf0[27] + bf0[26];
592   bf1[28] = bf0[28] + bf0[29];
593   bf1[29] = -bf0[29] + bf0[28];
594   bf1[30] = -bf0[30] + bf0[31];
595   bf1[31] = bf0[31] + bf0[30];
596   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
597 
598   // stage 8
599   stage++;
600   cospi = cospi_arr(cos_bit);
601   bf0 = output;
602   bf1 = step;
603   bf1[0] = bf0[0];
604   bf1[1] = bf0[1];
605   bf1[2] = bf0[2];
606   bf1[3] = bf0[3];
607   bf1[4] = bf0[4];
608   bf1[5] = bf0[5];
609   bf1[6] = bf0[6];
610   bf1[7] = bf0[7];
611   bf1[8] = bf0[8];
612   bf1[9] = bf0[9];
613   bf1[10] = bf0[10];
614   bf1[11] = bf0[11];
615   bf1[12] = bf0[12];
616   bf1[13] = bf0[13];
617   bf1[14] = bf0[14];
618   bf1[15] = bf0[15];
619   bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
620   bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
621   bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
622   bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
623   bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
624   bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
625   bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
626   bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
627   bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
628   bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
629   bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
630   bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
631   bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
632   bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
633   bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
634   bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
635   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
636 
637   // stage 9
638   stage++;
639   bf0 = step;
640   bf1 = output;
641   bf1[0] = bf0[0];
642   bf1[1] = bf0[16];
643   bf1[2] = bf0[8];
644   bf1[3] = bf0[24];
645   bf1[4] = bf0[4];
646   bf1[5] = bf0[20];
647   bf1[6] = bf0[12];
648   bf1[7] = bf0[28];
649   bf1[8] = bf0[2];
650   bf1[9] = bf0[18];
651   bf1[10] = bf0[10];
652   bf1[11] = bf0[26];
653   bf1[12] = bf0[6];
654   bf1[13] = bf0[22];
655   bf1[14] = bf0[14];
656   bf1[15] = bf0[30];
657   bf1[16] = bf0[1];
658   bf1[17] = bf0[17];
659   bf1[18] = bf0[9];
660   bf1[19] = bf0[25];
661   bf1[20] = bf0[5];
662   bf1[21] = bf0[21];
663   bf1[22] = bf0[13];
664   bf1[23] = bf0[29];
665   bf1[24] = bf0[3];
666   bf1[25] = bf0[19];
667   bf1[26] = bf0[11];
668   bf1[27] = bf0[27];
669   bf1[28] = bf0[7];
670   bf1[29] = bf0[23];
671   bf1[30] = bf0[15];
672   bf1[31] = bf0[31];
673   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
674 }
675 
av1_fadst4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)676 void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
677                     const int8_t *stage_range) {
678   int bit = cos_bit;
679   const int32_t *sinpi = sinpi_arr(bit);
680   int32_t x0, x1, x2, x3;
681   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
682 
683   // stage 0
684   av1_range_check_buf(0, input, input, 4, stage_range[0]);
685   x0 = input[0];
686   x1 = input[1];
687   x2 = input[2];
688   x3 = input[3];
689 
690   if (!(x0 | x1 | x2 | x3)) {
691     output[0] = output[1] = output[2] = output[3] = 0;
692     return;
693   }
694 
695   // stage 1
696   s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
697   s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
698   s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
699   s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
700   s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
701   s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
702   s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
703   s7 = range_check_value(x0 + x1, stage_range[1]);
704 
705   // stage 2
706   s7 = range_check_value(s7 - x3, stage_range[2]);
707 
708   // stage 3
709   x0 = range_check_value(s0 + s2, bit + stage_range[3]);
710   x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
711   x2 = range_check_value(s1 - s3, bit + stage_range[3]);
712   x3 = range_check_value(s4, bit + stage_range[3]);
713 
714   // stage 4
715   x0 = range_check_value(x0 + s5, bit + stage_range[4]);
716   x2 = range_check_value(x2 + s6, bit + stage_range[4]);
717 
718   // stage 5
719   s0 = range_check_value(x0 + x3, bit + stage_range[5]);
720   s1 = range_check_value(x1, bit + stage_range[5]);
721   s2 = range_check_value(x2 - x3, bit + stage_range[5]);
722   s3 = range_check_value(x2 - x0, bit + stage_range[5]);
723 
724   // stage 6
725   s3 = range_check_value(s3 + x3, bit + stage_range[6]);
726 
727   // 1-D transform scaling factor is sqrt(2).
728   output[0] = round_shift(s0, bit);
729   output[1] = round_shift(s1, bit);
730   output[2] = round_shift(s2, bit);
731   output[3] = round_shift(s3, bit);
732   av1_range_check_buf(6, input, output, 4, stage_range[6]);
733 }
734 
av1_fadst8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)735 void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
736                     const int8_t *stage_range) {
737   const int32_t size = 8;
738   const int32_t *cospi;
739 
740   int32_t stage = 0;
741   int32_t *bf0, *bf1;
742   int32_t step[8];
743 
744   // stage 0;
745   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
746 
747   // stage 1;
748   stage++;
749   assert(output != input);
750   bf1 = output;
751   bf1[0] = input[0];
752   bf1[1] = -input[7];
753   bf1[2] = -input[3];
754   bf1[3] = input[4];
755   bf1[4] = -input[1];
756   bf1[5] = input[6];
757   bf1[6] = input[2];
758   bf1[7] = -input[5];
759   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
760 
761   // stage 2
762   stage++;
763   cospi = cospi_arr(cos_bit);
764   bf0 = output;
765   bf1 = step;
766   bf1[0] = bf0[0];
767   bf1[1] = bf0[1];
768   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
769   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
770   bf1[4] = bf0[4];
771   bf1[5] = bf0[5];
772   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
773   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
774   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
775 
776   // stage 3
777   stage++;
778   bf0 = step;
779   bf1 = output;
780   bf1[0] = bf0[0] + bf0[2];
781   bf1[1] = bf0[1] + bf0[3];
782   bf1[2] = bf0[0] - bf0[2];
783   bf1[3] = bf0[1] - bf0[3];
784   bf1[4] = bf0[4] + bf0[6];
785   bf1[5] = bf0[5] + bf0[7];
786   bf1[6] = bf0[4] - bf0[6];
787   bf1[7] = bf0[5] - bf0[7];
788   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
789 
790   // stage 4
791   stage++;
792   cospi = cospi_arr(cos_bit);
793   bf0 = output;
794   bf1 = step;
795   bf1[0] = bf0[0];
796   bf1[1] = bf0[1];
797   bf1[2] = bf0[2];
798   bf1[3] = bf0[3];
799   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
800   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
801   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
802   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
803   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
804 
805   // stage 5
806   stage++;
807   bf0 = step;
808   bf1 = output;
809   bf1[0] = bf0[0] + bf0[4];
810   bf1[1] = bf0[1] + bf0[5];
811   bf1[2] = bf0[2] + bf0[6];
812   bf1[3] = bf0[3] + bf0[7];
813   bf1[4] = bf0[0] - bf0[4];
814   bf1[5] = bf0[1] - bf0[5];
815   bf1[6] = bf0[2] - bf0[6];
816   bf1[7] = bf0[3] - bf0[7];
817   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
818 
819   // stage 6
820   stage++;
821   cospi = cospi_arr(cos_bit);
822   bf0 = output;
823   bf1 = step;
824   bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
825   bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
826   bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
827   bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
828   bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
829   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
830   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
831   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
832   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
833 
834   // stage 7
835   stage++;
836   bf0 = step;
837   bf1 = output;
838   bf1[0] = bf0[1];
839   bf1[1] = bf0[6];
840   bf1[2] = bf0[3];
841   bf1[3] = bf0[4];
842   bf1[4] = bf0[5];
843   bf1[5] = bf0[2];
844   bf1[6] = bf0[7];
845   bf1[7] = bf0[0];
846   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
847 }
848 
av1_fadst16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)849 void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
850                      const int8_t *stage_range) {
851   const int32_t size = 16;
852   const int32_t *cospi;
853 
854   int32_t stage = 0;
855   int32_t *bf0, *bf1;
856   int32_t step[16];
857 
858   // stage 0;
859   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
860 
861   // stage 1;
862   stage++;
863   assert(output != input);
864   bf1 = output;
865   bf1[0] = input[0];
866   bf1[1] = -input[15];
867   bf1[2] = -input[7];
868   bf1[3] = input[8];
869   bf1[4] = -input[3];
870   bf1[5] = input[12];
871   bf1[6] = input[4];
872   bf1[7] = -input[11];
873   bf1[8] = -input[1];
874   bf1[9] = input[14];
875   bf1[10] = input[6];
876   bf1[11] = -input[9];
877   bf1[12] = input[2];
878   bf1[13] = -input[13];
879   bf1[14] = -input[5];
880   bf1[15] = input[10];
881   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
882 
883   // stage 2
884   stage++;
885   cospi = cospi_arr(cos_bit);
886   bf0 = output;
887   bf1 = step;
888   bf1[0] = bf0[0];
889   bf1[1] = bf0[1];
890   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
891   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
892   bf1[4] = bf0[4];
893   bf1[5] = bf0[5];
894   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
895   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
896   bf1[8] = bf0[8];
897   bf1[9] = bf0[9];
898   bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
899   bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
900   bf1[12] = bf0[12];
901   bf1[13] = bf0[13];
902   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
903   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
904   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
905 
906   // stage 3
907   stage++;
908   bf0 = step;
909   bf1 = output;
910   bf1[0] = bf0[0] + bf0[2];
911   bf1[1] = bf0[1] + bf0[3];
912   bf1[2] = bf0[0] - bf0[2];
913   bf1[3] = bf0[1] - bf0[3];
914   bf1[4] = bf0[4] + bf0[6];
915   bf1[5] = bf0[5] + bf0[7];
916   bf1[6] = bf0[4] - bf0[6];
917   bf1[7] = bf0[5] - bf0[7];
918   bf1[8] = bf0[8] + bf0[10];
919   bf1[9] = bf0[9] + bf0[11];
920   bf1[10] = bf0[8] - bf0[10];
921   bf1[11] = bf0[9] - bf0[11];
922   bf1[12] = bf0[12] + bf0[14];
923   bf1[13] = bf0[13] + bf0[15];
924   bf1[14] = bf0[12] - bf0[14];
925   bf1[15] = bf0[13] - bf0[15];
926   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
927 
928   // stage 4
929   stage++;
930   cospi = cospi_arr(cos_bit);
931   bf0 = output;
932   bf1 = step;
933   bf1[0] = bf0[0];
934   bf1[1] = bf0[1];
935   bf1[2] = bf0[2];
936   bf1[3] = bf0[3];
937   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
938   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
939   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
940   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
941   bf1[8] = bf0[8];
942   bf1[9] = bf0[9];
943   bf1[10] = bf0[10];
944   bf1[11] = bf0[11];
945   bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
946   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
947   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
948   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
949   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
950 
951   // stage 5
952   stage++;
953   bf0 = step;
954   bf1 = output;
955   bf1[0] = bf0[0] + bf0[4];
956   bf1[1] = bf0[1] + bf0[5];
957   bf1[2] = bf0[2] + bf0[6];
958   bf1[3] = bf0[3] + bf0[7];
959   bf1[4] = bf0[0] - bf0[4];
960   bf1[5] = bf0[1] - bf0[5];
961   bf1[6] = bf0[2] - bf0[6];
962   bf1[7] = bf0[3] - bf0[7];
963   bf1[8] = bf0[8] + bf0[12];
964   bf1[9] = bf0[9] + bf0[13];
965   bf1[10] = bf0[10] + bf0[14];
966   bf1[11] = bf0[11] + bf0[15];
967   bf1[12] = bf0[8] - bf0[12];
968   bf1[13] = bf0[9] - bf0[13];
969   bf1[14] = bf0[10] - bf0[14];
970   bf1[15] = bf0[11] - bf0[15];
971   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
972 
973   // stage 6
974   stage++;
975   cospi = cospi_arr(cos_bit);
976   bf0 = output;
977   bf1 = step;
978   bf1[0] = bf0[0];
979   bf1[1] = bf0[1];
980   bf1[2] = bf0[2];
981   bf1[3] = bf0[3];
982   bf1[4] = bf0[4];
983   bf1[5] = bf0[5];
984   bf1[6] = bf0[6];
985   bf1[7] = bf0[7];
986   bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
987   bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
988   bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
989   bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
990   bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
991   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
992   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
993   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
994   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
995 
996   // stage 7
997   stage++;
998   bf0 = step;
999   bf1 = output;
1000   bf1[0] = bf0[0] + bf0[8];
1001   bf1[1] = bf0[1] + bf0[9];
1002   bf1[2] = bf0[2] + bf0[10];
1003   bf1[3] = bf0[3] + bf0[11];
1004   bf1[4] = bf0[4] + bf0[12];
1005   bf1[5] = bf0[5] + bf0[13];
1006   bf1[6] = bf0[6] + bf0[14];
1007   bf1[7] = bf0[7] + bf0[15];
1008   bf1[8] = bf0[0] - bf0[8];
1009   bf1[9] = bf0[1] - bf0[9];
1010   bf1[10] = bf0[2] - bf0[10];
1011   bf1[11] = bf0[3] - bf0[11];
1012   bf1[12] = bf0[4] - bf0[12];
1013   bf1[13] = bf0[5] - bf0[13];
1014   bf1[14] = bf0[6] - bf0[14];
1015   bf1[15] = bf0[7] - bf0[15];
1016   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1017 
1018   // stage 8
1019   stage++;
1020   cospi = cospi_arr(cos_bit);
1021   bf0 = output;
1022   bf1 = step;
1023   bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1024   bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1025   bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1026   bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1027   bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1028   bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1029   bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1030   bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1031   bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1032   bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1033   bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1034   bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1035   bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1036   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1037   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1038   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1039   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1040 
1041   // stage 9
1042   stage++;
1043   bf0 = step;
1044   bf1 = output;
1045   bf1[0] = bf0[1];
1046   bf1[1] = bf0[14];
1047   bf1[2] = bf0[3];
1048   bf1[3] = bf0[12];
1049   bf1[4] = bf0[5];
1050   bf1[5] = bf0[10];
1051   bf1[6] = bf0[7];
1052   bf1[7] = bf0[8];
1053   bf1[8] = bf0[9];
1054   bf1[9] = bf0[6];
1055   bf1[10] = bf0[11];
1056   bf1[11] = bf0[4];
1057   bf1[12] = bf0[13];
1058   bf1[13] = bf0[2];
1059   bf1[14] = bf0[15];
1060   bf1[15] = bf0[0];
1061   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1062 }
1063 
av1_fidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1064 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1065                       const int8_t *stage_range) {
1066   (void)cos_bit;
1067   for (int i = 0; i < 4; ++i)
1068     output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
1069   assert(stage_range[0] + NewSqrt2Bits <= 32);
1070   av1_range_check_buf(0, input, output, 4, stage_range[0]);
1071 }
1072 
av1_fidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1073 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1074                       const int8_t *stage_range) {
1075   (void)cos_bit;
1076   for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1077   av1_range_check_buf(0, input, output, 8, stage_range[0]);
1078 }
1079 
av1_fidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1080 void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1081                        const int8_t *stage_range) {
1082   (void)cos_bit;
1083   for (int i = 0; i < 16; ++i)
1084     output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
1085   assert(stage_range[0] + NewSqrt2Bits <= 32);
1086   av1_range_check_buf(0, input, output, 16, stage_range[0]);
1087 }
1088 
av1_fidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1089 void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1090                        const int8_t *stage_range) {
1091   (void)cos_bit;
1092   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1093   av1_range_check_buf(0, input, output, 32, stage_range[0]);
1094 }
1095 
av1_fdct64_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1096 void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1097                     const int8_t *stage_range) {
1098   const int32_t size = 64;
1099   const int32_t *cospi;
1100 
1101   int32_t stage = 0;
1102   int32_t *bf0, *bf1;
1103   int32_t step[64];
1104 
1105   // stage 0;
1106   av1_range_check_buf(stage, input, input, size, stage_range[stage]);
1107 
1108   // stage 1;
1109   stage++;
1110   bf1 = output;
1111   bf1[0] = input[0] + input[63];
1112   bf1[1] = input[1] + input[62];
1113   bf1[2] = input[2] + input[61];
1114   bf1[3] = input[3] + input[60];
1115   bf1[4] = input[4] + input[59];
1116   bf1[5] = input[5] + input[58];
1117   bf1[6] = input[6] + input[57];
1118   bf1[7] = input[7] + input[56];
1119   bf1[8] = input[8] + input[55];
1120   bf1[9] = input[9] + input[54];
1121   bf1[10] = input[10] + input[53];
1122   bf1[11] = input[11] + input[52];
1123   bf1[12] = input[12] + input[51];
1124   bf1[13] = input[13] + input[50];
1125   bf1[14] = input[14] + input[49];
1126   bf1[15] = input[15] + input[48];
1127   bf1[16] = input[16] + input[47];
1128   bf1[17] = input[17] + input[46];
1129   bf1[18] = input[18] + input[45];
1130   bf1[19] = input[19] + input[44];
1131   bf1[20] = input[20] + input[43];
1132   bf1[21] = input[21] + input[42];
1133   bf1[22] = input[22] + input[41];
1134   bf1[23] = input[23] + input[40];
1135   bf1[24] = input[24] + input[39];
1136   bf1[25] = input[25] + input[38];
1137   bf1[26] = input[26] + input[37];
1138   bf1[27] = input[27] + input[36];
1139   bf1[28] = input[28] + input[35];
1140   bf1[29] = input[29] + input[34];
1141   bf1[30] = input[30] + input[33];
1142   bf1[31] = input[31] + input[32];
1143   bf1[32] = -input[32] + input[31];
1144   bf1[33] = -input[33] + input[30];
1145   bf1[34] = -input[34] + input[29];
1146   bf1[35] = -input[35] + input[28];
1147   bf1[36] = -input[36] + input[27];
1148   bf1[37] = -input[37] + input[26];
1149   bf1[38] = -input[38] + input[25];
1150   bf1[39] = -input[39] + input[24];
1151   bf1[40] = -input[40] + input[23];
1152   bf1[41] = -input[41] + input[22];
1153   bf1[42] = -input[42] + input[21];
1154   bf1[43] = -input[43] + input[20];
1155   bf1[44] = -input[44] + input[19];
1156   bf1[45] = -input[45] + input[18];
1157   bf1[46] = -input[46] + input[17];
1158   bf1[47] = -input[47] + input[16];
1159   bf1[48] = -input[48] + input[15];
1160   bf1[49] = -input[49] + input[14];
1161   bf1[50] = -input[50] + input[13];
1162   bf1[51] = -input[51] + input[12];
1163   bf1[52] = -input[52] + input[11];
1164   bf1[53] = -input[53] + input[10];
1165   bf1[54] = -input[54] + input[9];
1166   bf1[55] = -input[55] + input[8];
1167   bf1[56] = -input[56] + input[7];
1168   bf1[57] = -input[57] + input[6];
1169   bf1[58] = -input[58] + input[5];
1170   bf1[59] = -input[59] + input[4];
1171   bf1[60] = -input[60] + input[3];
1172   bf1[61] = -input[61] + input[2];
1173   bf1[62] = -input[62] + input[1];
1174   bf1[63] = -input[63] + input[0];
1175   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1176 
1177   // stage 2
1178   stage++;
1179   cospi = cospi_arr(cos_bit);
1180   bf0 = output;
1181   bf1 = step;
1182   bf1[0] = bf0[0] + bf0[31];
1183   bf1[1] = bf0[1] + bf0[30];
1184   bf1[2] = bf0[2] + bf0[29];
1185   bf1[3] = bf0[3] + bf0[28];
1186   bf1[4] = bf0[4] + bf0[27];
1187   bf1[5] = bf0[5] + bf0[26];
1188   bf1[6] = bf0[6] + bf0[25];
1189   bf1[7] = bf0[7] + bf0[24];
1190   bf1[8] = bf0[8] + bf0[23];
1191   bf1[9] = bf0[9] + bf0[22];
1192   bf1[10] = bf0[10] + bf0[21];
1193   bf1[11] = bf0[11] + bf0[20];
1194   bf1[12] = bf0[12] + bf0[19];
1195   bf1[13] = bf0[13] + bf0[18];
1196   bf1[14] = bf0[14] + bf0[17];
1197   bf1[15] = bf0[15] + bf0[16];
1198   bf1[16] = -bf0[16] + bf0[15];
1199   bf1[17] = -bf0[17] + bf0[14];
1200   bf1[18] = -bf0[18] + bf0[13];
1201   bf1[19] = -bf0[19] + bf0[12];
1202   bf1[20] = -bf0[20] + bf0[11];
1203   bf1[21] = -bf0[21] + bf0[10];
1204   bf1[22] = -bf0[22] + bf0[9];
1205   bf1[23] = -bf0[23] + bf0[8];
1206   bf1[24] = -bf0[24] + bf0[7];
1207   bf1[25] = -bf0[25] + bf0[6];
1208   bf1[26] = -bf0[26] + bf0[5];
1209   bf1[27] = -bf0[27] + bf0[4];
1210   bf1[28] = -bf0[28] + bf0[3];
1211   bf1[29] = -bf0[29] + bf0[2];
1212   bf1[30] = -bf0[30] + bf0[1];
1213   bf1[31] = -bf0[31] + bf0[0];
1214   bf1[32] = bf0[32];
1215   bf1[33] = bf0[33];
1216   bf1[34] = bf0[34];
1217   bf1[35] = bf0[35];
1218   bf1[36] = bf0[36];
1219   bf1[37] = bf0[37];
1220   bf1[38] = bf0[38];
1221   bf1[39] = bf0[39];
1222   bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1223   bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1224   bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1225   bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1226   bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1227   bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1228   bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1229   bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1230   bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1231   bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1232   bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1233   bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1234   bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1235   bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1236   bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1237   bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1238   bf1[56] = bf0[56];
1239   bf1[57] = bf0[57];
1240   bf1[58] = bf0[58];
1241   bf1[59] = bf0[59];
1242   bf1[60] = bf0[60];
1243   bf1[61] = bf0[61];
1244   bf1[62] = bf0[62];
1245   bf1[63] = bf0[63];
1246   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1247 
1248   // stage 3
1249   stage++;
1250   cospi = cospi_arr(cos_bit);
1251   bf0 = step;
1252   bf1 = output;
1253   bf1[0] = bf0[0] + bf0[15];
1254   bf1[1] = bf0[1] + bf0[14];
1255   bf1[2] = bf0[2] + bf0[13];
1256   bf1[3] = bf0[3] + bf0[12];
1257   bf1[4] = bf0[4] + bf0[11];
1258   bf1[5] = bf0[5] + bf0[10];
1259   bf1[6] = bf0[6] + bf0[9];
1260   bf1[7] = bf0[7] + bf0[8];
1261   bf1[8] = -bf0[8] + bf0[7];
1262   bf1[9] = -bf0[9] + bf0[6];
1263   bf1[10] = -bf0[10] + bf0[5];
1264   bf1[11] = -bf0[11] + bf0[4];
1265   bf1[12] = -bf0[12] + bf0[3];
1266   bf1[13] = -bf0[13] + bf0[2];
1267   bf1[14] = -bf0[14] + bf0[1];
1268   bf1[15] = -bf0[15] + bf0[0];
1269   bf1[16] = bf0[16];
1270   bf1[17] = bf0[17];
1271   bf1[18] = bf0[18];
1272   bf1[19] = bf0[19];
1273   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1274   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1275   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1276   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1277   bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1278   bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1279   bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1280   bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1281   bf1[28] = bf0[28];
1282   bf1[29] = bf0[29];
1283   bf1[30] = bf0[30];
1284   bf1[31] = bf0[31];
1285   bf1[32] = bf0[32] + bf0[47];
1286   bf1[33] = bf0[33] + bf0[46];
1287   bf1[34] = bf0[34] + bf0[45];
1288   bf1[35] = bf0[35] + bf0[44];
1289   bf1[36] = bf0[36] + bf0[43];
1290   bf1[37] = bf0[37] + bf0[42];
1291   bf1[38] = bf0[38] + bf0[41];
1292   bf1[39] = bf0[39] + bf0[40];
1293   bf1[40] = -bf0[40] + bf0[39];
1294   bf1[41] = -bf0[41] + bf0[38];
1295   bf1[42] = -bf0[42] + bf0[37];
1296   bf1[43] = -bf0[43] + bf0[36];
1297   bf1[44] = -bf0[44] + bf0[35];
1298   bf1[45] = -bf0[45] + bf0[34];
1299   bf1[46] = -bf0[46] + bf0[33];
1300   bf1[47] = -bf0[47] + bf0[32];
1301   bf1[48] = -bf0[48] + bf0[63];
1302   bf1[49] = -bf0[49] + bf0[62];
1303   bf1[50] = -bf0[50] + bf0[61];
1304   bf1[51] = -bf0[51] + bf0[60];
1305   bf1[52] = -bf0[52] + bf0[59];
1306   bf1[53] = -bf0[53] + bf0[58];
1307   bf1[54] = -bf0[54] + bf0[57];
1308   bf1[55] = -bf0[55] + bf0[56];
1309   bf1[56] = bf0[56] + bf0[55];
1310   bf1[57] = bf0[57] + bf0[54];
1311   bf1[58] = bf0[58] + bf0[53];
1312   bf1[59] = bf0[59] + bf0[52];
1313   bf1[60] = bf0[60] + bf0[51];
1314   bf1[61] = bf0[61] + bf0[50];
1315   bf1[62] = bf0[62] + bf0[49];
1316   bf1[63] = bf0[63] + bf0[48];
1317   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1318 
1319   // stage 4
1320   stage++;
1321   cospi = cospi_arr(cos_bit);
1322   bf0 = output;
1323   bf1 = step;
1324   bf1[0] = bf0[0] + bf0[7];
1325   bf1[1] = bf0[1] + bf0[6];
1326   bf1[2] = bf0[2] + bf0[5];
1327   bf1[3] = bf0[3] + bf0[4];
1328   bf1[4] = -bf0[4] + bf0[3];
1329   bf1[5] = -bf0[5] + bf0[2];
1330   bf1[6] = -bf0[6] + bf0[1];
1331   bf1[7] = -bf0[7] + bf0[0];
1332   bf1[8] = bf0[8];
1333   bf1[9] = bf0[9];
1334   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1335   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1336   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1337   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1338   bf1[14] = bf0[14];
1339   bf1[15] = bf0[15];
1340   bf1[16] = bf0[16] + bf0[23];
1341   bf1[17] = bf0[17] + bf0[22];
1342   bf1[18] = bf0[18] + bf0[21];
1343   bf1[19] = bf0[19] + bf0[20];
1344   bf1[20] = -bf0[20] + bf0[19];
1345   bf1[21] = -bf0[21] + bf0[18];
1346   bf1[22] = -bf0[22] + bf0[17];
1347   bf1[23] = -bf0[23] + bf0[16];
1348   bf1[24] = -bf0[24] + bf0[31];
1349   bf1[25] = -bf0[25] + bf0[30];
1350   bf1[26] = -bf0[26] + bf0[29];
1351   bf1[27] = -bf0[27] + bf0[28];
1352   bf1[28] = bf0[28] + bf0[27];
1353   bf1[29] = bf0[29] + bf0[26];
1354   bf1[30] = bf0[30] + bf0[25];
1355   bf1[31] = bf0[31] + bf0[24];
1356   bf1[32] = bf0[32];
1357   bf1[33] = bf0[33];
1358   bf1[34] = bf0[34];
1359   bf1[35] = bf0[35];
1360   bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1361   bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1362   bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1363   bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1364   bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1365   bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1366   bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1367   bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1368   bf1[44] = bf0[44];
1369   bf1[45] = bf0[45];
1370   bf1[46] = bf0[46];
1371   bf1[47] = bf0[47];
1372   bf1[48] = bf0[48];
1373   bf1[49] = bf0[49];
1374   bf1[50] = bf0[50];
1375   bf1[51] = bf0[51];
1376   bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1377   bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1378   bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1379   bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1380   bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1381   bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1382   bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1383   bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1384   bf1[60] = bf0[60];
1385   bf1[61] = bf0[61];
1386   bf1[62] = bf0[62];
1387   bf1[63] = bf0[63];
1388   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1389 
1390   // stage 5
1391   stage++;
1392   cospi = cospi_arr(cos_bit);
1393   bf0 = step;
1394   bf1 = output;
1395   bf1[0] = bf0[0] + bf0[3];
1396   bf1[1] = bf0[1] + bf0[2];
1397   bf1[2] = -bf0[2] + bf0[1];
1398   bf1[3] = -bf0[3] + bf0[0];
1399   bf1[4] = bf0[4];
1400   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1401   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1402   bf1[7] = bf0[7];
1403   bf1[8] = bf0[8] + bf0[11];
1404   bf1[9] = bf0[9] + bf0[10];
1405   bf1[10] = -bf0[10] + bf0[9];
1406   bf1[11] = -bf0[11] + bf0[8];
1407   bf1[12] = -bf0[12] + bf0[15];
1408   bf1[13] = -bf0[13] + bf0[14];
1409   bf1[14] = bf0[14] + bf0[13];
1410   bf1[15] = bf0[15] + bf0[12];
1411   bf1[16] = bf0[16];
1412   bf1[17] = bf0[17];
1413   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1414   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1415   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1416   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1417   bf1[22] = bf0[22];
1418   bf1[23] = bf0[23];
1419   bf1[24] = bf0[24];
1420   bf1[25] = bf0[25];
1421   bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1422   bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1423   bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1424   bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1425   bf1[30] = bf0[30];
1426   bf1[31] = bf0[31];
1427   bf1[32] = bf0[32] + bf0[39];
1428   bf1[33] = bf0[33] + bf0[38];
1429   bf1[34] = bf0[34] + bf0[37];
1430   bf1[35] = bf0[35] + bf0[36];
1431   bf1[36] = -bf0[36] + bf0[35];
1432   bf1[37] = -bf0[37] + bf0[34];
1433   bf1[38] = -bf0[38] + bf0[33];
1434   bf1[39] = -bf0[39] + bf0[32];
1435   bf1[40] = -bf0[40] + bf0[47];
1436   bf1[41] = -bf0[41] + bf0[46];
1437   bf1[42] = -bf0[42] + bf0[45];
1438   bf1[43] = -bf0[43] + bf0[44];
1439   bf1[44] = bf0[44] + bf0[43];
1440   bf1[45] = bf0[45] + bf0[42];
1441   bf1[46] = bf0[46] + bf0[41];
1442   bf1[47] = bf0[47] + bf0[40];
1443   bf1[48] = bf0[48] + bf0[55];
1444   bf1[49] = bf0[49] + bf0[54];
1445   bf1[50] = bf0[50] + bf0[53];
1446   bf1[51] = bf0[51] + bf0[52];
1447   bf1[52] = -bf0[52] + bf0[51];
1448   bf1[53] = -bf0[53] + bf0[50];
1449   bf1[54] = -bf0[54] + bf0[49];
1450   bf1[55] = -bf0[55] + bf0[48];
1451   bf1[56] = -bf0[56] + bf0[63];
1452   bf1[57] = -bf0[57] + bf0[62];
1453   bf1[58] = -bf0[58] + bf0[61];
1454   bf1[59] = -bf0[59] + bf0[60];
1455   bf1[60] = bf0[60] + bf0[59];
1456   bf1[61] = bf0[61] + bf0[58];
1457   bf1[62] = bf0[62] + bf0[57];
1458   bf1[63] = bf0[63] + bf0[56];
1459   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1460 
1461   // stage 6
1462   stage++;
1463   cospi = cospi_arr(cos_bit);
1464   bf0 = output;
1465   bf1 = step;
1466   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1467   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1468   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1469   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1470   bf1[4] = bf0[4] + bf0[5];
1471   bf1[5] = -bf0[5] + bf0[4];
1472   bf1[6] = -bf0[6] + bf0[7];
1473   bf1[7] = bf0[7] + bf0[6];
1474   bf1[8] = bf0[8];
1475   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1476   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1477   bf1[11] = bf0[11];
1478   bf1[12] = bf0[12];
1479   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1480   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1481   bf1[15] = bf0[15];
1482   bf1[16] = bf0[16] + bf0[19];
1483   bf1[17] = bf0[17] + bf0[18];
1484   bf1[18] = -bf0[18] + bf0[17];
1485   bf1[19] = -bf0[19] + bf0[16];
1486   bf1[20] = -bf0[20] + bf0[23];
1487   bf1[21] = -bf0[21] + bf0[22];
1488   bf1[22] = bf0[22] + bf0[21];
1489   bf1[23] = bf0[23] + bf0[20];
1490   bf1[24] = bf0[24] + bf0[27];
1491   bf1[25] = bf0[25] + bf0[26];
1492   bf1[26] = -bf0[26] + bf0[25];
1493   bf1[27] = -bf0[27] + bf0[24];
1494   bf1[28] = -bf0[28] + bf0[31];
1495   bf1[29] = -bf0[29] + bf0[30];
1496   bf1[30] = bf0[30] + bf0[29];
1497   bf1[31] = bf0[31] + bf0[28];
1498   bf1[32] = bf0[32];
1499   bf1[33] = bf0[33];
1500   bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1501   bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1502   bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1503   bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1504   bf1[38] = bf0[38];
1505   bf1[39] = bf0[39];
1506   bf1[40] = bf0[40];
1507   bf1[41] = bf0[41];
1508   bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1509   bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1510   bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1511   bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1512   bf1[46] = bf0[46];
1513   bf1[47] = bf0[47];
1514   bf1[48] = bf0[48];
1515   bf1[49] = bf0[49];
1516   bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1517   bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1518   bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1519   bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1520   bf1[54] = bf0[54];
1521   bf1[55] = bf0[55];
1522   bf1[56] = bf0[56];
1523   bf1[57] = bf0[57];
1524   bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1525   bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1526   bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1527   bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1528   bf1[62] = bf0[62];
1529   bf1[63] = bf0[63];
1530   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1531 
1532   // stage 7
1533   stage++;
1534   cospi = cospi_arr(cos_bit);
1535   bf0 = step;
1536   bf1 = output;
1537   bf1[0] = bf0[0];
1538   bf1[1] = bf0[1];
1539   bf1[2] = bf0[2];
1540   bf1[3] = bf0[3];
1541   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1542   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1543   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1544   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1545   bf1[8] = bf0[8] + bf0[9];
1546   bf1[9] = -bf0[9] + bf0[8];
1547   bf1[10] = -bf0[10] + bf0[11];
1548   bf1[11] = bf0[11] + bf0[10];
1549   bf1[12] = bf0[12] + bf0[13];
1550   bf1[13] = -bf0[13] + bf0[12];
1551   bf1[14] = -bf0[14] + bf0[15];
1552   bf1[15] = bf0[15] + bf0[14];
1553   bf1[16] = bf0[16];
1554   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1555   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1556   bf1[19] = bf0[19];
1557   bf1[20] = bf0[20];
1558   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1559   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1560   bf1[23] = bf0[23];
1561   bf1[24] = bf0[24];
1562   bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1563   bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1564   bf1[27] = bf0[27];
1565   bf1[28] = bf0[28];
1566   bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1567   bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1568   bf1[31] = bf0[31];
1569   bf1[32] = bf0[32] + bf0[35];
1570   bf1[33] = bf0[33] + bf0[34];
1571   bf1[34] = -bf0[34] + bf0[33];
1572   bf1[35] = -bf0[35] + bf0[32];
1573   bf1[36] = -bf0[36] + bf0[39];
1574   bf1[37] = -bf0[37] + bf0[38];
1575   bf1[38] = bf0[38] + bf0[37];
1576   bf1[39] = bf0[39] + bf0[36];
1577   bf1[40] = bf0[40] + bf0[43];
1578   bf1[41] = bf0[41] + bf0[42];
1579   bf1[42] = -bf0[42] + bf0[41];
1580   bf1[43] = -bf0[43] + bf0[40];
1581   bf1[44] = -bf0[44] + bf0[47];
1582   bf1[45] = -bf0[45] + bf0[46];
1583   bf1[46] = bf0[46] + bf0[45];
1584   bf1[47] = bf0[47] + bf0[44];
1585   bf1[48] = bf0[48] + bf0[51];
1586   bf1[49] = bf0[49] + bf0[50];
1587   bf1[50] = -bf0[50] + bf0[49];
1588   bf1[51] = -bf0[51] + bf0[48];
1589   bf1[52] = -bf0[52] + bf0[55];
1590   bf1[53] = -bf0[53] + bf0[54];
1591   bf1[54] = bf0[54] + bf0[53];
1592   bf1[55] = bf0[55] + bf0[52];
1593   bf1[56] = bf0[56] + bf0[59];
1594   bf1[57] = bf0[57] + bf0[58];
1595   bf1[58] = -bf0[58] + bf0[57];
1596   bf1[59] = -bf0[59] + bf0[56];
1597   bf1[60] = -bf0[60] + bf0[63];
1598   bf1[61] = -bf0[61] + bf0[62];
1599   bf1[62] = bf0[62] + bf0[61];
1600   bf1[63] = bf0[63] + bf0[60];
1601   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1602 
1603   // stage 8
1604   stage++;
1605   cospi = cospi_arr(cos_bit);
1606   bf0 = output;
1607   bf1 = step;
1608   bf1[0] = bf0[0];
1609   bf1[1] = bf0[1];
1610   bf1[2] = bf0[2];
1611   bf1[3] = bf0[3];
1612   bf1[4] = bf0[4];
1613   bf1[5] = bf0[5];
1614   bf1[6] = bf0[6];
1615   bf1[7] = bf0[7];
1616   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1617   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1618   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1619   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1620   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1621   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1622   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1623   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1624   bf1[16] = bf0[16] + bf0[17];
1625   bf1[17] = -bf0[17] + bf0[16];
1626   bf1[18] = -bf0[18] + bf0[19];
1627   bf1[19] = bf0[19] + bf0[18];
1628   bf1[20] = bf0[20] + bf0[21];
1629   bf1[21] = -bf0[21] + bf0[20];
1630   bf1[22] = -bf0[22] + bf0[23];
1631   bf1[23] = bf0[23] + bf0[22];
1632   bf1[24] = bf0[24] + bf0[25];
1633   bf1[25] = -bf0[25] + bf0[24];
1634   bf1[26] = -bf0[26] + bf0[27];
1635   bf1[27] = bf0[27] + bf0[26];
1636   bf1[28] = bf0[28] + bf0[29];
1637   bf1[29] = -bf0[29] + bf0[28];
1638   bf1[30] = -bf0[30] + bf0[31];
1639   bf1[31] = bf0[31] + bf0[30];
1640   bf1[32] = bf0[32];
1641   bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1642   bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1643   bf1[35] = bf0[35];
1644   bf1[36] = bf0[36];
1645   bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1646   bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1647   bf1[39] = bf0[39];
1648   bf1[40] = bf0[40];
1649   bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1650   bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1651   bf1[43] = bf0[43];
1652   bf1[44] = bf0[44];
1653   bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1654   bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1655   bf1[47] = bf0[47];
1656   bf1[48] = bf0[48];
1657   bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1658   bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1659   bf1[51] = bf0[51];
1660   bf1[52] = bf0[52];
1661   bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1662   bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1663   bf1[55] = bf0[55];
1664   bf1[56] = bf0[56];
1665   bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1666   bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1667   bf1[59] = bf0[59];
1668   bf1[60] = bf0[60];
1669   bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1670   bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1671   bf1[63] = bf0[63];
1672   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1673 
1674   // stage 9
1675   stage++;
1676   cospi = cospi_arr(cos_bit);
1677   bf0 = step;
1678   bf1 = output;
1679   bf1[0] = bf0[0];
1680   bf1[1] = bf0[1];
1681   bf1[2] = bf0[2];
1682   bf1[3] = bf0[3];
1683   bf1[4] = bf0[4];
1684   bf1[5] = bf0[5];
1685   bf1[6] = bf0[6];
1686   bf1[7] = bf0[7];
1687   bf1[8] = bf0[8];
1688   bf1[9] = bf0[9];
1689   bf1[10] = bf0[10];
1690   bf1[11] = bf0[11];
1691   bf1[12] = bf0[12];
1692   bf1[13] = bf0[13];
1693   bf1[14] = bf0[14];
1694   bf1[15] = bf0[15];
1695   bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1696   bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1697   bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1698   bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1699   bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1700   bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1701   bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1702   bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1703   bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1704   bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1705   bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1706   bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1707   bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1708   bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1709   bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1710   bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1711   bf1[32] = bf0[32] + bf0[33];
1712   bf1[33] = -bf0[33] + bf0[32];
1713   bf1[34] = -bf0[34] + bf0[35];
1714   bf1[35] = bf0[35] + bf0[34];
1715   bf1[36] = bf0[36] + bf0[37];
1716   bf1[37] = -bf0[37] + bf0[36];
1717   bf1[38] = -bf0[38] + bf0[39];
1718   bf1[39] = bf0[39] + bf0[38];
1719   bf1[40] = bf0[40] + bf0[41];
1720   bf1[41] = -bf0[41] + bf0[40];
1721   bf1[42] = -bf0[42] + bf0[43];
1722   bf1[43] = bf0[43] + bf0[42];
1723   bf1[44] = bf0[44] + bf0[45];
1724   bf1[45] = -bf0[45] + bf0[44];
1725   bf1[46] = -bf0[46] + bf0[47];
1726   bf1[47] = bf0[47] + bf0[46];
1727   bf1[48] = bf0[48] + bf0[49];
1728   bf1[49] = -bf0[49] + bf0[48];
1729   bf1[50] = -bf0[50] + bf0[51];
1730   bf1[51] = bf0[51] + bf0[50];
1731   bf1[52] = bf0[52] + bf0[53];
1732   bf1[53] = -bf0[53] + bf0[52];
1733   bf1[54] = -bf0[54] + bf0[55];
1734   bf1[55] = bf0[55] + bf0[54];
1735   bf1[56] = bf0[56] + bf0[57];
1736   bf1[57] = -bf0[57] + bf0[56];
1737   bf1[58] = -bf0[58] + bf0[59];
1738   bf1[59] = bf0[59] + bf0[58];
1739   bf1[60] = bf0[60] + bf0[61];
1740   bf1[61] = -bf0[61] + bf0[60];
1741   bf1[62] = -bf0[62] + bf0[63];
1742   bf1[63] = bf0[63] + bf0[62];
1743   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1744 
1745   // stage 10
1746   stage++;
1747   cospi = cospi_arr(cos_bit);
1748   bf0 = output;
1749   bf1 = step;
1750   bf1[0] = bf0[0];
1751   bf1[1] = bf0[1];
1752   bf1[2] = bf0[2];
1753   bf1[3] = bf0[3];
1754   bf1[4] = bf0[4];
1755   bf1[5] = bf0[5];
1756   bf1[6] = bf0[6];
1757   bf1[7] = bf0[7];
1758   bf1[8] = bf0[8];
1759   bf1[9] = bf0[9];
1760   bf1[10] = bf0[10];
1761   bf1[11] = bf0[11];
1762   bf1[12] = bf0[12];
1763   bf1[13] = bf0[13];
1764   bf1[14] = bf0[14];
1765   bf1[15] = bf0[15];
1766   bf1[16] = bf0[16];
1767   bf1[17] = bf0[17];
1768   bf1[18] = bf0[18];
1769   bf1[19] = bf0[19];
1770   bf1[20] = bf0[20];
1771   bf1[21] = bf0[21];
1772   bf1[22] = bf0[22];
1773   bf1[23] = bf0[23];
1774   bf1[24] = bf0[24];
1775   bf1[25] = bf0[25];
1776   bf1[26] = bf0[26];
1777   bf1[27] = bf0[27];
1778   bf1[28] = bf0[28];
1779   bf1[29] = bf0[29];
1780   bf1[30] = bf0[30];
1781   bf1[31] = bf0[31];
1782   bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1783   bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1784   bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1785   bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1786   bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1787   bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1788   bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1789   bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1790   bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1791   bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1792   bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1793   bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1794   bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1795   bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1796   bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1797   bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1798   bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1799   bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1800   bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1801   bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1802   bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1803   bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1804   bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1805   bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1806   bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1807   bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1808   bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1809   bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1810   bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1811   bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1812   bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1813   bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1814   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1815 
1816   // stage 11
1817   stage++;
1818   bf0 = step;
1819   bf1 = output;
1820   bf1[0] = bf0[0];
1821   bf1[1] = bf0[32];
1822   bf1[2] = bf0[16];
1823   bf1[3] = bf0[48];
1824   bf1[4] = bf0[8];
1825   bf1[5] = bf0[40];
1826   bf1[6] = bf0[24];
1827   bf1[7] = bf0[56];
1828   bf1[8] = bf0[4];
1829   bf1[9] = bf0[36];
1830   bf1[10] = bf0[20];
1831   bf1[11] = bf0[52];
1832   bf1[12] = bf0[12];
1833   bf1[13] = bf0[44];
1834   bf1[14] = bf0[28];
1835   bf1[15] = bf0[60];
1836   bf1[16] = bf0[2];
1837   bf1[17] = bf0[34];
1838   bf1[18] = bf0[18];
1839   bf1[19] = bf0[50];
1840   bf1[20] = bf0[10];
1841   bf1[21] = bf0[42];
1842   bf1[22] = bf0[26];
1843   bf1[23] = bf0[58];
1844   bf1[24] = bf0[6];
1845   bf1[25] = bf0[38];
1846   bf1[26] = bf0[22];
1847   bf1[27] = bf0[54];
1848   bf1[28] = bf0[14];
1849   bf1[29] = bf0[46];
1850   bf1[30] = bf0[30];
1851   bf1[31] = bf0[62];
1852   bf1[32] = bf0[1];
1853   bf1[33] = bf0[33];
1854   bf1[34] = bf0[17];
1855   bf1[35] = bf0[49];
1856   bf1[36] = bf0[9];
1857   bf1[37] = bf0[41];
1858   bf1[38] = bf0[25];
1859   bf1[39] = bf0[57];
1860   bf1[40] = bf0[5];
1861   bf1[41] = bf0[37];
1862   bf1[42] = bf0[21];
1863   bf1[43] = bf0[53];
1864   bf1[44] = bf0[13];
1865   bf1[45] = bf0[45];
1866   bf1[46] = bf0[29];
1867   bf1[47] = bf0[61];
1868   bf1[48] = bf0[3];
1869   bf1[49] = bf0[35];
1870   bf1[50] = bf0[19];
1871   bf1[51] = bf0[51];
1872   bf1[52] = bf0[11];
1873   bf1[53] = bf0[43];
1874   bf1[54] = bf0[27];
1875   bf1[55] = bf0[59];
1876   bf1[56] = bf0[7];
1877   bf1[57] = bf0[39];
1878   bf1[58] = bf0[23];
1879   bf1[59] = bf0[55];
1880   bf1[60] = bf0[15];
1881   bf1[61] = bf0[47];
1882   bf1[62] = bf0[31];
1883   bf1[63] = bf0[63];
1884   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1885 }
1886