1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdlib.h>
13 #include "av1/common/av1_inv_txfm1d.h"
14 #include "av1/common/av1_txfm.h"
15 
16 // TODO(angiebird): Make 1-d txfm functions static
17 //
18 
av1_idct4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)19 void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
20                    const int8_t *stage_range) {
21   assert(output != input);
22   const int32_t size = 4;
23   const int32_t *cospi = cospi_arr(cos_bit);
24 
25   int32_t stage = 0;
26   int32_t *bf0, *bf1;
27   int32_t step[4];
28 
29   // stage 0;
30 
31   // stage 1;
32   stage++;
33   bf1 = output;
34   bf1[0] = input[0];
35   bf1[1] = input[2];
36   bf1[2] = input[1];
37   bf1[3] = input[3];
38   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
39 
40   // stage 2
41   stage++;
42   bf0 = output;
43   bf1 = step;
44   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
45   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
46   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
47   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
48   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
49 
50   // stage 3
51   stage++;
52   bf0 = step;
53   bf1 = output;
54   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
55   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
56   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
57   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
58 }
59 
av1_idct8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)60 void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
61                    const int8_t *stage_range) {
62   assert(output != input);
63   const int32_t size = 8;
64   const int32_t *cospi = cospi_arr(cos_bit);
65 
66   int32_t stage = 0;
67   int32_t *bf0, *bf1;
68   int32_t step[8];
69 
70   // stage 0;
71 
72   // stage 1;
73   stage++;
74   bf1 = output;
75   bf1[0] = input[0];
76   bf1[1] = input[4];
77   bf1[2] = input[2];
78   bf1[3] = input[6];
79   bf1[4] = input[1];
80   bf1[5] = input[5];
81   bf1[6] = input[3];
82   bf1[7] = input[7];
83   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
84 
85   // stage 2
86   stage++;
87   bf0 = output;
88   bf1 = step;
89   bf1[0] = bf0[0];
90   bf1[1] = bf0[1];
91   bf1[2] = bf0[2];
92   bf1[3] = bf0[3];
93   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
94   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
95   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
96   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
97   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98 
99   // stage 3
100   stage++;
101   bf0 = step;
102   bf1 = output;
103   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
104   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
105   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
106   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
107   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
108   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
109   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
110   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
111   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
112 
113   // stage 4
114   stage++;
115   bf0 = output;
116   bf1 = step;
117   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
118   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
119   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
120   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
121   bf1[4] = bf0[4];
122   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
123   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
124   bf1[7] = bf0[7];
125   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
126 
127   // stage 5
128   stage++;
129   bf0 = step;
130   bf1 = output;
131   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
132   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
133   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
134   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
135   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
136   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
137   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
138   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
139 }
140 
av1_idct16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)141 void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
142                     const int8_t *stage_range) {
143   assert(output != input);
144   const int32_t size = 16;
145   const int32_t *cospi = cospi_arr(cos_bit);
146 
147   int32_t stage = 0;
148   int32_t *bf0, *bf1;
149   int32_t step[16];
150 
151   // stage 0;
152 
153   // stage 1;
154   stage++;
155   bf1 = output;
156   bf1[0] = input[0];
157   bf1[1] = input[8];
158   bf1[2] = input[4];
159   bf1[3] = input[12];
160   bf1[4] = input[2];
161   bf1[5] = input[10];
162   bf1[6] = input[6];
163   bf1[7] = input[14];
164   bf1[8] = input[1];
165   bf1[9] = input[9];
166   bf1[10] = input[5];
167   bf1[11] = input[13];
168   bf1[12] = input[3];
169   bf1[13] = input[11];
170   bf1[14] = input[7];
171   bf1[15] = input[15];
172   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
173 
174   // stage 2
175   stage++;
176   bf0 = output;
177   bf1 = step;
178   bf1[0] = bf0[0];
179   bf1[1] = bf0[1];
180   bf1[2] = bf0[2];
181   bf1[3] = bf0[3];
182   bf1[4] = bf0[4];
183   bf1[5] = bf0[5];
184   bf1[6] = bf0[6];
185   bf1[7] = bf0[7];
186   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
187   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
188   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
189   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
190   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
191   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
192   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
193   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
194   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
195 
196   // stage 3
197   stage++;
198   bf0 = step;
199   bf1 = output;
200   bf1[0] = bf0[0];
201   bf1[1] = bf0[1];
202   bf1[2] = bf0[2];
203   bf1[3] = bf0[3];
204   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
205   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
206   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
207   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
208   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
209   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
210   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
211   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
212   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
213   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
214   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
215   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
216   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
217 
218   // stage 4
219   stage++;
220   bf0 = output;
221   bf1 = step;
222   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
223   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
224   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
225   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
226   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
227   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
228   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
229   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
230   bf1[8] = bf0[8];
231   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
232   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
233   bf1[11] = bf0[11];
234   bf1[12] = bf0[12];
235   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
236   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
237   bf1[15] = bf0[15];
238   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
239 
240   // stage 5
241   stage++;
242   bf0 = step;
243   bf1 = output;
244   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
245   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
246   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
247   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
248   bf1[4] = bf0[4];
249   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
250   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
251   bf1[7] = bf0[7];
252   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
253   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
254   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
255   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
256   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
257   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
258   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
259   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
260   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
261 
262   // stage 6
263   stage++;
264   bf0 = output;
265   bf1 = step;
266   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
267   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
268   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
269   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
270   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
271   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
272   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
273   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
274   bf1[8] = bf0[8];
275   bf1[9] = bf0[9];
276   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
277   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
278   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
279   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
280   bf1[14] = bf0[14];
281   bf1[15] = bf0[15];
282   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
283 
284   // stage 7
285   stage++;
286   bf0 = step;
287   bf1 = output;
288   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
289   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
290   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
291   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
292   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
293   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
294   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
295   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
296   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
297   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
298   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
299   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
300   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
301   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
302   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
303   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
304 }
305 
av1_idct32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)306 void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
307                     const int8_t *stage_range) {
308   assert(output != input);
309   const int32_t size = 32;
310   const int32_t *cospi = cospi_arr(cos_bit);
311 
312   int32_t stage = 0;
313   int32_t *bf0, *bf1;
314   int32_t step[32];
315 
316   // stage 0;
317 
318   // stage 1;
319   stage++;
320   bf1 = output;
321   bf1[0] = input[0];
322   bf1[1] = input[16];
323   bf1[2] = input[8];
324   bf1[3] = input[24];
325   bf1[4] = input[4];
326   bf1[5] = input[20];
327   bf1[6] = input[12];
328   bf1[7] = input[28];
329   bf1[8] = input[2];
330   bf1[9] = input[18];
331   bf1[10] = input[10];
332   bf1[11] = input[26];
333   bf1[12] = input[6];
334   bf1[13] = input[22];
335   bf1[14] = input[14];
336   bf1[15] = input[30];
337   bf1[16] = input[1];
338   bf1[17] = input[17];
339   bf1[18] = input[9];
340   bf1[19] = input[25];
341   bf1[20] = input[5];
342   bf1[21] = input[21];
343   bf1[22] = input[13];
344   bf1[23] = input[29];
345   bf1[24] = input[3];
346   bf1[25] = input[19];
347   bf1[26] = input[11];
348   bf1[27] = input[27];
349   bf1[28] = input[7];
350   bf1[29] = input[23];
351   bf1[30] = input[15];
352   bf1[31] = input[31];
353   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
354 
355   // stage 2
356   stage++;
357   bf0 = output;
358   bf1 = step;
359   bf1[0] = bf0[0];
360   bf1[1] = bf0[1];
361   bf1[2] = bf0[2];
362   bf1[3] = bf0[3];
363   bf1[4] = bf0[4];
364   bf1[5] = bf0[5];
365   bf1[6] = bf0[6];
366   bf1[7] = bf0[7];
367   bf1[8] = bf0[8];
368   bf1[9] = bf0[9];
369   bf1[10] = bf0[10];
370   bf1[11] = bf0[11];
371   bf1[12] = bf0[12];
372   bf1[13] = bf0[13];
373   bf1[14] = bf0[14];
374   bf1[15] = bf0[15];
375   bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
376   bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
377   bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
378   bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
379   bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
380   bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
381   bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
382   bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
383   bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
384   bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
385   bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
386   bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
387   bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
388   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
389   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
390   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
391   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
392 
393   // stage 3
394   stage++;
395   bf0 = step;
396   bf1 = output;
397   bf1[0] = bf0[0];
398   bf1[1] = bf0[1];
399   bf1[2] = bf0[2];
400   bf1[3] = bf0[3];
401   bf1[4] = bf0[4];
402   bf1[5] = bf0[5];
403   bf1[6] = bf0[6];
404   bf1[7] = bf0[7];
405   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
406   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
407   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
408   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
409   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
410   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
411   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
412   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
413   bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
414   bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
415   bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
416   bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
417   bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
418   bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
419   bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
420   bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
421   bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
422   bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
423   bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
424   bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
425   bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
426   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
427   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
428   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
429   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
430 
431   // stage 4
432   stage++;
433   bf0 = output;
434   bf1 = step;
435   bf1[0] = bf0[0];
436   bf1[1] = bf0[1];
437   bf1[2] = bf0[2];
438   bf1[3] = bf0[3];
439   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
440   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
441   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
442   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
443   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
444   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
445   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
446   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
447   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
448   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
449   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
450   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
451   bf1[16] = bf0[16];
452   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
453   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
454   bf1[19] = bf0[19];
455   bf1[20] = bf0[20];
456   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
457   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
458   bf1[23] = bf0[23];
459   bf1[24] = bf0[24];
460   bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
461   bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
462   bf1[27] = bf0[27];
463   bf1[28] = bf0[28];
464   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
465   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
466   bf1[31] = bf0[31];
467   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
468 
469   // stage 5
470   stage++;
471   bf0 = step;
472   bf1 = output;
473   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
474   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
475   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
476   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
477   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
478   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
479   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
480   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
481   bf1[8] = bf0[8];
482   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
483   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
484   bf1[11] = bf0[11];
485   bf1[12] = bf0[12];
486   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
487   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
488   bf1[15] = bf0[15];
489   bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
490   bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
491   bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
492   bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
493   bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
494   bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
495   bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
496   bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
497   bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
498   bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
499   bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
500   bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
501   bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
502   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
503   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
504   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
505   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
506 
507   // stage 6
508   stage++;
509   bf0 = output;
510   bf1 = step;
511   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
512   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
513   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
514   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
515   bf1[4] = bf0[4];
516   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
517   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
518   bf1[7] = bf0[7];
519   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
520   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
521   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
522   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
523   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
524   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
525   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
526   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
527   bf1[16] = bf0[16];
528   bf1[17] = bf0[17];
529   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
530   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
531   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
532   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
533   bf1[22] = bf0[22];
534   bf1[23] = bf0[23];
535   bf1[24] = bf0[24];
536   bf1[25] = bf0[25];
537   bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
538   bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
539   bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
540   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
541   bf1[30] = bf0[30];
542   bf1[31] = bf0[31];
543   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
544 
545   // stage 7
546   stage++;
547   bf0 = step;
548   bf1 = output;
549   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
550   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
551   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
552   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
553   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
554   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
555   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
556   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
557   bf1[8] = bf0[8];
558   bf1[9] = bf0[9];
559   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
560   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
561   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
562   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
563   bf1[14] = bf0[14];
564   bf1[15] = bf0[15];
565   bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
566   bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
567   bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
568   bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
569   bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
570   bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
571   bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
572   bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
573   bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
574   bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
575   bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
576   bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
577   bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
578   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
579   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
580   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
581   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
582 
583   // stage 8
584   stage++;
585   bf0 = output;
586   bf1 = step;
587   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
588   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
589   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
590   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
591   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
592   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
593   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
594   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
595   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
596   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
597   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
598   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
599   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
600   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
601   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
602   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
603   bf1[16] = bf0[16];
604   bf1[17] = bf0[17];
605   bf1[18] = bf0[18];
606   bf1[19] = bf0[19];
607   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
608   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
609   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
610   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
611   bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
612   bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
613   bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
614   bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
615   bf1[28] = bf0[28];
616   bf1[29] = bf0[29];
617   bf1[30] = bf0[30];
618   bf1[31] = bf0[31];
619   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
620 
621   // stage 9
622   stage++;
623   bf0 = step;
624   bf1 = output;
625   bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
626   bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
627   bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
628   bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
629   bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
630   bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
631   bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
632   bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
633   bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
634   bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
635   bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
636   bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
637   bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
638   bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
639   bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
640   bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
641   bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
642   bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
643   bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
644   bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
645   bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
646   bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
647   bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
648   bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
649   bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
650   bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
651   bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
652   bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
653   bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
654   bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
655   bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
656   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
657 }
658 
av1_iadst4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)659 void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
660                     const int8_t *stage_range) {
661   int bit = cos_bit;
662   const int32_t *sinpi = sinpi_arr(bit);
663   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
664 
665   int32_t x0 = input[0];
666   int32_t x1 = input[1];
667   int32_t x2 = input[2];
668   int32_t x3 = input[3];
669 
670   if (!(x0 | x1 | x2 | x3)) {
671     output[0] = output[1] = output[2] = output[3] = 0;
672     return;
673   }
674 
675   assert(sinpi[1] + sinpi[2] == sinpi[4]);
676 
677   // stage 1
678   s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
679   s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
680   s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
681   s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
682   s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
683   s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
684   s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
685 
686   // stage 2
687   // NOTICE: (x0 - x2) here may use one extra bit compared to the
688   // opt_range_row/col specified in av1_gen_inv_stage_range()
689   s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
690 
691   // stage 3
692   s0 = range_check_value(s0 + s3, stage_range[3] + bit);
693   s1 = range_check_value(s1 - s4, stage_range[3] + bit);
694   s3 = range_check_value(s2, stage_range[3] + bit);
695   s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
696 
697   // stage 4
698   s0 = range_check_value(s0 + s5, stage_range[4] + bit);
699   s1 = range_check_value(s1 - s6, stage_range[4] + bit);
700 
701   // stage 5
702   x0 = range_check_value(s0 + s3, stage_range[5] + bit);
703   x1 = range_check_value(s1 + s3, stage_range[5] + bit);
704   x2 = range_check_value(s2, stage_range[5] + bit);
705   x3 = range_check_value(s0 + s1, stage_range[5] + bit);
706 
707   // stage 6
708   x3 = range_check_value(x3 - s3, stage_range[6] + bit);
709 
710   output[0] = round_shift(x0, bit);
711   output[1] = round_shift(x1, bit);
712   output[2] = round_shift(x2, bit);
713   output[3] = round_shift(x3, bit);
714 }
715 
av1_iadst8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)716 void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
717                     const int8_t *stage_range) {
718   assert(output != input);
719   const int32_t size = 8;
720   const int32_t *cospi = cospi_arr(cos_bit);
721 
722   int32_t stage = 0;
723   int32_t *bf0, *bf1;
724   int32_t step[8];
725 
726   // stage 0;
727 
728   // stage 1;
729   stage++;
730   bf1 = output;
731   bf1[0] = input[7];
732   bf1[1] = input[0];
733   bf1[2] = input[5];
734   bf1[3] = input[2];
735   bf1[4] = input[3];
736   bf1[5] = input[4];
737   bf1[6] = input[1];
738   bf1[7] = input[6];
739   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
740 
741   // stage 2
742   stage++;
743   bf0 = output;
744   bf1 = step;
745   bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
746   bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
747   bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
748   bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
749   bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
750   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
751   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
752   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
753   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
754 
755   // stage 3
756   stage++;
757   bf0 = step;
758   bf1 = output;
759   bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
760   bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
761   bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
762   bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
763   bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
764   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
765   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
766   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
767   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
768 
769   // stage 4
770   stage++;
771   bf0 = output;
772   bf1 = step;
773   bf1[0] = bf0[0];
774   bf1[1] = bf0[1];
775   bf1[2] = bf0[2];
776   bf1[3] = bf0[3];
777   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
778   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
779   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
780   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
781   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
782 
783   // stage 5
784   stage++;
785   bf0 = step;
786   bf1 = output;
787   bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
788   bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
789   bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
790   bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
791   bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
792   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
793   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
794   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
795   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
796 
797   // stage 6
798   stage++;
799   bf0 = output;
800   bf1 = step;
801   bf1[0] = bf0[0];
802   bf1[1] = bf0[1];
803   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
804   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
805   bf1[4] = bf0[4];
806   bf1[5] = bf0[5];
807   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
808   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
809   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
810 
811   // stage 7
812   stage++;
813   bf0 = step;
814   bf1 = output;
815   bf1[0] = bf0[0];
816   bf1[1] = -bf0[4];
817   bf1[2] = bf0[6];
818   bf1[3] = -bf0[2];
819   bf1[4] = bf0[3];
820   bf1[5] = -bf0[7];
821   bf1[6] = bf0[5];
822   bf1[7] = -bf0[1];
823 }
824 
av1_iadst16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)825 void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
826                      const int8_t *stage_range) {
827   assert(output != input);
828   const int32_t size = 16;
829   const int32_t *cospi = cospi_arr(cos_bit);
830 
831   int32_t stage = 0;
832   int32_t *bf0, *bf1;
833   int32_t step[16];
834 
835   // stage 0;
836 
837   // stage 1;
838   stage++;
839   bf1 = output;
840   bf1[0] = input[15];
841   bf1[1] = input[0];
842   bf1[2] = input[13];
843   bf1[3] = input[2];
844   bf1[4] = input[11];
845   bf1[5] = input[4];
846   bf1[6] = input[9];
847   bf1[7] = input[6];
848   bf1[8] = input[7];
849   bf1[9] = input[8];
850   bf1[10] = input[5];
851   bf1[11] = input[10];
852   bf1[12] = input[3];
853   bf1[13] = input[12];
854   bf1[14] = input[1];
855   bf1[15] = input[14];
856   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
857 
858   // stage 2
859   stage++;
860   bf0 = output;
861   bf1 = step;
862   bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
863   bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
864   bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
865   bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
866   bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
867   bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
868   bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
869   bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
870   bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
871   bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
872   bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
873   bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
874   bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
875   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
876   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
877   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
878   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
879 
880   // stage 3
881   stage++;
882   bf0 = step;
883   bf1 = output;
884   bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
885   bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
886   bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
887   bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
888   bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
889   bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
890   bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
891   bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
892   bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
893   bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
894   bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
895   bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
896   bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
897   bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
898   bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
899   bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
900   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
901 
902   // stage 4
903   stage++;
904   bf0 = output;
905   bf1 = step;
906   bf1[0] = bf0[0];
907   bf1[1] = bf0[1];
908   bf1[2] = bf0[2];
909   bf1[3] = bf0[3];
910   bf1[4] = bf0[4];
911   bf1[5] = bf0[5];
912   bf1[6] = bf0[6];
913   bf1[7] = bf0[7];
914   bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
915   bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
916   bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
917   bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
918   bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
919   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
920   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
921   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
922   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
923 
924   // stage 5
925   stage++;
926   bf0 = step;
927   bf1 = output;
928   bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
929   bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
930   bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
931   bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
932   bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
933   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
934   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
935   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
936   bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
937   bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
938   bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
939   bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
940   bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
941   bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
942   bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
943   bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
944   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
945 
946   // stage 6
947   stage++;
948   bf0 = output;
949   bf1 = step;
950   bf1[0] = bf0[0];
951   bf1[1] = bf0[1];
952   bf1[2] = bf0[2];
953   bf1[3] = bf0[3];
954   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
955   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
956   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
957   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
958   bf1[8] = bf0[8];
959   bf1[9] = bf0[9];
960   bf1[10] = bf0[10];
961   bf1[11] = bf0[11];
962   bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
963   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
964   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
965   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
966   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
967 
968   // stage 7
969   stage++;
970   bf0 = step;
971   bf1 = output;
972   bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
973   bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
974   bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
975   bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
976   bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
977   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
978   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
979   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
980   bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
981   bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
982   bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
983   bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
984   bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
985   bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
986   bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
987   bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
988   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
989 
990   // stage 8
991   stage++;
992   bf0 = output;
993   bf1 = step;
994   bf1[0] = bf0[0];
995   bf1[1] = bf0[1];
996   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
997   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
998   bf1[4] = bf0[4];
999   bf1[5] = bf0[5];
1000   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
1001   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
1002   bf1[8] = bf0[8];
1003   bf1[9] = bf0[9];
1004   bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
1005   bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
1006   bf1[12] = bf0[12];
1007   bf1[13] = bf0[13];
1008   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
1009   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
1010   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1011 
1012   // stage 9
1013   stage++;
1014   bf0 = step;
1015   bf1 = output;
1016   bf1[0] = bf0[0];
1017   bf1[1] = -bf0[8];
1018   bf1[2] = bf0[12];
1019   bf1[3] = -bf0[4];
1020   bf1[4] = bf0[6];
1021   bf1[5] = -bf0[14];
1022   bf1[6] = bf0[10];
1023   bf1[7] = -bf0[2];
1024   bf1[8] = bf0[3];
1025   bf1[9] = -bf0[11];
1026   bf1[10] = bf0[15];
1027   bf1[11] = -bf0[7];
1028   bf1[12] = bf0[5];
1029   bf1[13] = -bf0[13];
1030   bf1[14] = bf0[9];
1031   bf1[15] = -bf0[1];
1032 }
1033 
av1_iidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1034 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1035                       const int8_t *stage_range) {
1036   (void)cos_bit;
1037   (void)stage_range;
1038   for (int i = 0; i < 4; ++i) {
1039     output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
1040   }
1041   assert(stage_range[0] + NewSqrt2Bits <= 32);
1042 }
1043 
av1_iidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1044 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1045                       const int8_t *stage_range) {
1046   (void)cos_bit;
1047   (void)stage_range;
1048   for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
1049 }
1050 
av1_iidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1051 void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1052                        const int8_t *stage_range) {
1053   (void)cos_bit;
1054   (void)stage_range;
1055   for (int i = 0; i < 16; ++i)
1056     output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
1057   assert(stage_range[0] + NewSqrt2Bits <= 32);
1058 }
1059 
av1_iidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1060 void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1061                        const int8_t *stage_range) {
1062   (void)cos_bit;
1063   (void)stage_range;
1064   for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
1065 }
1066 
av1_idct64_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1067 void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1068                     const int8_t *stage_range) {
1069   assert(output != input);
1070   const int32_t size = 64;
1071   const int32_t *cospi = cospi_arr(cos_bit);
1072 
1073   int32_t stage = 0;
1074   int32_t *bf0, *bf1;
1075   int32_t step[64];
1076 
1077   // stage 0;
1078 
1079   // stage 1;
1080   stage++;
1081   bf1 = output;
1082   bf1[0] = input[0];
1083   bf1[1] = input[32];
1084   bf1[2] = input[16];
1085   bf1[3] = input[48];
1086   bf1[4] = input[8];
1087   bf1[5] = input[40];
1088   bf1[6] = input[24];
1089   bf1[7] = input[56];
1090   bf1[8] = input[4];
1091   bf1[9] = input[36];
1092   bf1[10] = input[20];
1093   bf1[11] = input[52];
1094   bf1[12] = input[12];
1095   bf1[13] = input[44];
1096   bf1[14] = input[28];
1097   bf1[15] = input[60];
1098   bf1[16] = input[2];
1099   bf1[17] = input[34];
1100   bf1[18] = input[18];
1101   bf1[19] = input[50];
1102   bf1[20] = input[10];
1103   bf1[21] = input[42];
1104   bf1[22] = input[26];
1105   bf1[23] = input[58];
1106   bf1[24] = input[6];
1107   bf1[25] = input[38];
1108   bf1[26] = input[22];
1109   bf1[27] = input[54];
1110   bf1[28] = input[14];
1111   bf1[29] = input[46];
1112   bf1[30] = input[30];
1113   bf1[31] = input[62];
1114   bf1[32] = input[1];
1115   bf1[33] = input[33];
1116   bf1[34] = input[17];
1117   bf1[35] = input[49];
1118   bf1[36] = input[9];
1119   bf1[37] = input[41];
1120   bf1[38] = input[25];
1121   bf1[39] = input[57];
1122   bf1[40] = input[5];
1123   bf1[41] = input[37];
1124   bf1[42] = input[21];
1125   bf1[43] = input[53];
1126   bf1[44] = input[13];
1127   bf1[45] = input[45];
1128   bf1[46] = input[29];
1129   bf1[47] = input[61];
1130   bf1[48] = input[3];
1131   bf1[49] = input[35];
1132   bf1[50] = input[19];
1133   bf1[51] = input[51];
1134   bf1[52] = input[11];
1135   bf1[53] = input[43];
1136   bf1[54] = input[27];
1137   bf1[55] = input[59];
1138   bf1[56] = input[7];
1139   bf1[57] = input[39];
1140   bf1[58] = input[23];
1141   bf1[59] = input[55];
1142   bf1[60] = input[15];
1143   bf1[61] = input[47];
1144   bf1[62] = input[31];
1145   bf1[63] = input[63];
1146   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1147 
1148   // stage 2
1149   stage++;
1150   bf0 = output;
1151   bf1 = step;
1152   bf1[0] = bf0[0];
1153   bf1[1] = bf0[1];
1154   bf1[2] = bf0[2];
1155   bf1[3] = bf0[3];
1156   bf1[4] = bf0[4];
1157   bf1[5] = bf0[5];
1158   bf1[6] = bf0[6];
1159   bf1[7] = bf0[7];
1160   bf1[8] = bf0[8];
1161   bf1[9] = bf0[9];
1162   bf1[10] = bf0[10];
1163   bf1[11] = bf0[11];
1164   bf1[12] = bf0[12];
1165   bf1[13] = bf0[13];
1166   bf1[14] = bf0[14];
1167   bf1[15] = bf0[15];
1168   bf1[16] = bf0[16];
1169   bf1[17] = bf0[17];
1170   bf1[18] = bf0[18];
1171   bf1[19] = bf0[19];
1172   bf1[20] = bf0[20];
1173   bf1[21] = bf0[21];
1174   bf1[22] = bf0[22];
1175   bf1[23] = bf0[23];
1176   bf1[24] = bf0[24];
1177   bf1[25] = bf0[25];
1178   bf1[26] = bf0[26];
1179   bf1[27] = bf0[27];
1180   bf1[28] = bf0[28];
1181   bf1[29] = bf0[29];
1182   bf1[30] = bf0[30];
1183   bf1[31] = bf0[31];
1184   bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
1185   bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
1186   bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
1187   bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
1188   bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
1189   bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
1190   bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
1191   bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
1192   bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
1193   bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
1194   bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
1195   bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
1196   bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
1197   bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
1198   bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
1199   bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
1200   bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
1201   bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
1202   bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
1203   bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
1204   bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
1205   bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
1206   bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
1207   bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
1208   bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
1209   bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
1210   bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
1211   bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
1212   bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
1213   bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
1214   bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
1215   bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
1216   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1217 
1218   // stage 3
1219   stage++;
1220   bf0 = step;
1221   bf1 = output;
1222   bf1[0] = bf0[0];
1223   bf1[1] = bf0[1];
1224   bf1[2] = bf0[2];
1225   bf1[3] = bf0[3];
1226   bf1[4] = bf0[4];
1227   bf1[5] = bf0[5];
1228   bf1[6] = bf0[6];
1229   bf1[7] = bf0[7];
1230   bf1[8] = bf0[8];
1231   bf1[9] = bf0[9];
1232   bf1[10] = bf0[10];
1233   bf1[11] = bf0[11];
1234   bf1[12] = bf0[12];
1235   bf1[13] = bf0[13];
1236   bf1[14] = bf0[14];
1237   bf1[15] = bf0[15];
1238   bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
1239   bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
1240   bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
1241   bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
1242   bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
1243   bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
1244   bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
1245   bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
1246   bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
1247   bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
1248   bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
1249   bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
1250   bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
1251   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
1252   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
1253   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
1254   bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
1255   bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
1256   bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
1257   bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
1258   bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
1259   bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
1260   bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
1261   bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
1262   bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
1263   bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
1264   bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
1265   bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
1266   bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
1267   bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
1268   bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
1269   bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
1270   bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
1271   bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
1272   bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
1273   bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
1274   bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
1275   bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
1276   bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
1277   bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
1278   bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
1279   bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
1280   bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
1281   bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
1282   bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
1283   bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
1284   bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
1285   bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
1286   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1287 
1288   // stage 4
1289   stage++;
1290   bf0 = output;
1291   bf1 = step;
1292   bf1[0] = bf0[0];
1293   bf1[1] = bf0[1];
1294   bf1[2] = bf0[2];
1295   bf1[3] = bf0[3];
1296   bf1[4] = bf0[4];
1297   bf1[5] = bf0[5];
1298   bf1[6] = bf0[6];
1299   bf1[7] = bf0[7];
1300   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
1301   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
1302   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
1303   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
1304   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
1305   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
1306   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
1307   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
1308   bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
1309   bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
1310   bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
1311   bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
1312   bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
1313   bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
1314   bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
1315   bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
1316   bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
1317   bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
1318   bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
1319   bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
1320   bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
1321   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
1322   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
1323   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
1324   bf1[32] = bf0[32];
1325   bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1326   bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1327   bf1[35] = bf0[35];
1328   bf1[36] = bf0[36];
1329   bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1330   bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1331   bf1[39] = bf0[39];
1332   bf1[40] = bf0[40];
1333   bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1334   bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1335   bf1[43] = bf0[43];
1336   bf1[44] = bf0[44];
1337   bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1338   bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1339   bf1[47] = bf0[47];
1340   bf1[48] = bf0[48];
1341   bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
1342   bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
1343   bf1[51] = bf0[51];
1344   bf1[52] = bf0[52];
1345   bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
1346   bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
1347   bf1[55] = bf0[55];
1348   bf1[56] = bf0[56];
1349   bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
1350   bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
1351   bf1[59] = bf0[59];
1352   bf1[60] = bf0[60];
1353   bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
1354   bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
1355   bf1[63] = bf0[63];
1356   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1357 
1358   // stage 5
1359   stage++;
1360   bf0 = step;
1361   bf1 = output;
1362   bf1[0] = bf0[0];
1363   bf1[1] = bf0[1];
1364   bf1[2] = bf0[2];
1365   bf1[3] = bf0[3];
1366   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
1367   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
1368   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
1369   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
1370   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
1371   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
1372   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
1373   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
1374   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
1375   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
1376   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
1377   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
1378   bf1[16] = bf0[16];
1379   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1380   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1381   bf1[19] = bf0[19];
1382   bf1[20] = bf0[20];
1383   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1384   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1385   bf1[23] = bf0[23];
1386   bf1[24] = bf0[24];
1387   bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
1388   bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
1389   bf1[27] = bf0[27];
1390   bf1[28] = bf0[28];
1391   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
1392   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
1393   bf1[31] = bf0[31];
1394   bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
1395   bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
1396   bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
1397   bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
1398   bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
1399   bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
1400   bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
1401   bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
1402   bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
1403   bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
1404   bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
1405   bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
1406   bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
1407   bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
1408   bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
1409   bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
1410   bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
1411   bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
1412   bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
1413   bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
1414   bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
1415   bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
1416   bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
1417   bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
1418   bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
1419   bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
1420   bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
1421   bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
1422   bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
1423   bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
1424   bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
1425   bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
1426   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1427 
1428   // stage 6
1429   stage++;
1430   bf0 = output;
1431   bf1 = step;
1432   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1433   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
1434   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
1435   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
1436   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
1437   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
1438   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
1439   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
1440   bf1[8] = bf0[8];
1441   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1442   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1443   bf1[11] = bf0[11];
1444   bf1[12] = bf0[12];
1445   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
1446   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
1447   bf1[15] = bf0[15];
1448   bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
1449   bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
1450   bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
1451   bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
1452   bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
1453   bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
1454   bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
1455   bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
1456   bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
1457   bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
1458   bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
1459   bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
1460   bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
1461   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
1462   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
1463   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
1464   bf1[32] = bf0[32];
1465   bf1[33] = bf0[33];
1466   bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1467   bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1468   bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1469   bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1470   bf1[38] = bf0[38];
1471   bf1[39] = bf0[39];
1472   bf1[40] = bf0[40];
1473   bf1[41] = bf0[41];
1474   bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1475   bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1476   bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1477   bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1478   bf1[46] = bf0[46];
1479   bf1[47] = bf0[47];
1480   bf1[48] = bf0[48];
1481   bf1[49] = bf0[49];
1482   bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
1483   bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
1484   bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
1485   bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
1486   bf1[54] = bf0[54];
1487   bf1[55] = bf0[55];
1488   bf1[56] = bf0[56];
1489   bf1[57] = bf0[57];
1490   bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
1491   bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
1492   bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
1493   bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
1494   bf1[62] = bf0[62];
1495   bf1[63] = bf0[63];
1496   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1497 
1498   // stage 7
1499   stage++;
1500   bf0 = step;
1501   bf1 = output;
1502   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
1503   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
1504   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
1505   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
1506   bf1[4] = bf0[4];
1507   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1508   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1509   bf1[7] = bf0[7];
1510   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
1511   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
1512   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
1513   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
1514   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
1515   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
1516   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
1517   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
1518   bf1[16] = bf0[16];
1519   bf1[17] = bf0[17];
1520   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1521   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1522   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1523   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1524   bf1[22] = bf0[22];
1525   bf1[23] = bf0[23];
1526   bf1[24] = bf0[24];
1527   bf1[25] = bf0[25];
1528   bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
1529   bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
1530   bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
1531   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
1532   bf1[30] = bf0[30];
1533   bf1[31] = bf0[31];
1534   bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
1535   bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
1536   bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
1537   bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
1538   bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
1539   bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
1540   bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
1541   bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
1542   bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
1543   bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
1544   bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
1545   bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
1546   bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
1547   bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
1548   bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
1549   bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
1550   bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
1551   bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
1552   bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
1553   bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
1554   bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
1555   bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
1556   bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
1557   bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
1558   bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
1559   bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
1560   bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
1561   bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
1562   bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
1563   bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
1564   bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
1565   bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
1566   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1567 
1568   // stage 8
1569   stage++;
1570   bf0 = output;
1571   bf1 = step;
1572   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
1573   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
1574   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
1575   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
1576   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
1577   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
1578   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
1579   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
1580   bf1[8] = bf0[8];
1581   bf1[9] = bf0[9];
1582   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1583   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1584   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1585   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1586   bf1[14] = bf0[14];
1587   bf1[15] = bf0[15];
1588   bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
1589   bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
1590   bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
1591   bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
1592   bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
1593   bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
1594   bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
1595   bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
1596   bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
1597   bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
1598   bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
1599   bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
1600   bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
1601   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
1602   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
1603   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
1604   bf1[32] = bf0[32];
1605   bf1[33] = bf0[33];
1606   bf1[34] = bf0[34];
1607   bf1[35] = bf0[35];
1608   bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1609   bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1610   bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1611   bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1612   bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1613   bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1614   bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1615   bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1616   bf1[44] = bf0[44];
1617   bf1[45] = bf0[45];
1618   bf1[46] = bf0[46];
1619   bf1[47] = bf0[47];
1620   bf1[48] = bf0[48];
1621   bf1[49] = bf0[49];
1622   bf1[50] = bf0[50];
1623   bf1[51] = bf0[51];
1624   bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
1625   bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
1626   bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
1627   bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
1628   bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
1629   bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
1630   bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
1631   bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
1632   bf1[60] = bf0[60];
1633   bf1[61] = bf0[61];
1634   bf1[62] = bf0[62];
1635   bf1[63] = bf0[63];
1636   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1637 
1638   // stage 9
1639   stage++;
1640   bf0 = step;
1641   bf1 = output;
1642   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
1643   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
1644   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
1645   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
1646   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
1647   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
1648   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
1649   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
1650   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
1651   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
1652   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
1653   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
1654   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
1655   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
1656   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
1657   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
1658   bf1[16] = bf0[16];
1659   bf1[17] = bf0[17];
1660   bf1[18] = bf0[18];
1661   bf1[19] = bf0[19];
1662   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1663   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1664   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1665   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1666   bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1667   bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1668   bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1669   bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1670   bf1[28] = bf0[28];
1671   bf1[29] = bf0[29];
1672   bf1[30] = bf0[30];
1673   bf1[31] = bf0[31];
1674   bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
1675   bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
1676   bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
1677   bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
1678   bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
1679   bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
1680   bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
1681   bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
1682   bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
1683   bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
1684   bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
1685   bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
1686   bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
1687   bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
1688   bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
1689   bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
1690   bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
1691   bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
1692   bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
1693   bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
1694   bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
1695   bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
1696   bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
1697   bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
1698   bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
1699   bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
1700   bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
1701   bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
1702   bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
1703   bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
1704   bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
1705   bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
1706   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1707 
1708   // stage 10
1709   stage++;
1710   bf0 = output;
1711   bf1 = step;
1712   bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
1713   bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
1714   bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
1715   bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
1716   bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
1717   bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
1718   bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
1719   bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
1720   bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
1721   bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
1722   bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
1723   bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
1724   bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
1725   bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
1726   bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
1727   bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
1728   bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
1729   bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
1730   bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
1731   bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
1732   bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
1733   bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
1734   bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
1735   bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
1736   bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
1737   bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
1738   bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
1739   bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
1740   bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
1741   bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
1742   bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
1743   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
1744   bf1[32] = bf0[32];
1745   bf1[33] = bf0[33];
1746   bf1[34] = bf0[34];
1747   bf1[35] = bf0[35];
1748   bf1[36] = bf0[36];
1749   bf1[37] = bf0[37];
1750   bf1[38] = bf0[38];
1751   bf1[39] = bf0[39];
1752   bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1753   bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1754   bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1755   bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1756   bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1757   bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1758   bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1759   bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1760   bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1761   bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1762   bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1763   bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1764   bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1765   bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1766   bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1767   bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1768   bf1[56] = bf0[56];
1769   bf1[57] = bf0[57];
1770   bf1[58] = bf0[58];
1771   bf1[59] = bf0[59];
1772   bf1[60] = bf0[60];
1773   bf1[61] = bf0[61];
1774   bf1[62] = bf0[62];
1775   bf1[63] = bf0[63];
1776   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1777 
1778   // stage 11
1779   stage++;
1780   bf0 = step;
1781   bf1 = output;
1782   bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
1783   bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
1784   bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
1785   bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
1786   bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
1787   bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
1788   bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
1789   bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
1790   bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
1791   bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
1792   bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
1793   bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
1794   bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
1795   bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
1796   bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
1797   bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
1798   bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
1799   bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
1800   bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
1801   bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
1802   bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
1803   bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
1804   bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
1805   bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
1806   bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
1807   bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
1808   bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
1809   bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
1810   bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
1811   bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
1812   bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
1813   bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
1814   bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
1815   bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
1816   bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
1817   bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
1818   bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
1819   bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
1820   bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
1821   bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
1822   bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
1823   bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
1824   bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
1825   bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
1826   bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
1827   bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
1828   bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
1829   bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
1830   bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
1831   bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
1832   bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
1833   bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
1834   bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
1835   bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
1836   bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
1837   bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
1838   bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
1839   bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
1840   bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
1841   bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
1842   bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
1843   bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
1844   bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
1845   bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
1846 }
1847