1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "aom_dsp/aom_dsp_common.h"
13 #include "aom_dsp/fft_common.h"
14 
simple_transpose(const float * A,float * B,int n)15 static INLINE void simple_transpose(const float *A, float *B, int n) {
16   for (int y = 0; y < n; y++) {
17     for (int x = 0; x < n; x++) {
18       B[y * n + x] = A[x * n + y];
19     }
20   }
21 }
22 
23 // The 1d transform is real to complex and packs the complex results in
24 // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
25 // components, followed by the n/2 - 1 imaginary components). After the
26 // transform is done on the rows, the first n/2 + 1 columns are real, and
27 // the remaining are the imaginary components. After the transform on the
28 // columns, the region of [0, n/2]x[0, n/2] contains the real part of
29 // fft of the real columns. The real part of the 2d fft also includes the
30 // imaginary part of transformed imaginary columns. This function assembles
31 // the correct outputs while putting the real and imaginary components
32 // next to each other.
unpack_2d_output(const float * col_fft,float * output,int n)33 static INLINE void unpack_2d_output(const float *col_fft, float *output,
34                                     int n) {
35   for (int y = 0; y <= n / 2; ++y) {
36     const int y2 = y + n / 2;
37     const int y_extra = y2 > n / 2 && y2 < n;
38 
39     for (int x = 0; x <= n / 2; ++x) {
40       const int x2 = x + n / 2;
41       const int x_extra = x2 > n / 2 && x2 < n;
42       output[2 * (y * n + x)] =
43           col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
44       output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
45                                     (x_extra ? col_fft[y * n + x2] : 0);
46       if (y_extra) {
47         output[2 * ((n - y) * n + x)] =
48             col_fft[y * n + x] +
49             (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
50         output[2 * ((n - y) * n + x) + 1] =
51             -(y_extra ? col_fft[y2 * n + x] : 0) +
52             (x_extra ? col_fft[y * n + x2] : 0);
53       }
54     }
55   }
56 }
57 
aom_fft_2d_gen(const float * input,float * temp,float * output,int n,aom_fft_1d_func_t tform,aom_fft_transpose_func_t transpose,aom_fft_unpack_func_t unpack,int vec_size)58 void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
59                     aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
60                     aom_fft_unpack_func_t unpack, int vec_size) {
61   for (int x = 0; x < n; x += vec_size) {
62     tform(input + x, output + x, n);
63   }
64   transpose(output, temp, n);
65 
66   for (int x = 0; x < n; x += vec_size) {
67     tform(temp + x, output + x, n);
68   }
69   transpose(output, temp, n);
70 
71   unpack(temp, output, n);
72 }
73 
store_float(float * output,float input)74 static INLINE void store_float(float *output, float input) { *output = input; }
add_float(float a,float b)75 static INLINE float add_float(float a, float b) { return a + b; }
sub_float(float a,float b)76 static INLINE float sub_float(float a, float b) { return a - b; }
mul_float(float a,float b)77 static INLINE float mul_float(float a, float b) { return a * b; }
78 
79 GEN_FFT_2(void, float, float, float, *, store_float);
80 GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
81           sub_float);
82 GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
83           sub_float, mul_float);
84 GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
85            sub_float, mul_float);
86 GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
87            sub_float, mul_float);
88 
aom_fft2x2_float_c(const float * input,float * temp,float * output)89 void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
90   aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
91                  unpack_2d_output, 1);
92 }
93 
aom_fft4x4_float_c(const float * input,float * temp,float * output)94 void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
95   aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
96                  unpack_2d_output, 1);
97 }
98 
aom_fft8x8_float_c(const float * input,float * temp,float * output)99 void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
100   aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
101                  unpack_2d_output, 1);
102 }
103 
aom_fft16x16_float_c(const float * input,float * temp,float * output)104 void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
105   aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
106                  unpack_2d_output, 1);
107 }
108 
aom_fft32x32_float_c(const float * input,float * temp,float * output)109 void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
110   aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
111                  unpack_2d_output, 1);
112 }
113 
aom_ifft_2d_gen(const float * input,float * temp,float * output,int n,aom_fft_1d_func_t fft_single,aom_fft_1d_func_t fft_multi,aom_fft_1d_func_t ifft_multi,aom_fft_transpose_func_t transpose,int vec_size)114 void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
115                      aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
116                      aom_fft_1d_func_t ifft_multi,
117                      aom_fft_transpose_func_t transpose, int vec_size) {
118   // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
119   // and get real outputs.
120   for (int y = 0; y <= n / 2; ++y) {
121     output[y * n] = input[2 * y * n];
122     output[y * n + 1] = input[2 * (y * n + n / 2)];
123   }
124   for (int y = n / 2 + 1; y < n; ++y) {
125     output[y * n] = input[2 * (y - n / 2) * n + 1];
126     output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
127   }
128 
129   for (int i = 0; i < 2; i += vec_size) {
130     ifft_multi(output + i, temp + i, n);
131   }
132 
133   // For the other columns, since we don't have a full ifft for complex inputs
134   // we have to split them into the real and imaginary counterparts.
135   // Pack the real component, then the imaginary components.
136   for (int y = 0; y < n; ++y) {
137     for (int x = 1; x < n / 2; ++x) {
138       output[y * n + (x + 1)] = input[2 * (y * n + x)];
139     }
140     for (int x = 1; x < n / 2; ++x) {
141       output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
142     }
143   }
144   for (int y = 2; y < vec_size; y++) {
145     fft_single(output + y, temp + y, n);
146   }
147   // This is the part that can be sped up with SIMD
148   for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
149     fft_multi(output + y, temp + y, n);
150   }
151 
152   // Put the 0 and n/2 th results in the correct place.
153   for (int x = 0; x < n; ++x) {
154     output[x] = temp[x * n];
155     output[(n / 2) * n + x] = temp[x * n + 1];
156   }
157   // This rearranges and transposes.
158   for (int y = 1; y < n / 2; ++y) {
159     // Fill in the real columns
160     for (int x = 0; x <= n / 2; ++x) {
161       output[x + y * n] =
162           temp[(y + 1) + x * n] +
163           ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
164     }
165     for (int x = n / 2 + 1; x < n; ++x) {
166       output[x + y * n] = temp[(y + 1) + (n - x) * n] -
167                           temp[(y + n / 2) + ((n - x) + n / 2) * n];
168     }
169     // Fill in the imag columns
170     for (int x = 0; x <= n / 2; ++x) {
171       output[x + (y + n / 2) * n] =
172           temp[(y + n / 2) + x * n] -
173           ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
174     }
175     for (int x = n / 2 + 1; x < n; ++x) {
176       output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
177                                     temp[(y + n / 2) + (n - x) * n];
178     }
179   }
180   for (int y = 0; y < n; y += vec_size) {
181     ifft_multi(output + y, temp + y, n);
182   }
183   transpose(temp, output, n);
184 }
185 
186 GEN_IFFT_2(void, float, float, float, *, store_float);
187 GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
188            sub_float);
189 GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
190            sub_float, mul_float);
191 GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
192             sub_float, mul_float);
193 GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
194             sub_float, mul_float);
195 
aom_ifft2x2_float_c(const float * input,float * temp,float * output)196 void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
197   aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
198                   aom_ifft1d_2_float, simple_transpose, 1);
199 }
200 
aom_ifft4x4_float_c(const float * input,float * temp,float * output)201 void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
202   aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
203                   aom_ifft1d_4_float, simple_transpose, 1);
204 }
205 
aom_ifft8x8_float_c(const float * input,float * temp,float * output)206 void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
207   aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
208                   aom_ifft1d_8_float, simple_transpose, 1);
209 }
210 
aom_ifft16x16_float_c(const float * input,float * temp,float * output)211 void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
212   aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
213                   aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
214 }
215 
aom_ifft32x32_float_c(const float * input,float * temp,float * output)216 void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
217   aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
218                   aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
219 }
220