1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15 #include "vpx_ports/mem.h"
16 
17 typedef void filter8_1dfunction (
18   const unsigned char *src_ptr,
19   const ptrdiff_t src_pitch,
20   unsigned char *output_ptr,
21   ptrdiff_t out_pitch,
22   unsigned int output_height,
23   const short *filter
24 );
25 
26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27   void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28                                    uint8_t *dst, ptrdiff_t dst_stride, \
29                                    const int16_t *filter_x, int x_step_q4, \
30                                    const int16_t *filter_y, int y_step_q4, \
31                                    int w, int h) { \
32   if (step_q4 == 16 && filter[3] != 128) { \
33     if (filter[0] || filter[1] || filter[2]) { \
34       while (w >= 16) { \
35         vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36                                                  src_stride, \
37                                                  dst, \
38                                                  dst_stride, \
39                                                  h, \
40                                                  filter); \
41         src += 16; \
42         dst += 16; \
43         w -= 16; \
44       } \
45       while (w >= 8) { \
46         vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47                                                 src_stride, \
48                                                 dst, \
49                                                 dst_stride, \
50                                                 h, \
51                                                 filter); \
52         src += 8; \
53         dst += 8; \
54         w -= 8; \
55       } \
56       while (w >= 4) { \
57         vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58                                                 src_stride, \
59                                                 dst, \
60                                                 dst_stride, \
61                                                 h, \
62                                                 filter); \
63         src += 4; \
64         dst += 4; \
65         w -= 4; \
66       } \
67     } else { \
68       while (w >= 16) { \
69         vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70                                                  src_stride, \
71                                                  dst, \
72                                                  dst_stride, \
73                                                  h, \
74                                                  filter); \
75         src += 16; \
76         dst += 16; \
77         w -= 16; \
78       } \
79       while (w >= 8) { \
80         vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81                                                 src_stride, \
82                                                 dst, \
83                                                 dst_stride, \
84                                                 h, \
85                                                 filter); \
86         src += 8; \
87         dst += 8; \
88         w -= 8; \
89       } \
90       while (w >= 4) { \
91         vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92                                                 src_stride, \
93                                                 dst, \
94                                                 dst_stride, \
95                                                 h, \
96                                                 filter); \
97         src += 4; \
98         dst += 4; \
99         w -= 4; \
100       } \
101     } \
102   } \
103   if (w) { \
104     vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105                              filter_x, x_step_q4, filter_y, y_step_q4, \
106                              w, h); \
107   } \
108 }
109 
110 #define FUN_CONV_2D(avg, opt) \
111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112                               uint8_t *dst, ptrdiff_t dst_stride, \
113                               const int16_t *filter_x, int x_step_q4, \
114                               const int16_t *filter_y, int y_step_q4, \
115                               int w, int h) { \
116   assert(w <= 64); \
117   assert(h <= 64); \
118   if (x_step_q4 == 16 && y_step_q4 == 16) { \
119     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121       DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122       vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123                                 filter_x, x_step_q4, filter_y, y_step_q4, \
124                                 w, h + 7); \
125       vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126                                       filter_x, x_step_q4, filter_y, \
127                                       y_step_q4, w, h); \
128     } else { \
129       DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130       vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131                                 filter_x, x_step_q4, filter_y, y_step_q4, \
132                                 w, h + 1); \
133       vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134                                       filter_x, x_step_q4, filter_y, \
135                                       y_step_q4, w, h); \
136     } \
137   } else { \
138     vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140   } \
141 }
142 #if HAVE_AVX2
143 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
144 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
145 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
146 #if (ARCH_X86_64)
147 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
148 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
149 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
150 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
151 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
152 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
153 #else
154 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
155 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
156 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
157 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
158 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
159 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
160 #endif
161 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
162 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
163 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
164 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
165 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
166 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
167 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
168 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
169 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
170 #define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
171 #define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
172 #define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
173 #define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
174 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
175 //                                uint8_t *dst, ptrdiff_t dst_stride,
176 //                                const int16_t *filter_x, int x_step_q4,
177 //                                const int16_t *filter_y, int y_step_q4,
178 //                                int w, int h);
179 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
180 //                               uint8_t *dst, ptrdiff_t dst_stride,
181 //                               const int16_t *filter_x, int x_step_q4,
182 //                               const int16_t *filter_y, int y_step_q4,
183 //                               int w, int h);
184 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
185 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
186 
187 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
188 //                          uint8_t *dst, ptrdiff_t dst_stride,
189 //                          const int16_t *filter_x, int x_step_q4,
190 //                          const int16_t *filter_y, int y_step_q4,
191 //                          int w, int h);
192 FUN_CONV_2D(, avx2);
193 #endif
194 #if HAVE_SSSE3
195 #if (ARCH_X86_64)
196 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
197 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
198 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
199 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
200 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
201 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
202 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
203 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
204 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
205 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
206 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
207 #else
208 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
209 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
210 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
211 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
212 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
213 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
214 #endif
215 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
216 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
217 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
218 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
219 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
220 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
221 
222 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
223 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
224 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
225 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
226 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
227 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
228 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
229 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
230 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
231 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
232 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
233 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
234 
235 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
236 //                                uint8_t *dst, ptrdiff_t dst_stride,
237 //                                const int16_t *filter_x, int x_step_q4,
238 //                                const int16_t *filter_y, int y_step_q4,
239 //                                int w, int h);
240 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
241 //                               uint8_t *dst, ptrdiff_t dst_stride,
242 //                               const int16_t *filter_x, int x_step_q4,
243 //                               const int16_t *filter_y, int y_step_q4,
244 //                               int w, int h);
245 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
246 //                                    uint8_t *dst, ptrdiff_t dst_stride,
247 //                                    const int16_t *filter_x, int x_step_q4,
248 //                                    const int16_t *filter_y, int y_step_q4,
249 //                                    int w, int h);
250 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
251 //                                   uint8_t *dst, ptrdiff_t dst_stride,
252 //                                   const int16_t *filter_x, int x_step_q4,
253 //                                   const int16_t *filter_y, int y_step_q4,
254 //                                   int w, int h);
255 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
256 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
257 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
258 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
259             ssse3);
260 
261 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
262 //                          uint8_t *dst, ptrdiff_t dst_stride,
263 //                          const int16_t *filter_x, int x_step_q4,
264 //                          const int16_t *filter_y, int y_step_q4,
265 //                          int w, int h);
266 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
267 //                              uint8_t *dst, ptrdiff_t dst_stride,
268 //                              const int16_t *filter_x, int x_step_q4,
269 //                              const int16_t *filter_y, int y_step_q4,
270 //                              int w, int h);
271 FUN_CONV_2D(, ssse3);
272 FUN_CONV_2D(avg_ , ssse3);
273 #endif
274 
275 #if HAVE_SSE2
276 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
277 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
278 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
279 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
280 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
281 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
282 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
283 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
284 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
285 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
286 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
287 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
288 
289 filter8_1dfunction vp9_filter_block1d16_v2_sse2;
290 filter8_1dfunction vp9_filter_block1d16_h2_sse2;
291 filter8_1dfunction vp9_filter_block1d8_v2_sse2;
292 filter8_1dfunction vp9_filter_block1d8_h2_sse2;
293 filter8_1dfunction vp9_filter_block1d4_v2_sse2;
294 filter8_1dfunction vp9_filter_block1d4_h2_sse2;
295 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
296 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
297 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
298 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
299 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
300 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
301 
302 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
303 //                               uint8_t *dst, ptrdiff_t dst_stride,
304 //                               const int16_t *filter_x, int x_step_q4,
305 //                               const int16_t *filter_y, int y_step_q4,
306 //                               int w, int h);
307 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
308 //                              uint8_t *dst, ptrdiff_t dst_stride,
309 //                              const int16_t *filter_x, int x_step_q4,
310 //                              const int16_t *filter_y, int y_step_q4,
311 //                              int w, int h);
312 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
313 //                                   uint8_t *dst, ptrdiff_t dst_stride,
314 //                                   const int16_t *filter_x, int x_step_q4,
315 //                                   const int16_t *filter_y, int y_step_q4,
316 //                                   int w, int h);
317 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
318 //                                  uint8_t *dst, ptrdiff_t dst_stride,
319 //                                  const int16_t *filter_x, int x_step_q4,
320 //                                  const int16_t *filter_y, int y_step_q4,
321 //                                  int w, int h);
322 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
323 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
324 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
325 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
326 
327 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
328 //                         uint8_t *dst, ptrdiff_t dst_stride,
329 //                         const int16_t *filter_x, int x_step_q4,
330 //                         const int16_t *filter_y, int y_step_q4,
331 //                         int w, int h);
332 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
333 //                             uint8_t *dst, ptrdiff_t dst_stride,
334 //                             const int16_t *filter_x, int x_step_q4,
335 //                             const int16_t *filter_y, int y_step_q4,
336 //                             int w, int h);
337 FUN_CONV_2D(, sse2);
338 FUN_CONV_2D(avg_ , sse2);
339 #endif
340