1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 13 #include "./vpx_config.h" 14 #include "./vp9_rtcd.h" 15 #include "vpx_ports/mem.h" 16 17 typedef void filter8_1dfunction ( 18 const unsigned char *src_ptr, 19 const ptrdiff_t src_pitch, 20 unsigned char *output_ptr, 21 ptrdiff_t out_pitch, 22 unsigned int output_height, 23 const short *filter 24 ); 25 26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ 27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ 28 uint8_t *dst, ptrdiff_t dst_stride, \ 29 const int16_t *filter_x, int x_step_q4, \ 30 const int16_t *filter_y, int y_step_q4, \ 31 int w, int h) { \ 32 if (step_q4 == 16 && filter[3] != 128) { \ 33 if (filter[0] || filter[1] || filter[2]) { \ 34 while (w >= 16) { \ 35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ 36 src_stride, \ 37 dst, \ 38 dst_stride, \ 39 h, \ 40 filter); \ 41 src += 16; \ 42 dst += 16; \ 43 w -= 16; \ 44 } \ 45 while (w >= 8) { \ 46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ 47 src_stride, \ 48 dst, \ 49 dst_stride, \ 50 h, \ 51 filter); \ 52 src += 8; \ 53 dst += 8; \ 54 w -= 8; \ 55 } \ 56 while (w >= 4) { \ 57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ 58 src_stride, \ 59 dst, \ 60 dst_stride, \ 61 h, \ 62 filter); \ 63 src += 4; \ 64 dst += 4; \ 65 w -= 4; \ 66 } \ 67 } else { \ 68 while (w >= 16) { \ 69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \ 70 src_stride, \ 71 dst, \ 72 dst_stride, \ 73 h, \ 74 filter); \ 75 src += 16; \ 76 dst += 16; \ 77 w -= 16; \ 78 } \ 79 while (w >= 8) { \ 80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \ 81 src_stride, \ 82 dst, \ 83 dst_stride, \ 84 h, \ 85 filter); \ 86 src += 8; \ 87 dst += 8; \ 88 w -= 8; \ 89 } \ 90 while (w >= 4) { \ 91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \ 92 src_stride, \ 93 dst, \ 94 dst_stride, \ 95 h, \ 96 filter); \ 97 src += 4; \ 98 dst += 4; \ 99 w -= 4; \ 100 } \ 101 } \ 102 } \ 103 if (w) { \ 104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ 105 filter_x, x_step_q4, filter_y, y_step_q4, \ 106 w, h); \ 107 } \ 108 } 109 110 #define FUN_CONV_2D(avg, opt) \ 111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ 112 uint8_t *dst, ptrdiff_t dst_stride, \ 113 const int16_t *filter_x, int x_step_q4, \ 114 const int16_t *filter_y, int y_step_q4, \ 115 int w, int h) { \ 116 assert(w <= 64); \ 117 assert(h <= 64); \ 118 if (x_step_q4 == 16 && y_step_q4 == 16) { \ 119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ 120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ 121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ 122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ 123 filter_x, x_step_q4, filter_y, y_step_q4, \ 124 w, h + 7); \ 125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ 126 filter_x, x_step_q4, filter_y, \ 127 y_step_q4, w, h); \ 128 } else { \ 129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ 130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ 131 filter_x, x_step_q4, filter_y, y_step_q4, \ 132 w, h + 1); \ 133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ 134 filter_x, x_step_q4, filter_y, \ 135 y_step_q4, w, h); \ 136 } \ 137 } else { \ 138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ 139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ 140 } \ 141 } 142 #if HAVE_AVX2 143 filter8_1dfunction vp9_filter_block1d16_v8_avx2; 144 filter8_1dfunction vp9_filter_block1d16_h8_avx2; 145 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 146 #if (ARCH_X86_64) 147 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; 148 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; 149 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; 150 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 151 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 152 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 153 #else 154 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 155 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 156 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 157 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 158 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 159 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 160 #endif 161 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; 162 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; 163 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; 164 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; 165 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; 166 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; 167 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 168 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 169 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 170 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 171 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 172 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 173 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 174 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, 175 // uint8_t *dst, ptrdiff_t dst_stride, 176 // const int16_t *filter_x, int x_step_q4, 177 // const int16_t *filter_y, int y_step_q4, 178 // int w, int h); 179 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, 180 // uint8_t *dst, ptrdiff_t dst_stride, 181 // const int16_t *filter_x, int x_step_q4, 182 // const int16_t *filter_y, int y_step_q4, 183 // int w, int h); 184 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); 185 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); 186 187 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, 188 // uint8_t *dst, ptrdiff_t dst_stride, 189 // const int16_t *filter_x, int x_step_q4, 190 // const int16_t *filter_y, int y_step_q4, 191 // int w, int h); 192 FUN_CONV_2D(, avx2); 193 #endif 194 #if HAVE_SSSE3 195 #if (ARCH_X86_64) 196 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; 197 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; 198 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; 199 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; 200 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 201 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; 202 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 203 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 204 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 205 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 206 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 207 #else 208 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; 209 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; 210 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 211 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 212 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 213 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 214 #endif 215 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; 216 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; 217 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; 218 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; 219 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; 220 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; 221 222 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; 223 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; 224 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; 225 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; 226 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; 227 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; 228 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; 229 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; 230 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; 231 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; 232 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; 233 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; 234 235 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 236 // uint8_t *dst, ptrdiff_t dst_stride, 237 // const int16_t *filter_x, int x_step_q4, 238 // const int16_t *filter_y, int y_step_q4, 239 // int w, int h); 240 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 241 // uint8_t *dst, ptrdiff_t dst_stride, 242 // const int16_t *filter_x, int x_step_q4, 243 // const int16_t *filter_y, int y_step_q4, 244 // int w, int h); 245 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 246 // uint8_t *dst, ptrdiff_t dst_stride, 247 // const int16_t *filter_x, int x_step_q4, 248 // const int16_t *filter_y, int y_step_q4, 249 // int w, int h); 250 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 251 // uint8_t *dst, ptrdiff_t dst_stride, 252 // const int16_t *filter_x, int x_step_q4, 253 // const int16_t *filter_y, int y_step_q4, 254 // int w, int h); 255 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); 256 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); 257 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); 258 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, 259 ssse3); 260 261 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 262 // uint8_t *dst, ptrdiff_t dst_stride, 263 // const int16_t *filter_x, int x_step_q4, 264 // const int16_t *filter_y, int y_step_q4, 265 // int w, int h); 266 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 267 // uint8_t *dst, ptrdiff_t dst_stride, 268 // const int16_t *filter_x, int x_step_q4, 269 // const int16_t *filter_y, int y_step_q4, 270 // int w, int h); 271 FUN_CONV_2D(, ssse3); 272 FUN_CONV_2D(avg_ , ssse3); 273 #endif 274 275 #if HAVE_SSE2 276 filter8_1dfunction vp9_filter_block1d16_v8_sse2; 277 filter8_1dfunction vp9_filter_block1d16_h8_sse2; 278 filter8_1dfunction vp9_filter_block1d8_v8_sse2; 279 filter8_1dfunction vp9_filter_block1d8_h8_sse2; 280 filter8_1dfunction vp9_filter_block1d4_v8_sse2; 281 filter8_1dfunction vp9_filter_block1d4_h8_sse2; 282 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 283 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 284 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; 285 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; 286 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; 287 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; 288 289 filter8_1dfunction vp9_filter_block1d16_v2_sse2; 290 filter8_1dfunction vp9_filter_block1d16_h2_sse2; 291 filter8_1dfunction vp9_filter_block1d8_v2_sse2; 292 filter8_1dfunction vp9_filter_block1d8_h2_sse2; 293 filter8_1dfunction vp9_filter_block1d4_v2_sse2; 294 filter8_1dfunction vp9_filter_block1d4_h2_sse2; 295 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; 296 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; 297 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; 298 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; 299 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; 300 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; 301 302 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 303 // uint8_t *dst, ptrdiff_t dst_stride, 304 // const int16_t *filter_x, int x_step_q4, 305 // const int16_t *filter_y, int y_step_q4, 306 // int w, int h); 307 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 308 // uint8_t *dst, ptrdiff_t dst_stride, 309 // const int16_t *filter_x, int x_step_q4, 310 // const int16_t *filter_y, int y_step_q4, 311 // int w, int h); 312 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 313 // uint8_t *dst, ptrdiff_t dst_stride, 314 // const int16_t *filter_x, int x_step_q4, 315 // const int16_t *filter_y, int y_step_q4, 316 // int w, int h); 317 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 318 // uint8_t *dst, ptrdiff_t dst_stride, 319 // const int16_t *filter_x, int x_step_q4, 320 // const int16_t *filter_y, int y_step_q4, 321 // int w, int h); 322 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); 323 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); 324 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); 325 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); 326 327 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, 328 // uint8_t *dst, ptrdiff_t dst_stride, 329 // const int16_t *filter_x, int x_step_q4, 330 // const int16_t *filter_y, int y_step_q4, 331 // int w, int h); 332 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, 333 // uint8_t *dst, ptrdiff_t dst_stride, 334 // const int16_t *filter_x, int x_step_q4, 335 // const int16_t *filter_y, int y_step_q4, 336 // int w, int h); 337 FUN_CONV_2D(, sse2); 338 FUN_CONV_2D(avg_ , sse2); 339 #endif 340