1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h>
12 #include "vpx_ports/mem.h"
13
14 // filters for 16_h8 and 16_v8
15 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
18 };
19
20 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
22 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
23 };
24
25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
28 };
29
30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
33 };
34
35 #if defined(__clang__)
36 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3)
37 # define MM256_BROADCASTSI128_SI256(x) \
38 _mm_broadcastsi128_si256((__m128i const *)&(x))
39 # else // clang > 3.3
40 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
41 # endif // clang <= 3.3
42 #elif defined(__GNUC__)
43 # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
44 # define MM256_BROADCASTSI128_SI256(x) \
45 _mm_broadcastsi128_si256((__m128i const *)&(x))
46 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
47 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
48 # else // gcc > 4.7
49 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
50 # endif // gcc <= 4.6
51 #else // !(gcc || clang)
52 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
53 #endif // __clang__
54
vp9_filter_block1d16_h8_avx2(unsigned char * src_ptr,unsigned int src_pixels_per_line,unsigned char * output_ptr,unsigned int output_pitch,unsigned int output_height,int16_t * filter)55 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
56 unsigned int src_pixels_per_line,
57 unsigned char *output_ptr,
58 unsigned int output_pitch,
59 unsigned int output_height,
60 int16_t *filter) {
61 __m128i filtersReg;
62 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
63 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
64 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
65 __m256i srcReg32b1, srcReg32b2, filtersReg32;
66 unsigned int i;
67 unsigned int src_stride, dst_stride;
68
69 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
70 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
71 filtersReg = _mm_loadu_si128((__m128i *)filter);
72 // converting the 16 bit (short) to 8 bit (byte) and have the same data
73 // in both lanes of 128 bit register.
74 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
75 // have the same data in both lanes of a 256 bit register
76 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
77
78 // duplicate only the first 16 bits (first and second byte)
79 // across 256 bit register
80 firstFilters = _mm256_shuffle_epi8(filtersReg32,
81 _mm256_set1_epi16(0x100u));
82 // duplicate only the second 16 bits (third and forth byte)
83 // across 256 bit register
84 secondFilters = _mm256_shuffle_epi8(filtersReg32,
85 _mm256_set1_epi16(0x302u));
86 // duplicate only the third 16 bits (fifth and sixth byte)
87 // across 256 bit register
88 thirdFilters = _mm256_shuffle_epi8(filtersReg32,
89 _mm256_set1_epi16(0x504u));
90 // duplicate only the forth 16 bits (seventh and eighth byte)
91 // across 256 bit register
92 forthFilters = _mm256_shuffle_epi8(filtersReg32,
93 _mm256_set1_epi16(0x706u));
94
95 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
96 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
97 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
98 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
99
100 // multiple the size of the source and destination stride by two
101 src_stride = src_pixels_per_line << 1;
102 dst_stride = output_pitch << 1;
103 for (i = output_height; i > 1; i-=2) {
104 // load the 2 strides of source
105 srcReg32b1 = _mm256_castsi128_si256(
106 _mm_loadu_si128((__m128i *)(src_ptr-3)));
107 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
108 _mm_loadu_si128((__m128i *)
109 (src_ptr+src_pixels_per_line-3)), 1);
110
111 // filter the source buffer
112 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
113 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
114
115 // multiply 2 adjacent elements with the filter and add the result
116 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
117 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
118
119 // add and saturate the results together
120 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
121
122 // filter the source buffer
123 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
124 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
125
126 // multiply 2 adjacent elements with the filter and add the result
127 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
128 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
129
130 // add and saturate the results together
131 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
132 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
133
134 // reading 2 strides of the next 16 bytes
135 // (part of it was being read by earlier read)
136 srcReg32b2 = _mm256_castsi128_si256(
137 _mm_loadu_si128((__m128i *)(src_ptr+5)));
138 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
139 _mm_loadu_si128((__m128i *)
140 (src_ptr+src_pixels_per_line+5)), 1);
141
142 // add and saturate the results together
143 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
144 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
145
146 // filter the source buffer
147 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
148 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
149
150 // multiply 2 adjacent elements with the filter and add the result
151 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
152 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
153
154 // add and saturate the results together
155 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
156
157 // filter the source buffer
158 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
159 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
160
161 // multiply 2 adjacent elements with the filter and add the result
162 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
163 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
164
165 // add and saturate the results together
166 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
167 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
168 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
169 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
170
171
172 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
173
174 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
175
176 // shift by 7 bit each 16 bit
177 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
178 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
179
180 // shrink to 8 bit each 16 bits, the first lane contain the first
181 // convolve result and the second lane contain the second convolve
182 // result
183 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
184 srcRegFilt32b2_1);
185
186 src_ptr+=src_stride;
187
188 // save 16 bytes
189 _mm_store_si128((__m128i*)output_ptr,
190 _mm256_castsi256_si128(srcRegFilt32b1_1));
191
192 // save the next 16 bits
193 _mm_store_si128((__m128i*)(output_ptr+output_pitch),
194 _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
195 output_ptr+=dst_stride;
196 }
197
198 // if the number of strides is odd.
199 // process only 16 bytes
200 if (i > 0) {
201 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
202 __m128i srcRegFilt2, srcRegFilt3;
203
204 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
205
206 // filter the source buffer
207 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
208 _mm256_castsi256_si128(filt1Reg));
209 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
210 _mm256_castsi256_si128(filt2Reg));
211
212 // multiply 2 adjacent elements with the filter and add the result
213 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
214 _mm256_castsi256_si128(firstFilters));
215 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
216 _mm256_castsi256_si128(secondFilters));
217
218 // add and saturate the results together
219 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
220
221 // filter the source buffer
222 srcRegFilt3= _mm_shuffle_epi8(srcReg1,
223 _mm256_castsi256_si128(filt4Reg));
224 srcRegFilt2= _mm_shuffle_epi8(srcReg1,
225 _mm256_castsi256_si128(filt3Reg));
226
227 // multiply 2 adjacent elements with the filter and add the result
228 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
229 _mm256_castsi256_si128(forthFilters));
230 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
231 _mm256_castsi256_si128(thirdFilters));
232
233 // add and saturate the results together
234 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
235 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
236
237 // reading the next 16 bytes
238 // (part of it was being read by earlier read)
239 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
240
241 // add and saturate the results together
242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
243 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
244
245 // filter the source buffer
246 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
247 _mm256_castsi256_si128(filt1Reg));
248 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
249 _mm256_castsi256_si128(filt2Reg));
250
251 // multiply 2 adjacent elements with the filter and add the result
252 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
253 _mm256_castsi256_si128(firstFilters));
254 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
255 _mm256_castsi256_si128(secondFilters));
256
257 // add and saturate the results together
258 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
259
260 // filter the source buffer
261 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
262 _mm256_castsi256_si128(filt4Reg));
263 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
264 _mm256_castsi256_si128(filt3Reg));
265
266 // multiply 2 adjacent elements with the filter and add the result
267 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
268 _mm256_castsi256_si128(forthFilters));
269 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
270 _mm256_castsi256_si128(thirdFilters));
271
272 // add and saturate the results together
273 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
274 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
275 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
276 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
277
278
279 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
280 _mm256_castsi256_si128(addFilterReg64));
281
282 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
283 _mm256_castsi256_si128(addFilterReg64));
284
285 // shift by 7 bit each 16 bit
286 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
287 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
288
289 // shrink to 8 bit each 16 bits, the first lane contain the first
290 // convolve result and the second lane contain the second convolve
291 // result
292 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
293
294 // save 16 bytes
295 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
296 }
297 }
298
vp9_filter_block1d16_v8_avx2(unsigned char * src_ptr,unsigned int src_pitch,unsigned char * output_ptr,unsigned int out_pitch,unsigned int output_height,int16_t * filter)299 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
300 unsigned int src_pitch,
301 unsigned char *output_ptr,
302 unsigned int out_pitch,
303 unsigned int output_height,
304 int16_t *filter) {
305 __m128i filtersReg;
306 __m256i addFilterReg64;
307 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
308 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
309 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
310 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
311 unsigned int i;
312 unsigned int src_stride, dst_stride;
313
314 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
315 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
316 filtersReg = _mm_loadu_si128((__m128i *)filter);
317 // converting the 16 bit (short) to 8 bit (byte) and have the
318 // same data in both lanes of 128 bit register.
319 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
320 // have the same data in both lanes of a 256 bit register
321 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
322
323 // duplicate only the first 16 bits (first and second byte)
324 // across 256 bit register
325 firstFilters = _mm256_shuffle_epi8(filtersReg32,
326 _mm256_set1_epi16(0x100u));
327 // duplicate only the second 16 bits (third and forth byte)
328 // across 256 bit register
329 secondFilters = _mm256_shuffle_epi8(filtersReg32,
330 _mm256_set1_epi16(0x302u));
331 // duplicate only the third 16 bits (fifth and sixth byte)
332 // across 256 bit register
333 thirdFilters = _mm256_shuffle_epi8(filtersReg32,
334 _mm256_set1_epi16(0x504u));
335 // duplicate only the forth 16 bits (seventh and eighth byte)
336 // across 256 bit register
337 forthFilters = _mm256_shuffle_epi8(filtersReg32,
338 _mm256_set1_epi16(0x706u));
339
340 // multiple the size of the source and destination stride by two
341 src_stride = src_pitch << 1;
342 dst_stride = out_pitch << 1;
343
344 // load 16 bytes 7 times in stride of src_pitch
345 srcReg32b1 = _mm256_castsi128_si256(
346 _mm_loadu_si128((__m128i *)(src_ptr)));
347 srcReg32b2 = _mm256_castsi128_si256(
348 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
349 srcReg32b3 = _mm256_castsi128_si256(
350 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
351 srcReg32b4 = _mm256_castsi128_si256(
352 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
353 srcReg32b5 = _mm256_castsi128_si256(
354 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
355 srcReg32b6 = _mm256_castsi128_si256(
356 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
357 srcReg32b7 = _mm256_castsi128_si256(
358 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
359
360 // have each consecutive loads on the same 256 register
361 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
362 _mm256_castsi256_si128(srcReg32b2), 1);
363 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
364 _mm256_castsi256_si128(srcReg32b3), 1);
365 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
366 _mm256_castsi256_si128(srcReg32b4), 1);
367 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
368 _mm256_castsi256_si128(srcReg32b5), 1);
369 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
370 _mm256_castsi256_si128(srcReg32b6), 1);
371 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
372 _mm256_castsi256_si128(srcReg32b7), 1);
373
374 // merge every two consecutive registers except the last one
375 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
376 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
377
378 // save
379 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
380
381 // save
382 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
383
384 // save
385 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
386
387 // save
388 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
389
390
391 for (i = output_height; i > 1; i-=2) {
392 // load the last 2 loads of 16 bytes and have every two
393 // consecutive loads in the same 256 bit register
394 srcReg32b8 = _mm256_castsi128_si256(
395 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
396 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
397 _mm256_castsi256_si128(srcReg32b8), 1);
398 srcReg32b9 = _mm256_castsi128_si256(
399 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
400 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
401 _mm256_castsi256_si128(srcReg32b9), 1);
402
403 // merge every two consecutive registers
404 // save
405 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
406 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
407
408 // multiply 2 adjacent elements with the filter and add the result
409 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
410 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
411 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
412 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
413
414 // add and saturate the results together
415 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
416 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
417
418
419 // multiply 2 adjacent elements with the filter and add the result
420 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
421 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
422
423 // multiply 2 adjacent elements with the filter and add the result
424 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
425 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
426
427
428 // add and saturate the results together
429 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
430 _mm256_min_epi16(srcReg32b8, srcReg32b12));
431 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
432 _mm256_min_epi16(srcReg32b6, srcReg32b13));
433
434 // add and saturate the results together
435 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
436 _mm256_max_epi16(srcReg32b8, srcReg32b12));
437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
438 _mm256_max_epi16(srcReg32b6, srcReg32b13));
439
440
441 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
442 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
443
444 // shift by 7 bit each 16 bit
445 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
446 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
447
448 // shrink to 8 bit each 16 bits, the first lane contain the first
449 // convolve result and the second lane contain the second convolve
450 // result
451 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
452
453 src_ptr+=src_stride;
454
455 // save 16 bytes
456 _mm_store_si128((__m128i*)output_ptr,
457 _mm256_castsi256_si128(srcReg32b1));
458
459 // save the next 16 bits
460 _mm_store_si128((__m128i*)(output_ptr+out_pitch),
461 _mm256_extractf128_si256(srcReg32b1, 1));
462
463 output_ptr+=dst_stride;
464
465 // save part of the registers for next strides
466 srcReg32b10 = srcReg32b11;
467 srcReg32b1 = srcReg32b3;
468 srcReg32b11 = srcReg32b2;
469 srcReg32b3 = srcReg32b5;
470 srcReg32b2 = srcReg32b4;
471 srcReg32b5 = srcReg32b7;
472 srcReg32b7 = srcReg32b9;
473 }
474 if (i > 0) {
475 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
476 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
477 // load the last 16 bytes
478 srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
479
480 // merge the last 2 results together
481 srcRegFilt4 = _mm_unpacklo_epi8(
482 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
483 srcRegFilt7 = _mm_unpackhi_epi8(
484 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
485
486 // multiply 2 adjacent elements with the filter and add the result
487 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
488 _mm256_castsi256_si128(firstFilters));
489 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
490 _mm256_castsi256_si128(forthFilters));
491 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
492 _mm256_castsi256_si128(firstFilters));
493 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
494 _mm256_castsi256_si128(forthFilters));
495
496 // add and saturate the results together
497 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
498 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
499
500
501 // multiply 2 adjacent elements with the filter and add the result
502 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
503 _mm256_castsi256_si128(secondFilters));
504 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
505 _mm256_castsi256_si128(secondFilters));
506
507 // multiply 2 adjacent elements with the filter and add the result
508 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
509 _mm256_castsi256_si128(thirdFilters));
510 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
511 _mm256_castsi256_si128(thirdFilters));
512
513 // add and saturate the results together
514 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
515 _mm_min_epi16(srcRegFilt4, srcRegFilt6));
516 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
517 _mm_min_epi16(srcRegFilt5, srcRegFilt7));
518
519 // add and saturate the results together
520 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
521 _mm_max_epi16(srcRegFilt4, srcRegFilt6));
522 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
523 _mm_max_epi16(srcRegFilt5, srcRegFilt7));
524
525
526 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
527 _mm256_castsi256_si128(addFilterReg64));
528 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
529 _mm256_castsi256_si128(addFilterReg64));
530
531 // shift by 7 bit each 16 bit
532 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
533 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
534
535 // shrink to 8 bit each 16 bits, the first lane contain the first
536 // convolve result and the second lane contain the second convolve
537 // result
538 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
539
540 // save 16 bytes
541 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
542 }
543 }
544