1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>  // AVX2
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_ports/mem.h"
17 
18 /* clang-format off */
19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
20   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
21   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
22   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
23   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
24   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
25   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
26   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
27   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
28    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
29    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
30    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
31    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
32    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
33    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
34    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
35    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
36 };
37 /* clang-format on */
38 
39 #define FILTER_SRC(filter)                               \
40   /* filter the source */                                \
41   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
42   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
43                                                          \
44   /* add 8 to source */                                  \
45   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
46   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
47                                                          \
48   /* divide source by 16 */                              \
49   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
50   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
51 
52 #define MERGE_WITH_SRC(src_reg, reg)               \
53   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
54   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
55 
56 #define LOAD_SRC_DST                                    \
57   /* load source and destination */                     \
58   src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
59   dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
60 
61 #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
62   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
63   /* average between current and next stride source */                     \
64   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
65 
66 #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
67   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
68   MERGE_WITH_SRC(src_reg, src_next_reg)
69 
70 #define CALC_SUM_SSE_INSIDE_LOOP                          \
71   /* expand each byte to 2 bytes */                       \
72   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
73   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
74   /* source - dest */                                     \
75   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
76   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
77   /* caculate sum */                                      \
78   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
79   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
80   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
81   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
82   /* calculate sse */                                     \
83   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
84   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
85 
86 // final calculation to sum and sse
87 #define CALC_SUM_AND_SSE                                                   \
88   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
89   sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
90   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
91   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
92   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
93   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
94                                                                            \
95   sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
96   sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
97                                                                            \
98   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
99   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
100   *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
101                   _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
102   sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
103   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
104   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
105         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
106 
aom_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)107 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
108                                              int x_offset, int y_offset,
109                                              const uint8_t *dst, int dst_stride,
110                                              int height, unsigned int *sse) {
111   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
112   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
113   __m256i zero_reg;
114   int i, sum;
115   sum_reg = _mm256_set1_epi16(0);
116   sse_reg = _mm256_set1_epi16(0);
117   zero_reg = _mm256_set1_epi16(0);
118 
119   // x_offset = 0 and y_offset = 0
120   if (x_offset == 0) {
121     if (y_offset == 0) {
122       for (i = 0; i < height; i++) {
123         LOAD_SRC_DST
124         // expend each byte to 2 bytes
125         MERGE_WITH_SRC(src_reg, zero_reg)
126         CALC_SUM_SSE_INSIDE_LOOP
127         src += src_stride;
128         dst += dst_stride;
129       }
130       // x_offset = 0 and y_offset = 8
131     } else if (y_offset == 8) {
132       __m256i src_next_reg;
133       for (i = 0; i < height; i++) {
134         LOAD_SRC_DST
135         AVG_NEXT_SRC(src_reg, src_stride)
136         // expend each byte to 2 bytes
137         MERGE_WITH_SRC(src_reg, zero_reg)
138         CALC_SUM_SSE_INSIDE_LOOP
139         src += src_stride;
140         dst += dst_stride;
141       }
142       // x_offset = 0 and y_offset = bilin interpolation
143     } else {
144       __m256i filter, pw8, src_next_reg;
145 
146       y_offset <<= 5;
147       filter = _mm256_load_si256(
148           (__m256i const *)(bilinear_filters_avx2 + y_offset));
149       pw8 = _mm256_set1_epi16(8);
150       for (i = 0; i < height; i++) {
151         LOAD_SRC_DST
152         MERGE_NEXT_SRC(src_reg, src_stride)
153         FILTER_SRC(filter)
154         CALC_SUM_SSE_INSIDE_LOOP
155         src += src_stride;
156         dst += dst_stride;
157       }
158     }
159     // x_offset = 8  and y_offset = 0
160   } else if (x_offset == 8) {
161     if (y_offset == 0) {
162       __m256i src_next_reg;
163       for (i = 0; i < height; i++) {
164         LOAD_SRC_DST
165         AVG_NEXT_SRC(src_reg, 1)
166         // expand each byte to 2 bytes
167         MERGE_WITH_SRC(src_reg, zero_reg)
168         CALC_SUM_SSE_INSIDE_LOOP
169         src += src_stride;
170         dst += dst_stride;
171       }
172       // x_offset = 8  and y_offset = 8
173     } else if (y_offset == 8) {
174       __m256i src_next_reg, src_avg;
175       // load source and another source starting from the next
176       // following byte
177       src_reg = _mm256_loadu_si256((__m256i const *)(src));
178       AVG_NEXT_SRC(src_reg, 1)
179       for (i = 0; i < height; i++) {
180         src_avg = src_reg;
181         src += src_stride;
182         LOAD_SRC_DST
183         AVG_NEXT_SRC(src_reg, 1)
184         // average between previous average to current average
185         src_avg = _mm256_avg_epu8(src_avg, src_reg);
186         // expand each byte to 2 bytes
187         MERGE_WITH_SRC(src_avg, zero_reg)
188         // save current source average
189         CALC_SUM_SSE_INSIDE_LOOP
190         dst += dst_stride;
191       }
192       // x_offset = 8  and y_offset = bilin interpolation
193     } else {
194       __m256i filter, pw8, src_next_reg, src_avg;
195       y_offset <<= 5;
196       filter = _mm256_load_si256(
197           (__m256i const *)(bilinear_filters_avx2 + y_offset));
198       pw8 = _mm256_set1_epi16(8);
199       // load source and another source starting from the next
200       // following byte
201       src_reg = _mm256_loadu_si256((__m256i const *)(src));
202       AVG_NEXT_SRC(src_reg, 1)
203       for (i = 0; i < height; i++) {
204         // save current source average
205         src_avg = src_reg;
206         src += src_stride;
207         LOAD_SRC_DST
208         AVG_NEXT_SRC(src_reg, 1)
209         MERGE_WITH_SRC(src_avg, src_reg)
210         FILTER_SRC(filter)
211         CALC_SUM_SSE_INSIDE_LOOP
212         dst += dst_stride;
213       }
214     }
215     // x_offset = bilin interpolation and y_offset = 0
216   } else {
217     if (y_offset == 0) {
218       __m256i filter, pw8, src_next_reg;
219       x_offset <<= 5;
220       filter = _mm256_load_si256(
221           (__m256i const *)(bilinear_filters_avx2 + x_offset));
222       pw8 = _mm256_set1_epi16(8);
223       for (i = 0; i < height; i++) {
224         LOAD_SRC_DST
225         MERGE_NEXT_SRC(src_reg, 1)
226         FILTER_SRC(filter)
227         CALC_SUM_SSE_INSIDE_LOOP
228         src += src_stride;
229         dst += dst_stride;
230       }
231       // x_offset = bilin interpolation and y_offset = 8
232     } else if (y_offset == 8) {
233       __m256i filter, pw8, src_next_reg, src_pack;
234       x_offset <<= 5;
235       filter = _mm256_load_si256(
236           (__m256i const *)(bilinear_filters_avx2 + x_offset));
237       pw8 = _mm256_set1_epi16(8);
238       src_reg = _mm256_loadu_si256((__m256i const *)(src));
239       MERGE_NEXT_SRC(src_reg, 1)
240       FILTER_SRC(filter)
241       // convert each 16 bit to 8 bit to each low and high lane source
242       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
243       for (i = 0; i < height; i++) {
244         src += src_stride;
245         LOAD_SRC_DST
246         MERGE_NEXT_SRC(src_reg, 1)
247         FILTER_SRC(filter)
248         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
249         // average between previous pack to the current
250         src_pack = _mm256_avg_epu8(src_pack, src_reg);
251         MERGE_WITH_SRC(src_pack, zero_reg)
252         CALC_SUM_SSE_INSIDE_LOOP
253         src_pack = src_reg;
254         dst += dst_stride;
255       }
256       // x_offset = bilin interpolation and y_offset = bilin interpolation
257     } else {
258       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
259       x_offset <<= 5;
260       xfilter = _mm256_load_si256(
261           (__m256i const *)(bilinear_filters_avx2 + x_offset));
262       y_offset <<= 5;
263       yfilter = _mm256_load_si256(
264           (__m256i const *)(bilinear_filters_avx2 + y_offset));
265       pw8 = _mm256_set1_epi16(8);
266       // load source and another source starting from the next
267       // following byte
268       src_reg = _mm256_loadu_si256((__m256i const *)(src));
269       MERGE_NEXT_SRC(src_reg, 1)
270 
271       FILTER_SRC(xfilter)
272       // convert each 16 bit to 8 bit to each low and high lane source
273       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
274       for (i = 0; i < height; i++) {
275         src += src_stride;
276         LOAD_SRC_DST
277         MERGE_NEXT_SRC(src_reg, 1)
278         FILTER_SRC(xfilter)
279         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
280         // merge previous pack to current pack source
281         MERGE_WITH_SRC(src_pack, src_reg)
282         // filter the source
283         FILTER_SRC(yfilter)
284         src_pack = src_reg;
285         CALC_SUM_SSE_INSIDE_LOOP
286         dst += dst_stride;
287       }
288     }
289   }
290   CALC_SUM_AND_SSE
291   _mm256_zeroupper();
292   return sum;
293 }
294 
aom_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)295 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
296     const uint8_t *src, int src_stride, int x_offset, int y_offset,
297     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
298     int height, unsigned int *sse) {
299   __m256i sec_reg;
300   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
301   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
302   __m256i zero_reg;
303   int i, sum;
304   sum_reg = _mm256_set1_epi16(0);
305   sse_reg = _mm256_set1_epi16(0);
306   zero_reg = _mm256_set1_epi16(0);
307 
308   // x_offset = 0 and y_offset = 0
309   if (x_offset == 0) {
310     if (y_offset == 0) {
311       for (i = 0; i < height; i++) {
312         LOAD_SRC_DST
313         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
314         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
315         sec += sec_stride;
316         // expend each byte to 2 bytes
317         MERGE_WITH_SRC(src_reg, zero_reg)
318         CALC_SUM_SSE_INSIDE_LOOP
319         src += src_stride;
320         dst += dst_stride;
321       }
322     } else if (y_offset == 8) {
323       __m256i src_next_reg;
324       for (i = 0; i < height; i++) {
325         LOAD_SRC_DST
326         AVG_NEXT_SRC(src_reg, src_stride)
327         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
328         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
329         sec += sec_stride;
330         // expend each byte to 2 bytes
331         MERGE_WITH_SRC(src_reg, zero_reg)
332         CALC_SUM_SSE_INSIDE_LOOP
333         src += src_stride;
334         dst += dst_stride;
335       }
336       // x_offset = 0 and y_offset = bilin interpolation
337     } else {
338       __m256i filter, pw8, src_next_reg;
339 
340       y_offset <<= 5;
341       filter = _mm256_load_si256(
342           (__m256i const *)(bilinear_filters_avx2 + y_offset));
343       pw8 = _mm256_set1_epi16(8);
344       for (i = 0; i < height; i++) {
345         LOAD_SRC_DST
346         MERGE_NEXT_SRC(src_reg, src_stride)
347         FILTER_SRC(filter)
348         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
349         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
350         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
351         sec += sec_stride;
352         MERGE_WITH_SRC(src_reg, zero_reg)
353         CALC_SUM_SSE_INSIDE_LOOP
354         src += src_stride;
355         dst += dst_stride;
356       }
357     }
358     // x_offset = 8  and y_offset = 0
359   } else if (x_offset == 8) {
360     if (y_offset == 0) {
361       __m256i src_next_reg;
362       for (i = 0; i < height; i++) {
363         LOAD_SRC_DST
364         AVG_NEXT_SRC(src_reg, 1)
365         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
366         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
367         sec += sec_stride;
368         // expand each byte to 2 bytes
369         MERGE_WITH_SRC(src_reg, zero_reg)
370         CALC_SUM_SSE_INSIDE_LOOP
371         src += src_stride;
372         dst += dst_stride;
373       }
374       // x_offset = 8  and y_offset = 8
375     } else if (y_offset == 8) {
376       __m256i src_next_reg, src_avg;
377       // load source and another source starting from the next
378       // following byte
379       src_reg = _mm256_loadu_si256((__m256i const *)(src));
380       AVG_NEXT_SRC(src_reg, 1)
381       for (i = 0; i < height; i++) {
382         // save current source average
383         src_avg = src_reg;
384         src += src_stride;
385         LOAD_SRC_DST
386         AVG_NEXT_SRC(src_reg, 1)
387         // average between previous average to current average
388         src_avg = _mm256_avg_epu8(src_avg, src_reg);
389         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
390         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
391         sec += sec_stride;
392         // expand each byte to 2 bytes
393         MERGE_WITH_SRC(src_avg, zero_reg)
394         CALC_SUM_SSE_INSIDE_LOOP
395         dst += dst_stride;
396       }
397       // x_offset = 8  and y_offset = bilin interpolation
398     } else {
399       __m256i filter, pw8, src_next_reg, src_avg;
400       y_offset <<= 5;
401       filter = _mm256_load_si256(
402           (__m256i const *)(bilinear_filters_avx2 + y_offset));
403       pw8 = _mm256_set1_epi16(8);
404       // load source and another source starting from the next
405       // following byte
406       src_reg = _mm256_loadu_si256((__m256i const *)(src));
407       AVG_NEXT_SRC(src_reg, 1)
408       for (i = 0; i < height; i++) {
409         // save current source average
410         src_avg = src_reg;
411         src += src_stride;
412         LOAD_SRC_DST
413         AVG_NEXT_SRC(src_reg, 1)
414         MERGE_WITH_SRC(src_avg, src_reg)
415         FILTER_SRC(filter)
416         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
417         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
418         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
419         // expand each byte to 2 bytes
420         MERGE_WITH_SRC(src_avg, zero_reg)
421         sec += sec_stride;
422         CALC_SUM_SSE_INSIDE_LOOP
423         dst += dst_stride;
424       }
425     }
426     // x_offset = bilin interpolation and y_offset = 0
427   } else {
428     if (y_offset == 0) {
429       __m256i filter, pw8, src_next_reg;
430       x_offset <<= 5;
431       filter = _mm256_load_si256(
432           (__m256i const *)(bilinear_filters_avx2 + x_offset));
433       pw8 = _mm256_set1_epi16(8);
434       for (i = 0; i < height; i++) {
435         LOAD_SRC_DST
436         MERGE_NEXT_SRC(src_reg, 1)
437         FILTER_SRC(filter)
438         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
439         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
440         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
441         MERGE_WITH_SRC(src_reg, zero_reg)
442         sec += sec_stride;
443         CALC_SUM_SSE_INSIDE_LOOP
444         src += src_stride;
445         dst += dst_stride;
446       }
447       // x_offset = bilin interpolation and y_offset = 8
448     } else if (y_offset == 8) {
449       __m256i filter, pw8, src_next_reg, src_pack;
450       x_offset <<= 5;
451       filter = _mm256_load_si256(
452           (__m256i const *)(bilinear_filters_avx2 + x_offset));
453       pw8 = _mm256_set1_epi16(8);
454       src_reg = _mm256_loadu_si256((__m256i const *)(src));
455       MERGE_NEXT_SRC(src_reg, 1)
456       FILTER_SRC(filter)
457       // convert each 16 bit to 8 bit to each low and high lane source
458       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
459       for (i = 0; i < height; i++) {
460         src += src_stride;
461         LOAD_SRC_DST
462         MERGE_NEXT_SRC(src_reg, 1)
463         FILTER_SRC(filter)
464         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
465         // average between previous pack to the current
466         src_pack = _mm256_avg_epu8(src_pack, src_reg);
467         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
468         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
469         sec += sec_stride;
470         MERGE_WITH_SRC(src_pack, zero_reg)
471         src_pack = src_reg;
472         CALC_SUM_SSE_INSIDE_LOOP
473         dst += dst_stride;
474       }
475       // x_offset = bilin interpolation and y_offset = bilin interpolation
476     } else {
477       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
478       x_offset <<= 5;
479       xfilter = _mm256_load_si256(
480           (__m256i const *)(bilinear_filters_avx2 + x_offset));
481       y_offset <<= 5;
482       yfilter = _mm256_load_si256(
483           (__m256i const *)(bilinear_filters_avx2 + y_offset));
484       pw8 = _mm256_set1_epi16(8);
485       // load source and another source starting from the next
486       // following byte
487       src_reg = _mm256_loadu_si256((__m256i const *)(src));
488       MERGE_NEXT_SRC(src_reg, 1)
489 
490       FILTER_SRC(xfilter)
491       // convert each 16 bit to 8 bit to each low and high lane source
492       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
493       for (i = 0; i < height; i++) {
494         src += src_stride;
495         LOAD_SRC_DST
496         MERGE_NEXT_SRC(src_reg, 1)
497         FILTER_SRC(xfilter)
498         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
499         // merge previous pack to current pack source
500         MERGE_WITH_SRC(src_pack, src_reg)
501         // filter the source
502         FILTER_SRC(yfilter)
503         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
504         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
505         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
506         MERGE_WITH_SRC(src_pack, zero_reg)
507         src_pack = src_reg;
508         sec += sec_stride;
509         CALC_SUM_SSE_INSIDE_LOOP
510         dst += dst_stride;
511       }
512     }
513   }
514   CALC_SUM_AND_SSE
515   _mm256_zeroupper();
516   return sum;
517 }
518