1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>  // AVX2
12 #include "vpx_ports/mem.h"
13 #include "vp9/encoder/vp9_variance.h"
14 
15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
17   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18   15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
19   15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
20   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22   13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
23   13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
24   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26   11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
27   11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
28   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
29   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
30   9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
31   9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
32   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
34   7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
35   7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
36   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
37   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
38   5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
39   5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
40   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
41   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
42   3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
43   3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
44   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
45   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
46   1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
47   1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
48 };
49 
50 #define FILTER_SRC(filter) \
51   /* filter the source */ \
52   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
53   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
54   \
55   /* add 8 to source */ \
56   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
57   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
58   \
59   /* divide source by 16 */ \
60   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
61   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
62 
63 #define MERGE_WITH_SRC(src_reg, reg) \
64   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
65   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
66 
67 #define LOAD_SRC_DST \
68   /* load source and destination */ \
69   src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
70   dst_reg = _mm256_load_si256((__m256i const *) (dst));
71 
72 #define AVG_NEXT_SRC(src_reg, size_stride) \
73   src_next_reg = _mm256_loadu_si256((__m256i const *) \
74                                    (src + size_stride)); \
75   /* average between current and next stride source */ \
76   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
77 
78 #define MERGE_NEXT_SRC(src_reg, size_stride) \
79   src_next_reg = _mm256_loadu_si256((__m256i const *) \
80                                    (src + size_stride)); \
81   MERGE_WITH_SRC(src_reg, src_next_reg)
82 
83 #define CALC_SUM_SSE_INSIDE_LOOP \
84   /* expand each byte to 2 bytes */ \
85   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
86   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
87   /* source - dest */ \
88   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
89   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
90   /* caculate sum */ \
91   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
92   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
93   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
94   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
95   /* calculate sse */ \
96   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
97   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
98 
99 // final calculation to sum and sse
100 #define CALC_SUM_AND_SSE \
101   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
102   sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
103   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
104   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
105   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
106   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
107   \
108   sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
109   sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
110   \
111   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
112   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
113   *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
114                 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
115   sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
116   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
117   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
118         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
119 
120 
vp9_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
122                                              int src_stride,
123                                              int x_offset,
124                                              int y_offset,
125                                              const uint8_t *dst,
126                                              int dst_stride,
127                                              int height,
128                                              unsigned int *sse) {
129   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
130   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
131   __m256i zero_reg;
132   int i, sum;
133   sum_reg = _mm256_set1_epi16(0);
134   sse_reg = _mm256_set1_epi16(0);
135   zero_reg = _mm256_set1_epi16(0);
136 
137   // x_offset = 0 and y_offset = 0
138   if (x_offset == 0) {
139     if (y_offset == 0) {
140       for (i = 0; i < height ; i++) {
141         LOAD_SRC_DST
142         // expend each byte to 2 bytes
143         MERGE_WITH_SRC(src_reg, zero_reg)
144         CALC_SUM_SSE_INSIDE_LOOP
145         src+= src_stride;
146         dst+= dst_stride;
147       }
148     // x_offset = 0 and y_offset = 8
149     } else if (y_offset == 8) {
150       __m256i src_next_reg;
151       for (i = 0; i < height ; i++) {
152         LOAD_SRC_DST
153         AVG_NEXT_SRC(src_reg, src_stride)
154         // expend each byte to 2 bytes
155         MERGE_WITH_SRC(src_reg, zero_reg)
156         CALC_SUM_SSE_INSIDE_LOOP
157         src+= src_stride;
158         dst+= dst_stride;
159       }
160     // x_offset = 0 and y_offset = bilin interpolation
161     } else {
162       __m256i filter, pw8, src_next_reg;
163 
164       y_offset <<= 5;
165       filter = _mm256_load_si256((__m256i const *)
166                (bilinear_filters_avx2 + y_offset));
167       pw8 = _mm256_set1_epi16(8);
168       for (i = 0; i < height ; i++) {
169         LOAD_SRC_DST
170         MERGE_NEXT_SRC(src_reg, src_stride)
171         FILTER_SRC(filter)
172         CALC_SUM_SSE_INSIDE_LOOP
173         src+= src_stride;
174         dst+= dst_stride;
175       }
176     }
177   // x_offset = 8  and y_offset = 0
178   } else if (x_offset == 8) {
179     if (y_offset == 0) {
180       __m256i src_next_reg;
181       for (i = 0; i < height ; i++) {
182         LOAD_SRC_DST
183         AVG_NEXT_SRC(src_reg, 1)
184         // expand each byte to 2 bytes
185         MERGE_WITH_SRC(src_reg, zero_reg)
186         CALC_SUM_SSE_INSIDE_LOOP
187         src+= src_stride;
188         dst+= dst_stride;
189       }
190     // x_offset = 8  and y_offset = 8
191     } else if (y_offset == 8) {
192       __m256i src_next_reg, src_avg;
193       // load source and another source starting from the next
194       // following byte
195       src_reg = _mm256_loadu_si256((__m256i const *) (src));
196       AVG_NEXT_SRC(src_reg, 1)
197       for (i = 0; i < height ; i++) {
198         src_avg = src_reg;
199         src+= src_stride;
200         LOAD_SRC_DST
201         AVG_NEXT_SRC(src_reg, 1)
202         // average between previous average to current average
203         src_avg = _mm256_avg_epu8(src_avg, src_reg);
204         // expand each byte to 2 bytes
205         MERGE_WITH_SRC(src_avg, zero_reg)
206         // save current source average
207         CALC_SUM_SSE_INSIDE_LOOP
208         dst+= dst_stride;
209       }
210     // x_offset = 8  and y_offset = bilin interpolation
211     } else {
212       __m256i filter, pw8, src_next_reg, src_avg;
213       y_offset <<= 5;
214       filter = _mm256_load_si256((__m256i const *)
215                (bilinear_filters_avx2 + y_offset));
216       pw8 = _mm256_set1_epi16(8);
217       // load source and another source starting from the next
218       // following byte
219       src_reg = _mm256_loadu_si256((__m256i const *) (src));
220       AVG_NEXT_SRC(src_reg, 1)
221       for (i = 0; i < height ; i++) {
222         // save current source average
223         src_avg = src_reg;
224         src+= src_stride;
225         LOAD_SRC_DST
226         AVG_NEXT_SRC(src_reg, 1)
227         MERGE_WITH_SRC(src_avg, src_reg)
228         FILTER_SRC(filter)
229         CALC_SUM_SSE_INSIDE_LOOP
230         dst+= dst_stride;
231       }
232     }
233   // x_offset = bilin interpolation and y_offset = 0
234   } else {
235     if (y_offset == 0) {
236       __m256i filter, pw8, src_next_reg;
237       x_offset <<= 5;
238       filter = _mm256_load_si256((__m256i const *)
239                (bilinear_filters_avx2 + x_offset));
240       pw8 = _mm256_set1_epi16(8);
241       for (i = 0; i < height ; i++) {
242         LOAD_SRC_DST
243         MERGE_NEXT_SRC(src_reg, 1)
244         FILTER_SRC(filter)
245         CALC_SUM_SSE_INSIDE_LOOP
246         src+= src_stride;
247         dst+= dst_stride;
248       }
249     // x_offset = bilin interpolation and y_offset = 8
250     } else if (y_offset == 8) {
251       __m256i filter, pw8, src_next_reg, src_pack;
252       x_offset <<= 5;
253       filter = _mm256_load_si256((__m256i const *)
254                (bilinear_filters_avx2 + x_offset));
255       pw8 = _mm256_set1_epi16(8);
256       src_reg = _mm256_loadu_si256((__m256i const *) (src));
257       MERGE_NEXT_SRC(src_reg, 1)
258       FILTER_SRC(filter)
259       // convert each 16 bit to 8 bit to each low and high lane source
260       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
261       for (i = 0; i < height ; i++) {
262         src+= src_stride;
263         LOAD_SRC_DST
264         MERGE_NEXT_SRC(src_reg, 1)
265         FILTER_SRC(filter)
266         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
267         // average between previous pack to the current
268         src_pack = _mm256_avg_epu8(src_pack, src_reg);
269         MERGE_WITH_SRC(src_pack, zero_reg)
270         CALC_SUM_SSE_INSIDE_LOOP
271         src_pack = src_reg;
272         dst+= dst_stride;
273       }
274     // x_offset = bilin interpolation and y_offset = bilin interpolation
275     } else {
276       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
277       x_offset <<= 5;
278       xfilter = _mm256_load_si256((__m256i const *)
279                 (bilinear_filters_avx2 + x_offset));
280       y_offset <<= 5;
281       yfilter = _mm256_load_si256((__m256i const *)
282                 (bilinear_filters_avx2 + y_offset));
283       pw8 = _mm256_set1_epi16(8);
284       // load source and another source starting from the next
285       // following byte
286       src_reg = _mm256_loadu_si256((__m256i const *) (src));
287       MERGE_NEXT_SRC(src_reg, 1)
288 
289       FILTER_SRC(xfilter)
290       // convert each 16 bit to 8 bit to each low and high lane source
291       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
292       for (i = 0; i < height ; i++) {
293         src+= src_stride;
294         LOAD_SRC_DST
295         MERGE_NEXT_SRC(src_reg, 1)
296         FILTER_SRC(xfilter)
297         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
298         // merge previous pack to current pack source
299         MERGE_WITH_SRC(src_pack, src_reg)
300         // filter the source
301         FILTER_SRC(yfilter)
302         src_pack = src_reg;
303         CALC_SUM_SSE_INSIDE_LOOP
304         dst+= dst_stride;
305       }
306     }
307   }
308   CALC_SUM_AND_SSE
309   return sum;
310 }
311 
vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
313                                              int src_stride,
314                                              int x_offset,
315                                              int y_offset,
316                                              const uint8_t *dst,
317                                              int dst_stride,
318                                              const uint8_t *sec,
319                                              int sec_stride,
320                                              int height,
321                                              unsigned int *sse) {
322   __m256i sec_reg;
323   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
324   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
325   __m256i zero_reg;
326   int i, sum;
327   sum_reg = _mm256_set1_epi16(0);
328   sse_reg = _mm256_set1_epi16(0);
329   zero_reg = _mm256_set1_epi16(0);
330 
331   // x_offset = 0 and y_offset = 0
332   if (x_offset == 0) {
333     if (y_offset == 0) {
334       for (i = 0; i < height ; i++) {
335         LOAD_SRC_DST
336         sec_reg = _mm256_load_si256((__m256i const *) (sec));
337         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338         sec+= sec_stride;
339         // expend each byte to 2 bytes
340         MERGE_WITH_SRC(src_reg, zero_reg)
341         CALC_SUM_SSE_INSIDE_LOOP
342         src+= src_stride;
343         dst+= dst_stride;
344       }
345     } else if (y_offset == 8) {
346       __m256i src_next_reg;
347       for (i = 0; i < height ; i++) {
348         LOAD_SRC_DST
349         AVG_NEXT_SRC(src_reg, src_stride)
350         sec_reg = _mm256_load_si256((__m256i const *) (sec));
351         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
352         sec+= sec_stride;
353         // expend each byte to 2 bytes
354         MERGE_WITH_SRC(src_reg, zero_reg)
355         CALC_SUM_SSE_INSIDE_LOOP
356         src+= src_stride;
357         dst+= dst_stride;
358       }
359     // x_offset = 0 and y_offset = bilin interpolation
360     } else {
361       __m256i filter, pw8, src_next_reg;
362 
363       y_offset <<= 5;
364       filter = _mm256_load_si256((__m256i const *)
365                  (bilinear_filters_avx2 + y_offset));
366       pw8 = _mm256_set1_epi16(8);
367       for (i = 0; i < height ; i++) {
368         LOAD_SRC_DST
369         MERGE_NEXT_SRC(src_reg, src_stride)
370         FILTER_SRC(filter)
371         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
372         sec_reg = _mm256_load_si256((__m256i const *) (sec));
373         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
374         sec+= sec_stride;
375         MERGE_WITH_SRC(src_reg, zero_reg)
376         CALC_SUM_SSE_INSIDE_LOOP
377         src+= src_stride;
378         dst+= dst_stride;
379       }
380     }
381   // x_offset = 8  and y_offset = 0
382   } else if (x_offset == 8) {
383     if (y_offset == 0) {
384       __m256i src_next_reg;
385       for (i = 0; i < height ; i++) {
386         LOAD_SRC_DST
387         AVG_NEXT_SRC(src_reg, 1)
388         sec_reg = _mm256_load_si256((__m256i const *) (sec));
389         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
390         sec+= sec_stride;
391         // expand each byte to 2 bytes
392         MERGE_WITH_SRC(src_reg, zero_reg)
393         CALC_SUM_SSE_INSIDE_LOOP
394         src+= src_stride;
395         dst+= dst_stride;
396       }
397     // x_offset = 8  and y_offset = 8
398     } else if (y_offset == 8) {
399       __m256i src_next_reg, src_avg;
400       // load source and another source starting from the next
401       // following byte
402       src_reg = _mm256_loadu_si256((__m256i const *) (src));
403       AVG_NEXT_SRC(src_reg, 1)
404       for (i = 0; i < height ; i++) {
405         // save current source average
406         src_avg = src_reg;
407         src+= src_stride;
408         LOAD_SRC_DST
409         AVG_NEXT_SRC(src_reg, 1)
410         // average between previous average to current average
411         src_avg = _mm256_avg_epu8(src_avg, src_reg);
412         sec_reg = _mm256_load_si256((__m256i const *) (sec));
413         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
414         sec+= sec_stride;
415         // expand each byte to 2 bytes
416         MERGE_WITH_SRC(src_avg, zero_reg)
417         CALC_SUM_SSE_INSIDE_LOOP
418         dst+= dst_stride;
419       }
420     // x_offset = 8  and y_offset = bilin interpolation
421     } else {
422       __m256i filter, pw8, src_next_reg, src_avg;
423       y_offset <<= 5;
424       filter = _mm256_load_si256((__m256i const *)
425                (bilinear_filters_avx2 + y_offset));
426       pw8 = _mm256_set1_epi16(8);
427       // load source and another source starting from the next
428       // following byte
429       src_reg = _mm256_loadu_si256((__m256i const *) (src));
430       AVG_NEXT_SRC(src_reg, 1)
431       for (i = 0; i < height ; i++) {
432         // save current source average
433         src_avg = src_reg;
434         src+= src_stride;
435         LOAD_SRC_DST
436         AVG_NEXT_SRC(src_reg, 1)
437         MERGE_WITH_SRC(src_avg, src_reg)
438         FILTER_SRC(filter)
439         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
440         sec_reg = _mm256_load_si256((__m256i const *) (sec));
441         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
442         // expand each byte to 2 bytes
443         MERGE_WITH_SRC(src_avg, zero_reg)
444         sec+= sec_stride;
445         CALC_SUM_SSE_INSIDE_LOOP
446         dst+= dst_stride;
447       }
448     }
449   // x_offset = bilin interpolation and y_offset = 0
450   } else {
451     if (y_offset == 0) {
452       __m256i filter, pw8, src_next_reg;
453       x_offset <<= 5;
454       filter = _mm256_load_si256((__m256i const *)
455                (bilinear_filters_avx2 + x_offset));
456       pw8 = _mm256_set1_epi16(8);
457       for (i = 0; i < height ; i++) {
458         LOAD_SRC_DST
459         MERGE_NEXT_SRC(src_reg, 1)
460         FILTER_SRC(filter)
461         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
462         sec_reg = _mm256_load_si256((__m256i const *) (sec));
463         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
464         MERGE_WITH_SRC(src_reg, zero_reg)
465         sec+= sec_stride;
466         CALC_SUM_SSE_INSIDE_LOOP
467         src+= src_stride;
468         dst+= dst_stride;
469       }
470     // x_offset = bilin interpolation and y_offset = 8
471     } else if (y_offset == 8) {
472       __m256i filter, pw8, src_next_reg, src_pack;
473       x_offset <<= 5;
474       filter = _mm256_load_si256((__m256i const *)
475                (bilinear_filters_avx2 + x_offset));
476       pw8 = _mm256_set1_epi16(8);
477       src_reg = _mm256_loadu_si256((__m256i const *) (src));
478       MERGE_NEXT_SRC(src_reg, 1)
479       FILTER_SRC(filter)
480       // convert each 16 bit to 8 bit to each low and high lane source
481       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
482       for (i = 0; i < height ; i++) {
483         src+= src_stride;
484         LOAD_SRC_DST
485         MERGE_NEXT_SRC(src_reg, 1)
486         FILTER_SRC(filter)
487         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
488         // average between previous pack to the current
489         src_pack = _mm256_avg_epu8(src_pack, src_reg);
490         sec_reg = _mm256_load_si256((__m256i const *) (sec));
491         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
492         sec+= sec_stride;
493         MERGE_WITH_SRC(src_pack, zero_reg)
494         src_pack = src_reg;
495         CALC_SUM_SSE_INSIDE_LOOP
496         dst+= dst_stride;
497       }
498     // x_offset = bilin interpolation and y_offset = bilin interpolation
499     } else {
500       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
501       x_offset <<= 5;
502       xfilter = _mm256_load_si256((__m256i const *)
503                 (bilinear_filters_avx2 + x_offset));
504       y_offset <<= 5;
505       yfilter = _mm256_load_si256((__m256i const *)
506                 (bilinear_filters_avx2 + y_offset));
507       pw8 = _mm256_set1_epi16(8);
508       // load source and another source starting from the next
509       // following byte
510       src_reg = _mm256_loadu_si256((__m256i const *) (src));
511       MERGE_NEXT_SRC(src_reg, 1)
512 
513       FILTER_SRC(xfilter)
514       // convert each 16 bit to 8 bit to each low and high lane source
515       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
516       for (i = 0; i < height ; i++) {
517         src+= src_stride;
518         LOAD_SRC_DST
519         MERGE_NEXT_SRC(src_reg, 1)
520         FILTER_SRC(xfilter)
521         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522         // merge previous pack to current pack source
523         MERGE_WITH_SRC(src_pack, src_reg)
524         // filter the source
525         FILTER_SRC(yfilter)
526         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
527         sec_reg = _mm256_load_si256((__m256i const *) (sec));
528         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
529         MERGE_WITH_SRC(src_pack, zero_reg)
530         src_pack = src_reg;
531         sec+= sec_stride;
532         CALC_SUM_SSE_INSIDE_LOOP
533         dst+= dst_stride;
534       }
535     }
536   }
537   CALC_SUM_AND_SSE
538   return sum;
539 }
540