1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h> // AVX2
12 #include "vpx_ports/mem.h"
13 #include "vp9/encoder/vp9_variance.h"
14
15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
19 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
23 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
27 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
29 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
30 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
31 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
32 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
34 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
35 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
36 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
37 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
38 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
39 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
40 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
41 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
42 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
43 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
44 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
45 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
46 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
47 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
48 };
49
50 #define FILTER_SRC(filter) \
51 /* filter the source */ \
52 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
53 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
54 \
55 /* add 8 to source */ \
56 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
57 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
58 \
59 /* divide source by 16 */ \
60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
62
63 #define MERGE_WITH_SRC(src_reg, reg) \
64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
66
67 #define LOAD_SRC_DST \
68 /* load source and destination */ \
69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
70 dst_reg = _mm256_load_si256((__m256i const *) (dst));
71
72 #define AVG_NEXT_SRC(src_reg, size_stride) \
73 src_next_reg = _mm256_loadu_si256((__m256i const *) \
74 (src + size_stride)); \
75 /* average between current and next stride source */ \
76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
77
78 #define MERGE_NEXT_SRC(src_reg, size_stride) \
79 src_next_reg = _mm256_loadu_si256((__m256i const *) \
80 (src + size_stride)); \
81 MERGE_WITH_SRC(src_reg, src_next_reg)
82
83 #define CALC_SUM_SSE_INSIDE_LOOP \
84 /* expand each byte to 2 bytes */ \
85 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
86 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
87 /* source - dest */ \
88 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
89 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
90 /* caculate sum */ \
91 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
92 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
93 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
94 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
95 /* calculate sse */ \
96 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
97 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
98
99 // final calculation to sum and sse
100 #define CALC_SUM_AND_SSE \
101 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
102 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
103 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
104 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
105 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
106 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
107 \
108 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
109 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
110 \
111 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
112 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
113 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
114 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
115 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
116 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
117 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
118 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
119
120
vp9_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
122 int src_stride,
123 int x_offset,
124 int y_offset,
125 const uint8_t *dst,
126 int dst_stride,
127 int height,
128 unsigned int *sse) {
129 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
130 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
131 __m256i zero_reg;
132 int i, sum;
133 sum_reg = _mm256_set1_epi16(0);
134 sse_reg = _mm256_set1_epi16(0);
135 zero_reg = _mm256_set1_epi16(0);
136
137 // x_offset = 0 and y_offset = 0
138 if (x_offset == 0) {
139 if (y_offset == 0) {
140 for (i = 0; i < height ; i++) {
141 LOAD_SRC_DST
142 // expend each byte to 2 bytes
143 MERGE_WITH_SRC(src_reg, zero_reg)
144 CALC_SUM_SSE_INSIDE_LOOP
145 src+= src_stride;
146 dst+= dst_stride;
147 }
148 // x_offset = 0 and y_offset = 8
149 } else if (y_offset == 8) {
150 __m256i src_next_reg;
151 for (i = 0; i < height ; i++) {
152 LOAD_SRC_DST
153 AVG_NEXT_SRC(src_reg, src_stride)
154 // expend each byte to 2 bytes
155 MERGE_WITH_SRC(src_reg, zero_reg)
156 CALC_SUM_SSE_INSIDE_LOOP
157 src+= src_stride;
158 dst+= dst_stride;
159 }
160 // x_offset = 0 and y_offset = bilin interpolation
161 } else {
162 __m256i filter, pw8, src_next_reg;
163
164 y_offset <<= 5;
165 filter = _mm256_load_si256((__m256i const *)
166 (bilinear_filters_avx2 + y_offset));
167 pw8 = _mm256_set1_epi16(8);
168 for (i = 0; i < height ; i++) {
169 LOAD_SRC_DST
170 MERGE_NEXT_SRC(src_reg, src_stride)
171 FILTER_SRC(filter)
172 CALC_SUM_SSE_INSIDE_LOOP
173 src+= src_stride;
174 dst+= dst_stride;
175 }
176 }
177 // x_offset = 8 and y_offset = 0
178 } else if (x_offset == 8) {
179 if (y_offset == 0) {
180 __m256i src_next_reg;
181 for (i = 0; i < height ; i++) {
182 LOAD_SRC_DST
183 AVG_NEXT_SRC(src_reg, 1)
184 // expand each byte to 2 bytes
185 MERGE_WITH_SRC(src_reg, zero_reg)
186 CALC_SUM_SSE_INSIDE_LOOP
187 src+= src_stride;
188 dst+= dst_stride;
189 }
190 // x_offset = 8 and y_offset = 8
191 } else if (y_offset == 8) {
192 __m256i src_next_reg, src_avg;
193 // load source and another source starting from the next
194 // following byte
195 src_reg = _mm256_loadu_si256((__m256i const *) (src));
196 AVG_NEXT_SRC(src_reg, 1)
197 for (i = 0; i < height ; i++) {
198 src_avg = src_reg;
199 src+= src_stride;
200 LOAD_SRC_DST
201 AVG_NEXT_SRC(src_reg, 1)
202 // average between previous average to current average
203 src_avg = _mm256_avg_epu8(src_avg, src_reg);
204 // expand each byte to 2 bytes
205 MERGE_WITH_SRC(src_avg, zero_reg)
206 // save current source average
207 CALC_SUM_SSE_INSIDE_LOOP
208 dst+= dst_stride;
209 }
210 // x_offset = 8 and y_offset = bilin interpolation
211 } else {
212 __m256i filter, pw8, src_next_reg, src_avg;
213 y_offset <<= 5;
214 filter = _mm256_load_si256((__m256i const *)
215 (bilinear_filters_avx2 + y_offset));
216 pw8 = _mm256_set1_epi16(8);
217 // load source and another source starting from the next
218 // following byte
219 src_reg = _mm256_loadu_si256((__m256i const *) (src));
220 AVG_NEXT_SRC(src_reg, 1)
221 for (i = 0; i < height ; i++) {
222 // save current source average
223 src_avg = src_reg;
224 src+= src_stride;
225 LOAD_SRC_DST
226 AVG_NEXT_SRC(src_reg, 1)
227 MERGE_WITH_SRC(src_avg, src_reg)
228 FILTER_SRC(filter)
229 CALC_SUM_SSE_INSIDE_LOOP
230 dst+= dst_stride;
231 }
232 }
233 // x_offset = bilin interpolation and y_offset = 0
234 } else {
235 if (y_offset == 0) {
236 __m256i filter, pw8, src_next_reg;
237 x_offset <<= 5;
238 filter = _mm256_load_si256((__m256i const *)
239 (bilinear_filters_avx2 + x_offset));
240 pw8 = _mm256_set1_epi16(8);
241 for (i = 0; i < height ; i++) {
242 LOAD_SRC_DST
243 MERGE_NEXT_SRC(src_reg, 1)
244 FILTER_SRC(filter)
245 CALC_SUM_SSE_INSIDE_LOOP
246 src+= src_stride;
247 dst+= dst_stride;
248 }
249 // x_offset = bilin interpolation and y_offset = 8
250 } else if (y_offset == 8) {
251 __m256i filter, pw8, src_next_reg, src_pack;
252 x_offset <<= 5;
253 filter = _mm256_load_si256((__m256i const *)
254 (bilinear_filters_avx2 + x_offset));
255 pw8 = _mm256_set1_epi16(8);
256 src_reg = _mm256_loadu_si256((__m256i const *) (src));
257 MERGE_NEXT_SRC(src_reg, 1)
258 FILTER_SRC(filter)
259 // convert each 16 bit to 8 bit to each low and high lane source
260 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
261 for (i = 0; i < height ; i++) {
262 src+= src_stride;
263 LOAD_SRC_DST
264 MERGE_NEXT_SRC(src_reg, 1)
265 FILTER_SRC(filter)
266 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
267 // average between previous pack to the current
268 src_pack = _mm256_avg_epu8(src_pack, src_reg);
269 MERGE_WITH_SRC(src_pack, zero_reg)
270 CALC_SUM_SSE_INSIDE_LOOP
271 src_pack = src_reg;
272 dst+= dst_stride;
273 }
274 // x_offset = bilin interpolation and y_offset = bilin interpolation
275 } else {
276 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
277 x_offset <<= 5;
278 xfilter = _mm256_load_si256((__m256i const *)
279 (bilinear_filters_avx2 + x_offset));
280 y_offset <<= 5;
281 yfilter = _mm256_load_si256((__m256i const *)
282 (bilinear_filters_avx2 + y_offset));
283 pw8 = _mm256_set1_epi16(8);
284 // load source and another source starting from the next
285 // following byte
286 src_reg = _mm256_loadu_si256((__m256i const *) (src));
287 MERGE_NEXT_SRC(src_reg, 1)
288
289 FILTER_SRC(xfilter)
290 // convert each 16 bit to 8 bit to each low and high lane source
291 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
292 for (i = 0; i < height ; i++) {
293 src+= src_stride;
294 LOAD_SRC_DST
295 MERGE_NEXT_SRC(src_reg, 1)
296 FILTER_SRC(xfilter)
297 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
298 // merge previous pack to current pack source
299 MERGE_WITH_SRC(src_pack, src_reg)
300 // filter the source
301 FILTER_SRC(yfilter)
302 src_pack = src_reg;
303 CALC_SUM_SSE_INSIDE_LOOP
304 dst+= dst_stride;
305 }
306 }
307 }
308 CALC_SUM_AND_SSE
309 return sum;
310 }
311
vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
313 int src_stride,
314 int x_offset,
315 int y_offset,
316 const uint8_t *dst,
317 int dst_stride,
318 const uint8_t *sec,
319 int sec_stride,
320 int height,
321 unsigned int *sse) {
322 __m256i sec_reg;
323 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
324 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
325 __m256i zero_reg;
326 int i, sum;
327 sum_reg = _mm256_set1_epi16(0);
328 sse_reg = _mm256_set1_epi16(0);
329 zero_reg = _mm256_set1_epi16(0);
330
331 // x_offset = 0 and y_offset = 0
332 if (x_offset == 0) {
333 if (y_offset == 0) {
334 for (i = 0; i < height ; i++) {
335 LOAD_SRC_DST
336 sec_reg = _mm256_load_si256((__m256i const *) (sec));
337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338 sec+= sec_stride;
339 // expend each byte to 2 bytes
340 MERGE_WITH_SRC(src_reg, zero_reg)
341 CALC_SUM_SSE_INSIDE_LOOP
342 src+= src_stride;
343 dst+= dst_stride;
344 }
345 } else if (y_offset == 8) {
346 __m256i src_next_reg;
347 for (i = 0; i < height ; i++) {
348 LOAD_SRC_DST
349 AVG_NEXT_SRC(src_reg, src_stride)
350 sec_reg = _mm256_load_si256((__m256i const *) (sec));
351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
352 sec+= sec_stride;
353 // expend each byte to 2 bytes
354 MERGE_WITH_SRC(src_reg, zero_reg)
355 CALC_SUM_SSE_INSIDE_LOOP
356 src+= src_stride;
357 dst+= dst_stride;
358 }
359 // x_offset = 0 and y_offset = bilin interpolation
360 } else {
361 __m256i filter, pw8, src_next_reg;
362
363 y_offset <<= 5;
364 filter = _mm256_load_si256((__m256i const *)
365 (bilinear_filters_avx2 + y_offset));
366 pw8 = _mm256_set1_epi16(8);
367 for (i = 0; i < height ; i++) {
368 LOAD_SRC_DST
369 MERGE_NEXT_SRC(src_reg, src_stride)
370 FILTER_SRC(filter)
371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
372 sec_reg = _mm256_load_si256((__m256i const *) (sec));
373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
374 sec+= sec_stride;
375 MERGE_WITH_SRC(src_reg, zero_reg)
376 CALC_SUM_SSE_INSIDE_LOOP
377 src+= src_stride;
378 dst+= dst_stride;
379 }
380 }
381 // x_offset = 8 and y_offset = 0
382 } else if (x_offset == 8) {
383 if (y_offset == 0) {
384 __m256i src_next_reg;
385 for (i = 0; i < height ; i++) {
386 LOAD_SRC_DST
387 AVG_NEXT_SRC(src_reg, 1)
388 sec_reg = _mm256_load_si256((__m256i const *) (sec));
389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
390 sec+= sec_stride;
391 // expand each byte to 2 bytes
392 MERGE_WITH_SRC(src_reg, zero_reg)
393 CALC_SUM_SSE_INSIDE_LOOP
394 src+= src_stride;
395 dst+= dst_stride;
396 }
397 // x_offset = 8 and y_offset = 8
398 } else if (y_offset == 8) {
399 __m256i src_next_reg, src_avg;
400 // load source and another source starting from the next
401 // following byte
402 src_reg = _mm256_loadu_si256((__m256i const *) (src));
403 AVG_NEXT_SRC(src_reg, 1)
404 for (i = 0; i < height ; i++) {
405 // save current source average
406 src_avg = src_reg;
407 src+= src_stride;
408 LOAD_SRC_DST
409 AVG_NEXT_SRC(src_reg, 1)
410 // average between previous average to current average
411 src_avg = _mm256_avg_epu8(src_avg, src_reg);
412 sec_reg = _mm256_load_si256((__m256i const *) (sec));
413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
414 sec+= sec_stride;
415 // expand each byte to 2 bytes
416 MERGE_WITH_SRC(src_avg, zero_reg)
417 CALC_SUM_SSE_INSIDE_LOOP
418 dst+= dst_stride;
419 }
420 // x_offset = 8 and y_offset = bilin interpolation
421 } else {
422 __m256i filter, pw8, src_next_reg, src_avg;
423 y_offset <<= 5;
424 filter = _mm256_load_si256((__m256i const *)
425 (bilinear_filters_avx2 + y_offset));
426 pw8 = _mm256_set1_epi16(8);
427 // load source and another source starting from the next
428 // following byte
429 src_reg = _mm256_loadu_si256((__m256i const *) (src));
430 AVG_NEXT_SRC(src_reg, 1)
431 for (i = 0; i < height ; i++) {
432 // save current source average
433 src_avg = src_reg;
434 src+= src_stride;
435 LOAD_SRC_DST
436 AVG_NEXT_SRC(src_reg, 1)
437 MERGE_WITH_SRC(src_avg, src_reg)
438 FILTER_SRC(filter)
439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
440 sec_reg = _mm256_load_si256((__m256i const *) (sec));
441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
442 // expand each byte to 2 bytes
443 MERGE_WITH_SRC(src_avg, zero_reg)
444 sec+= sec_stride;
445 CALC_SUM_SSE_INSIDE_LOOP
446 dst+= dst_stride;
447 }
448 }
449 // x_offset = bilin interpolation and y_offset = 0
450 } else {
451 if (y_offset == 0) {
452 __m256i filter, pw8, src_next_reg;
453 x_offset <<= 5;
454 filter = _mm256_load_si256((__m256i const *)
455 (bilinear_filters_avx2 + x_offset));
456 pw8 = _mm256_set1_epi16(8);
457 for (i = 0; i < height ; i++) {
458 LOAD_SRC_DST
459 MERGE_NEXT_SRC(src_reg, 1)
460 FILTER_SRC(filter)
461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
462 sec_reg = _mm256_load_si256((__m256i const *) (sec));
463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
464 MERGE_WITH_SRC(src_reg, zero_reg)
465 sec+= sec_stride;
466 CALC_SUM_SSE_INSIDE_LOOP
467 src+= src_stride;
468 dst+= dst_stride;
469 }
470 // x_offset = bilin interpolation and y_offset = 8
471 } else if (y_offset == 8) {
472 __m256i filter, pw8, src_next_reg, src_pack;
473 x_offset <<= 5;
474 filter = _mm256_load_si256((__m256i const *)
475 (bilinear_filters_avx2 + x_offset));
476 pw8 = _mm256_set1_epi16(8);
477 src_reg = _mm256_loadu_si256((__m256i const *) (src));
478 MERGE_NEXT_SRC(src_reg, 1)
479 FILTER_SRC(filter)
480 // convert each 16 bit to 8 bit to each low and high lane source
481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
482 for (i = 0; i < height ; i++) {
483 src+= src_stride;
484 LOAD_SRC_DST
485 MERGE_NEXT_SRC(src_reg, 1)
486 FILTER_SRC(filter)
487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
488 // average between previous pack to the current
489 src_pack = _mm256_avg_epu8(src_pack, src_reg);
490 sec_reg = _mm256_load_si256((__m256i const *) (sec));
491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
492 sec+= sec_stride;
493 MERGE_WITH_SRC(src_pack, zero_reg)
494 src_pack = src_reg;
495 CALC_SUM_SSE_INSIDE_LOOP
496 dst+= dst_stride;
497 }
498 // x_offset = bilin interpolation and y_offset = bilin interpolation
499 } else {
500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
501 x_offset <<= 5;
502 xfilter = _mm256_load_si256((__m256i const *)
503 (bilinear_filters_avx2 + x_offset));
504 y_offset <<= 5;
505 yfilter = _mm256_load_si256((__m256i const *)
506 (bilinear_filters_avx2 + y_offset));
507 pw8 = _mm256_set1_epi16(8);
508 // load source and another source starting from the next
509 // following byte
510 src_reg = _mm256_loadu_si256((__m256i const *) (src));
511 MERGE_NEXT_SRC(src_reg, 1)
512
513 FILTER_SRC(xfilter)
514 // convert each 16 bit to 8 bit to each low and high lane source
515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
516 for (i = 0; i < height ; i++) {
517 src+= src_stride;
518 LOAD_SRC_DST
519 MERGE_NEXT_SRC(src_reg, 1)
520 FILTER_SRC(xfilter)
521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522 // merge previous pack to current pack source
523 MERGE_WITH_SRC(src_pack, src_reg)
524 // filter the source
525 FILTER_SRC(yfilter)
526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
527 sec_reg = _mm256_load_si256((__m256i const *) (sec));
528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
529 MERGE_WITH_SRC(src_pack, zero_reg)
530 src_pack = src_reg;
531 sec+= sec_stride;
532 CALC_SUM_SSE_INSIDE_LOOP
533 dst+= dst_stride;
534 }
535 }
536 }
537 CALC_SUM_AND_SSE
538 return sum;
539 }
540