1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <xmmintrin.h>
13 
14 #include "./vp8_rtcd.h"
15 #include "./vpx_config.h"
16 #include "vp8/common/filter.h"
17 #include "vpx_dsp/x86/mem_sse2.h"
18 #include "vpx_ports/mem.h"
19 
horizontal_16x16(uint8_t * src,const int stride,uint16_t * dst,const int xoffset)20 static INLINE void horizontal_16x16(uint8_t *src, const int stride,
21                                     uint16_t *dst, const int xoffset) {
22   int h;
23   const __m128i zero = _mm_setzero_si128();
24 
25   if (xoffset == 0) {
26     for (h = 0; h < 17; ++h) {
27       const __m128i a = _mm_loadu_si128((__m128i *)src);
28       const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
29       const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
30       _mm_store_si128((__m128i *)dst, a_lo);
31       _mm_store_si128((__m128i *)(dst + 8), a_hi);
32       src += stride;
33       dst += 16;
34     }
35     return;
36   }
37 
38   {
39     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
40     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
41     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
42 
43     for (h = 0; h < 17; ++h) {
44       const __m128i a = _mm_loadu_si128((__m128i *)src);
45       const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
46       const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
47       const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
48       const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
49 
50       const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
51       const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
52       const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
53       const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
54       const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
55 
56       const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
57       const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
58 
59       const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
60       const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
61 
62       const __m128i shifted_lo =
63           _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
64       const __m128i shifted_hi =
65           _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
66 
67       _mm_store_si128((__m128i *)dst, shifted_lo);
68       _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
69       src += stride;
70       dst += 16;
71     }
72   }
73 }
74 
vertical_16x16(uint16_t * src,uint8_t * dst,const int stride,const int yoffset)75 static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
76                                   const int yoffset) {
77   int h;
78 
79   if (yoffset == 0) {
80     for (h = 0; h < 16; ++h) {
81       const __m128i row_lo = _mm_load_si128((__m128i *)src);
82       const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
83       const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
84       _mm_store_si128((__m128i *)dst, packed);
85       src += 16;
86       dst += stride;
87     }
88     return;
89   }
90 
91   {
92     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
93     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
94     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
95 
96     __m128i row_0_lo = _mm_load_si128((__m128i *)src);
97     __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
98     src += 16;
99     for (h = 0; h < 16; ++h) {
100       const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
101       const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
102 
103       const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
104       const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
105       const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
106       const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
107 
108       const __m128i sum_lo =
109           _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
110       const __m128i sum_hi =
111           _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
112 
113       const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
114       const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
115 
116       const __m128i shifted_lo =
117           _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
118       const __m128i shifted_hi =
119           _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
120 
121       const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
122       _mm_store_si128((__m128i *)dst, packed);
123       row_0_lo = row_1_lo;
124       row_0_hi = row_1_hi;
125       src += 16;
126       dst += stride;
127     }
128   }
129 }
130 
vp8_bilinear_predict16x16_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)131 void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
132                                     int xoffset, int yoffset, uint8_t *dst_ptr,
133                                     int dst_pitch) {
134   DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
135 
136   assert((xoffset | yoffset) != 0);
137 
138   horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
139 
140   vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
141 }
142 
horizontal_8xN(uint8_t * src,const int stride,uint16_t * dst,const int xoffset,const int height)143 static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
144                                   const int xoffset, const int height) {
145   int h;
146   const __m128i zero = _mm_setzero_si128();
147 
148   if (xoffset == 0) {
149     for (h = 0; h < height; ++h) {
150       const __m128i a = _mm_loadl_epi64((__m128i *)src);
151       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
152       _mm_store_si128((__m128i *)dst, a_u16);
153       src += stride;
154       dst += 8;
155     }
156     return;
157   }
158 
159   {
160     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
161     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
162     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
163 
164     // Filter horizontally. Rather than load the whole array and transpose, load
165     // 16 values (overreading) and shift to set up the second value. Do an
166     // "extra" 9th line so the vertical pass has the necessary context.
167     for (h = 0; h < height; ++h) {
168       const __m128i a = _mm_loadu_si128((__m128i *)src);
169       const __m128i b = _mm_srli_si128(a, 1);
170       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
171       const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
172       const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
173       const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
174       const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
175       const __m128i compensated = _mm_add_epi16(sum, round_factor);
176       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
177       _mm_store_si128((__m128i *)dst, shifted);
178       src += stride;
179       dst += 8;
180     }
181   }
182 }
183 
vertical_8xN(uint16_t * src,uint8_t * dst,const int stride,const int yoffset,const int height)184 static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
185                                 const int yoffset, const int height) {
186   int h;
187 
188   if (yoffset == 0) {
189     for (h = 0; h < height; ++h) {
190       const __m128i row = _mm_load_si128((__m128i *)src);
191       const __m128i packed = _mm_packus_epi16(row, row);
192       _mm_storel_epi64((__m128i *)dst, packed);
193       src += 8;
194       dst += stride;
195     }
196     return;
197   }
198 
199   {
200     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
201     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
202     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
203 
204     __m128i row_0 = _mm_load_si128((__m128i *)src);
205     src += 8;
206     for (h = 0; h < height; ++h) {
207       const __m128i row_1 = _mm_load_si128((__m128i *)src);
208       const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
209       const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
210       const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
211       const __m128i compensated = _mm_add_epi16(sum, round_factor);
212       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
213       const __m128i packed = _mm_packus_epi16(shifted, shifted);
214       _mm_storel_epi64((__m128i *)dst, packed);
215       row_0 = row_1;
216       src += 8;
217       dst += stride;
218     }
219   }
220 }
221 
vp8_bilinear_predict8x8_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)222 void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
223                                   int xoffset, int yoffset, uint8_t *dst_ptr,
224                                   int dst_pitch) {
225   DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
226 
227   assert((xoffset | yoffset) != 0);
228 
229   horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
230 
231   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
232 }
233 
vp8_bilinear_predict8x4_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)234 void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
235                                   int xoffset, int yoffset, uint8_t *dst_ptr,
236                                   int dst_pitch) {
237   DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
238 
239   assert((xoffset | yoffset) != 0);
240 
241   horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
242 
243   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
244 }
245 
horizontal_4x4(uint8_t * src,const int stride,uint16_t * dst,const int xoffset)246 static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
247                                   const int xoffset) {
248   int h;
249   const __m128i zero = _mm_setzero_si128();
250 
251   if (xoffset == 0) {
252     for (h = 0; h < 5; ++h) {
253       const __m128i a = load_unaligned_u32(src);
254       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
255       _mm_storel_epi64((__m128i *)dst, a_u16);
256       src += stride;
257       dst += 4;
258     }
259     return;
260   }
261 
262   {
263     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
264     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
265     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
266 
267     for (h = 0; h < 5; ++h) {
268       const __m128i a = load_unaligned_u32(src);
269       const __m128i b = load_unaligned_u32(src + 1);
270       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
271       const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
272       const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
273       const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
274       const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
275       const __m128i compensated = _mm_add_epi16(sum, round_factor);
276       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
277       _mm_storel_epi64((__m128i *)dst, shifted);
278       src += stride;
279       dst += 4;
280     }
281   }
282 }
283 
vertical_4x4(uint16_t * src,uint8_t * dst,const int stride,const int yoffset)284 static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
285                                 const int yoffset) {
286   int h;
287 
288   if (yoffset == 0) {
289     for (h = 0; h < 4; h += 2) {
290       const __m128i row = _mm_load_si128((__m128i *)src);
291       __m128i packed = _mm_packus_epi16(row, row);
292       store_unaligned_u32(dst, packed);
293       dst += stride;
294       packed = _mm_srli_si128(packed, 4);
295       store_unaligned_u32(dst, packed);
296       dst += stride;
297       src += 8;
298     }
299     return;
300   }
301 
302   {
303     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
304     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
305     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
306 
307     for (h = 0; h < 4; h += 2) {
308       const __m128i row_0 = _mm_load_si128((__m128i *)src);
309       const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
310       const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
311       const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
312       const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
313       const __m128i compensated = _mm_add_epi16(sum, round_factor);
314       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
315       __m128i packed = _mm_packus_epi16(shifted, shifted);
316       storeu_uint32(dst, _mm_cvtsi128_si32(packed));
317       packed = _mm_srli_si128(packed, 4);
318       dst += stride;
319       storeu_uint32(dst, _mm_cvtsi128_si32(packed));
320       dst += stride;
321       src += 8;
322     }
323   }
324 }
325 
vp8_bilinear_predict4x4_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)326 void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
327                                   int xoffset, int yoffset, uint8_t *dst_ptr,
328                                   int dst_pitch) {
329   DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
330 
331   assert((xoffset | yoffset) != 0);
332 
333   horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
334 
335   vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
336 }
337