1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <emmintrin.h>
12 #include <assert.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/aom_filter.h"
17 
copy_64(const uint16_t * src,uint16_t * dst)18 static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
19   __m128i s[8];
20   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
21   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
22   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
23   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
24   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
25   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
26   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
27   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
28   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
29   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
30   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
31   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
32   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
33   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
34   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
35   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
36 }
37 
copy_128(const uint16_t * src,uint16_t * dst)38 static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
39   __m128i s[16];
40   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
41   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
42   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
43   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
44   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
45   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
46   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
47   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
48   s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
49   s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
50   s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
51   s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
52   s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
53   s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
54   s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
55   s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
56   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
57   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
58   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
59   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
60   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
61   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
62   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
63   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
64   _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
65   _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
66   _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
67   _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
68   _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
69   _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
70   _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
71   _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
72 }
73 
av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)74 void av1_highbd_convolve_2d_copy_sr_sse2(
75     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
76     int h, const InterpFilterParams *filter_params_x,
77     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
78     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
79   (void)filter_params_x;
80   (void)filter_params_y;
81   (void)subpel_x_q4;
82   (void)subpel_y_q4;
83   (void)conv_params;
84   (void)bd;
85   if (w >= 16) {
86     assert(!((intptr_t)dst % 16));
87     assert(!(dst_stride % 16));
88   }
89 
90   if (w == 2) {
91     do {
92       __m128i s = _mm_loadl_epi64((__m128i *)src);
93       *(uint32_t *)dst = _mm_cvtsi128_si32(s);
94       src += src_stride;
95       dst += dst_stride;
96       s = _mm_loadl_epi64((__m128i *)src);
97       *(uint32_t *)dst = _mm_cvtsi128_si32(s);
98       src += src_stride;
99       dst += dst_stride;
100       h -= 2;
101     } while (h);
102   } else if (w == 4) {
103     do {
104       __m128i s[2];
105       s[0] = _mm_loadl_epi64((__m128i *)src);
106       src += src_stride;
107       s[1] = _mm_loadl_epi64((__m128i *)src);
108       src += src_stride;
109       _mm_storel_epi64((__m128i *)dst, s[0]);
110       dst += dst_stride;
111       _mm_storel_epi64((__m128i *)dst, s[1]);
112       dst += dst_stride;
113       h -= 2;
114     } while (h);
115   } else if (w == 8) {
116     do {
117       __m128i s[2];
118       s[0] = _mm_loadu_si128((__m128i *)src);
119       src += src_stride;
120       s[1] = _mm_loadu_si128((__m128i *)src);
121       src += src_stride;
122       _mm_store_si128((__m128i *)dst, s[0]);
123       dst += dst_stride;
124       _mm_store_si128((__m128i *)dst, s[1]);
125       dst += dst_stride;
126       h -= 2;
127     } while (h);
128   } else if (w == 16) {
129     do {
130       __m128i s[4];
131       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
132       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
133       src += src_stride;
134       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
135       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
136       src += src_stride;
137       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
138       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
139       dst += dst_stride;
140       _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
141       _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
142       dst += dst_stride;
143       h -= 2;
144     } while (h);
145   } else if (w == 32) {
146     do {
147       __m128i s[8];
148       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
149       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
150       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
151       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
152       src += src_stride;
153       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
154       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
155       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
156       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
157       src += src_stride;
158       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
159       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
160       _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
161       _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
162       dst += dst_stride;
163       _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
164       _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
165       _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
166       _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
167       dst += dst_stride;
168       h -= 2;
169     } while (h);
170   } else if (w == 64) {
171     do {
172       copy_64(src, dst);
173       src += src_stride;
174       dst += dst_stride;
175       copy_64(src, dst);
176       src += src_stride;
177       dst += dst_stride;
178       h -= 2;
179     } while (h);
180   } else {
181     do {
182       copy_128(src, dst);
183       src += src_stride;
184       dst += dst_stride;
185       copy_128(src, dst);
186       src += src_stride;
187       dst += dst_stride;
188       h -= 2;
189     } while (h);
190   }
191 }
192