1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <emmintrin.h>
12 #include <assert.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/aom_filter.h"
17
copy_64(const uint16_t * src,uint16_t * dst)18 static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
19 __m128i s[8];
20 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
21 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
22 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
23 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
24 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
25 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
26 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
27 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
28 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
29 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
30 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
31 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
32 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
33 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
34 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
35 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
36 }
37
copy_128(const uint16_t * src,uint16_t * dst)38 static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
39 __m128i s[16];
40 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
41 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
42 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
43 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
44 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
45 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
46 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
47 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
48 s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
49 s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
50 s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
51 s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
52 s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
53 s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
54 s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
55 s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
56 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
57 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
58 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
59 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
60 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
61 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
62 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
63 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
64 _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
65 _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
66 _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
67 _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
68 _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
69 _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
70 _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
71 _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
72 }
73
av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)74 void av1_highbd_convolve_2d_copy_sr_sse2(
75 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
76 int h, const InterpFilterParams *filter_params_x,
77 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
78 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
79 (void)filter_params_x;
80 (void)filter_params_y;
81 (void)subpel_x_q4;
82 (void)subpel_y_q4;
83 (void)conv_params;
84 (void)bd;
85 if (w >= 16) {
86 assert(!((intptr_t)dst % 16));
87 assert(!(dst_stride % 16));
88 }
89
90 if (w == 2) {
91 do {
92 __m128i s = _mm_loadl_epi64((__m128i *)src);
93 *(uint32_t *)dst = _mm_cvtsi128_si32(s);
94 src += src_stride;
95 dst += dst_stride;
96 s = _mm_loadl_epi64((__m128i *)src);
97 *(uint32_t *)dst = _mm_cvtsi128_si32(s);
98 src += src_stride;
99 dst += dst_stride;
100 h -= 2;
101 } while (h);
102 } else if (w == 4) {
103 do {
104 __m128i s[2];
105 s[0] = _mm_loadl_epi64((__m128i *)src);
106 src += src_stride;
107 s[1] = _mm_loadl_epi64((__m128i *)src);
108 src += src_stride;
109 _mm_storel_epi64((__m128i *)dst, s[0]);
110 dst += dst_stride;
111 _mm_storel_epi64((__m128i *)dst, s[1]);
112 dst += dst_stride;
113 h -= 2;
114 } while (h);
115 } else if (w == 8) {
116 do {
117 __m128i s[2];
118 s[0] = _mm_loadu_si128((__m128i *)src);
119 src += src_stride;
120 s[1] = _mm_loadu_si128((__m128i *)src);
121 src += src_stride;
122 _mm_store_si128((__m128i *)dst, s[0]);
123 dst += dst_stride;
124 _mm_store_si128((__m128i *)dst, s[1]);
125 dst += dst_stride;
126 h -= 2;
127 } while (h);
128 } else if (w == 16) {
129 do {
130 __m128i s[4];
131 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
132 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
133 src += src_stride;
134 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
135 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
136 src += src_stride;
137 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
138 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
139 dst += dst_stride;
140 _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
141 _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
142 dst += dst_stride;
143 h -= 2;
144 } while (h);
145 } else if (w == 32) {
146 do {
147 __m128i s[8];
148 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
149 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
150 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
151 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
152 src += src_stride;
153 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
154 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
155 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
156 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
157 src += src_stride;
158 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
159 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
160 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
161 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
162 dst += dst_stride;
163 _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
164 _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
165 _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
166 _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
167 dst += dst_stride;
168 h -= 2;
169 } while (h);
170 } else if (w == 64) {
171 do {
172 copy_64(src, dst);
173 src += src_stride;
174 dst += dst_stride;
175 copy_64(src, dst);
176 src += src_stride;
177 dst += dst_stride;
178 h -= 2;
179 } while (h);
180 } else {
181 do {
182 copy_128(src, dst);
183 src += src_stride;
184 dst += dst_stride;
185 copy_128(src, dst);
186 src += src_stride;
187 dst += dst_stride;
188 h -= 2;
189 } while (h);
190 }
191 }
192