1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 
18 #include "config/aom_config.h"
19 
20 #include "aom_dsp/simd/v64_intrinsics_c.h"
21 
22 typedef union {
23   uint8_t u8[16];
24   uint16_t u16[8];
25   uint32_t u32[4];
26   uint64_t u64[2];
27   int8_t s8[16];
28   int16_t s16[8];
29   int32_t s32[4];
30   int64_t s64[2];
31   c_v64 v64[2];
32 } c_v128;
33 
c_v128_low_u32(c_v128 a)34 SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
35 
c_v128_low_v64(c_v128 a)36 SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
37 
c_v128_high_v64(c_v128 a)38 SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
39 
c_v128_from_64(uint64_t hi,uint64_t lo)40 SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
41   c_v128 t;
42   t.u64[1] = hi;
43   t.u64[0] = lo;
44   return t;
45 }
46 
c_v128_from_v64(c_v64 hi,c_v64 lo)47 SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
48   c_v128 t;
49   t.v64[1] = hi;
50   t.v64[0] = lo;
51   return t;
52 }
53 
c_v128_from_32(uint32_t a,uint32_t b,uint32_t c,uint32_t d)54 SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
55                                   uint32_t d) {
56   c_v128 t;
57   t.u32[3] = a;
58   t.u32[2] = b;
59   t.u32[1] = c;
60   t.u32[0] = d;
61   return t;
62 }
63 
c_v128_load_unaligned(const void * p)64 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
65   c_v128 t;
66   uint8_t *pp = (uint8_t *)p;
67   uint8_t *q = (uint8_t *)&t;
68   int c;
69   for (c = 0; c < 16; c++) q[c] = pp[c];
70   return t;
71 }
72 
c_v128_load_aligned(const void * p)73 SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
74   if (SIMD_CHECK && (uintptr_t)p & 15) {
75     fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
76     abort();
77   }
78   return c_v128_load_unaligned(p);
79 }
80 
c_v128_store_unaligned(void * p,c_v128 a)81 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
82   uint8_t *pp = (uint8_t *)p;
83   uint8_t *q = (uint8_t *)&a;
84   int c;
85   for (c = 0; c < 16; c++) pp[c] = q[c];
86 }
87 
c_v128_store_aligned(void * p,c_v128 a)88 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
89   if (SIMD_CHECK && (uintptr_t)p & 15) {
90     fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
91     abort();
92   }
93   c_v128_store_unaligned(p, a);
94 }
95 
c_v128_zero()96 SIMD_INLINE c_v128 c_v128_zero() {
97   c_v128 t;
98   t.u64[1] = t.u64[0] = 0;
99   return t;
100 }
101 
c_v128_dup_8(uint8_t x)102 SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
103   c_v128 t;
104   t.v64[1] = t.v64[0] = c_v64_dup_8(x);
105   return t;
106 }
107 
c_v128_dup_16(uint16_t x)108 SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
109   c_v128 t;
110   t.v64[1] = t.v64[0] = c_v64_dup_16(x);
111   return t;
112 }
113 
c_v128_dup_32(uint32_t x)114 SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
115   c_v128 t;
116   t.v64[1] = t.v64[0] = c_v64_dup_32(x);
117   return t;
118 }
119 
c_v128_dup_64(uint64_t x)120 SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
121   c_v128 t;
122   t.u64[1] = t.u64[0] = x;
123   return t;
124 }
125 
c_v128_dotp_su8(c_v128 a,c_v128 b)126 SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
127   return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
128          c_v64_dotp_su8(a.v64[0], b.v64[0]);
129 }
130 
c_v128_dotp_s16(c_v128 a,c_v128 b)131 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
132   return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
133          c_v64_dotp_s16(a.v64[0], b.v64[0]);
134 }
135 
c_v128_dotp_s32(c_v128 a,c_v128 b)136 SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
137   // 32 bit products, 64 bit sum
138   return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
139          (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
140          (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
141          (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
142 }
143 
c_v128_hadd_u8(c_v128 a)144 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
145   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
146 }
147 
148 typedef uint32_t c_sad128_internal;
149 
c_v128_sad_u8_init()150 SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
151 
152 /* Implementation dependent return value.  Result must be finalised with
153    v128_sad_u8_sum().
154    The result for more than 32 v128_sad_u8() calls is undefined. */
c_v128_sad_u8(c_sad128_internal s,c_v128 a,c_v128 b)155 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
156                                             c_v128 b) {
157   int c;
158   for (c = 0; c < 16; c++)
159     s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
160   return s;
161 }
162 
c_v128_sad_u8_sum(c_sad128_internal s)163 SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
164 
165 typedef uint32_t c_ssd128_internal;
166 
c_v128_ssd_u8_init()167 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
168 
169 /* Implementation dependent return value.  Result must be finalised with
170  * v128_ssd_u8_sum(). */
c_v128_ssd_u8(c_ssd128_internal s,c_v128 a,c_v128 b)171 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
172                                             c_v128 b) {
173   int c;
174   for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
175   return s;
176 }
177 
c_v128_ssd_u8_sum(c_ssd128_internal s)178 SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
179 
c_v128_or(c_v128 a,c_v128 b)180 SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
181   return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
182                          c_v64_or(a.v64[0], b.v64[0]));
183 }
184 
c_v128_xor(c_v128 a,c_v128 b)185 SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
186   return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
187                          c_v64_xor(a.v64[0], b.v64[0]));
188 }
189 
c_v128_and(c_v128 a,c_v128 b)190 SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
191   return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
192                          c_v64_and(a.v64[0], b.v64[0]));
193 }
194 
c_v128_andn(c_v128 a,c_v128 b)195 SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
196   return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
197                          c_v64_andn(a.v64[0], b.v64[0]));
198 }
199 
c_v128_add_8(c_v128 a,c_v128 b)200 SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
201   return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
202                          c_v64_add_8(a.v64[0], b.v64[0]));
203 }
204 
c_v128_add_16(c_v128 a,c_v128 b)205 SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
206   return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
207                          c_v64_add_16(a.v64[0], b.v64[0]));
208 }
209 
c_v128_sadd_u8(c_v128 a,c_v128 b)210 SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
211   return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
212                          c_v64_sadd_u8(a.v64[0], b.v64[0]));
213 }
214 
c_v128_sadd_s8(c_v128 a,c_v128 b)215 SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
216   return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
217                          c_v64_sadd_s8(a.v64[0], b.v64[0]));
218 }
219 
c_v128_sadd_s16(c_v128 a,c_v128 b)220 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
221   return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
222                          c_v64_sadd_s16(a.v64[0], b.v64[0]));
223 }
224 
c_v128_add_32(c_v128 a,c_v128 b)225 SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
226   return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
227                          c_v64_add_32(a.v64[0], b.v64[0]));
228 }
229 
c_v128_add_64(c_v128 a,c_v128 b)230 SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
231   // Two complement overflow (silences sanitizers)
232   return c_v128_from_64(
233       a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
234                                    : a.v64[1].u64 + b.v64[1].u64,
235       a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
236                                    : a.v64[0].u64 + b.v64[0].u64);
237 }
238 
c_v128_padd_s16(c_v128 a)239 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
240   c_v128 t;
241   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
242   t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
243   t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
244   t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
245   return t;
246 }
247 
c_v128_padd_u8(c_v128 a)248 SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
249   c_v128 t;
250   t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
251   t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
252   t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
253   t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
254   t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
255   t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
256   t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
257   t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
258   return t;
259 }
260 
c_v128_sub_8(c_v128 a,c_v128 b)261 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
262   return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
263                          c_v64_sub_8(a.v64[0], b.v64[0]));
264 }
265 
c_v128_ssub_u8(c_v128 a,c_v128 b)266 SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
267   return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
268                          c_v64_ssub_u8(a.v64[0], b.v64[0]));
269 }
270 
c_v128_ssub_s8(c_v128 a,c_v128 b)271 SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
272   return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
273                          c_v64_ssub_s8(a.v64[0], b.v64[0]));
274 }
275 
c_v128_sub_16(c_v128 a,c_v128 b)276 SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
277   return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
278                          c_v64_sub_16(a.v64[0], b.v64[0]));
279 }
280 
c_v128_ssub_s16(c_v128 a,c_v128 b)281 SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
282   return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
283                          c_v64_ssub_s16(a.v64[0], b.v64[0]));
284 }
285 
c_v128_ssub_u16(c_v128 a,c_v128 b)286 SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
287   return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
288                          c_v64_ssub_u16(a.v64[0], b.v64[0]));
289 }
290 
c_v128_sub_32(c_v128 a,c_v128 b)291 SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
292   return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
293                          c_v64_sub_32(a.v64[0], b.v64[0]));
294 }
295 
c_v128_sub_64(c_v128 a,c_v128 b)296 SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
297   // Two complement underflow (silences sanitizers)
298   return c_v128_from_64(
299       a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
300                                   : a.v64[1].u64 - b.v64[1].u64,
301       a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
302                                   : a.v64[0].u64 - b.v64[0].u64);
303 }
304 
c_v128_abs_s16(c_v128 a)305 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
306   return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
307 }
308 
c_v128_abs_s8(c_v128 a)309 SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
310   return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
311 }
312 
c_v128_mul_s16(c_v64 a,c_v64 b)313 SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
314   c_v64 lo_bits = c_v64_mullo_s16(a, b);
315   c_v64 hi_bits = c_v64_mulhi_s16(a, b);
316   return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
317                          c_v64_ziplo_16(hi_bits, lo_bits));
318 }
319 
c_v128_mullo_s16(c_v128 a,c_v128 b)320 SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
321   return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
322                          c_v64_mullo_s16(a.v64[0], b.v64[0]));
323 }
324 
c_v128_mulhi_s16(c_v128 a,c_v128 b)325 SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
326   return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
327                          c_v64_mulhi_s16(a.v64[0], b.v64[0]));
328 }
329 
c_v128_mullo_s32(c_v128 a,c_v128 b)330 SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
331   return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
332                          c_v64_mullo_s32(a.v64[0], b.v64[0]));
333 }
334 
c_v128_madd_s16(c_v128 a,c_v128 b)335 SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
336   return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
337                          c_v64_madd_s16(a.v64[0], b.v64[0]));
338 }
339 
c_v128_madd_us8(c_v128 a,c_v128 b)340 SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
341   return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
342                          c_v64_madd_us8(a.v64[0], b.v64[0]));
343 }
344 
c_v128_avg_u8(c_v128 a,c_v128 b)345 SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
346   return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
347                          c_v64_avg_u8(a.v64[0], b.v64[0]));
348 }
349 
c_v128_rdavg_u8(c_v128 a,c_v128 b)350 SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
351   return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
352                          c_v64_rdavg_u8(a.v64[0], b.v64[0]));
353 }
354 
c_v128_rdavg_u16(c_v128 a,c_v128 b)355 SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
356   return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
357                          c_v64_rdavg_u16(a.v64[0], b.v64[0]));
358 }
359 
c_v128_avg_u16(c_v128 a,c_v128 b)360 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
361   return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
362                          c_v64_avg_u16(a.v64[0], b.v64[0]));
363 }
364 
c_v128_min_u8(c_v128 a,c_v128 b)365 SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
366   return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
367                          c_v64_min_u8(a.v64[0], b.v64[0]));
368 }
369 
c_v128_max_u8(c_v128 a,c_v128 b)370 SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
371   return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
372                          c_v64_max_u8(a.v64[0], b.v64[0]));
373 }
374 
c_v128_min_s8(c_v128 a,c_v128 b)375 SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
376   return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
377                          c_v64_min_s8(a.v64[0], b.v64[0]));
378 }
379 
c_v128_movemask_8(c_v128 a)380 SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
381   return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
382          ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
383          ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
384          ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
385          ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
386          ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
387          ((a.s8[0] < 0) << 0);
388 }
389 
c_v128_blend_8(c_v128 a,c_v128 b,c_v128 c)390 SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
391   c_v128 t;
392   for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
393   return t;
394 }
395 
c_v128_max_s8(c_v128 a,c_v128 b)396 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
397   return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
398                          c_v64_max_s8(a.v64[0], b.v64[0]));
399 }
400 
c_v128_min_s16(c_v128 a,c_v128 b)401 SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
402   return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
403                          c_v64_min_s16(a.v64[0], b.v64[0]));
404 }
405 
c_v128_max_s16(c_v128 a,c_v128 b)406 SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
407   return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
408                          c_v64_max_s16(a.v64[0], b.v64[0]));
409 }
410 
c_v128_max_s32(c_v128 a,c_v128 b)411 SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
412   c_v128 t;
413   int c;
414   for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
415   return t;
416 }
417 
c_v128_min_s32(c_v128 a,c_v128 b)418 SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
419   c_v128 t;
420   int c;
421   for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
422   return t;
423 }
424 
c_v128_ziplo_8(c_v128 a,c_v128 b)425 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
426   return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
427                          c_v64_ziplo_8(a.v64[0], b.v64[0]));
428 }
429 
c_v128_ziphi_8(c_v128 a,c_v128 b)430 SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
431   return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
432                          c_v64_ziplo_8(a.v64[1], b.v64[1]));
433 }
434 
c_v128_ziplo_16(c_v128 a,c_v128 b)435 SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
436   return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
437                          c_v64_ziplo_16(a.v64[0], b.v64[0]));
438 }
439 
c_v128_ziphi_16(c_v128 a,c_v128 b)440 SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
441   return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
442                          c_v64_ziplo_16(a.v64[1], b.v64[1]));
443 }
444 
c_v128_ziplo_32(c_v128 a,c_v128 b)445 SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
446   return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
447                          c_v64_ziplo_32(a.v64[0], b.v64[0]));
448 }
449 
c_v128_ziphi_32(c_v128 a,c_v128 b)450 SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
451   return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
452                          c_v64_ziplo_32(a.v64[1], b.v64[1]));
453 }
454 
c_v128_ziplo_64(c_v128 a,c_v128 b)455 SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
456   return c_v128_from_v64(a.v64[0], b.v64[0]);
457 }
458 
c_v128_ziphi_64(c_v128 a,c_v128 b)459 SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
460   return c_v128_from_v64(a.v64[1], b.v64[1]);
461 }
462 
c_v128_zip_8(c_v64 a,c_v64 b)463 SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
464   return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
465 }
466 
c_v128_zip_16(c_v64 a,c_v64 b)467 SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
468   return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
469 }
470 
c_v128_zip_32(c_v64 a,c_v64 b)471 SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
472   return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
473 }
474 
_c_v128_unzip_8(c_v128 a,c_v128 b,int mode)475 SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
476   c_v128 t;
477   if (mode) {
478     t.u8[15] = b.u8[15];
479     t.u8[14] = b.u8[13];
480     t.u8[13] = b.u8[11];
481     t.u8[12] = b.u8[9];
482     t.u8[11] = b.u8[7];
483     t.u8[10] = b.u8[5];
484     t.u8[9] = b.u8[3];
485     t.u8[8] = b.u8[1];
486     t.u8[7] = a.u8[15];
487     t.u8[6] = a.u8[13];
488     t.u8[5] = a.u8[11];
489     t.u8[4] = a.u8[9];
490     t.u8[3] = a.u8[7];
491     t.u8[2] = a.u8[5];
492     t.u8[1] = a.u8[3];
493     t.u8[0] = a.u8[1];
494   } else {
495     t.u8[15] = a.u8[14];
496     t.u8[14] = a.u8[12];
497     t.u8[13] = a.u8[10];
498     t.u8[12] = a.u8[8];
499     t.u8[11] = a.u8[6];
500     t.u8[10] = a.u8[4];
501     t.u8[9] = a.u8[2];
502     t.u8[8] = a.u8[0];
503     t.u8[7] = b.u8[14];
504     t.u8[6] = b.u8[12];
505     t.u8[5] = b.u8[10];
506     t.u8[4] = b.u8[8];
507     t.u8[3] = b.u8[6];
508     t.u8[2] = b.u8[4];
509     t.u8[1] = b.u8[2];
510     t.u8[0] = b.u8[0];
511   }
512   return t;
513 }
514 
c_v128_unziplo_8(c_v128 a,c_v128 b)515 SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
516   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
517                            : _c_v128_unzip_8(a, b, 0);
518 }
519 
c_v128_unziphi_8(c_v128 a,c_v128 b)520 SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
521   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
522                            : _c_v128_unzip_8(b, a, 1);
523 }
524 
_c_v128_unzip_16(c_v128 a,c_v128 b,int mode)525 SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
526   c_v128 t;
527   if (mode) {
528     t.u16[7] = b.u16[7];
529     t.u16[6] = b.u16[5];
530     t.u16[5] = b.u16[3];
531     t.u16[4] = b.u16[1];
532     t.u16[3] = a.u16[7];
533     t.u16[2] = a.u16[5];
534     t.u16[1] = a.u16[3];
535     t.u16[0] = a.u16[1];
536   } else {
537     t.u16[7] = a.u16[6];
538     t.u16[6] = a.u16[4];
539     t.u16[5] = a.u16[2];
540     t.u16[4] = a.u16[0];
541     t.u16[3] = b.u16[6];
542     t.u16[2] = b.u16[4];
543     t.u16[1] = b.u16[2];
544     t.u16[0] = b.u16[0];
545   }
546   return t;
547 }
548 
c_v128_unziplo_16(c_v128 a,c_v128 b)549 SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
550   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
551                            : _c_v128_unzip_16(a, b, 0);
552 }
553 
c_v128_unziphi_16(c_v128 a,c_v128 b)554 SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
555   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
556                            : _c_v128_unzip_16(b, a, 1);
557 }
558 
_c_v128_unzip_32(c_v128 a,c_v128 b,int mode)559 SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
560   c_v128 t;
561   if (mode) {
562     t.u32[3] = b.u32[3];
563     t.u32[2] = b.u32[1];
564     t.u32[1] = a.u32[3];
565     t.u32[0] = a.u32[1];
566   } else {
567     t.u32[3] = a.u32[2];
568     t.u32[2] = a.u32[0];
569     t.u32[1] = b.u32[2];
570     t.u32[0] = b.u32[0];
571   }
572   return t;
573 }
574 
c_v128_unziplo_32(c_v128 a,c_v128 b)575 SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
576   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
577                            : _c_v128_unzip_32(a, b, 0);
578 }
579 
c_v128_unziphi_32(c_v128 a,c_v128 b)580 SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
581   return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
582                            : _c_v128_unzip_32(b, a, 1);
583 }
584 
c_v128_unpack_u8_s16(c_v64 a)585 SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
586   return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
587 }
588 
c_v128_unpacklo_u8_s16(c_v128 a)589 SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
590   return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
591                          c_v64_unpacklo_u8_s16(a.v64[0]));
592 }
593 
c_v128_unpackhi_u8_s16(c_v128 a)594 SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
595   return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
596                          c_v64_unpacklo_u8_s16(a.v64[1]));
597 }
598 
c_v128_unpack_s8_s16(c_v64 a)599 SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
600   return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
601 }
602 
c_v128_unpacklo_s8_s16(c_v128 a)603 SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
604   return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
605                          c_v64_unpacklo_s8_s16(a.v64[0]));
606 }
607 
c_v128_unpackhi_s8_s16(c_v128 a)608 SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
609   return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
610                          c_v64_unpacklo_s8_s16(a.v64[1]));
611 }
612 
c_v128_pack_s32_s16(c_v128 a,c_v128 b)613 SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
614   return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
615                          c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
616 }
617 
c_v128_pack_s32_u16(c_v128 a,c_v128 b)618 SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
619   return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
620                          c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
621 }
622 
c_v128_pack_s16_u8(c_v128 a,c_v128 b)623 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
624   return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
625                          c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
626 }
627 
c_v128_pack_s16_s8(c_v128 a,c_v128 b)628 SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
629   return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
630                          c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
631 }
632 
c_v128_unpack_u16_s32(c_v64 a)633 SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
634   return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
635 }
636 
c_v128_unpack_s16_s32(c_v64 a)637 SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
638   return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
639 }
640 
c_v128_unpacklo_u16_s32(c_v128 a)641 SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
642   return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
643                          c_v64_unpacklo_u16_s32(a.v64[0]));
644 }
645 
c_v128_unpacklo_s16_s32(c_v128 a)646 SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
647   return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
648                          c_v64_unpacklo_s16_s32(a.v64[0]));
649 }
650 
c_v128_unpackhi_u16_s32(c_v128 a)651 SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
652   return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
653                          c_v64_unpacklo_u16_s32(a.v64[1]));
654 }
655 
c_v128_unpackhi_s16_s32(c_v128 a)656 SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
657   return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
658                          c_v64_unpacklo_s16_s32(a.v64[1]));
659 }
660 
c_v128_shuffle_8(c_v128 a,c_v128 pattern)661 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
662   c_v128 t;
663   int c;
664   for (c = 0; c < 16; c++)
665     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
666                                      : pattern.u8[c] & 15];
667 
668   return t;
669 }
670 
c_v128_cmpgt_s8(c_v128 a,c_v128 b)671 SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
672   return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
673                          c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
674 }
675 
c_v128_cmplt_s8(c_v128 a,c_v128 b)676 SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
677   return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
678                          c_v64_cmplt_s8(a.v64[0], b.v64[0]));
679 }
680 
c_v128_cmpeq_8(c_v128 a,c_v128 b)681 SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
682   return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
683                          c_v64_cmpeq_8(a.v64[0], b.v64[0]));
684 }
685 
c_v128_cmpgt_s16(c_v128 a,c_v128 b)686 SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
687   return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
688                          c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
689 }
690 
c_v128_cmplt_s16(c_v128 a,c_v128 b)691 SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
692   return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
693                          c_v64_cmplt_s16(a.v64[0], b.v64[0]));
694 }
695 
c_v128_cmpeq_16(c_v128 a,c_v128 b)696 SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
697   return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
698                          c_v64_cmpeq_16(a.v64[0], b.v64[0]));
699 }
700 
c_v128_cmpgt_s32(c_v128 a,c_v128 b)701 SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
702   c_v128 t;
703   int c;
704   for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
705   return t;
706 }
707 
c_v128_cmplt_s32(c_v128 a,c_v128 b)708 SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
709   c_v128 t;
710   int c;
711   for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
712   return t;
713 }
714 
c_v128_cmpeq_32(c_v128 a,c_v128 b)715 SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
716   c_v128 t;
717   int c;
718   for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
719   return t;
720 }
721 
c_v128_shl_n_byte(c_v128 a,const unsigned int n)722 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
723   if (n < 8)
724     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
725                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
726                            c_v64_shl_n_byte(a.v64[0], n));
727   else
728     return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
729 }
730 
c_v128_shr_n_byte(c_v128 a,const unsigned int n)731 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
732   if (n < 8)
733     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
734                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
735                                     c_v64_shl_n_byte(a.v64[1], 8 - n)));
736   else
737     return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
738 }
739 
c_v128_align(c_v128 a,c_v128 b,const unsigned int c)740 SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
741   if (SIMD_CHECK && c > 15) {
742     fprintf(stderr, "Error: undefined alignment %d\n", c);
743     abort();
744   }
745   return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
746            : b;
747 }
748 
c_v128_shl_8(c_v128 a,const unsigned int c)749 SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
750   return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
751 }
752 
c_v128_shr_u8(c_v128 a,const unsigned int c)753 SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
754   return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
755 }
756 
c_v128_shr_s8(c_v128 a,const unsigned int c)757 SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
758   return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
759 }
760 
c_v128_shl_16(c_v128 a,const unsigned int c)761 SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
762   return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
763 }
764 
c_v128_shr_u16(c_v128 a,const unsigned int c)765 SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
766   return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
767                          c_v64_shr_u16(a.v64[0], c));
768 }
769 
c_v128_shr_s16(c_v128 a,const unsigned int c)770 SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
771   return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
772                          c_v64_shr_s16(a.v64[0], c));
773 }
774 
c_v128_shl_32(c_v128 a,const unsigned int c)775 SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
776   return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
777 }
778 
c_v128_shr_u32(c_v128 a,const unsigned int c)779 SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
780   return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
781                          c_v64_shr_u32(a.v64[0], c));
782 }
783 
c_v128_shr_s32(c_v128 a,const unsigned int c)784 SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
785   return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
786                          c_v64_shr_s32(a.v64[0], c));
787 }
788 
c_v128_shl_64(c_v128 a,const unsigned int c)789 SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
790   a.v64[1].u64 <<= c;
791   a.v64[0].u64 <<= c;
792   return c_v128_from_v64(a.v64[1], a.v64[0]);
793 }
794 
c_v128_shr_u64(c_v128 a,const unsigned int c)795 SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
796   a.v64[1].u64 >>= c;
797   a.v64[0].u64 >>= c;
798   return c_v128_from_v64(a.v64[1], a.v64[0]);
799 }
800 
c_v128_shr_s64(c_v128 a,const unsigned int c)801 SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
802   a.v64[1].s64 >>= c;
803   a.v64[0].s64 >>= c;
804   return c_v128_from_v64(a.v64[1], a.v64[0]);
805 }
806 
c_v128_shl_n_8(c_v128 a,const unsigned int n)807 SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
808   return c_v128_shl_8(a, n);
809 }
810 
c_v128_shl_n_16(c_v128 a,const unsigned int n)811 SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
812   return c_v128_shl_16(a, n);
813 }
814 
c_v128_shl_n_32(c_v128 a,const unsigned int n)815 SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
816   return c_v128_shl_32(a, n);
817 }
818 
c_v128_shl_n_64(c_v128 a,const unsigned int n)819 SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
820   return c_v128_shl_64(a, n);
821 }
822 
c_v128_shr_n_u8(c_v128 a,const unsigned int n)823 SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
824   return c_v128_shr_u8(a, n);
825 }
826 
c_v128_shr_n_u16(c_v128 a,const unsigned int n)827 SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
828   return c_v128_shr_u16(a, n);
829 }
830 
c_v128_shr_n_u32(c_v128 a,const unsigned int n)831 SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
832   return c_v128_shr_u32(a, n);
833 }
834 
c_v128_shr_n_u64(c_v128 a,const unsigned int n)835 SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
836   return c_v128_shr_u64(a, n);
837 }
838 
c_v128_shr_n_s8(c_v128 a,const unsigned int n)839 SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
840   return c_v128_shr_s8(a, n);
841 }
842 
c_v128_shr_n_s16(c_v128 a,const unsigned int n)843 SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
844   return c_v128_shr_s16(a, n);
845 }
846 
c_v128_shr_n_s32(c_v128 a,const unsigned int n)847 SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
848   return c_v128_shr_s32(a, n);
849 }
850 
c_v128_shr_n_s64(c_v128 a,const unsigned int n)851 SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
852   return c_v128_shr_s64(a, n);
853 }
854 
855 typedef uint32_t c_sad128_internal_u16;
856 
c_v128_sad_u16_init()857 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
858 
859 /* Implementation dependent return value.  Result must be finalised with
860  * v128_sad_u16_sum(). */
c_v128_sad_u16(c_sad128_internal_u16 s,c_v128 a,c_v128 b)861 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
862                                                  c_v128 a, c_v128 b) {
863   int c;
864   for (c = 0; c < 8; c++)
865     s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
866   return s;
867 }
868 
c_v128_sad_u16_sum(c_sad128_internal_u16 s)869 SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
870 
871 typedef uint64_t c_ssd128_internal_s16;
872 
c_v128_ssd_s16_init()873 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
874 
875 /* Implementation dependent return value.  Result must be finalised with
876  * v128_ssd_s16_sum(). */
c_v128_ssd_s16(c_ssd128_internal_s16 s,c_v128 a,c_v128 b)877 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
878                                                  c_v128 a, c_v128 b) {
879   int c;
880   for (c = 0; c < 8; c++)
881     s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
882          (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
883   return s;
884 }
885 
c_v128_ssd_s16_sum(c_ssd128_internal_s16 s)886 SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
887 
888 #endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
889