1 /* chunkset_neon.c -- NEON inline functions to copy small data chunks.
2  * For conditions of distribution and use, see copyright notice in zlib.h
3  */
4 
5 #ifdef ARM_NEON_CHUNKSET
6 #ifdef _M_ARM64
7 #  include <arm64_neon.h>
8 #else
9 #  include <arm_neon.h>
10 #endif
11 #include "../../zbuild.h"
12 #include "../../zutil.h"
13 
14 typedef uint8x16_t chunk_t;
15 
16 #define HAVE_CHUNKMEMSET_1
17 #define HAVE_CHUNKMEMSET_2
18 #define HAVE_CHUNKMEMSET_3
19 #define HAVE_CHUNKMEMSET_4
20 #define HAVE_CHUNKMEMSET_8
21 
chunkmemset_1(uint8_t * from,chunk_t * chunk)22 static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
23     *chunk = vld1q_dup_u8(from);
24 }
25 
chunkmemset_2(uint8_t * from,chunk_t * chunk)26 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
27     *chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
28 }
29 
chunkmemset_4(uint8_t * from,chunk_t * chunk)30 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
31     *chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
32 }
33 
chunkmemset_8(uint8_t * from,chunk_t * chunk)34 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
35     *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
36 }
37 
38 #define CHUNKSIZE        chunksize_neon
39 #define CHUNKCOPY        chunkcopy_neon
40 #define CHUNKCOPY_SAFE   chunkcopy_safe_neon
41 #define CHUNKUNROLL      chunkunroll_neon
42 #define CHUNKMEMSET      chunkmemset_neon
43 #define CHUNKMEMSET_SAFE chunkmemset_safe_neon
44 
45 uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len);
46 uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len);
47 
chunkmemset_3(uint8_t * out,uint8_t * from,unsigned dist,unsigned len)48 static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
49     uint8x8x3_t chunks;
50     unsigned sz = sizeof(chunks);
51     if (len < sz) {
52         out = CHUNKUNROLL(out, &dist, &len);
53         return CHUNKCOPY(out, out - dist, len);
54     }
55 
56     /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
57        chunks[0] = {a,a,a,a,a,a,a,a}
58        chunks[1] = {b,b,b,b,b,b,b,b}
59        chunks[2] = {c,c,c,c,c,c,c,c}. */
60     chunks = vld3_dup_u8(from);
61 
62     unsigned rem = len % sz;
63     len -= rem;
64     while (len) {
65         /* Store "a,b,c, ..., a,b,c". */
66         vst3_u8(out, chunks);
67         out += sz;
68         len -= sz;
69     }
70 
71     if (!rem)
72         return out;
73 
74     /* Last, deal with the case when LEN is not a multiple of SZ. */
75     out = CHUNKUNROLL(out, &dist, &rem);
76     return CHUNKCOPY(out, out - dist, rem);
77 }
78 
79 #if defined(__aarch64__) || defined(_M_ARM64)
80 
81 #define HAVE_CHUNKMEMSET_6
82 
chunkmemset_6(uint8_t * out,uint8_t * from,unsigned dist,unsigned len)83 static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
84     uint16x8x3_t chunks;
85     unsigned sz = sizeof(chunks);
86     if (len < sz) {
87         out = CHUNKUNROLL(out, &dist, &len);
88         return CHUNKCOPY(out, out - dist, len);
89     }
90 
91     /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
92        chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
93        chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
94        chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
95     chunks = vld3q_dup_u16((unsigned short *)from);
96 
97     unsigned rem = len % sz;
98     len -= rem;
99     while (len) {
100         /* Store "ab,cd,ef, ..., ab,cd,ef". */
101         vst3q_u16((unsigned short *)out, chunks);
102         out += sz;
103         len -= sz;
104     }
105 
106     if (!rem)
107         return out;
108 
109     /* Last, deal with the case when LEN is not a multiple of SZ. */
110     out = CHUNKUNROLL(out, &dist, &rem);
111     return CHUNKCOPY(out, out - dist, rem);
112 }
113 
114 #endif
115 
loadchunk(uint8_t const * s,chunk_t * chunk)116 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
117     *chunk = vld1q_u8(s);
118 }
119 
storechunk(uint8_t * out,chunk_t * chunk)120 static inline void storechunk(uint8_t *out, chunk_t *chunk) {
121     vst1q_u8(out, *chunk);
122 }
123 
124 #include "chunkset_tpl.h"
125 
126 #endif
127