1 /* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15 /* This implementation of poly1305 is by Andrew Moon
16 * (https://github.com/floodyberry/poly1305-donna) and released as public
17 * domain. It implements SIMD vectorization based on the algorithm described in
18 * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 * block size */
20
21 #include <openssl/poly1305.h>
22
23 #include "../internal.h"
24
25
26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
27
28 #include <emmintrin.h>
29
30 #define U8TO64_LE(m) (*(const uint64_t *)(m))
31 #define U8TO32_LE(m) (*(const uint32_t *)(m))
32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
33
34 typedef __m128i xmmi;
35
36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
37 (1 << 26) - 1, 0, (1 << 26) - 1, 0};
38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
40 (1 << 24), 0, (1 << 24), 0};
41
add128(uint128_t a,uint128_t b)42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
43
add128_64(uint128_t a,uint64_t b)44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
45
mul64x64_128(uint64_t a,uint64_t b)46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
47 return (uint128_t)a * b;
48 }
49
lo128(uint128_t a)50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
51
shr128(uint128_t v,const int shift)52 static inline uint64_t shr128(uint128_t v, const int shift) {
53 return (uint64_t)(v >> shift);
54 }
55
shr128_pair(uint64_t hi,uint64_t lo,const int shift)56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
57 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
58 }
59
60 typedef struct poly1305_power_t {
61 union {
62 xmmi v;
63 uint64_t u[2];
64 uint32_t d[4];
65 } R20, R21, R22, R23, R24, S21, S22, S23, S24;
66 } poly1305_power;
67
68 typedef struct poly1305_state_internal_t {
69 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
70 bytes of free storage */
71 union {
72 xmmi H[5]; /* 80 bytes */
73 uint64_t HH[10];
74 };
75 /* uint64_t r0,r1,r2; [24 bytes] */
76 /* uint64_t pad0,pad1; [16 bytes] */
77 uint64_t started; /* 8 bytes */
78 uint64_t leftover; /* 8 bytes */
79 uint8_t buffer[64]; /* 64 bytes */
80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
81 alignment = 511 bytes raw */
82
poly1305_aligned_state(poly1305_state * state)83 static inline poly1305_state_internal *poly1305_aligned_state(
84 poly1305_state *state) {
85 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
86 }
87
88 /* copy 0-63 bytes */
89 static inline void
poly1305_block_copy(uint8_t * dst,const uint8_t * src,size_t bytes)90 poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
91 size_t offset = src - dst;
92 if (bytes & 32) {
93 _mm_storeu_si128((xmmi *)(dst + 0),
94 _mm_loadu_si128((const xmmi *)(dst + offset + 0)));
95 _mm_storeu_si128((xmmi *)(dst + 16),
96 _mm_loadu_si128((const xmmi *)(dst + offset + 16)));
97 dst += 32;
98 }
99 if (bytes & 16) {
100 _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset)));
101 dst += 16;
102 }
103 if (bytes & 8) {
104 *(uint64_t *)dst = *(const uint64_t *)(dst + offset);
105 dst += 8;
106 }
107 if (bytes & 4) {
108 *(uint32_t *)dst = *(const uint32_t *)(dst + offset);
109 dst += 4;
110 }
111 if (bytes & 2) {
112 *(uint16_t *)dst = *(uint16_t *)(dst + offset);
113 dst += 2;
114 }
115 if (bytes & 1) {
116 *(uint8_t *)dst = *(uint8_t *)(dst + offset);
117 }
118 }
119
120 /* zero 0-15 bytes */
poly1305_block_zero(uint8_t * dst,size_t bytes)121 static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
122 if (bytes & 8) {
123 *(uint64_t *)dst = 0;
124 dst += 8;
125 }
126 if (bytes & 4) {
127 *(uint32_t *)dst = 0;
128 dst += 4;
129 }
130 if (bytes & 2) {
131 *(uint16_t *)dst = 0;
132 dst += 2;
133 }
134 if (bytes & 1) {
135 *(uint8_t *)dst = 0;
136 }
137 }
138
poly1305_min(size_t a,size_t b)139 static inline size_t poly1305_min(size_t a, size_t b) {
140 return (a < b) ? a : b;
141 }
142
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])143 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
144 poly1305_state_internal *st = poly1305_aligned_state(state);
145 poly1305_power *p;
146 uint64_t r0, r1, r2;
147 uint64_t t0, t1;
148
149 /* clamp key */
150 t0 = U8TO64_LE(key + 0);
151 t1 = U8TO64_LE(key + 8);
152 r0 = t0 & 0xffc0fffffff;
153 t0 >>= 44;
154 t0 |= t1 << 20;
155 r1 = t0 & 0xfffffc0ffff;
156 t1 >>= 24;
157 r2 = t1 & 0x00ffffffc0f;
158
159 /* store r in un-used space of st->P[1] */
160 p = &st->P[1];
161 p->R20.d[1] = (uint32_t)(r0);
162 p->R20.d[3] = (uint32_t)(r0 >> 32);
163 p->R21.d[1] = (uint32_t)(r1);
164 p->R21.d[3] = (uint32_t)(r1 >> 32);
165 p->R22.d[1] = (uint32_t)(r2);
166 p->R22.d[3] = (uint32_t)(r2 >> 32);
167
168 /* store pad */
169 p->R23.d[1] = U8TO32_LE(key + 16);
170 p->R23.d[3] = U8TO32_LE(key + 20);
171 p->R24.d[1] = U8TO32_LE(key + 24);
172 p->R24.d[3] = U8TO32_LE(key + 28);
173
174 /* H = 0 */
175 st->H[0] = _mm_setzero_si128();
176 st->H[1] = _mm_setzero_si128();
177 st->H[2] = _mm_setzero_si128();
178 st->H[3] = _mm_setzero_si128();
179 st->H[4] = _mm_setzero_si128();
180
181 st->started = 0;
182 st->leftover = 0;
183 }
184
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)185 static void poly1305_first_block(poly1305_state_internal *st,
186 const uint8_t *m) {
187 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
188 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
189 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
190 xmmi T5, T6;
191 poly1305_power *p;
192 uint128_t d[3];
193 uint64_t r0, r1, r2;
194 uint64_t r20, r21, r22, s22;
195 uint64_t pad0, pad1;
196 uint64_t c;
197 uint64_t i;
198
199 /* pull out stored info */
200 p = &st->P[1];
201
202 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
203 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
204 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
205 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
206 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
207
208 /* compute powers r^2,r^4 */
209 r20 = r0;
210 r21 = r1;
211 r22 = r2;
212 for (i = 0; i < 2; i++) {
213 s22 = r22 * (5 << 2);
214
215 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
216 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
217 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
218
219 r20 = lo128(d[0]) & 0xfffffffffff;
220 c = shr128(d[0], 44);
221 d[1] = add128_64(d[1], c);
222 r21 = lo128(d[1]) & 0xfffffffffff;
223 c = shr128(d[1], 44);
224 d[2] = add128_64(d[2], c);
225 r22 = lo128(d[2]) & 0x3ffffffffff;
226 c = shr128(d[2], 42);
227 r20 += c * 5;
228 c = (r20 >> 44);
229 r20 = r20 & 0xfffffffffff;
230 r21 += c;
231
232 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
233 _MM_SHUFFLE(1, 0, 1, 0));
234 p->R21.v = _mm_shuffle_epi32(
235 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
236 _MM_SHUFFLE(1, 0, 1, 0));
237 p->R22.v =
238 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
239 _MM_SHUFFLE(1, 0, 1, 0));
240 p->R23.v = _mm_shuffle_epi32(
241 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
242 _MM_SHUFFLE(1, 0, 1, 0));
243 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
244 _MM_SHUFFLE(1, 0, 1, 0));
245 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
246 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
247 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
248 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
249 p--;
250 }
251
252 /* put saved info back */
253 p = &st->P[1];
254 p->R20.d[1] = (uint32_t)(r0);
255 p->R20.d[3] = (uint32_t)(r0 >> 32);
256 p->R21.d[1] = (uint32_t)(r1);
257 p->R21.d[3] = (uint32_t)(r1 >> 32);
258 p->R22.d[1] = (uint32_t)(r2);
259 p->R22.d[3] = (uint32_t)(r2 >> 32);
260 p->R23.d[1] = (uint32_t)(pad0);
261 p->R23.d[3] = (uint32_t)(pad0 >> 32);
262 p->R24.d[1] = (uint32_t)(pad1);
263 p->R24.d[3] = (uint32_t)(pad1 >> 32);
264
265 /* H = [Mx,My] */
266 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
267 _mm_loadl_epi64((const xmmi *)(m + 16)));
268 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
269 _mm_loadl_epi64((const xmmi *)(m + 24)));
270 st->H[0] = _mm_and_si128(MMASK, T5);
271 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
272 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
273 st->H[2] = _mm_and_si128(MMASK, T5);
274 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
275 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
276 }
277
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)278 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
279 size_t bytes) {
280 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
281 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
282 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
283
284 poly1305_power *p;
285 xmmi H0, H1, H2, H3, H4;
286 xmmi T0, T1, T2, T3, T4, T5, T6;
287 xmmi M0, M1, M2, M3, M4;
288 xmmi C1, C2;
289
290 H0 = st->H[0];
291 H1 = st->H[1];
292 H2 = st->H[2];
293 H3 = st->H[3];
294 H4 = st->H[4];
295
296 while (bytes >= 64) {
297 /* H *= [r^4,r^4] */
298 p = &st->P[0];
299 T0 = _mm_mul_epu32(H0, p->R20.v);
300 T1 = _mm_mul_epu32(H0, p->R21.v);
301 T2 = _mm_mul_epu32(H0, p->R22.v);
302 T3 = _mm_mul_epu32(H0, p->R23.v);
303 T4 = _mm_mul_epu32(H0, p->R24.v);
304 T5 = _mm_mul_epu32(H1, p->S24.v);
305 T6 = _mm_mul_epu32(H1, p->R20.v);
306 T0 = _mm_add_epi64(T0, T5);
307 T1 = _mm_add_epi64(T1, T6);
308 T5 = _mm_mul_epu32(H2, p->S23.v);
309 T6 = _mm_mul_epu32(H2, p->S24.v);
310 T0 = _mm_add_epi64(T0, T5);
311 T1 = _mm_add_epi64(T1, T6);
312 T5 = _mm_mul_epu32(H3, p->S22.v);
313 T6 = _mm_mul_epu32(H3, p->S23.v);
314 T0 = _mm_add_epi64(T0, T5);
315 T1 = _mm_add_epi64(T1, T6);
316 T5 = _mm_mul_epu32(H4, p->S21.v);
317 T6 = _mm_mul_epu32(H4, p->S22.v);
318 T0 = _mm_add_epi64(T0, T5);
319 T1 = _mm_add_epi64(T1, T6);
320 T5 = _mm_mul_epu32(H1, p->R21.v);
321 T6 = _mm_mul_epu32(H1, p->R22.v);
322 T2 = _mm_add_epi64(T2, T5);
323 T3 = _mm_add_epi64(T3, T6);
324 T5 = _mm_mul_epu32(H2, p->R20.v);
325 T6 = _mm_mul_epu32(H2, p->R21.v);
326 T2 = _mm_add_epi64(T2, T5);
327 T3 = _mm_add_epi64(T3, T6);
328 T5 = _mm_mul_epu32(H3, p->S24.v);
329 T6 = _mm_mul_epu32(H3, p->R20.v);
330 T2 = _mm_add_epi64(T2, T5);
331 T3 = _mm_add_epi64(T3, T6);
332 T5 = _mm_mul_epu32(H4, p->S23.v);
333 T6 = _mm_mul_epu32(H4, p->S24.v);
334 T2 = _mm_add_epi64(T2, T5);
335 T3 = _mm_add_epi64(T3, T6);
336 T5 = _mm_mul_epu32(H1, p->R23.v);
337 T4 = _mm_add_epi64(T4, T5);
338 T5 = _mm_mul_epu32(H2, p->R22.v);
339 T4 = _mm_add_epi64(T4, T5);
340 T5 = _mm_mul_epu32(H3, p->R21.v);
341 T4 = _mm_add_epi64(T4, T5);
342 T5 = _mm_mul_epu32(H4, p->R20.v);
343 T4 = _mm_add_epi64(T4, T5);
344
345 /* H += [Mx,My]*[r^2,r^2] */
346 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
347 _mm_loadl_epi64((const xmmi *)(m + 16)));
348 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
349 _mm_loadl_epi64((const xmmi *)(m + 24)));
350 M0 = _mm_and_si128(MMASK, T5);
351 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
352 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
353 M2 = _mm_and_si128(MMASK, T5);
354 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
355 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
356
357 p = &st->P[1];
358 T5 = _mm_mul_epu32(M0, p->R20.v);
359 T6 = _mm_mul_epu32(M0, p->R21.v);
360 T0 = _mm_add_epi64(T0, T5);
361 T1 = _mm_add_epi64(T1, T6);
362 T5 = _mm_mul_epu32(M1, p->S24.v);
363 T6 = _mm_mul_epu32(M1, p->R20.v);
364 T0 = _mm_add_epi64(T0, T5);
365 T1 = _mm_add_epi64(T1, T6);
366 T5 = _mm_mul_epu32(M2, p->S23.v);
367 T6 = _mm_mul_epu32(M2, p->S24.v);
368 T0 = _mm_add_epi64(T0, T5);
369 T1 = _mm_add_epi64(T1, T6);
370 T5 = _mm_mul_epu32(M3, p->S22.v);
371 T6 = _mm_mul_epu32(M3, p->S23.v);
372 T0 = _mm_add_epi64(T0, T5);
373 T1 = _mm_add_epi64(T1, T6);
374 T5 = _mm_mul_epu32(M4, p->S21.v);
375 T6 = _mm_mul_epu32(M4, p->S22.v);
376 T0 = _mm_add_epi64(T0, T5);
377 T1 = _mm_add_epi64(T1, T6);
378 T5 = _mm_mul_epu32(M0, p->R22.v);
379 T6 = _mm_mul_epu32(M0, p->R23.v);
380 T2 = _mm_add_epi64(T2, T5);
381 T3 = _mm_add_epi64(T3, T6);
382 T5 = _mm_mul_epu32(M1, p->R21.v);
383 T6 = _mm_mul_epu32(M1, p->R22.v);
384 T2 = _mm_add_epi64(T2, T5);
385 T3 = _mm_add_epi64(T3, T6);
386 T5 = _mm_mul_epu32(M2, p->R20.v);
387 T6 = _mm_mul_epu32(M2, p->R21.v);
388 T2 = _mm_add_epi64(T2, T5);
389 T3 = _mm_add_epi64(T3, T6);
390 T5 = _mm_mul_epu32(M3, p->S24.v);
391 T6 = _mm_mul_epu32(M3, p->R20.v);
392 T2 = _mm_add_epi64(T2, T5);
393 T3 = _mm_add_epi64(T3, T6);
394 T5 = _mm_mul_epu32(M4, p->S23.v);
395 T6 = _mm_mul_epu32(M4, p->S24.v);
396 T2 = _mm_add_epi64(T2, T5);
397 T3 = _mm_add_epi64(T3, T6);
398 T5 = _mm_mul_epu32(M0, p->R24.v);
399 T4 = _mm_add_epi64(T4, T5);
400 T5 = _mm_mul_epu32(M1, p->R23.v);
401 T4 = _mm_add_epi64(T4, T5);
402 T5 = _mm_mul_epu32(M2, p->R22.v);
403 T4 = _mm_add_epi64(T4, T5);
404 T5 = _mm_mul_epu32(M3, p->R21.v);
405 T4 = _mm_add_epi64(T4, T5);
406 T5 = _mm_mul_epu32(M4, p->R20.v);
407 T4 = _mm_add_epi64(T4, T5);
408
409 /* H += [Mx,My] */
410 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
411 _mm_loadl_epi64((const xmmi *)(m + 48)));
412 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
413 _mm_loadl_epi64((const xmmi *)(m + 56)));
414 M0 = _mm_and_si128(MMASK, T5);
415 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
416 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
417 M2 = _mm_and_si128(MMASK, T5);
418 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
419 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
420
421 T0 = _mm_add_epi64(T0, M0);
422 T1 = _mm_add_epi64(T1, M1);
423 T2 = _mm_add_epi64(T2, M2);
424 T3 = _mm_add_epi64(T3, M3);
425 T4 = _mm_add_epi64(T4, M4);
426
427 /* reduce */
428 C1 = _mm_srli_epi64(T0, 26);
429 C2 = _mm_srli_epi64(T3, 26);
430 T0 = _mm_and_si128(T0, MMASK);
431 T3 = _mm_and_si128(T3, MMASK);
432 T1 = _mm_add_epi64(T1, C1);
433 T4 = _mm_add_epi64(T4, C2);
434 C1 = _mm_srli_epi64(T1, 26);
435 C2 = _mm_srli_epi64(T4, 26);
436 T1 = _mm_and_si128(T1, MMASK);
437 T4 = _mm_and_si128(T4, MMASK);
438 T2 = _mm_add_epi64(T2, C1);
439 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
440 C1 = _mm_srli_epi64(T2, 26);
441 C2 = _mm_srli_epi64(T0, 26);
442 T2 = _mm_and_si128(T2, MMASK);
443 T0 = _mm_and_si128(T0, MMASK);
444 T3 = _mm_add_epi64(T3, C1);
445 T1 = _mm_add_epi64(T1, C2);
446 C1 = _mm_srli_epi64(T3, 26);
447 T3 = _mm_and_si128(T3, MMASK);
448 T4 = _mm_add_epi64(T4, C1);
449
450 /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
451 H0 = T0;
452 H1 = T1;
453 H2 = T2;
454 H3 = T3;
455 H4 = T4;
456
457 m += 64;
458 bytes -= 64;
459 }
460
461 st->H[0] = H0;
462 st->H[1] = H1;
463 st->H[2] = H2;
464 st->H[3] = H3;
465 st->H[4] = H4;
466 }
467
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)468 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
469 size_t bytes) {
470 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
471 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
472 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
473
474 poly1305_power *p;
475 xmmi H0, H1, H2, H3, H4;
476 xmmi M0, M1, M2, M3, M4;
477 xmmi T0, T1, T2, T3, T4, T5, T6;
478 xmmi C1, C2;
479
480 uint64_t r0, r1, r2;
481 uint64_t t0, t1, t2, t3, t4;
482 uint64_t c;
483 size_t consumed = 0;
484
485 H0 = st->H[0];
486 H1 = st->H[1];
487 H2 = st->H[2];
488 H3 = st->H[3];
489 H4 = st->H[4];
490
491 /* p = [r^2,r^2] */
492 p = &st->P[1];
493
494 if (bytes >= 32) {
495 /* H *= [r^2,r^2] */
496 T0 = _mm_mul_epu32(H0, p->R20.v);
497 T1 = _mm_mul_epu32(H0, p->R21.v);
498 T2 = _mm_mul_epu32(H0, p->R22.v);
499 T3 = _mm_mul_epu32(H0, p->R23.v);
500 T4 = _mm_mul_epu32(H0, p->R24.v);
501 T5 = _mm_mul_epu32(H1, p->S24.v);
502 T6 = _mm_mul_epu32(H1, p->R20.v);
503 T0 = _mm_add_epi64(T0, T5);
504 T1 = _mm_add_epi64(T1, T6);
505 T5 = _mm_mul_epu32(H2, p->S23.v);
506 T6 = _mm_mul_epu32(H2, p->S24.v);
507 T0 = _mm_add_epi64(T0, T5);
508 T1 = _mm_add_epi64(T1, T6);
509 T5 = _mm_mul_epu32(H3, p->S22.v);
510 T6 = _mm_mul_epu32(H3, p->S23.v);
511 T0 = _mm_add_epi64(T0, T5);
512 T1 = _mm_add_epi64(T1, T6);
513 T5 = _mm_mul_epu32(H4, p->S21.v);
514 T6 = _mm_mul_epu32(H4, p->S22.v);
515 T0 = _mm_add_epi64(T0, T5);
516 T1 = _mm_add_epi64(T1, T6);
517 T5 = _mm_mul_epu32(H1, p->R21.v);
518 T6 = _mm_mul_epu32(H1, p->R22.v);
519 T2 = _mm_add_epi64(T2, T5);
520 T3 = _mm_add_epi64(T3, T6);
521 T5 = _mm_mul_epu32(H2, p->R20.v);
522 T6 = _mm_mul_epu32(H2, p->R21.v);
523 T2 = _mm_add_epi64(T2, T5);
524 T3 = _mm_add_epi64(T3, T6);
525 T5 = _mm_mul_epu32(H3, p->S24.v);
526 T6 = _mm_mul_epu32(H3, p->R20.v);
527 T2 = _mm_add_epi64(T2, T5);
528 T3 = _mm_add_epi64(T3, T6);
529 T5 = _mm_mul_epu32(H4, p->S23.v);
530 T6 = _mm_mul_epu32(H4, p->S24.v);
531 T2 = _mm_add_epi64(T2, T5);
532 T3 = _mm_add_epi64(T3, T6);
533 T5 = _mm_mul_epu32(H1, p->R23.v);
534 T4 = _mm_add_epi64(T4, T5);
535 T5 = _mm_mul_epu32(H2, p->R22.v);
536 T4 = _mm_add_epi64(T4, T5);
537 T5 = _mm_mul_epu32(H3, p->R21.v);
538 T4 = _mm_add_epi64(T4, T5);
539 T5 = _mm_mul_epu32(H4, p->R20.v);
540 T4 = _mm_add_epi64(T4, T5);
541
542 /* H += [Mx,My] */
543 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
544 _mm_loadl_epi64((const xmmi *)(m + 16)));
545 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
546 _mm_loadl_epi64((const xmmi *)(m + 24)));
547 M0 = _mm_and_si128(MMASK, T5);
548 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
549 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
550 M2 = _mm_and_si128(MMASK, T5);
551 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
552 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
553
554 T0 = _mm_add_epi64(T0, M0);
555 T1 = _mm_add_epi64(T1, M1);
556 T2 = _mm_add_epi64(T2, M2);
557 T3 = _mm_add_epi64(T3, M3);
558 T4 = _mm_add_epi64(T4, M4);
559
560 /* reduce */
561 C1 = _mm_srli_epi64(T0, 26);
562 C2 = _mm_srli_epi64(T3, 26);
563 T0 = _mm_and_si128(T0, MMASK);
564 T3 = _mm_and_si128(T3, MMASK);
565 T1 = _mm_add_epi64(T1, C1);
566 T4 = _mm_add_epi64(T4, C2);
567 C1 = _mm_srli_epi64(T1, 26);
568 C2 = _mm_srli_epi64(T4, 26);
569 T1 = _mm_and_si128(T1, MMASK);
570 T4 = _mm_and_si128(T4, MMASK);
571 T2 = _mm_add_epi64(T2, C1);
572 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
573 C1 = _mm_srli_epi64(T2, 26);
574 C2 = _mm_srli_epi64(T0, 26);
575 T2 = _mm_and_si128(T2, MMASK);
576 T0 = _mm_and_si128(T0, MMASK);
577 T3 = _mm_add_epi64(T3, C1);
578 T1 = _mm_add_epi64(T1, C2);
579 C1 = _mm_srli_epi64(T3, 26);
580 T3 = _mm_and_si128(T3, MMASK);
581 T4 = _mm_add_epi64(T4, C1);
582
583 /* H = (H*[r^2,r^2] + [Mx,My]) */
584 H0 = T0;
585 H1 = T1;
586 H2 = T2;
587 H3 = T3;
588 H4 = T4;
589
590 consumed = 32;
591 }
592
593 /* finalize, H *= [r^2,r] */
594 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
595 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
596 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
597
598 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
599 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
600 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
601 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
602 p->R24.d[2] = (uint32_t)((r2 >> 16));
603 p->S21.d[2] = p->R21.d[2] * 5;
604 p->S22.d[2] = p->R22.d[2] * 5;
605 p->S23.d[2] = p->R23.d[2] * 5;
606 p->S24.d[2] = p->R24.d[2] * 5;
607
608 /* H *= [r^2,r] */
609 T0 = _mm_mul_epu32(H0, p->R20.v);
610 T1 = _mm_mul_epu32(H0, p->R21.v);
611 T2 = _mm_mul_epu32(H0, p->R22.v);
612 T3 = _mm_mul_epu32(H0, p->R23.v);
613 T4 = _mm_mul_epu32(H0, p->R24.v);
614 T5 = _mm_mul_epu32(H1, p->S24.v);
615 T6 = _mm_mul_epu32(H1, p->R20.v);
616 T0 = _mm_add_epi64(T0, T5);
617 T1 = _mm_add_epi64(T1, T6);
618 T5 = _mm_mul_epu32(H2, p->S23.v);
619 T6 = _mm_mul_epu32(H2, p->S24.v);
620 T0 = _mm_add_epi64(T0, T5);
621 T1 = _mm_add_epi64(T1, T6);
622 T5 = _mm_mul_epu32(H3, p->S22.v);
623 T6 = _mm_mul_epu32(H3, p->S23.v);
624 T0 = _mm_add_epi64(T0, T5);
625 T1 = _mm_add_epi64(T1, T6);
626 T5 = _mm_mul_epu32(H4, p->S21.v);
627 T6 = _mm_mul_epu32(H4, p->S22.v);
628 T0 = _mm_add_epi64(T0, T5);
629 T1 = _mm_add_epi64(T1, T6);
630 T5 = _mm_mul_epu32(H1, p->R21.v);
631 T6 = _mm_mul_epu32(H1, p->R22.v);
632 T2 = _mm_add_epi64(T2, T5);
633 T3 = _mm_add_epi64(T3, T6);
634 T5 = _mm_mul_epu32(H2, p->R20.v);
635 T6 = _mm_mul_epu32(H2, p->R21.v);
636 T2 = _mm_add_epi64(T2, T5);
637 T3 = _mm_add_epi64(T3, T6);
638 T5 = _mm_mul_epu32(H3, p->S24.v);
639 T6 = _mm_mul_epu32(H3, p->R20.v);
640 T2 = _mm_add_epi64(T2, T5);
641 T3 = _mm_add_epi64(T3, T6);
642 T5 = _mm_mul_epu32(H4, p->S23.v);
643 T6 = _mm_mul_epu32(H4, p->S24.v);
644 T2 = _mm_add_epi64(T2, T5);
645 T3 = _mm_add_epi64(T3, T6);
646 T5 = _mm_mul_epu32(H1, p->R23.v);
647 T4 = _mm_add_epi64(T4, T5);
648 T5 = _mm_mul_epu32(H2, p->R22.v);
649 T4 = _mm_add_epi64(T4, T5);
650 T5 = _mm_mul_epu32(H3, p->R21.v);
651 T4 = _mm_add_epi64(T4, T5);
652 T5 = _mm_mul_epu32(H4, p->R20.v);
653 T4 = _mm_add_epi64(T4, T5);
654
655 C1 = _mm_srli_epi64(T0, 26);
656 C2 = _mm_srli_epi64(T3, 26);
657 T0 = _mm_and_si128(T0, MMASK);
658 T3 = _mm_and_si128(T3, MMASK);
659 T1 = _mm_add_epi64(T1, C1);
660 T4 = _mm_add_epi64(T4, C2);
661 C1 = _mm_srli_epi64(T1, 26);
662 C2 = _mm_srli_epi64(T4, 26);
663 T1 = _mm_and_si128(T1, MMASK);
664 T4 = _mm_and_si128(T4, MMASK);
665 T2 = _mm_add_epi64(T2, C1);
666 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
667 C1 = _mm_srli_epi64(T2, 26);
668 C2 = _mm_srli_epi64(T0, 26);
669 T2 = _mm_and_si128(T2, MMASK);
670 T0 = _mm_and_si128(T0, MMASK);
671 T3 = _mm_add_epi64(T3, C1);
672 T1 = _mm_add_epi64(T1, C2);
673 C1 = _mm_srli_epi64(T3, 26);
674 T3 = _mm_and_si128(T3, MMASK);
675 T4 = _mm_add_epi64(T4, C1);
676
677 /* H = H[0]+H[1] */
678 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
679 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
680 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
681 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
682 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
683
684 t0 = _mm_cvtsi128_si32(H0);
685 c = (t0 >> 26);
686 t0 &= 0x3ffffff;
687 t1 = _mm_cvtsi128_si32(H1) + c;
688 c = (t1 >> 26);
689 t1 &= 0x3ffffff;
690 t2 = _mm_cvtsi128_si32(H2) + c;
691 c = (t2 >> 26);
692 t2 &= 0x3ffffff;
693 t3 = _mm_cvtsi128_si32(H3) + c;
694 c = (t3 >> 26);
695 t3 &= 0x3ffffff;
696 t4 = _mm_cvtsi128_si32(H4) + c;
697 c = (t4 >> 26);
698 t4 &= 0x3ffffff;
699 t0 = t0 + (c * 5);
700 c = (t0 >> 26);
701 t0 &= 0x3ffffff;
702 t1 = t1 + c;
703
704 st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
705 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
706 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
707
708 return consumed;
709 }
710
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)711 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
712 size_t bytes) {
713 poly1305_state_internal *st = poly1305_aligned_state(state);
714 size_t want;
715
716 /* need at least 32 initial bytes to start the accelerated branch */
717 if (!st->started) {
718 if ((st->leftover == 0) && (bytes > 32)) {
719 poly1305_first_block(st, m);
720 m += 32;
721 bytes -= 32;
722 } else {
723 want = poly1305_min(32 - st->leftover, bytes);
724 poly1305_block_copy(st->buffer + st->leftover, m, want);
725 bytes -= want;
726 m += want;
727 st->leftover += want;
728 if ((st->leftover < 32) || (bytes == 0)) {
729 return;
730 }
731 poly1305_first_block(st, st->buffer);
732 st->leftover = 0;
733 }
734 st->started = 1;
735 }
736
737 /* handle leftover */
738 if (st->leftover) {
739 want = poly1305_min(64 - st->leftover, bytes);
740 poly1305_block_copy(st->buffer + st->leftover, m, want);
741 bytes -= want;
742 m += want;
743 st->leftover += want;
744 if (st->leftover < 64) {
745 return;
746 }
747 poly1305_blocks(st, st->buffer, 64);
748 st->leftover = 0;
749 }
750
751 /* process 64 byte blocks */
752 if (bytes >= 64) {
753 want = (bytes & ~63);
754 poly1305_blocks(st, m, want);
755 m += want;
756 bytes -= want;
757 }
758
759 if (bytes) {
760 poly1305_block_copy(st->buffer + st->leftover, m, bytes);
761 st->leftover += bytes;
762 }
763 }
764
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])765 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
766 poly1305_state_internal *st = poly1305_aligned_state(state);
767 size_t leftover = st->leftover;
768 uint8_t *m = st->buffer;
769 uint128_t d[3];
770 uint64_t h0, h1, h2;
771 uint64_t t0, t1;
772 uint64_t g0, g1, g2, c, nc;
773 uint64_t r0, r1, r2, s1, s2;
774 poly1305_power *p;
775
776 if (st->started) {
777 size_t consumed = poly1305_combine(st, m, leftover);
778 leftover -= consumed;
779 m += consumed;
780 }
781
782 /* st->HH will either be 0 or have the combined result */
783 h0 = st->HH[0];
784 h1 = st->HH[1];
785 h2 = st->HH[2];
786
787 p = &st->P[1];
788 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
789 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
790 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
791 s1 = r1 * (5 << 2);
792 s2 = r2 * (5 << 2);
793
794 if (leftover < 16) {
795 goto poly1305_donna_atmost15bytes;
796 }
797
798 poly1305_donna_atleast16bytes:
799 t0 = U8TO64_LE(m + 0);
800 t1 = U8TO64_LE(m + 8);
801 h0 += t0 & 0xfffffffffff;
802 t0 = shr128_pair(t1, t0, 44);
803 h1 += t0 & 0xfffffffffff;
804 h2 += (t1 >> 24) | ((uint64_t)1 << 40);
805
806 poly1305_donna_mul:
807 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
808 mul64x64_128(h2, s1));
809 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
810 mul64x64_128(h2, s2));
811 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
812 mul64x64_128(h2, r0));
813 h0 = lo128(d[0]) & 0xfffffffffff;
814 c = shr128(d[0], 44);
815 d[1] = add128_64(d[1], c);
816 h1 = lo128(d[1]) & 0xfffffffffff;
817 c = shr128(d[1], 44);
818 d[2] = add128_64(d[2], c);
819 h2 = lo128(d[2]) & 0x3ffffffffff;
820 c = shr128(d[2], 42);
821 h0 += c * 5;
822
823 m += 16;
824 leftover -= 16;
825 if (leftover >= 16) {
826 goto poly1305_donna_atleast16bytes;
827 }
828
829 /* final bytes */
830 poly1305_donna_atmost15bytes:
831 if (!leftover) {
832 goto poly1305_donna_finish;
833 }
834
835 m[leftover++] = 1;
836 poly1305_block_zero(m + leftover, 16 - leftover);
837 leftover = 16;
838
839 t0 = U8TO64_LE(m + 0);
840 t1 = U8TO64_LE(m + 8);
841 h0 += t0 & 0xfffffffffff;
842 t0 = shr128_pair(t1, t0, 44);
843 h1 += t0 & 0xfffffffffff;
844 h2 += (t1 >> 24);
845
846 goto poly1305_donna_mul;
847
848 poly1305_donna_finish:
849 c = (h0 >> 44);
850 h0 &= 0xfffffffffff;
851 h1 += c;
852 c = (h1 >> 44);
853 h1 &= 0xfffffffffff;
854 h2 += c;
855 c = (h2 >> 42);
856 h2 &= 0x3ffffffffff;
857 h0 += c * 5;
858
859 g0 = h0 + 5;
860 c = (g0 >> 44);
861 g0 &= 0xfffffffffff;
862 g1 = h1 + c;
863 c = (g1 >> 44);
864 g1 &= 0xfffffffffff;
865 g2 = h2 + c - ((uint64_t)1 << 42);
866
867 c = (g2 >> 63) - 1;
868 nc = ~c;
869 h0 = (h0 & nc) | (g0 & c);
870 h1 = (h1 & nc) | (g1 & c);
871 h2 = (h2 & nc) | (g2 & c);
872
873 /* pad */
874 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
875 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
876 h0 += (t0 & 0xfffffffffff);
877 c = (h0 >> 44);
878 h0 &= 0xfffffffffff;
879 t0 = shr128_pair(t1, t0, 44);
880 h1 += (t0 & 0xfffffffffff) + c;
881 c = (h1 >> 44);
882 h1 &= 0xfffffffffff;
883 t1 = (t1 >> 24);
884 h2 += (t1)+c;
885
886 U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
887 U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
888 }
889
890 #endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */
891