1 /* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15 /* This implementation of poly1305 is by Andrew Moon
16 * (https://github.com/floodyberry/poly1305-donna) and released as public
17 * domain. It implements SIMD vectorization based on the algorithm described in
18 * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 * block size */
20
21 #include <openssl/poly1305.h>
22
23
24 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
25
26 #include <emmintrin.h>
27
28 #define ALIGN(x) __attribute__((aligned(x)))
29 /* inline is not a keyword in C89. */
30 #define INLINE
31 #define U8TO64_LE(m) (*(uint64_t *)(m))
32 #define U8TO32_LE(m) (*(uint32_t *)(m))
33 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
34
35 typedef __m128i xmmi;
36 typedef unsigned __int128 uint128_t;
37
38 static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = {
39 (1 << 26) - 1, 0, (1 << 26) - 1, 0};
40 static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
41 static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0,
42 (1 << 24), 0};
43
add128(uint128_t a,uint128_t b)44 static uint128_t INLINE add128(uint128_t a, uint128_t b) { return a + b; }
45
add128_64(uint128_t a,uint64_t b)46 static uint128_t INLINE add128_64(uint128_t a, uint64_t b) { return a + b; }
47
mul64x64_128(uint64_t a,uint64_t b)48 static uint128_t INLINE mul64x64_128(uint64_t a, uint64_t b) {
49 return (uint128_t)a * b;
50 }
51
lo128(uint128_t a)52 static uint64_t INLINE lo128(uint128_t a) { return (uint64_t)a; }
53
shr128(uint128_t v,const int shift)54 static uint64_t INLINE shr128(uint128_t v, const int shift) {
55 return (uint64_t)(v >> shift);
56 }
57
shr128_pair(uint64_t hi,uint64_t lo,const int shift)58 static uint64_t INLINE shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
59 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
60 }
61
62 typedef struct poly1305_power_t {
63 union {
64 xmmi v;
65 uint64_t u[2];
66 uint32_t d[4];
67 } R20, R21, R22, R23, R24, S21, S22, S23, S24;
68 } poly1305_power;
69
70 typedef struct poly1305_state_internal_t {
71 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
72 bytes of free storage */
73 union {
74 xmmi H[5]; /* 80 bytes */
75 uint64_t HH[10];
76 };
77 /* uint64_t r0,r1,r2; [24 bytes] */
78 /* uint64_t pad0,pad1; [16 bytes] */
79 uint64_t started; /* 8 bytes */
80 uint64_t leftover; /* 8 bytes */
81 uint8_t buffer[64]; /* 64 bytes */
82 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
83 alignment = 511 bytes raw */
84
poly1305_aligned_state(poly1305_state * state)85 static poly1305_state_internal INLINE *poly1305_aligned_state(
86 poly1305_state *state) {
87 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
88 }
89
90 /* copy 0-63 bytes */
91 static void INLINE
poly1305_block_copy(uint8_t * dst,const uint8_t * src,size_t bytes)92 poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
93 size_t offset = src - dst;
94 if (bytes & 32) {
95 _mm_storeu_si128((xmmi *)(dst + 0),
96 _mm_loadu_si128((xmmi *)(dst + offset + 0)));
97 _mm_storeu_si128((xmmi *)(dst + 16),
98 _mm_loadu_si128((xmmi *)(dst + offset + 16)));
99 dst += 32;
100 }
101 if (bytes & 16) {
102 _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset)));
103 dst += 16;
104 }
105 if (bytes & 8) {
106 *(uint64_t *)dst = *(uint64_t *)(dst + offset);
107 dst += 8;
108 }
109 if (bytes & 4) {
110 *(uint32_t *)dst = *(uint32_t *)(dst + offset);
111 dst += 4;
112 }
113 if (bytes & 2) {
114 *(uint16_t *)dst = *(uint16_t *)(dst + offset);
115 dst += 2;
116 }
117 if (bytes & 1) {
118 *(uint8_t *)dst = *(uint8_t *)(dst + offset);
119 }
120 }
121
122 /* zero 0-15 bytes */
poly1305_block_zero(uint8_t * dst,size_t bytes)123 static void INLINE poly1305_block_zero(uint8_t *dst, size_t bytes) {
124 if (bytes & 8) {
125 *(uint64_t *)dst = 0;
126 dst += 8;
127 }
128 if (bytes & 4) {
129 *(uint32_t *)dst = 0;
130 dst += 4;
131 }
132 if (bytes & 2) {
133 *(uint16_t *)dst = 0;
134 dst += 2;
135 }
136 if (bytes & 1) {
137 *(uint8_t *)dst = 0;
138 }
139 }
140
poly1305_min(size_t a,size_t b)141 static size_t INLINE poly1305_min(size_t a, size_t b) {
142 return (a < b) ? a : b;
143 }
144
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])145 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
146 poly1305_state_internal *st = poly1305_aligned_state(state);
147 poly1305_power *p;
148 uint64_t r0, r1, r2;
149 uint64_t t0, t1;
150
151 /* clamp key */
152 t0 = U8TO64_LE(key + 0);
153 t1 = U8TO64_LE(key + 8);
154 r0 = t0 & 0xffc0fffffff;
155 t0 >>= 44;
156 t0 |= t1 << 20;
157 r1 = t0 & 0xfffffc0ffff;
158 t1 >>= 24;
159 r2 = t1 & 0x00ffffffc0f;
160
161 /* store r in un-used space of st->P[1] */
162 p = &st->P[1];
163 p->R20.d[1] = (uint32_t)(r0);
164 p->R20.d[3] = (uint32_t)(r0 >> 32);
165 p->R21.d[1] = (uint32_t)(r1);
166 p->R21.d[3] = (uint32_t)(r1 >> 32);
167 p->R22.d[1] = (uint32_t)(r2);
168 p->R22.d[3] = (uint32_t)(r2 >> 32);
169
170 /* store pad */
171 p->R23.d[1] = U8TO32_LE(key + 16);
172 p->R23.d[3] = U8TO32_LE(key + 20);
173 p->R24.d[1] = U8TO32_LE(key + 24);
174 p->R24.d[3] = U8TO32_LE(key + 28);
175
176 /* H = 0 */
177 st->H[0] = _mm_setzero_si128();
178 st->H[1] = _mm_setzero_si128();
179 st->H[2] = _mm_setzero_si128();
180 st->H[3] = _mm_setzero_si128();
181 st->H[4] = _mm_setzero_si128();
182
183 st->started = 0;
184 st->leftover = 0;
185 }
186
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)187 static void poly1305_first_block(poly1305_state_internal *st,
188 const uint8_t *m) {
189 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
190 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
191 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
192 xmmi T5, T6;
193 poly1305_power *p;
194 uint128_t d[3];
195 uint64_t r0, r1, r2;
196 uint64_t r20, r21, r22, s22;
197 uint64_t pad0, pad1;
198 uint64_t c;
199 uint64_t i;
200
201 /* pull out stored info */
202 p = &st->P[1];
203
204 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
205 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
206 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
207 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
208 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
209
210 /* compute powers r^2,r^4 */
211 r20 = r0;
212 r21 = r1;
213 r22 = r2;
214 for (i = 0; i < 2; i++) {
215 s22 = r22 * (5 << 2);
216
217 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
218 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
219 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
220
221 r20 = lo128(d[0]) & 0xfffffffffff;
222 c = shr128(d[0], 44);
223 d[1] = add128_64(d[1], c);
224 r21 = lo128(d[1]) & 0xfffffffffff;
225 c = shr128(d[1], 44);
226 d[2] = add128_64(d[2], c);
227 r22 = lo128(d[2]) & 0x3ffffffffff;
228 c = shr128(d[2], 42);
229 r20 += c * 5;
230 c = (r20 >> 44);
231 r20 = r20 & 0xfffffffffff;
232 r21 += c;
233
234 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
235 _MM_SHUFFLE(1, 0, 1, 0));
236 p->R21.v = _mm_shuffle_epi32(
237 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
238 _MM_SHUFFLE(1, 0, 1, 0));
239 p->R22.v =
240 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
241 _MM_SHUFFLE(1, 0, 1, 0));
242 p->R23.v = _mm_shuffle_epi32(
243 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
244 _MM_SHUFFLE(1, 0, 1, 0));
245 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
246 _MM_SHUFFLE(1, 0, 1, 0));
247 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
248 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
249 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
250 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
251 p--;
252 }
253
254 /* put saved info back */
255 p = &st->P[1];
256 p->R20.d[1] = (uint32_t)(r0);
257 p->R20.d[3] = (uint32_t)(r0 >> 32);
258 p->R21.d[1] = (uint32_t)(r1);
259 p->R21.d[3] = (uint32_t)(r1 >> 32);
260 p->R22.d[1] = (uint32_t)(r2);
261 p->R22.d[3] = (uint32_t)(r2 >> 32);
262 p->R23.d[1] = (uint32_t)(pad0);
263 p->R23.d[3] = (uint32_t)(pad0 >> 32);
264 p->R24.d[1] = (uint32_t)(pad1);
265 p->R24.d[3] = (uint32_t)(pad1 >> 32);
266
267 /* H = [Mx,My] */
268 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
269 _mm_loadl_epi64((xmmi *)(m + 16)));
270 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
271 _mm_loadl_epi64((xmmi *)(m + 24)));
272 st->H[0] = _mm_and_si128(MMASK, T5);
273 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
274 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
275 st->H[2] = _mm_and_si128(MMASK, T5);
276 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
277 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
278 }
279
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)280 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
281 size_t bytes) {
282 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
283 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
284 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
285
286 poly1305_power *p;
287 xmmi H0, H1, H2, H3, H4;
288 xmmi T0, T1, T2, T3, T4, T5, T6;
289 xmmi M0, M1, M2, M3, M4;
290 xmmi C1, C2;
291
292 H0 = st->H[0];
293 H1 = st->H[1];
294 H2 = st->H[2];
295 H3 = st->H[3];
296 H4 = st->H[4];
297
298 while (bytes >= 64) {
299 /* H *= [r^4,r^4] */
300 p = &st->P[0];
301 T0 = _mm_mul_epu32(H0, p->R20.v);
302 T1 = _mm_mul_epu32(H0, p->R21.v);
303 T2 = _mm_mul_epu32(H0, p->R22.v);
304 T3 = _mm_mul_epu32(H0, p->R23.v);
305 T4 = _mm_mul_epu32(H0, p->R24.v);
306 T5 = _mm_mul_epu32(H1, p->S24.v);
307 T6 = _mm_mul_epu32(H1, p->R20.v);
308 T0 = _mm_add_epi64(T0, T5);
309 T1 = _mm_add_epi64(T1, T6);
310 T5 = _mm_mul_epu32(H2, p->S23.v);
311 T6 = _mm_mul_epu32(H2, p->S24.v);
312 T0 = _mm_add_epi64(T0, T5);
313 T1 = _mm_add_epi64(T1, T6);
314 T5 = _mm_mul_epu32(H3, p->S22.v);
315 T6 = _mm_mul_epu32(H3, p->S23.v);
316 T0 = _mm_add_epi64(T0, T5);
317 T1 = _mm_add_epi64(T1, T6);
318 T5 = _mm_mul_epu32(H4, p->S21.v);
319 T6 = _mm_mul_epu32(H4, p->S22.v);
320 T0 = _mm_add_epi64(T0, T5);
321 T1 = _mm_add_epi64(T1, T6);
322 T5 = _mm_mul_epu32(H1, p->R21.v);
323 T6 = _mm_mul_epu32(H1, p->R22.v);
324 T2 = _mm_add_epi64(T2, T5);
325 T3 = _mm_add_epi64(T3, T6);
326 T5 = _mm_mul_epu32(H2, p->R20.v);
327 T6 = _mm_mul_epu32(H2, p->R21.v);
328 T2 = _mm_add_epi64(T2, T5);
329 T3 = _mm_add_epi64(T3, T6);
330 T5 = _mm_mul_epu32(H3, p->S24.v);
331 T6 = _mm_mul_epu32(H3, p->R20.v);
332 T2 = _mm_add_epi64(T2, T5);
333 T3 = _mm_add_epi64(T3, T6);
334 T5 = _mm_mul_epu32(H4, p->S23.v);
335 T6 = _mm_mul_epu32(H4, p->S24.v);
336 T2 = _mm_add_epi64(T2, T5);
337 T3 = _mm_add_epi64(T3, T6);
338 T5 = _mm_mul_epu32(H1, p->R23.v);
339 T4 = _mm_add_epi64(T4, T5);
340 T5 = _mm_mul_epu32(H2, p->R22.v);
341 T4 = _mm_add_epi64(T4, T5);
342 T5 = _mm_mul_epu32(H3, p->R21.v);
343 T4 = _mm_add_epi64(T4, T5);
344 T5 = _mm_mul_epu32(H4, p->R20.v);
345 T4 = _mm_add_epi64(T4, T5);
346
347 /* H += [Mx,My]*[r^2,r^2] */
348 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
349 _mm_loadl_epi64((xmmi *)(m + 16)));
350 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
351 _mm_loadl_epi64((xmmi *)(m + 24)));
352 M0 = _mm_and_si128(MMASK, T5);
353 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
354 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
355 M2 = _mm_and_si128(MMASK, T5);
356 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
357 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
358
359 p = &st->P[1];
360 T5 = _mm_mul_epu32(M0, p->R20.v);
361 T6 = _mm_mul_epu32(M0, p->R21.v);
362 T0 = _mm_add_epi64(T0, T5);
363 T1 = _mm_add_epi64(T1, T6);
364 T5 = _mm_mul_epu32(M1, p->S24.v);
365 T6 = _mm_mul_epu32(M1, p->R20.v);
366 T0 = _mm_add_epi64(T0, T5);
367 T1 = _mm_add_epi64(T1, T6);
368 T5 = _mm_mul_epu32(M2, p->S23.v);
369 T6 = _mm_mul_epu32(M2, p->S24.v);
370 T0 = _mm_add_epi64(T0, T5);
371 T1 = _mm_add_epi64(T1, T6);
372 T5 = _mm_mul_epu32(M3, p->S22.v);
373 T6 = _mm_mul_epu32(M3, p->S23.v);
374 T0 = _mm_add_epi64(T0, T5);
375 T1 = _mm_add_epi64(T1, T6);
376 T5 = _mm_mul_epu32(M4, p->S21.v);
377 T6 = _mm_mul_epu32(M4, p->S22.v);
378 T0 = _mm_add_epi64(T0, T5);
379 T1 = _mm_add_epi64(T1, T6);
380 T5 = _mm_mul_epu32(M0, p->R22.v);
381 T6 = _mm_mul_epu32(M0, p->R23.v);
382 T2 = _mm_add_epi64(T2, T5);
383 T3 = _mm_add_epi64(T3, T6);
384 T5 = _mm_mul_epu32(M1, p->R21.v);
385 T6 = _mm_mul_epu32(M1, p->R22.v);
386 T2 = _mm_add_epi64(T2, T5);
387 T3 = _mm_add_epi64(T3, T6);
388 T5 = _mm_mul_epu32(M2, p->R20.v);
389 T6 = _mm_mul_epu32(M2, p->R21.v);
390 T2 = _mm_add_epi64(T2, T5);
391 T3 = _mm_add_epi64(T3, T6);
392 T5 = _mm_mul_epu32(M3, p->S24.v);
393 T6 = _mm_mul_epu32(M3, p->R20.v);
394 T2 = _mm_add_epi64(T2, T5);
395 T3 = _mm_add_epi64(T3, T6);
396 T5 = _mm_mul_epu32(M4, p->S23.v);
397 T6 = _mm_mul_epu32(M4, p->S24.v);
398 T2 = _mm_add_epi64(T2, T5);
399 T3 = _mm_add_epi64(T3, T6);
400 T5 = _mm_mul_epu32(M0, p->R24.v);
401 T4 = _mm_add_epi64(T4, T5);
402 T5 = _mm_mul_epu32(M1, p->R23.v);
403 T4 = _mm_add_epi64(T4, T5);
404 T5 = _mm_mul_epu32(M2, p->R22.v);
405 T4 = _mm_add_epi64(T4, T5);
406 T5 = _mm_mul_epu32(M3, p->R21.v);
407 T4 = _mm_add_epi64(T4, T5);
408 T5 = _mm_mul_epu32(M4, p->R20.v);
409 T4 = _mm_add_epi64(T4, T5);
410
411 /* H += [Mx,My] */
412 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)),
413 _mm_loadl_epi64((xmmi *)(m + 48)));
414 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)),
415 _mm_loadl_epi64((xmmi *)(m + 56)));
416 M0 = _mm_and_si128(MMASK, T5);
417 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
418 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
419 M2 = _mm_and_si128(MMASK, T5);
420 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
421 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
422
423 T0 = _mm_add_epi64(T0, M0);
424 T1 = _mm_add_epi64(T1, M1);
425 T2 = _mm_add_epi64(T2, M2);
426 T3 = _mm_add_epi64(T3, M3);
427 T4 = _mm_add_epi64(T4, M4);
428
429 /* reduce */
430 C1 = _mm_srli_epi64(T0, 26);
431 C2 = _mm_srli_epi64(T3, 26);
432 T0 = _mm_and_si128(T0, MMASK);
433 T3 = _mm_and_si128(T3, MMASK);
434 T1 = _mm_add_epi64(T1, C1);
435 T4 = _mm_add_epi64(T4, C2);
436 C1 = _mm_srli_epi64(T1, 26);
437 C2 = _mm_srli_epi64(T4, 26);
438 T1 = _mm_and_si128(T1, MMASK);
439 T4 = _mm_and_si128(T4, MMASK);
440 T2 = _mm_add_epi64(T2, C1);
441 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
442 C1 = _mm_srli_epi64(T2, 26);
443 C2 = _mm_srli_epi64(T0, 26);
444 T2 = _mm_and_si128(T2, MMASK);
445 T0 = _mm_and_si128(T0, MMASK);
446 T3 = _mm_add_epi64(T3, C1);
447 T1 = _mm_add_epi64(T1, C2);
448 C1 = _mm_srli_epi64(T3, 26);
449 T3 = _mm_and_si128(T3, MMASK);
450 T4 = _mm_add_epi64(T4, C1);
451
452 /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
453 H0 = T0;
454 H1 = T1;
455 H2 = T2;
456 H3 = T3;
457 H4 = T4;
458
459 m += 64;
460 bytes -= 64;
461 }
462
463 st->H[0] = H0;
464 st->H[1] = H1;
465 st->H[2] = H2;
466 st->H[3] = H3;
467 st->H[4] = H4;
468 }
469
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)470 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
471 size_t bytes) {
472 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
473 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
474 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
475
476 poly1305_power *p;
477 xmmi H0, H1, H2, H3, H4;
478 xmmi M0, M1, M2, M3, M4;
479 xmmi T0, T1, T2, T3, T4, T5, T6;
480 xmmi C1, C2;
481
482 uint64_t r0, r1, r2;
483 uint64_t t0, t1, t2, t3, t4;
484 uint64_t c;
485 size_t consumed = 0;
486
487 H0 = st->H[0];
488 H1 = st->H[1];
489 H2 = st->H[2];
490 H3 = st->H[3];
491 H4 = st->H[4];
492
493 /* p = [r^2,r^2] */
494 p = &st->P[1];
495
496 if (bytes >= 32) {
497 /* H *= [r^2,r^2] */
498 T0 = _mm_mul_epu32(H0, p->R20.v);
499 T1 = _mm_mul_epu32(H0, p->R21.v);
500 T2 = _mm_mul_epu32(H0, p->R22.v);
501 T3 = _mm_mul_epu32(H0, p->R23.v);
502 T4 = _mm_mul_epu32(H0, p->R24.v);
503 T5 = _mm_mul_epu32(H1, p->S24.v);
504 T6 = _mm_mul_epu32(H1, p->R20.v);
505 T0 = _mm_add_epi64(T0, T5);
506 T1 = _mm_add_epi64(T1, T6);
507 T5 = _mm_mul_epu32(H2, p->S23.v);
508 T6 = _mm_mul_epu32(H2, p->S24.v);
509 T0 = _mm_add_epi64(T0, T5);
510 T1 = _mm_add_epi64(T1, T6);
511 T5 = _mm_mul_epu32(H3, p->S22.v);
512 T6 = _mm_mul_epu32(H3, p->S23.v);
513 T0 = _mm_add_epi64(T0, T5);
514 T1 = _mm_add_epi64(T1, T6);
515 T5 = _mm_mul_epu32(H4, p->S21.v);
516 T6 = _mm_mul_epu32(H4, p->S22.v);
517 T0 = _mm_add_epi64(T0, T5);
518 T1 = _mm_add_epi64(T1, T6);
519 T5 = _mm_mul_epu32(H1, p->R21.v);
520 T6 = _mm_mul_epu32(H1, p->R22.v);
521 T2 = _mm_add_epi64(T2, T5);
522 T3 = _mm_add_epi64(T3, T6);
523 T5 = _mm_mul_epu32(H2, p->R20.v);
524 T6 = _mm_mul_epu32(H2, p->R21.v);
525 T2 = _mm_add_epi64(T2, T5);
526 T3 = _mm_add_epi64(T3, T6);
527 T5 = _mm_mul_epu32(H3, p->S24.v);
528 T6 = _mm_mul_epu32(H3, p->R20.v);
529 T2 = _mm_add_epi64(T2, T5);
530 T3 = _mm_add_epi64(T3, T6);
531 T5 = _mm_mul_epu32(H4, p->S23.v);
532 T6 = _mm_mul_epu32(H4, p->S24.v);
533 T2 = _mm_add_epi64(T2, T5);
534 T3 = _mm_add_epi64(T3, T6);
535 T5 = _mm_mul_epu32(H1, p->R23.v);
536 T4 = _mm_add_epi64(T4, T5);
537 T5 = _mm_mul_epu32(H2, p->R22.v);
538 T4 = _mm_add_epi64(T4, T5);
539 T5 = _mm_mul_epu32(H3, p->R21.v);
540 T4 = _mm_add_epi64(T4, T5);
541 T5 = _mm_mul_epu32(H4, p->R20.v);
542 T4 = _mm_add_epi64(T4, T5);
543
544 /* H += [Mx,My] */
545 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
546 _mm_loadl_epi64((xmmi *)(m + 16)));
547 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
548 _mm_loadl_epi64((xmmi *)(m + 24)));
549 M0 = _mm_and_si128(MMASK, T5);
550 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
551 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
552 M2 = _mm_and_si128(MMASK, T5);
553 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
554 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
555
556 T0 = _mm_add_epi64(T0, M0);
557 T1 = _mm_add_epi64(T1, M1);
558 T2 = _mm_add_epi64(T2, M2);
559 T3 = _mm_add_epi64(T3, M3);
560 T4 = _mm_add_epi64(T4, M4);
561
562 /* reduce */
563 C1 = _mm_srli_epi64(T0, 26);
564 C2 = _mm_srli_epi64(T3, 26);
565 T0 = _mm_and_si128(T0, MMASK);
566 T3 = _mm_and_si128(T3, MMASK);
567 T1 = _mm_add_epi64(T1, C1);
568 T4 = _mm_add_epi64(T4, C2);
569 C1 = _mm_srli_epi64(T1, 26);
570 C2 = _mm_srli_epi64(T4, 26);
571 T1 = _mm_and_si128(T1, MMASK);
572 T4 = _mm_and_si128(T4, MMASK);
573 T2 = _mm_add_epi64(T2, C1);
574 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
575 C1 = _mm_srli_epi64(T2, 26);
576 C2 = _mm_srli_epi64(T0, 26);
577 T2 = _mm_and_si128(T2, MMASK);
578 T0 = _mm_and_si128(T0, MMASK);
579 T3 = _mm_add_epi64(T3, C1);
580 T1 = _mm_add_epi64(T1, C2);
581 C1 = _mm_srli_epi64(T3, 26);
582 T3 = _mm_and_si128(T3, MMASK);
583 T4 = _mm_add_epi64(T4, C1);
584
585 /* H = (H*[r^2,r^2] + [Mx,My]) */
586 H0 = T0;
587 H1 = T1;
588 H2 = T2;
589 H3 = T3;
590 H4 = T4;
591
592 consumed = 32;
593 }
594
595 /* finalize, H *= [r^2,r] */
596 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
597 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
598 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
599
600 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
601 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
602 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
603 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
604 p->R24.d[2] = (uint32_t)((r2 >> 16));
605 p->S21.d[2] = p->R21.d[2] * 5;
606 p->S22.d[2] = p->R22.d[2] * 5;
607 p->S23.d[2] = p->R23.d[2] * 5;
608 p->S24.d[2] = p->R24.d[2] * 5;
609
610 /* H *= [r^2,r] */
611 T0 = _mm_mul_epu32(H0, p->R20.v);
612 T1 = _mm_mul_epu32(H0, p->R21.v);
613 T2 = _mm_mul_epu32(H0, p->R22.v);
614 T3 = _mm_mul_epu32(H0, p->R23.v);
615 T4 = _mm_mul_epu32(H0, p->R24.v);
616 T5 = _mm_mul_epu32(H1, p->S24.v);
617 T6 = _mm_mul_epu32(H1, p->R20.v);
618 T0 = _mm_add_epi64(T0, T5);
619 T1 = _mm_add_epi64(T1, T6);
620 T5 = _mm_mul_epu32(H2, p->S23.v);
621 T6 = _mm_mul_epu32(H2, p->S24.v);
622 T0 = _mm_add_epi64(T0, T5);
623 T1 = _mm_add_epi64(T1, T6);
624 T5 = _mm_mul_epu32(H3, p->S22.v);
625 T6 = _mm_mul_epu32(H3, p->S23.v);
626 T0 = _mm_add_epi64(T0, T5);
627 T1 = _mm_add_epi64(T1, T6);
628 T5 = _mm_mul_epu32(H4, p->S21.v);
629 T6 = _mm_mul_epu32(H4, p->S22.v);
630 T0 = _mm_add_epi64(T0, T5);
631 T1 = _mm_add_epi64(T1, T6);
632 T5 = _mm_mul_epu32(H1, p->R21.v);
633 T6 = _mm_mul_epu32(H1, p->R22.v);
634 T2 = _mm_add_epi64(T2, T5);
635 T3 = _mm_add_epi64(T3, T6);
636 T5 = _mm_mul_epu32(H2, p->R20.v);
637 T6 = _mm_mul_epu32(H2, p->R21.v);
638 T2 = _mm_add_epi64(T2, T5);
639 T3 = _mm_add_epi64(T3, T6);
640 T5 = _mm_mul_epu32(H3, p->S24.v);
641 T6 = _mm_mul_epu32(H3, p->R20.v);
642 T2 = _mm_add_epi64(T2, T5);
643 T3 = _mm_add_epi64(T3, T6);
644 T5 = _mm_mul_epu32(H4, p->S23.v);
645 T6 = _mm_mul_epu32(H4, p->S24.v);
646 T2 = _mm_add_epi64(T2, T5);
647 T3 = _mm_add_epi64(T3, T6);
648 T5 = _mm_mul_epu32(H1, p->R23.v);
649 T4 = _mm_add_epi64(T4, T5);
650 T5 = _mm_mul_epu32(H2, p->R22.v);
651 T4 = _mm_add_epi64(T4, T5);
652 T5 = _mm_mul_epu32(H3, p->R21.v);
653 T4 = _mm_add_epi64(T4, T5);
654 T5 = _mm_mul_epu32(H4, p->R20.v);
655 T4 = _mm_add_epi64(T4, T5);
656
657 C1 = _mm_srli_epi64(T0, 26);
658 C2 = _mm_srli_epi64(T3, 26);
659 T0 = _mm_and_si128(T0, MMASK);
660 T3 = _mm_and_si128(T3, MMASK);
661 T1 = _mm_add_epi64(T1, C1);
662 T4 = _mm_add_epi64(T4, C2);
663 C1 = _mm_srli_epi64(T1, 26);
664 C2 = _mm_srli_epi64(T4, 26);
665 T1 = _mm_and_si128(T1, MMASK);
666 T4 = _mm_and_si128(T4, MMASK);
667 T2 = _mm_add_epi64(T2, C1);
668 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
669 C1 = _mm_srli_epi64(T2, 26);
670 C2 = _mm_srli_epi64(T0, 26);
671 T2 = _mm_and_si128(T2, MMASK);
672 T0 = _mm_and_si128(T0, MMASK);
673 T3 = _mm_add_epi64(T3, C1);
674 T1 = _mm_add_epi64(T1, C2);
675 C1 = _mm_srli_epi64(T3, 26);
676 T3 = _mm_and_si128(T3, MMASK);
677 T4 = _mm_add_epi64(T4, C1);
678
679 /* H = H[0]+H[1] */
680 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
681 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
682 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
683 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
684 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
685
686 t0 = _mm_cvtsi128_si32(H0);
687 c = (t0 >> 26);
688 t0 &= 0x3ffffff;
689 t1 = _mm_cvtsi128_si32(H1) + c;
690 c = (t1 >> 26);
691 t1 &= 0x3ffffff;
692 t2 = _mm_cvtsi128_si32(H2) + c;
693 c = (t2 >> 26);
694 t2 &= 0x3ffffff;
695 t3 = _mm_cvtsi128_si32(H3) + c;
696 c = (t3 >> 26);
697 t3 &= 0x3ffffff;
698 t4 = _mm_cvtsi128_si32(H4) + c;
699 c = (t4 >> 26);
700 t4 &= 0x3ffffff;
701 t0 = t0 + (c * 5);
702 c = (t0 >> 26);
703 t0 &= 0x3ffffff;
704 t1 = t1 + c;
705
706 st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull;
707 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull;
708 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull;
709
710 return consumed;
711 }
712
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)713 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
714 size_t bytes) {
715 poly1305_state_internal *st = poly1305_aligned_state(state);
716 size_t want;
717
718 /* need at least 32 initial bytes to start the accelerated branch */
719 if (!st->started) {
720 if ((st->leftover == 0) && (bytes > 32)) {
721 poly1305_first_block(st, m);
722 m += 32;
723 bytes -= 32;
724 } else {
725 want = poly1305_min(32 - st->leftover, bytes);
726 poly1305_block_copy(st->buffer + st->leftover, m, want);
727 bytes -= want;
728 m += want;
729 st->leftover += want;
730 if ((st->leftover < 32) || (bytes == 0)) {
731 return;
732 }
733 poly1305_first_block(st, st->buffer);
734 st->leftover = 0;
735 }
736 st->started = 1;
737 }
738
739 /* handle leftover */
740 if (st->leftover) {
741 want = poly1305_min(64 - st->leftover, bytes);
742 poly1305_block_copy(st->buffer + st->leftover, m, want);
743 bytes -= want;
744 m += want;
745 st->leftover += want;
746 if (st->leftover < 64) {
747 return;
748 }
749 poly1305_blocks(st, st->buffer, 64);
750 st->leftover = 0;
751 }
752
753 /* process 64 byte blocks */
754 if (bytes >= 64) {
755 want = (bytes & ~63);
756 poly1305_blocks(st, m, want);
757 m += want;
758 bytes -= want;
759 }
760
761 if (bytes) {
762 poly1305_block_copy(st->buffer + st->leftover, m, bytes);
763 st->leftover += bytes;
764 }
765 }
766
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])767 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
768 poly1305_state_internal *st = poly1305_aligned_state(state);
769 size_t leftover = st->leftover;
770 uint8_t *m = st->buffer;
771 uint128_t d[3];
772 uint64_t h0, h1, h2;
773 uint64_t t0, t1;
774 uint64_t g0, g1, g2, c, nc;
775 uint64_t r0, r1, r2, s1, s2;
776 poly1305_power *p;
777
778 if (st->started) {
779 size_t consumed = poly1305_combine(st, m, leftover);
780 leftover -= consumed;
781 m += consumed;
782 }
783
784 /* st->HH will either be 0 or have the combined result */
785 h0 = st->HH[0];
786 h1 = st->HH[1];
787 h2 = st->HH[2];
788
789 p = &st->P[1];
790 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
791 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
792 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
793 s1 = r1 * (5 << 2);
794 s2 = r2 * (5 << 2);
795
796 if (leftover < 16) {
797 goto poly1305_donna_atmost15bytes;
798 }
799
800 poly1305_donna_atleast16bytes:
801 t0 = U8TO64_LE(m + 0);
802 t1 = U8TO64_LE(m + 8);
803 h0 += t0 & 0xfffffffffff;
804 t0 = shr128_pair(t1, t0, 44);
805 h1 += t0 & 0xfffffffffff;
806 h2 += (t1 >> 24) | ((uint64_t)1 << 40);
807
808 poly1305_donna_mul:
809 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
810 mul64x64_128(h2, s1));
811 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
812 mul64x64_128(h2, s2));
813 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
814 mul64x64_128(h2, r0));
815 h0 = lo128(d[0]) & 0xfffffffffff;
816 c = shr128(d[0], 44);
817 d[1] = add128_64(d[1], c);
818 h1 = lo128(d[1]) & 0xfffffffffff;
819 c = shr128(d[1], 44);
820 d[2] = add128_64(d[2], c);
821 h2 = lo128(d[2]) & 0x3ffffffffff;
822 c = shr128(d[2], 42);
823 h0 += c * 5;
824
825 m += 16;
826 leftover -= 16;
827 if (leftover >= 16) {
828 goto poly1305_donna_atleast16bytes;
829 }
830
831 /* final bytes */
832 poly1305_donna_atmost15bytes:
833 if (!leftover) {
834 goto poly1305_donna_finish;
835 }
836
837 m[leftover++] = 1;
838 poly1305_block_zero(m + leftover, 16 - leftover);
839 leftover = 16;
840
841 t0 = U8TO64_LE(m + 0);
842 t1 = U8TO64_LE(m + 8);
843 h0 += t0 & 0xfffffffffff;
844 t0 = shr128_pair(t1, t0, 44);
845 h1 += t0 & 0xfffffffffff;
846 h2 += (t1 >> 24);
847
848 goto poly1305_donna_mul;
849
850 poly1305_donna_finish:
851 c = (h0 >> 44);
852 h0 &= 0xfffffffffff;
853 h1 += c;
854 c = (h1 >> 44);
855 h1 &= 0xfffffffffff;
856 h2 += c;
857 c = (h2 >> 42);
858 h2 &= 0x3ffffffffff;
859 h0 += c * 5;
860
861 g0 = h0 + 5;
862 c = (g0 >> 44);
863 g0 &= 0xfffffffffff;
864 g1 = h1 + c;
865 c = (g1 >> 44);
866 g1 &= 0xfffffffffff;
867 g2 = h2 + c - ((uint64_t)1 << 42);
868
869 c = (g2 >> 63) - 1;
870 nc = ~c;
871 h0 = (h0 & nc) | (g0 & c);
872 h1 = (h1 & nc) | (g1 & c);
873 h2 = (h2 & nc) | (g2 & c);
874
875 /* pad */
876 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
877 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
878 h0 += (t0 & 0xfffffffffff);
879 c = (h0 >> 44);
880 h0 &= 0xfffffffffff;
881 t0 = shr128_pair(t1, t0, 44);
882 h1 += (t0 & 0xfffffffffff) + c;
883 c = (h1 >> 44);
884 h1 &= 0xfffffffffff;
885 t1 = (t1 >> 24);
886 h2 += (t1)+c;
887
888 U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
889 U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
890 }
891
892 #endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */
893