1 /* Copyright (c) 2014, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 /* This implementation of poly1305 is by Andrew Moon
16  * (https://github.com/floodyberry/poly1305-donna) and released as public
17  * domain. It implements SIMD vectorization based on the algorithm described in
18  * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19  * block size */
20 
21 #include <openssl/poly1305.h>
22 
23 #include "../internal.h"
24 
25 
26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
27 
28 #include <emmintrin.h>
29 
30 #define U8TO64_LE(m) (*(const uint64_t *)(m))
31 #define U8TO32_LE(m) (*(const uint32_t *)(m))
32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
33 
34 typedef __m128i xmmi;
35 
36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
37     (1 << 26) - 1, 0, (1 << 26) - 1, 0};
38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
40     (1 << 24), 0, (1 << 24), 0};
41 
add128(uint128_t a,uint128_t b)42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
43 
add128_64(uint128_t a,uint64_t b)44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
45 
mul64x64_128(uint64_t a,uint64_t b)46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
47   return (uint128_t)a * b;
48 }
49 
lo128(uint128_t a)50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
51 
shr128(uint128_t v,const int shift)52 static inline uint64_t shr128(uint128_t v, const int shift) {
53   return (uint64_t)(v >> shift);
54 }
55 
shr128_pair(uint64_t hi,uint64_t lo,const int shift)56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
57   return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
58 }
59 
60 typedef struct poly1305_power_t {
61   union {
62     xmmi v;
63     uint64_t u[2];
64     uint32_t d[4];
65   } R20, R21, R22, R23, R24, S21, S22, S23, S24;
66 } poly1305_power;
67 
68 typedef struct poly1305_state_internal_t {
69   poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
70                           bytes of free storage */
71   union {
72     xmmi H[5]; /*  80 bytes  */
73     uint64_t HH[10];
74   };
75   /* uint64_t r0,r1,r2;       [24 bytes] */
76   /* uint64_t pad0,pad1;      [16 bytes] */
77   uint64_t started;        /*   8 bytes  */
78   uint64_t leftover;       /*   8 bytes  */
79   uint8_t buffer[64];      /*  64 bytes  */
80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
81                               alignment = 511 bytes raw */
82 
poly1305_aligned_state(poly1305_state * state)83 static inline poly1305_state_internal *poly1305_aligned_state(
84     poly1305_state *state) {
85   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
86 }
87 
88 /* copy 0-63 bytes */
89 static inline void
poly1305_block_copy(uint8_t * dst,const uint8_t * src,size_t bytes)90 poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
91   size_t offset = src - dst;
92   if (bytes & 32) {
93     _mm_storeu_si128((xmmi *)(dst + 0),
94                      _mm_loadu_si128((const xmmi *)(dst + offset + 0)));
95     _mm_storeu_si128((xmmi *)(dst + 16),
96                      _mm_loadu_si128((const xmmi *)(dst + offset + 16)));
97     dst += 32;
98   }
99   if (bytes & 16) {
100     _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset)));
101     dst += 16;
102   }
103   if (bytes & 8) {
104     *(uint64_t *)dst = *(const uint64_t *)(dst + offset);
105     dst += 8;
106   }
107   if (bytes & 4) {
108     *(uint32_t *)dst = *(const uint32_t *)(dst + offset);
109     dst += 4;
110   }
111   if (bytes & 2) {
112     *(uint16_t *)dst = *(uint16_t *)(dst + offset);
113     dst += 2;
114   }
115   if (bytes & 1) {
116     *(uint8_t *)dst = *(uint8_t *)(dst + offset);
117   }
118 }
119 
120 /* zero 0-15 bytes */
poly1305_block_zero(uint8_t * dst,size_t bytes)121 static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
122   if (bytes & 8) {
123     *(uint64_t *)dst = 0;
124     dst += 8;
125   }
126   if (bytes & 4) {
127     *(uint32_t *)dst = 0;
128     dst += 4;
129   }
130   if (bytes & 2) {
131     *(uint16_t *)dst = 0;
132     dst += 2;
133   }
134   if (bytes & 1) {
135     *(uint8_t *)dst = 0;
136   }
137 }
138 
poly1305_min(size_t a,size_t b)139 static inline size_t poly1305_min(size_t a, size_t b) {
140   return (a < b) ? a : b;
141 }
142 
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])143 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
144   poly1305_state_internal *st = poly1305_aligned_state(state);
145   poly1305_power *p;
146   uint64_t r0, r1, r2;
147   uint64_t t0, t1;
148 
149   /* clamp key */
150   t0 = U8TO64_LE(key + 0);
151   t1 = U8TO64_LE(key + 8);
152   r0 = t0 & 0xffc0fffffff;
153   t0 >>= 44;
154   t0 |= t1 << 20;
155   r1 = t0 & 0xfffffc0ffff;
156   t1 >>= 24;
157   r2 = t1 & 0x00ffffffc0f;
158 
159   /* store r in un-used space of st->P[1] */
160   p = &st->P[1];
161   p->R20.d[1] = (uint32_t)(r0);
162   p->R20.d[3] = (uint32_t)(r0 >> 32);
163   p->R21.d[1] = (uint32_t)(r1);
164   p->R21.d[3] = (uint32_t)(r1 >> 32);
165   p->R22.d[1] = (uint32_t)(r2);
166   p->R22.d[3] = (uint32_t)(r2 >> 32);
167 
168   /* store pad */
169   p->R23.d[1] = U8TO32_LE(key + 16);
170   p->R23.d[3] = U8TO32_LE(key + 20);
171   p->R24.d[1] = U8TO32_LE(key + 24);
172   p->R24.d[3] = U8TO32_LE(key + 28);
173 
174   /* H = 0 */
175   st->H[0] = _mm_setzero_si128();
176   st->H[1] = _mm_setzero_si128();
177   st->H[2] = _mm_setzero_si128();
178   st->H[3] = _mm_setzero_si128();
179   st->H[4] = _mm_setzero_si128();
180 
181   st->started = 0;
182   st->leftover = 0;
183 }
184 
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)185 static void poly1305_first_block(poly1305_state_internal *st,
186                                  const uint8_t *m) {
187   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
188   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
189   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
190   xmmi T5, T6;
191   poly1305_power *p;
192   uint128_t d[3];
193   uint64_t r0, r1, r2;
194   uint64_t r20, r21, r22, s22;
195   uint64_t pad0, pad1;
196   uint64_t c;
197   uint64_t i;
198 
199   /* pull out stored info */
200   p = &st->P[1];
201 
202   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
203   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
204   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
205   pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
206   pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
207 
208   /* compute powers r^2,r^4 */
209   r20 = r0;
210   r21 = r1;
211   r22 = r2;
212   for (i = 0; i < 2; i++) {
213     s22 = r22 * (5 << 2);
214 
215     d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
216     d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
217     d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
218 
219     r20 = lo128(d[0]) & 0xfffffffffff;
220     c = shr128(d[0], 44);
221     d[1] = add128_64(d[1], c);
222     r21 = lo128(d[1]) & 0xfffffffffff;
223     c = shr128(d[1], 44);
224     d[2] = add128_64(d[2], c);
225     r22 = lo128(d[2]) & 0x3ffffffffff;
226     c = shr128(d[2], 42);
227     r20 += c * 5;
228     c = (r20 >> 44);
229     r20 = r20 & 0xfffffffffff;
230     r21 += c;
231 
232     p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
233                                  _MM_SHUFFLE(1, 0, 1, 0));
234     p->R21.v = _mm_shuffle_epi32(
235         _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
236         _MM_SHUFFLE(1, 0, 1, 0));
237     p->R22.v =
238         _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
239                           _MM_SHUFFLE(1, 0, 1, 0));
240     p->R23.v = _mm_shuffle_epi32(
241         _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
242         _MM_SHUFFLE(1, 0, 1, 0));
243     p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
244                                  _MM_SHUFFLE(1, 0, 1, 0));
245     p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
246     p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
247     p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
248     p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
249     p--;
250   }
251 
252   /* put saved info back */
253   p = &st->P[1];
254   p->R20.d[1] = (uint32_t)(r0);
255   p->R20.d[3] = (uint32_t)(r0 >> 32);
256   p->R21.d[1] = (uint32_t)(r1);
257   p->R21.d[3] = (uint32_t)(r1 >> 32);
258   p->R22.d[1] = (uint32_t)(r2);
259   p->R22.d[3] = (uint32_t)(r2 >> 32);
260   p->R23.d[1] = (uint32_t)(pad0);
261   p->R23.d[3] = (uint32_t)(pad0 >> 32);
262   p->R24.d[1] = (uint32_t)(pad1);
263   p->R24.d[3] = (uint32_t)(pad1 >> 32);
264 
265   /* H = [Mx,My] */
266   T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
267                           _mm_loadl_epi64((const xmmi *)(m + 16)));
268   T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
269                           _mm_loadl_epi64((const xmmi *)(m + 24)));
270   st->H[0] = _mm_and_si128(MMASK, T5);
271   st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
272   T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
273   st->H[2] = _mm_and_si128(MMASK, T5);
274   st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
275   st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
276 }
277 
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)278 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
279                             size_t bytes) {
280   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
281   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
282   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
283 
284   poly1305_power *p;
285   xmmi H0, H1, H2, H3, H4;
286   xmmi T0, T1, T2, T3, T4, T5, T6;
287   xmmi M0, M1, M2, M3, M4;
288   xmmi C1, C2;
289 
290   H0 = st->H[0];
291   H1 = st->H[1];
292   H2 = st->H[2];
293   H3 = st->H[3];
294   H4 = st->H[4];
295 
296   while (bytes >= 64) {
297     /* H *= [r^4,r^4] */
298     p = &st->P[0];
299     T0 = _mm_mul_epu32(H0, p->R20.v);
300     T1 = _mm_mul_epu32(H0, p->R21.v);
301     T2 = _mm_mul_epu32(H0, p->R22.v);
302     T3 = _mm_mul_epu32(H0, p->R23.v);
303     T4 = _mm_mul_epu32(H0, p->R24.v);
304     T5 = _mm_mul_epu32(H1, p->S24.v);
305     T6 = _mm_mul_epu32(H1, p->R20.v);
306     T0 = _mm_add_epi64(T0, T5);
307     T1 = _mm_add_epi64(T1, T6);
308     T5 = _mm_mul_epu32(H2, p->S23.v);
309     T6 = _mm_mul_epu32(H2, p->S24.v);
310     T0 = _mm_add_epi64(T0, T5);
311     T1 = _mm_add_epi64(T1, T6);
312     T5 = _mm_mul_epu32(H3, p->S22.v);
313     T6 = _mm_mul_epu32(H3, p->S23.v);
314     T0 = _mm_add_epi64(T0, T5);
315     T1 = _mm_add_epi64(T1, T6);
316     T5 = _mm_mul_epu32(H4, p->S21.v);
317     T6 = _mm_mul_epu32(H4, p->S22.v);
318     T0 = _mm_add_epi64(T0, T5);
319     T1 = _mm_add_epi64(T1, T6);
320     T5 = _mm_mul_epu32(H1, p->R21.v);
321     T6 = _mm_mul_epu32(H1, p->R22.v);
322     T2 = _mm_add_epi64(T2, T5);
323     T3 = _mm_add_epi64(T3, T6);
324     T5 = _mm_mul_epu32(H2, p->R20.v);
325     T6 = _mm_mul_epu32(H2, p->R21.v);
326     T2 = _mm_add_epi64(T2, T5);
327     T3 = _mm_add_epi64(T3, T6);
328     T5 = _mm_mul_epu32(H3, p->S24.v);
329     T6 = _mm_mul_epu32(H3, p->R20.v);
330     T2 = _mm_add_epi64(T2, T5);
331     T3 = _mm_add_epi64(T3, T6);
332     T5 = _mm_mul_epu32(H4, p->S23.v);
333     T6 = _mm_mul_epu32(H4, p->S24.v);
334     T2 = _mm_add_epi64(T2, T5);
335     T3 = _mm_add_epi64(T3, T6);
336     T5 = _mm_mul_epu32(H1, p->R23.v);
337     T4 = _mm_add_epi64(T4, T5);
338     T5 = _mm_mul_epu32(H2, p->R22.v);
339     T4 = _mm_add_epi64(T4, T5);
340     T5 = _mm_mul_epu32(H3, p->R21.v);
341     T4 = _mm_add_epi64(T4, T5);
342     T5 = _mm_mul_epu32(H4, p->R20.v);
343     T4 = _mm_add_epi64(T4, T5);
344 
345     /* H += [Mx,My]*[r^2,r^2] */
346     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
347                             _mm_loadl_epi64((const xmmi *)(m + 16)));
348     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
349                             _mm_loadl_epi64((const xmmi *)(m + 24)));
350     M0 = _mm_and_si128(MMASK, T5);
351     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
352     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
353     M2 = _mm_and_si128(MMASK, T5);
354     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
355     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
356 
357     p = &st->P[1];
358     T5 = _mm_mul_epu32(M0, p->R20.v);
359     T6 = _mm_mul_epu32(M0, p->R21.v);
360     T0 = _mm_add_epi64(T0, T5);
361     T1 = _mm_add_epi64(T1, T6);
362     T5 = _mm_mul_epu32(M1, p->S24.v);
363     T6 = _mm_mul_epu32(M1, p->R20.v);
364     T0 = _mm_add_epi64(T0, T5);
365     T1 = _mm_add_epi64(T1, T6);
366     T5 = _mm_mul_epu32(M2, p->S23.v);
367     T6 = _mm_mul_epu32(M2, p->S24.v);
368     T0 = _mm_add_epi64(T0, T5);
369     T1 = _mm_add_epi64(T1, T6);
370     T5 = _mm_mul_epu32(M3, p->S22.v);
371     T6 = _mm_mul_epu32(M3, p->S23.v);
372     T0 = _mm_add_epi64(T0, T5);
373     T1 = _mm_add_epi64(T1, T6);
374     T5 = _mm_mul_epu32(M4, p->S21.v);
375     T6 = _mm_mul_epu32(M4, p->S22.v);
376     T0 = _mm_add_epi64(T0, T5);
377     T1 = _mm_add_epi64(T1, T6);
378     T5 = _mm_mul_epu32(M0, p->R22.v);
379     T6 = _mm_mul_epu32(M0, p->R23.v);
380     T2 = _mm_add_epi64(T2, T5);
381     T3 = _mm_add_epi64(T3, T6);
382     T5 = _mm_mul_epu32(M1, p->R21.v);
383     T6 = _mm_mul_epu32(M1, p->R22.v);
384     T2 = _mm_add_epi64(T2, T5);
385     T3 = _mm_add_epi64(T3, T6);
386     T5 = _mm_mul_epu32(M2, p->R20.v);
387     T6 = _mm_mul_epu32(M2, p->R21.v);
388     T2 = _mm_add_epi64(T2, T5);
389     T3 = _mm_add_epi64(T3, T6);
390     T5 = _mm_mul_epu32(M3, p->S24.v);
391     T6 = _mm_mul_epu32(M3, p->R20.v);
392     T2 = _mm_add_epi64(T2, T5);
393     T3 = _mm_add_epi64(T3, T6);
394     T5 = _mm_mul_epu32(M4, p->S23.v);
395     T6 = _mm_mul_epu32(M4, p->S24.v);
396     T2 = _mm_add_epi64(T2, T5);
397     T3 = _mm_add_epi64(T3, T6);
398     T5 = _mm_mul_epu32(M0, p->R24.v);
399     T4 = _mm_add_epi64(T4, T5);
400     T5 = _mm_mul_epu32(M1, p->R23.v);
401     T4 = _mm_add_epi64(T4, T5);
402     T5 = _mm_mul_epu32(M2, p->R22.v);
403     T4 = _mm_add_epi64(T4, T5);
404     T5 = _mm_mul_epu32(M3, p->R21.v);
405     T4 = _mm_add_epi64(T4, T5);
406     T5 = _mm_mul_epu32(M4, p->R20.v);
407     T4 = _mm_add_epi64(T4, T5);
408 
409     /* H += [Mx,My] */
410     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
411                             _mm_loadl_epi64((const xmmi *)(m + 48)));
412     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
413                             _mm_loadl_epi64((const xmmi *)(m + 56)));
414     M0 = _mm_and_si128(MMASK, T5);
415     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
416     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
417     M2 = _mm_and_si128(MMASK, T5);
418     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
419     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
420 
421     T0 = _mm_add_epi64(T0, M0);
422     T1 = _mm_add_epi64(T1, M1);
423     T2 = _mm_add_epi64(T2, M2);
424     T3 = _mm_add_epi64(T3, M3);
425     T4 = _mm_add_epi64(T4, M4);
426 
427     /* reduce */
428     C1 = _mm_srli_epi64(T0, 26);
429     C2 = _mm_srli_epi64(T3, 26);
430     T0 = _mm_and_si128(T0, MMASK);
431     T3 = _mm_and_si128(T3, MMASK);
432     T1 = _mm_add_epi64(T1, C1);
433     T4 = _mm_add_epi64(T4, C2);
434     C1 = _mm_srli_epi64(T1, 26);
435     C2 = _mm_srli_epi64(T4, 26);
436     T1 = _mm_and_si128(T1, MMASK);
437     T4 = _mm_and_si128(T4, MMASK);
438     T2 = _mm_add_epi64(T2, C1);
439     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
440     C1 = _mm_srli_epi64(T2, 26);
441     C2 = _mm_srli_epi64(T0, 26);
442     T2 = _mm_and_si128(T2, MMASK);
443     T0 = _mm_and_si128(T0, MMASK);
444     T3 = _mm_add_epi64(T3, C1);
445     T1 = _mm_add_epi64(T1, C2);
446     C1 = _mm_srli_epi64(T3, 26);
447     T3 = _mm_and_si128(T3, MMASK);
448     T4 = _mm_add_epi64(T4, C1);
449 
450     /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
451     H0 = T0;
452     H1 = T1;
453     H2 = T2;
454     H3 = T3;
455     H4 = T4;
456 
457     m += 64;
458     bytes -= 64;
459   }
460 
461   st->H[0] = H0;
462   st->H[1] = H1;
463   st->H[2] = H2;
464   st->H[3] = H3;
465   st->H[4] = H4;
466 }
467 
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)468 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
469                                size_t bytes) {
470   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
471   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
472   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
473 
474   poly1305_power *p;
475   xmmi H0, H1, H2, H3, H4;
476   xmmi M0, M1, M2, M3, M4;
477   xmmi T0, T1, T2, T3, T4, T5, T6;
478   xmmi C1, C2;
479 
480   uint64_t r0, r1, r2;
481   uint64_t t0, t1, t2, t3, t4;
482   uint64_t c;
483   size_t consumed = 0;
484 
485   H0 = st->H[0];
486   H1 = st->H[1];
487   H2 = st->H[2];
488   H3 = st->H[3];
489   H4 = st->H[4];
490 
491   /* p = [r^2,r^2] */
492   p = &st->P[1];
493 
494   if (bytes >= 32) {
495     /* H *= [r^2,r^2] */
496     T0 = _mm_mul_epu32(H0, p->R20.v);
497     T1 = _mm_mul_epu32(H0, p->R21.v);
498     T2 = _mm_mul_epu32(H0, p->R22.v);
499     T3 = _mm_mul_epu32(H0, p->R23.v);
500     T4 = _mm_mul_epu32(H0, p->R24.v);
501     T5 = _mm_mul_epu32(H1, p->S24.v);
502     T6 = _mm_mul_epu32(H1, p->R20.v);
503     T0 = _mm_add_epi64(T0, T5);
504     T1 = _mm_add_epi64(T1, T6);
505     T5 = _mm_mul_epu32(H2, p->S23.v);
506     T6 = _mm_mul_epu32(H2, p->S24.v);
507     T0 = _mm_add_epi64(T0, T5);
508     T1 = _mm_add_epi64(T1, T6);
509     T5 = _mm_mul_epu32(H3, p->S22.v);
510     T6 = _mm_mul_epu32(H3, p->S23.v);
511     T0 = _mm_add_epi64(T0, T5);
512     T1 = _mm_add_epi64(T1, T6);
513     T5 = _mm_mul_epu32(H4, p->S21.v);
514     T6 = _mm_mul_epu32(H4, p->S22.v);
515     T0 = _mm_add_epi64(T0, T5);
516     T1 = _mm_add_epi64(T1, T6);
517     T5 = _mm_mul_epu32(H1, p->R21.v);
518     T6 = _mm_mul_epu32(H1, p->R22.v);
519     T2 = _mm_add_epi64(T2, T5);
520     T3 = _mm_add_epi64(T3, T6);
521     T5 = _mm_mul_epu32(H2, p->R20.v);
522     T6 = _mm_mul_epu32(H2, p->R21.v);
523     T2 = _mm_add_epi64(T2, T5);
524     T3 = _mm_add_epi64(T3, T6);
525     T5 = _mm_mul_epu32(H3, p->S24.v);
526     T6 = _mm_mul_epu32(H3, p->R20.v);
527     T2 = _mm_add_epi64(T2, T5);
528     T3 = _mm_add_epi64(T3, T6);
529     T5 = _mm_mul_epu32(H4, p->S23.v);
530     T6 = _mm_mul_epu32(H4, p->S24.v);
531     T2 = _mm_add_epi64(T2, T5);
532     T3 = _mm_add_epi64(T3, T6);
533     T5 = _mm_mul_epu32(H1, p->R23.v);
534     T4 = _mm_add_epi64(T4, T5);
535     T5 = _mm_mul_epu32(H2, p->R22.v);
536     T4 = _mm_add_epi64(T4, T5);
537     T5 = _mm_mul_epu32(H3, p->R21.v);
538     T4 = _mm_add_epi64(T4, T5);
539     T5 = _mm_mul_epu32(H4, p->R20.v);
540     T4 = _mm_add_epi64(T4, T5);
541 
542     /* H += [Mx,My] */
543     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
544                             _mm_loadl_epi64((const xmmi *)(m + 16)));
545     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
546                             _mm_loadl_epi64((const xmmi *)(m + 24)));
547     M0 = _mm_and_si128(MMASK, T5);
548     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
549     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
550     M2 = _mm_and_si128(MMASK, T5);
551     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
552     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
553 
554     T0 = _mm_add_epi64(T0, M0);
555     T1 = _mm_add_epi64(T1, M1);
556     T2 = _mm_add_epi64(T2, M2);
557     T3 = _mm_add_epi64(T3, M3);
558     T4 = _mm_add_epi64(T4, M4);
559 
560     /* reduce */
561     C1 = _mm_srli_epi64(T0, 26);
562     C2 = _mm_srli_epi64(T3, 26);
563     T0 = _mm_and_si128(T0, MMASK);
564     T3 = _mm_and_si128(T3, MMASK);
565     T1 = _mm_add_epi64(T1, C1);
566     T4 = _mm_add_epi64(T4, C2);
567     C1 = _mm_srli_epi64(T1, 26);
568     C2 = _mm_srli_epi64(T4, 26);
569     T1 = _mm_and_si128(T1, MMASK);
570     T4 = _mm_and_si128(T4, MMASK);
571     T2 = _mm_add_epi64(T2, C1);
572     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
573     C1 = _mm_srli_epi64(T2, 26);
574     C2 = _mm_srli_epi64(T0, 26);
575     T2 = _mm_and_si128(T2, MMASK);
576     T0 = _mm_and_si128(T0, MMASK);
577     T3 = _mm_add_epi64(T3, C1);
578     T1 = _mm_add_epi64(T1, C2);
579     C1 = _mm_srli_epi64(T3, 26);
580     T3 = _mm_and_si128(T3, MMASK);
581     T4 = _mm_add_epi64(T4, C1);
582 
583     /* H = (H*[r^2,r^2] + [Mx,My]) */
584     H0 = T0;
585     H1 = T1;
586     H2 = T2;
587     H3 = T3;
588     H4 = T4;
589 
590     consumed = 32;
591   }
592 
593   /* finalize, H *= [r^2,r] */
594   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
595   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
596   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
597 
598   p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
599   p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
600   p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
601   p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
602   p->R24.d[2] = (uint32_t)((r2 >> 16));
603   p->S21.d[2] = p->R21.d[2] * 5;
604   p->S22.d[2] = p->R22.d[2] * 5;
605   p->S23.d[2] = p->R23.d[2] * 5;
606   p->S24.d[2] = p->R24.d[2] * 5;
607 
608   /* H *= [r^2,r] */
609   T0 = _mm_mul_epu32(H0, p->R20.v);
610   T1 = _mm_mul_epu32(H0, p->R21.v);
611   T2 = _mm_mul_epu32(H0, p->R22.v);
612   T3 = _mm_mul_epu32(H0, p->R23.v);
613   T4 = _mm_mul_epu32(H0, p->R24.v);
614   T5 = _mm_mul_epu32(H1, p->S24.v);
615   T6 = _mm_mul_epu32(H1, p->R20.v);
616   T0 = _mm_add_epi64(T0, T5);
617   T1 = _mm_add_epi64(T1, T6);
618   T5 = _mm_mul_epu32(H2, p->S23.v);
619   T6 = _mm_mul_epu32(H2, p->S24.v);
620   T0 = _mm_add_epi64(T0, T5);
621   T1 = _mm_add_epi64(T1, T6);
622   T5 = _mm_mul_epu32(H3, p->S22.v);
623   T6 = _mm_mul_epu32(H3, p->S23.v);
624   T0 = _mm_add_epi64(T0, T5);
625   T1 = _mm_add_epi64(T1, T6);
626   T5 = _mm_mul_epu32(H4, p->S21.v);
627   T6 = _mm_mul_epu32(H4, p->S22.v);
628   T0 = _mm_add_epi64(T0, T5);
629   T1 = _mm_add_epi64(T1, T6);
630   T5 = _mm_mul_epu32(H1, p->R21.v);
631   T6 = _mm_mul_epu32(H1, p->R22.v);
632   T2 = _mm_add_epi64(T2, T5);
633   T3 = _mm_add_epi64(T3, T6);
634   T5 = _mm_mul_epu32(H2, p->R20.v);
635   T6 = _mm_mul_epu32(H2, p->R21.v);
636   T2 = _mm_add_epi64(T2, T5);
637   T3 = _mm_add_epi64(T3, T6);
638   T5 = _mm_mul_epu32(H3, p->S24.v);
639   T6 = _mm_mul_epu32(H3, p->R20.v);
640   T2 = _mm_add_epi64(T2, T5);
641   T3 = _mm_add_epi64(T3, T6);
642   T5 = _mm_mul_epu32(H4, p->S23.v);
643   T6 = _mm_mul_epu32(H4, p->S24.v);
644   T2 = _mm_add_epi64(T2, T5);
645   T3 = _mm_add_epi64(T3, T6);
646   T5 = _mm_mul_epu32(H1, p->R23.v);
647   T4 = _mm_add_epi64(T4, T5);
648   T5 = _mm_mul_epu32(H2, p->R22.v);
649   T4 = _mm_add_epi64(T4, T5);
650   T5 = _mm_mul_epu32(H3, p->R21.v);
651   T4 = _mm_add_epi64(T4, T5);
652   T5 = _mm_mul_epu32(H4, p->R20.v);
653   T4 = _mm_add_epi64(T4, T5);
654 
655   C1 = _mm_srli_epi64(T0, 26);
656   C2 = _mm_srli_epi64(T3, 26);
657   T0 = _mm_and_si128(T0, MMASK);
658   T3 = _mm_and_si128(T3, MMASK);
659   T1 = _mm_add_epi64(T1, C1);
660   T4 = _mm_add_epi64(T4, C2);
661   C1 = _mm_srli_epi64(T1, 26);
662   C2 = _mm_srli_epi64(T4, 26);
663   T1 = _mm_and_si128(T1, MMASK);
664   T4 = _mm_and_si128(T4, MMASK);
665   T2 = _mm_add_epi64(T2, C1);
666   T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
667   C1 = _mm_srli_epi64(T2, 26);
668   C2 = _mm_srli_epi64(T0, 26);
669   T2 = _mm_and_si128(T2, MMASK);
670   T0 = _mm_and_si128(T0, MMASK);
671   T3 = _mm_add_epi64(T3, C1);
672   T1 = _mm_add_epi64(T1, C2);
673   C1 = _mm_srli_epi64(T3, 26);
674   T3 = _mm_and_si128(T3, MMASK);
675   T4 = _mm_add_epi64(T4, C1);
676 
677   /* H = H[0]+H[1] */
678   H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
679   H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
680   H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
681   H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
682   H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
683 
684   t0 = _mm_cvtsi128_si32(H0);
685   c = (t0 >> 26);
686   t0 &= 0x3ffffff;
687   t1 = _mm_cvtsi128_si32(H1) + c;
688   c = (t1 >> 26);
689   t1 &= 0x3ffffff;
690   t2 = _mm_cvtsi128_si32(H2) + c;
691   c = (t2 >> 26);
692   t2 &= 0x3ffffff;
693   t3 = _mm_cvtsi128_si32(H3) + c;
694   c = (t3 >> 26);
695   t3 &= 0x3ffffff;
696   t4 = _mm_cvtsi128_si32(H4) + c;
697   c = (t4 >> 26);
698   t4 &= 0x3ffffff;
699   t0 = t0 + (c * 5);
700   c = (t0 >> 26);
701   t0 &= 0x3ffffff;
702   t1 = t1 + c;
703 
704   st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
705   st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
706   st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
707 
708   return consumed;
709 }
710 
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)711 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
712                             size_t bytes) {
713   poly1305_state_internal *st = poly1305_aligned_state(state);
714   size_t want;
715 
716   /* need at least 32 initial bytes to start the accelerated branch */
717   if (!st->started) {
718     if ((st->leftover == 0) && (bytes > 32)) {
719       poly1305_first_block(st, m);
720       m += 32;
721       bytes -= 32;
722     } else {
723       want = poly1305_min(32 - st->leftover, bytes);
724       poly1305_block_copy(st->buffer + st->leftover, m, want);
725       bytes -= want;
726       m += want;
727       st->leftover += want;
728       if ((st->leftover < 32) || (bytes == 0)) {
729         return;
730       }
731       poly1305_first_block(st, st->buffer);
732       st->leftover = 0;
733     }
734     st->started = 1;
735   }
736 
737   /* handle leftover */
738   if (st->leftover) {
739     want = poly1305_min(64 - st->leftover, bytes);
740     poly1305_block_copy(st->buffer + st->leftover, m, want);
741     bytes -= want;
742     m += want;
743     st->leftover += want;
744     if (st->leftover < 64) {
745       return;
746     }
747     poly1305_blocks(st, st->buffer, 64);
748     st->leftover = 0;
749   }
750 
751   /* process 64 byte blocks */
752   if (bytes >= 64) {
753     want = (bytes & ~63);
754     poly1305_blocks(st, m, want);
755     m += want;
756     bytes -= want;
757   }
758 
759   if (bytes) {
760     poly1305_block_copy(st->buffer + st->leftover, m, bytes);
761     st->leftover += bytes;
762   }
763 }
764 
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])765 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
766   poly1305_state_internal *st = poly1305_aligned_state(state);
767   size_t leftover = st->leftover;
768   uint8_t *m = st->buffer;
769   uint128_t d[3];
770   uint64_t h0, h1, h2;
771   uint64_t t0, t1;
772   uint64_t g0, g1, g2, c, nc;
773   uint64_t r0, r1, r2, s1, s2;
774   poly1305_power *p;
775 
776   if (st->started) {
777     size_t consumed = poly1305_combine(st, m, leftover);
778     leftover -= consumed;
779     m += consumed;
780   }
781 
782   /* st->HH will either be 0 or have the combined result */
783   h0 = st->HH[0];
784   h1 = st->HH[1];
785   h2 = st->HH[2];
786 
787   p = &st->P[1];
788   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
789   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
790   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
791   s1 = r1 * (5 << 2);
792   s2 = r2 * (5 << 2);
793 
794   if (leftover < 16) {
795     goto poly1305_donna_atmost15bytes;
796   }
797 
798 poly1305_donna_atleast16bytes:
799   t0 = U8TO64_LE(m + 0);
800   t1 = U8TO64_LE(m + 8);
801   h0 += t0 & 0xfffffffffff;
802   t0 = shr128_pair(t1, t0, 44);
803   h1 += t0 & 0xfffffffffff;
804   h2 += (t1 >> 24) | ((uint64_t)1 << 40);
805 
806 poly1305_donna_mul:
807   d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
808                 mul64x64_128(h2, s1));
809   d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
810                 mul64x64_128(h2, s2));
811   d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
812                 mul64x64_128(h2, r0));
813   h0 = lo128(d[0]) & 0xfffffffffff;
814   c = shr128(d[0], 44);
815   d[1] = add128_64(d[1], c);
816   h1 = lo128(d[1]) & 0xfffffffffff;
817   c = shr128(d[1], 44);
818   d[2] = add128_64(d[2], c);
819   h2 = lo128(d[2]) & 0x3ffffffffff;
820   c = shr128(d[2], 42);
821   h0 += c * 5;
822 
823   m += 16;
824   leftover -= 16;
825   if (leftover >= 16) {
826     goto poly1305_donna_atleast16bytes;
827   }
828 
829 /* final bytes */
830 poly1305_donna_atmost15bytes:
831   if (!leftover) {
832     goto poly1305_donna_finish;
833   }
834 
835   m[leftover++] = 1;
836   poly1305_block_zero(m + leftover, 16 - leftover);
837   leftover = 16;
838 
839   t0 = U8TO64_LE(m + 0);
840   t1 = U8TO64_LE(m + 8);
841   h0 += t0 & 0xfffffffffff;
842   t0 = shr128_pair(t1, t0, 44);
843   h1 += t0 & 0xfffffffffff;
844   h2 += (t1 >> 24);
845 
846   goto poly1305_donna_mul;
847 
848 poly1305_donna_finish:
849   c = (h0 >> 44);
850   h0 &= 0xfffffffffff;
851   h1 += c;
852   c = (h1 >> 44);
853   h1 &= 0xfffffffffff;
854   h2 += c;
855   c = (h2 >> 42);
856   h2 &= 0x3ffffffffff;
857   h0 += c * 5;
858 
859   g0 = h0 + 5;
860   c = (g0 >> 44);
861   g0 &= 0xfffffffffff;
862   g1 = h1 + c;
863   c = (g1 >> 44);
864   g1 &= 0xfffffffffff;
865   g2 = h2 + c - ((uint64_t)1 << 42);
866 
867   c = (g2 >> 63) - 1;
868   nc = ~c;
869   h0 = (h0 & nc) | (g0 & c);
870   h1 = (h1 & nc) | (g1 & c);
871   h2 = (h2 & nc) | (g2 & c);
872 
873   /* pad */
874   t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
875   t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
876   h0 += (t0 & 0xfffffffffff);
877   c = (h0 >> 44);
878   h0 &= 0xfffffffffff;
879   t0 = shr128_pair(t1, t0, 44);
880   h1 += (t0 & 0xfffffffffff) + c;
881   c = (h1 >> 44);
882   h1 &= 0xfffffffffff;
883   t1 = (t1 >> 24);
884   h2 += (t1)+c;
885 
886   U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
887   U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
888 }
889 
890 #endif  /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */
891