1 /* Copyright (c) 2014, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20 
21 #include <openssl/poly1305.h>
22 
23 #include "../internal.h"
24 
25 
26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
27 
28 #include <emmintrin.h>
29 
30 #define U8TO64_LE(m) (*(const uint64_t *)(m))
31 #define U8TO32_LE(m) (*(const uint32_t *)(m))
32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
33 
34 typedef __m128i xmmi;
35 
36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
37     (1 << 26) - 1, 0, (1 << 26) - 1, 0};
38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
40     (1 << 24), 0, (1 << 24), 0};
41 
add128(uint128_t a,uint128_t b)42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
43 
add128_64(uint128_t a,uint64_t b)44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
45 
mul64x64_128(uint64_t a,uint64_t b)46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
47   return (uint128_t)a * b;
48 }
49 
lo128(uint128_t a)50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
51 
shr128(uint128_t v,const int shift)52 static inline uint64_t shr128(uint128_t v, const int shift) {
53   return (uint64_t)(v >> shift);
54 }
55 
shr128_pair(uint64_t hi,uint64_t lo,const int shift)56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
57   return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
58 }
59 
60 typedef struct poly1305_power_t {
61   union {
62     xmmi v;
63     uint64_t u[2];
64     uint32_t d[4];
65   } R20, R21, R22, R23, R24, S21, S22, S23, S24;
66 } poly1305_power;
67 
68 typedef struct poly1305_state_internal_t {
69   poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
70                           bytes of free storage */
71   union {
72     xmmi H[5];  //  80 bytes
73     uint64_t HH[10];
74   };
75   // uint64_t r0,r1,r2;       [24 bytes]
76   // uint64_t pad0,pad1;      [16 bytes]
77   uint64_t started;        //   8 bytes
78   uint64_t leftover;       //   8 bytes
79   uint8_t buffer[64];      //  64 bytes
80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
81                               alignment = 511 bytes raw */
82 
poly1305_aligned_state(poly1305_state * state)83 static inline poly1305_state_internal *poly1305_aligned_state(
84     poly1305_state *state) {
85   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
86 }
87 
poly1305_min(size_t a,size_t b)88 static inline size_t poly1305_min(size_t a, size_t b) {
89   return (a < b) ? a : b;
90 }
91 
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])92 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
93   poly1305_state_internal *st = poly1305_aligned_state(state);
94   poly1305_power *p;
95   uint64_t r0, r1, r2;
96   uint64_t t0, t1;
97 
98   // clamp key
99   t0 = U8TO64_LE(key + 0);
100   t1 = U8TO64_LE(key + 8);
101   r0 = t0 & 0xffc0fffffff;
102   t0 >>= 44;
103   t0 |= t1 << 20;
104   r1 = t0 & 0xfffffc0ffff;
105   t1 >>= 24;
106   r2 = t1 & 0x00ffffffc0f;
107 
108   // store r in un-used space of st->P[1]
109   p = &st->P[1];
110   p->R20.d[1] = (uint32_t)(r0);
111   p->R20.d[3] = (uint32_t)(r0 >> 32);
112   p->R21.d[1] = (uint32_t)(r1);
113   p->R21.d[3] = (uint32_t)(r1 >> 32);
114   p->R22.d[1] = (uint32_t)(r2);
115   p->R22.d[3] = (uint32_t)(r2 >> 32);
116 
117   // store pad
118   p->R23.d[1] = U8TO32_LE(key + 16);
119   p->R23.d[3] = U8TO32_LE(key + 20);
120   p->R24.d[1] = U8TO32_LE(key + 24);
121   p->R24.d[3] = U8TO32_LE(key + 28);
122 
123   // H = 0
124   st->H[0] = _mm_setzero_si128();
125   st->H[1] = _mm_setzero_si128();
126   st->H[2] = _mm_setzero_si128();
127   st->H[3] = _mm_setzero_si128();
128   st->H[4] = _mm_setzero_si128();
129 
130   st->started = 0;
131   st->leftover = 0;
132 }
133 
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)134 static void poly1305_first_block(poly1305_state_internal *st,
135                                  const uint8_t *m) {
136   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
137   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
138   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
139   xmmi T5, T6;
140   poly1305_power *p;
141   uint128_t d[3];
142   uint64_t r0, r1, r2;
143   uint64_t r20, r21, r22, s22;
144   uint64_t pad0, pad1;
145   uint64_t c;
146   uint64_t i;
147 
148   // pull out stored info
149   p = &st->P[1];
150 
151   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
152   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
153   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
154   pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
155   pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
156 
157   // compute powers r^2,r^4
158   r20 = r0;
159   r21 = r1;
160   r22 = r2;
161   for (i = 0; i < 2; i++) {
162     s22 = r22 * (5 << 2);
163 
164     d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
165     d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
166     d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
167 
168     r20 = lo128(d[0]) & 0xfffffffffff;
169     c = shr128(d[0], 44);
170     d[1] = add128_64(d[1], c);
171     r21 = lo128(d[1]) & 0xfffffffffff;
172     c = shr128(d[1], 44);
173     d[2] = add128_64(d[2], c);
174     r22 = lo128(d[2]) & 0x3ffffffffff;
175     c = shr128(d[2], 42);
176     r20 += c * 5;
177     c = (r20 >> 44);
178     r20 = r20 & 0xfffffffffff;
179     r21 += c;
180 
181     p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
182                                  _MM_SHUFFLE(1, 0, 1, 0));
183     p->R21.v = _mm_shuffle_epi32(
184         _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
185         _MM_SHUFFLE(1, 0, 1, 0));
186     p->R22.v =
187         _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
188                           _MM_SHUFFLE(1, 0, 1, 0));
189     p->R23.v = _mm_shuffle_epi32(
190         _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
191         _MM_SHUFFLE(1, 0, 1, 0));
192     p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
193                                  _MM_SHUFFLE(1, 0, 1, 0));
194     p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
195     p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
196     p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
197     p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
198     p--;
199   }
200 
201   // put saved info back
202   p = &st->P[1];
203   p->R20.d[1] = (uint32_t)(r0);
204   p->R20.d[3] = (uint32_t)(r0 >> 32);
205   p->R21.d[1] = (uint32_t)(r1);
206   p->R21.d[3] = (uint32_t)(r1 >> 32);
207   p->R22.d[1] = (uint32_t)(r2);
208   p->R22.d[3] = (uint32_t)(r2 >> 32);
209   p->R23.d[1] = (uint32_t)(pad0);
210   p->R23.d[3] = (uint32_t)(pad0 >> 32);
211   p->R24.d[1] = (uint32_t)(pad1);
212   p->R24.d[3] = (uint32_t)(pad1 >> 32);
213 
214   // H = [Mx,My]
215   T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
216                           _mm_loadl_epi64((const xmmi *)(m + 16)));
217   T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
218                           _mm_loadl_epi64((const xmmi *)(m + 24)));
219   st->H[0] = _mm_and_si128(MMASK, T5);
220   st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
221   T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
222   st->H[2] = _mm_and_si128(MMASK, T5);
223   st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
224   st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
225 }
226 
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)227 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
228                             size_t bytes) {
229   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
230   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
231   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
232 
233   poly1305_power *p;
234   xmmi H0, H1, H2, H3, H4;
235   xmmi T0, T1, T2, T3, T4, T5, T6;
236   xmmi M0, M1, M2, M3, M4;
237   xmmi C1, C2;
238 
239   H0 = st->H[0];
240   H1 = st->H[1];
241   H2 = st->H[2];
242   H3 = st->H[3];
243   H4 = st->H[4];
244 
245   while (bytes >= 64) {
246     // H *= [r^4,r^4]
247     p = &st->P[0];
248     T0 = _mm_mul_epu32(H0, p->R20.v);
249     T1 = _mm_mul_epu32(H0, p->R21.v);
250     T2 = _mm_mul_epu32(H0, p->R22.v);
251     T3 = _mm_mul_epu32(H0, p->R23.v);
252     T4 = _mm_mul_epu32(H0, p->R24.v);
253     T5 = _mm_mul_epu32(H1, p->S24.v);
254     T6 = _mm_mul_epu32(H1, p->R20.v);
255     T0 = _mm_add_epi64(T0, T5);
256     T1 = _mm_add_epi64(T1, T6);
257     T5 = _mm_mul_epu32(H2, p->S23.v);
258     T6 = _mm_mul_epu32(H2, p->S24.v);
259     T0 = _mm_add_epi64(T0, T5);
260     T1 = _mm_add_epi64(T1, T6);
261     T5 = _mm_mul_epu32(H3, p->S22.v);
262     T6 = _mm_mul_epu32(H3, p->S23.v);
263     T0 = _mm_add_epi64(T0, T5);
264     T1 = _mm_add_epi64(T1, T6);
265     T5 = _mm_mul_epu32(H4, p->S21.v);
266     T6 = _mm_mul_epu32(H4, p->S22.v);
267     T0 = _mm_add_epi64(T0, T5);
268     T1 = _mm_add_epi64(T1, T6);
269     T5 = _mm_mul_epu32(H1, p->R21.v);
270     T6 = _mm_mul_epu32(H1, p->R22.v);
271     T2 = _mm_add_epi64(T2, T5);
272     T3 = _mm_add_epi64(T3, T6);
273     T5 = _mm_mul_epu32(H2, p->R20.v);
274     T6 = _mm_mul_epu32(H2, p->R21.v);
275     T2 = _mm_add_epi64(T2, T5);
276     T3 = _mm_add_epi64(T3, T6);
277     T5 = _mm_mul_epu32(H3, p->S24.v);
278     T6 = _mm_mul_epu32(H3, p->R20.v);
279     T2 = _mm_add_epi64(T2, T5);
280     T3 = _mm_add_epi64(T3, T6);
281     T5 = _mm_mul_epu32(H4, p->S23.v);
282     T6 = _mm_mul_epu32(H4, p->S24.v);
283     T2 = _mm_add_epi64(T2, T5);
284     T3 = _mm_add_epi64(T3, T6);
285     T5 = _mm_mul_epu32(H1, p->R23.v);
286     T4 = _mm_add_epi64(T4, T5);
287     T5 = _mm_mul_epu32(H2, p->R22.v);
288     T4 = _mm_add_epi64(T4, T5);
289     T5 = _mm_mul_epu32(H3, p->R21.v);
290     T4 = _mm_add_epi64(T4, T5);
291     T5 = _mm_mul_epu32(H4, p->R20.v);
292     T4 = _mm_add_epi64(T4, T5);
293 
294     // H += [Mx,My]*[r^2,r^2]
295     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
296                             _mm_loadl_epi64((const xmmi *)(m + 16)));
297     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
298                             _mm_loadl_epi64((const xmmi *)(m + 24)));
299     M0 = _mm_and_si128(MMASK, T5);
300     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
301     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
302     M2 = _mm_and_si128(MMASK, T5);
303     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
304     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
305 
306     p = &st->P[1];
307     T5 = _mm_mul_epu32(M0, p->R20.v);
308     T6 = _mm_mul_epu32(M0, p->R21.v);
309     T0 = _mm_add_epi64(T0, T5);
310     T1 = _mm_add_epi64(T1, T6);
311     T5 = _mm_mul_epu32(M1, p->S24.v);
312     T6 = _mm_mul_epu32(M1, p->R20.v);
313     T0 = _mm_add_epi64(T0, T5);
314     T1 = _mm_add_epi64(T1, T6);
315     T5 = _mm_mul_epu32(M2, p->S23.v);
316     T6 = _mm_mul_epu32(M2, p->S24.v);
317     T0 = _mm_add_epi64(T0, T5);
318     T1 = _mm_add_epi64(T1, T6);
319     T5 = _mm_mul_epu32(M3, p->S22.v);
320     T6 = _mm_mul_epu32(M3, p->S23.v);
321     T0 = _mm_add_epi64(T0, T5);
322     T1 = _mm_add_epi64(T1, T6);
323     T5 = _mm_mul_epu32(M4, p->S21.v);
324     T6 = _mm_mul_epu32(M4, p->S22.v);
325     T0 = _mm_add_epi64(T0, T5);
326     T1 = _mm_add_epi64(T1, T6);
327     T5 = _mm_mul_epu32(M0, p->R22.v);
328     T6 = _mm_mul_epu32(M0, p->R23.v);
329     T2 = _mm_add_epi64(T2, T5);
330     T3 = _mm_add_epi64(T3, T6);
331     T5 = _mm_mul_epu32(M1, p->R21.v);
332     T6 = _mm_mul_epu32(M1, p->R22.v);
333     T2 = _mm_add_epi64(T2, T5);
334     T3 = _mm_add_epi64(T3, T6);
335     T5 = _mm_mul_epu32(M2, p->R20.v);
336     T6 = _mm_mul_epu32(M2, p->R21.v);
337     T2 = _mm_add_epi64(T2, T5);
338     T3 = _mm_add_epi64(T3, T6);
339     T5 = _mm_mul_epu32(M3, p->S24.v);
340     T6 = _mm_mul_epu32(M3, p->R20.v);
341     T2 = _mm_add_epi64(T2, T5);
342     T3 = _mm_add_epi64(T3, T6);
343     T5 = _mm_mul_epu32(M4, p->S23.v);
344     T6 = _mm_mul_epu32(M4, p->S24.v);
345     T2 = _mm_add_epi64(T2, T5);
346     T3 = _mm_add_epi64(T3, T6);
347     T5 = _mm_mul_epu32(M0, p->R24.v);
348     T4 = _mm_add_epi64(T4, T5);
349     T5 = _mm_mul_epu32(M1, p->R23.v);
350     T4 = _mm_add_epi64(T4, T5);
351     T5 = _mm_mul_epu32(M2, p->R22.v);
352     T4 = _mm_add_epi64(T4, T5);
353     T5 = _mm_mul_epu32(M3, p->R21.v);
354     T4 = _mm_add_epi64(T4, T5);
355     T5 = _mm_mul_epu32(M4, p->R20.v);
356     T4 = _mm_add_epi64(T4, T5);
357 
358     // H += [Mx,My]
359     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
360                             _mm_loadl_epi64((const xmmi *)(m + 48)));
361     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
362                             _mm_loadl_epi64((const xmmi *)(m + 56)));
363     M0 = _mm_and_si128(MMASK, T5);
364     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
365     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
366     M2 = _mm_and_si128(MMASK, T5);
367     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
368     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
369 
370     T0 = _mm_add_epi64(T0, M0);
371     T1 = _mm_add_epi64(T1, M1);
372     T2 = _mm_add_epi64(T2, M2);
373     T3 = _mm_add_epi64(T3, M3);
374     T4 = _mm_add_epi64(T4, M4);
375 
376     // reduce
377     C1 = _mm_srli_epi64(T0, 26);
378     C2 = _mm_srli_epi64(T3, 26);
379     T0 = _mm_and_si128(T0, MMASK);
380     T3 = _mm_and_si128(T3, MMASK);
381     T1 = _mm_add_epi64(T1, C1);
382     T4 = _mm_add_epi64(T4, C2);
383     C1 = _mm_srli_epi64(T1, 26);
384     C2 = _mm_srli_epi64(T4, 26);
385     T1 = _mm_and_si128(T1, MMASK);
386     T4 = _mm_and_si128(T4, MMASK);
387     T2 = _mm_add_epi64(T2, C1);
388     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
389     C1 = _mm_srli_epi64(T2, 26);
390     C2 = _mm_srli_epi64(T0, 26);
391     T2 = _mm_and_si128(T2, MMASK);
392     T0 = _mm_and_si128(T0, MMASK);
393     T3 = _mm_add_epi64(T3, C1);
394     T1 = _mm_add_epi64(T1, C2);
395     C1 = _mm_srli_epi64(T3, 26);
396     T3 = _mm_and_si128(T3, MMASK);
397     T4 = _mm_add_epi64(T4, C1);
398 
399     // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
400     H0 = T0;
401     H1 = T1;
402     H2 = T2;
403     H3 = T3;
404     H4 = T4;
405 
406     m += 64;
407     bytes -= 64;
408   }
409 
410   st->H[0] = H0;
411   st->H[1] = H1;
412   st->H[2] = H2;
413   st->H[3] = H3;
414   st->H[4] = H4;
415 }
416 
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)417 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
418                                size_t bytes) {
419   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
420   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
421   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
422 
423   poly1305_power *p;
424   xmmi H0, H1, H2, H3, H4;
425   xmmi M0, M1, M2, M3, M4;
426   xmmi T0, T1, T2, T3, T4, T5, T6;
427   xmmi C1, C2;
428 
429   uint64_t r0, r1, r2;
430   uint64_t t0, t1, t2, t3, t4;
431   uint64_t c;
432   size_t consumed = 0;
433 
434   H0 = st->H[0];
435   H1 = st->H[1];
436   H2 = st->H[2];
437   H3 = st->H[3];
438   H4 = st->H[4];
439 
440   // p = [r^2,r^2]
441   p = &st->P[1];
442 
443   if (bytes >= 32) {
444     // H *= [r^2,r^2]
445     T0 = _mm_mul_epu32(H0, p->R20.v);
446     T1 = _mm_mul_epu32(H0, p->R21.v);
447     T2 = _mm_mul_epu32(H0, p->R22.v);
448     T3 = _mm_mul_epu32(H0, p->R23.v);
449     T4 = _mm_mul_epu32(H0, p->R24.v);
450     T5 = _mm_mul_epu32(H1, p->S24.v);
451     T6 = _mm_mul_epu32(H1, p->R20.v);
452     T0 = _mm_add_epi64(T0, T5);
453     T1 = _mm_add_epi64(T1, T6);
454     T5 = _mm_mul_epu32(H2, p->S23.v);
455     T6 = _mm_mul_epu32(H2, p->S24.v);
456     T0 = _mm_add_epi64(T0, T5);
457     T1 = _mm_add_epi64(T1, T6);
458     T5 = _mm_mul_epu32(H3, p->S22.v);
459     T6 = _mm_mul_epu32(H3, p->S23.v);
460     T0 = _mm_add_epi64(T0, T5);
461     T1 = _mm_add_epi64(T1, T6);
462     T5 = _mm_mul_epu32(H4, p->S21.v);
463     T6 = _mm_mul_epu32(H4, p->S22.v);
464     T0 = _mm_add_epi64(T0, T5);
465     T1 = _mm_add_epi64(T1, T6);
466     T5 = _mm_mul_epu32(H1, p->R21.v);
467     T6 = _mm_mul_epu32(H1, p->R22.v);
468     T2 = _mm_add_epi64(T2, T5);
469     T3 = _mm_add_epi64(T3, T6);
470     T5 = _mm_mul_epu32(H2, p->R20.v);
471     T6 = _mm_mul_epu32(H2, p->R21.v);
472     T2 = _mm_add_epi64(T2, T5);
473     T3 = _mm_add_epi64(T3, T6);
474     T5 = _mm_mul_epu32(H3, p->S24.v);
475     T6 = _mm_mul_epu32(H3, p->R20.v);
476     T2 = _mm_add_epi64(T2, T5);
477     T3 = _mm_add_epi64(T3, T6);
478     T5 = _mm_mul_epu32(H4, p->S23.v);
479     T6 = _mm_mul_epu32(H4, p->S24.v);
480     T2 = _mm_add_epi64(T2, T5);
481     T3 = _mm_add_epi64(T3, T6);
482     T5 = _mm_mul_epu32(H1, p->R23.v);
483     T4 = _mm_add_epi64(T4, T5);
484     T5 = _mm_mul_epu32(H2, p->R22.v);
485     T4 = _mm_add_epi64(T4, T5);
486     T5 = _mm_mul_epu32(H3, p->R21.v);
487     T4 = _mm_add_epi64(T4, T5);
488     T5 = _mm_mul_epu32(H4, p->R20.v);
489     T4 = _mm_add_epi64(T4, T5);
490 
491     // H += [Mx,My]
492     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
493                             _mm_loadl_epi64((const xmmi *)(m + 16)));
494     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
495                             _mm_loadl_epi64((const xmmi *)(m + 24)));
496     M0 = _mm_and_si128(MMASK, T5);
497     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
498     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
499     M2 = _mm_and_si128(MMASK, T5);
500     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
501     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
502 
503     T0 = _mm_add_epi64(T0, M0);
504     T1 = _mm_add_epi64(T1, M1);
505     T2 = _mm_add_epi64(T2, M2);
506     T3 = _mm_add_epi64(T3, M3);
507     T4 = _mm_add_epi64(T4, M4);
508 
509     // reduce
510     C1 = _mm_srli_epi64(T0, 26);
511     C2 = _mm_srli_epi64(T3, 26);
512     T0 = _mm_and_si128(T0, MMASK);
513     T3 = _mm_and_si128(T3, MMASK);
514     T1 = _mm_add_epi64(T1, C1);
515     T4 = _mm_add_epi64(T4, C2);
516     C1 = _mm_srli_epi64(T1, 26);
517     C2 = _mm_srli_epi64(T4, 26);
518     T1 = _mm_and_si128(T1, MMASK);
519     T4 = _mm_and_si128(T4, MMASK);
520     T2 = _mm_add_epi64(T2, C1);
521     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
522     C1 = _mm_srli_epi64(T2, 26);
523     C2 = _mm_srli_epi64(T0, 26);
524     T2 = _mm_and_si128(T2, MMASK);
525     T0 = _mm_and_si128(T0, MMASK);
526     T3 = _mm_add_epi64(T3, C1);
527     T1 = _mm_add_epi64(T1, C2);
528     C1 = _mm_srli_epi64(T3, 26);
529     T3 = _mm_and_si128(T3, MMASK);
530     T4 = _mm_add_epi64(T4, C1);
531 
532     // H = (H*[r^2,r^2] + [Mx,My])
533     H0 = T0;
534     H1 = T1;
535     H2 = T2;
536     H3 = T3;
537     H4 = T4;
538 
539     consumed = 32;
540   }
541 
542   // finalize, H *= [r^2,r]
543   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
544   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
545   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
546 
547   p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
548   p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
549   p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
550   p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
551   p->R24.d[2] = (uint32_t)((r2 >> 16));
552   p->S21.d[2] = p->R21.d[2] * 5;
553   p->S22.d[2] = p->R22.d[2] * 5;
554   p->S23.d[2] = p->R23.d[2] * 5;
555   p->S24.d[2] = p->R24.d[2] * 5;
556 
557   // H *= [r^2,r]
558   T0 = _mm_mul_epu32(H0, p->R20.v);
559   T1 = _mm_mul_epu32(H0, p->R21.v);
560   T2 = _mm_mul_epu32(H0, p->R22.v);
561   T3 = _mm_mul_epu32(H0, p->R23.v);
562   T4 = _mm_mul_epu32(H0, p->R24.v);
563   T5 = _mm_mul_epu32(H1, p->S24.v);
564   T6 = _mm_mul_epu32(H1, p->R20.v);
565   T0 = _mm_add_epi64(T0, T5);
566   T1 = _mm_add_epi64(T1, T6);
567   T5 = _mm_mul_epu32(H2, p->S23.v);
568   T6 = _mm_mul_epu32(H2, p->S24.v);
569   T0 = _mm_add_epi64(T0, T5);
570   T1 = _mm_add_epi64(T1, T6);
571   T5 = _mm_mul_epu32(H3, p->S22.v);
572   T6 = _mm_mul_epu32(H3, p->S23.v);
573   T0 = _mm_add_epi64(T0, T5);
574   T1 = _mm_add_epi64(T1, T6);
575   T5 = _mm_mul_epu32(H4, p->S21.v);
576   T6 = _mm_mul_epu32(H4, p->S22.v);
577   T0 = _mm_add_epi64(T0, T5);
578   T1 = _mm_add_epi64(T1, T6);
579   T5 = _mm_mul_epu32(H1, p->R21.v);
580   T6 = _mm_mul_epu32(H1, p->R22.v);
581   T2 = _mm_add_epi64(T2, T5);
582   T3 = _mm_add_epi64(T3, T6);
583   T5 = _mm_mul_epu32(H2, p->R20.v);
584   T6 = _mm_mul_epu32(H2, p->R21.v);
585   T2 = _mm_add_epi64(T2, T5);
586   T3 = _mm_add_epi64(T3, T6);
587   T5 = _mm_mul_epu32(H3, p->S24.v);
588   T6 = _mm_mul_epu32(H3, p->R20.v);
589   T2 = _mm_add_epi64(T2, T5);
590   T3 = _mm_add_epi64(T3, T6);
591   T5 = _mm_mul_epu32(H4, p->S23.v);
592   T6 = _mm_mul_epu32(H4, p->S24.v);
593   T2 = _mm_add_epi64(T2, T5);
594   T3 = _mm_add_epi64(T3, T6);
595   T5 = _mm_mul_epu32(H1, p->R23.v);
596   T4 = _mm_add_epi64(T4, T5);
597   T5 = _mm_mul_epu32(H2, p->R22.v);
598   T4 = _mm_add_epi64(T4, T5);
599   T5 = _mm_mul_epu32(H3, p->R21.v);
600   T4 = _mm_add_epi64(T4, T5);
601   T5 = _mm_mul_epu32(H4, p->R20.v);
602   T4 = _mm_add_epi64(T4, T5);
603 
604   C1 = _mm_srli_epi64(T0, 26);
605   C2 = _mm_srli_epi64(T3, 26);
606   T0 = _mm_and_si128(T0, MMASK);
607   T3 = _mm_and_si128(T3, MMASK);
608   T1 = _mm_add_epi64(T1, C1);
609   T4 = _mm_add_epi64(T4, C2);
610   C1 = _mm_srli_epi64(T1, 26);
611   C2 = _mm_srli_epi64(T4, 26);
612   T1 = _mm_and_si128(T1, MMASK);
613   T4 = _mm_and_si128(T4, MMASK);
614   T2 = _mm_add_epi64(T2, C1);
615   T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
616   C1 = _mm_srli_epi64(T2, 26);
617   C2 = _mm_srli_epi64(T0, 26);
618   T2 = _mm_and_si128(T2, MMASK);
619   T0 = _mm_and_si128(T0, MMASK);
620   T3 = _mm_add_epi64(T3, C1);
621   T1 = _mm_add_epi64(T1, C2);
622   C1 = _mm_srli_epi64(T3, 26);
623   T3 = _mm_and_si128(T3, MMASK);
624   T4 = _mm_add_epi64(T4, C1);
625 
626   // H = H[0]+H[1]
627   H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
628   H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
629   H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
630   H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
631   H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
632 
633   t0 = _mm_cvtsi128_si32(H0);
634   c = (t0 >> 26);
635   t0 &= 0x3ffffff;
636   t1 = _mm_cvtsi128_si32(H1) + c;
637   c = (t1 >> 26);
638   t1 &= 0x3ffffff;
639   t2 = _mm_cvtsi128_si32(H2) + c;
640   c = (t2 >> 26);
641   t2 &= 0x3ffffff;
642   t3 = _mm_cvtsi128_si32(H3) + c;
643   c = (t3 >> 26);
644   t3 &= 0x3ffffff;
645   t4 = _mm_cvtsi128_si32(H4) + c;
646   c = (t4 >> 26);
647   t4 &= 0x3ffffff;
648   t0 = t0 + (c * 5);
649   c = (t0 >> 26);
650   t0 &= 0x3ffffff;
651   t1 = t1 + c;
652 
653   st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
654   st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
655   st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
656 
657   return consumed;
658 }
659 
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)660 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
661                             size_t bytes) {
662   poly1305_state_internal *st = poly1305_aligned_state(state);
663   size_t want;
664 
665   // need at least 32 initial bytes to start the accelerated branch
666   if (!st->started) {
667     if ((st->leftover == 0) && (bytes > 32)) {
668       poly1305_first_block(st, m);
669       m += 32;
670       bytes -= 32;
671     } else {
672       want = poly1305_min(32 - st->leftover, bytes);
673       OPENSSL_memcpy(st->buffer + st->leftover, m, want);
674       bytes -= want;
675       m += want;
676       st->leftover += want;
677       if ((st->leftover < 32) || (bytes == 0)) {
678         return;
679       }
680       poly1305_first_block(st, st->buffer);
681       st->leftover = 0;
682     }
683     st->started = 1;
684   }
685 
686   // handle leftover
687   if (st->leftover) {
688     want = poly1305_min(64 - st->leftover, bytes);
689     OPENSSL_memcpy(st->buffer + st->leftover, m, want);
690     bytes -= want;
691     m += want;
692     st->leftover += want;
693     if (st->leftover < 64) {
694       return;
695     }
696     poly1305_blocks(st, st->buffer, 64);
697     st->leftover = 0;
698   }
699 
700   // process 64 byte blocks
701   if (bytes >= 64) {
702     want = (bytes & ~63);
703     poly1305_blocks(st, m, want);
704     m += want;
705     bytes -= want;
706   }
707 
708   if (bytes) {
709     OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
710     st->leftover += bytes;
711   }
712 }
713 
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])714 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
715   poly1305_state_internal *st = poly1305_aligned_state(state);
716   size_t leftover = st->leftover;
717   uint8_t *m = st->buffer;
718   uint128_t d[3];
719   uint64_t h0, h1, h2;
720   uint64_t t0, t1;
721   uint64_t g0, g1, g2, c, nc;
722   uint64_t r0, r1, r2, s1, s2;
723   poly1305_power *p;
724 
725   if (st->started) {
726     size_t consumed = poly1305_combine(st, m, leftover);
727     leftover -= consumed;
728     m += consumed;
729   }
730 
731   // st->HH will either be 0 or have the combined result
732   h0 = st->HH[0];
733   h1 = st->HH[1];
734   h2 = st->HH[2];
735 
736   p = &st->P[1];
737   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
738   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
739   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
740   s1 = r1 * (5 << 2);
741   s2 = r2 * (5 << 2);
742 
743   if (leftover < 16) {
744     goto poly1305_donna_atmost15bytes;
745   }
746 
747 poly1305_donna_atleast16bytes:
748   t0 = U8TO64_LE(m + 0);
749   t1 = U8TO64_LE(m + 8);
750   h0 += t0 & 0xfffffffffff;
751   t0 = shr128_pair(t1, t0, 44);
752   h1 += t0 & 0xfffffffffff;
753   h2 += (t1 >> 24) | ((uint64_t)1 << 40);
754 
755 poly1305_donna_mul:
756   d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
757                 mul64x64_128(h2, s1));
758   d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
759                 mul64x64_128(h2, s2));
760   d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
761                 mul64x64_128(h2, r0));
762   h0 = lo128(d[0]) & 0xfffffffffff;
763   c = shr128(d[0], 44);
764   d[1] = add128_64(d[1], c);
765   h1 = lo128(d[1]) & 0xfffffffffff;
766   c = shr128(d[1], 44);
767   d[2] = add128_64(d[2], c);
768   h2 = lo128(d[2]) & 0x3ffffffffff;
769   c = shr128(d[2], 42);
770   h0 += c * 5;
771 
772   m += 16;
773   leftover -= 16;
774   if (leftover >= 16) {
775     goto poly1305_donna_atleast16bytes;
776   }
777 
778 // final bytes
779 poly1305_donna_atmost15bytes:
780   if (!leftover) {
781     goto poly1305_donna_finish;
782   }
783 
784   m[leftover++] = 1;
785   OPENSSL_memset(m + leftover, 0, 16 - leftover);
786   leftover = 16;
787 
788   t0 = U8TO64_LE(m + 0);
789   t1 = U8TO64_LE(m + 8);
790   h0 += t0 & 0xfffffffffff;
791   t0 = shr128_pair(t1, t0, 44);
792   h1 += t0 & 0xfffffffffff;
793   h2 += (t1 >> 24);
794 
795   goto poly1305_donna_mul;
796 
797 poly1305_donna_finish:
798   c = (h0 >> 44);
799   h0 &= 0xfffffffffff;
800   h1 += c;
801   c = (h1 >> 44);
802   h1 &= 0xfffffffffff;
803   h2 += c;
804   c = (h2 >> 42);
805   h2 &= 0x3ffffffffff;
806   h0 += c * 5;
807 
808   g0 = h0 + 5;
809   c = (g0 >> 44);
810   g0 &= 0xfffffffffff;
811   g1 = h1 + c;
812   c = (g1 >> 44);
813   g1 &= 0xfffffffffff;
814   g2 = h2 + c - ((uint64_t)1 << 42);
815 
816   c = (g2 >> 63) - 1;
817   nc = ~c;
818   h0 = (h0 & nc) | (g0 & c);
819   h1 = (h1 & nc) | (g1 & c);
820   h2 = (h2 & nc) | (g2 & c);
821 
822   // pad
823   t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
824   t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
825   h0 += (t0 & 0xfffffffffff);
826   c = (h0 >> 44);
827   h0 &= 0xfffffffffff;
828   t0 = shr128_pair(t1, t0, 44);
829   h1 += (t0 & 0xfffffffffff) + c;
830   c = (h1 >> 44);
831   h1 &= 0xfffffffffff;
832   t1 = (t1 >> 24);
833   h2 += (t1)+c;
834 
835   U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
836   U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
837 }
838 
839 #endif  // !OPENSSL_WINDOWS && OPENSSL_X86_64
840