1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !amd64,!arm gccgo appengine
6
7package poly1305
8
9// Based on original, public domain implementation from NaCl by D. J.
10// Bernstein.
11
12import "math"
13
14const (
15	alpham80 = 0.00000000558793544769287109375
16	alpham48 = 24.0
17	alpham16 = 103079215104.0
18	alpha0   = 6755399441055744.0
19	alpha18  = 1770887431076116955136.0
20	alpha32  = 29014219670751100192948224.0
21	alpha50  = 7605903601369376408980219232256.0
22	alpha64  = 124615124604835863084731911901282304.0
23	alpha82  = 32667107224410092492483962313449748299776.0
24	alpha96  = 535217884764734955396857238543560676143529984.0
25	alpha112 = 35076039295941670036888435985190792471742381031424.0
26	alpha130 = 9194973245195333150150082162901855101712434733101613056.0
27	scale    = 0.0000000000000000000000000000000000000036734198463196484624023016788195177431833298649127735047148490821200539357960224151611328125
28	offset0  = 6755408030990331.0
29	offset1  = 29014256564239239022116864.0
30	offset2  = 124615283061160854719918951570079744.0
31	offset3  = 535219245894202480694386063513315216128475136.0
32)
33
34// Sum generates an authenticator for m using a one-time key and puts the
35// 16-byte result into out. Authenticating two different messages with the same
36// key allows an attacker to forge messages at will.
37func Sum(out *[16]byte, m []byte, key *[32]byte) {
38	r := key
39	s := key[16:]
40	var (
41		y7        float64
42		y6        float64
43		y1        float64
44		y0        float64
45		y5        float64
46		y4        float64
47		x7        float64
48		x6        float64
49		x1        float64
50		x0        float64
51		y3        float64
52		y2        float64
53		x5        float64
54		r3lowx0   float64
55		x4        float64
56		r0lowx6   float64
57		x3        float64
58		r3highx0  float64
59		x2        float64
60		r0highx6  float64
61		r0lowx0   float64
62		sr1lowx6  float64
63		r0highx0  float64
64		sr1highx6 float64
65		sr3low    float64
66		r1lowx0   float64
67		sr2lowx6  float64
68		r1highx0  float64
69		sr2highx6 float64
70		r2lowx0   float64
71		sr3lowx6  float64
72		r2highx0  float64
73		sr3highx6 float64
74		r1highx4  float64
75		r1lowx4   float64
76		r0highx4  float64
77		r0lowx4   float64
78		sr3highx4 float64
79		sr3lowx4  float64
80		sr2highx4 float64
81		sr2lowx4  float64
82		r0lowx2   float64
83		r0highx2  float64
84		r1lowx2   float64
85		r1highx2  float64
86		r2lowx2   float64
87		r2highx2  float64
88		sr3lowx2  float64
89		sr3highx2 float64
90		z0        float64
91		z1        float64
92		z2        float64
93		z3        float64
94		m0        int64
95		m1        int64
96		m2        int64
97		m3        int64
98		m00       uint32
99		m01       uint32
100		m02       uint32
101		m03       uint32
102		m10       uint32
103		m11       uint32
104		m12       uint32
105		m13       uint32
106		m20       uint32
107		m21       uint32
108		m22       uint32
109		m23       uint32
110		m30       uint32
111		m31       uint32
112		m32       uint32
113		m33       uint64
114		lbelow2   int32
115		lbelow3   int32
116		lbelow4   int32
117		lbelow5   int32
118		lbelow6   int32
119		lbelow7   int32
120		lbelow8   int32
121		lbelow9   int32
122		lbelow10  int32
123		lbelow11  int32
124		lbelow12  int32
125		lbelow13  int32
126		lbelow14  int32
127		lbelow15  int32
128		s00       uint32
129		s01       uint32
130		s02       uint32
131		s03       uint32
132		s10       uint32
133		s11       uint32
134		s12       uint32
135		s13       uint32
136		s20       uint32
137		s21       uint32
138		s22       uint32
139		s23       uint32
140		s30       uint32
141		s31       uint32
142		s32       uint32
143		s33       uint32
144		bits32    uint64
145		f         uint64
146		f0        uint64
147		f1        uint64
148		f2        uint64
149		f3        uint64
150		f4        uint64
151		g         uint64
152		g0        uint64
153		g1        uint64
154		g2        uint64
155		g3        uint64
156		g4        uint64
157	)
158
159	var p int32
160
161	l := int32(len(m))
162
163	r00 := uint32(r[0])
164
165	r01 := uint32(r[1])
166
167	r02 := uint32(r[2])
168	r0 := int64(2151)
169
170	r03 := uint32(r[3])
171	r03 &= 15
172	r0 <<= 51
173
174	r10 := uint32(r[4])
175	r10 &= 252
176	r01 <<= 8
177	r0 += int64(r00)
178
179	r11 := uint32(r[5])
180	r02 <<= 16
181	r0 += int64(r01)
182
183	r12 := uint32(r[6])
184	r03 <<= 24
185	r0 += int64(r02)
186
187	r13 := uint32(r[7])
188	r13 &= 15
189	r1 := int64(2215)
190	r0 += int64(r03)
191
192	d0 := r0
193	r1 <<= 51
194	r2 := int64(2279)
195
196	r20 := uint32(r[8])
197	r20 &= 252
198	r11 <<= 8
199	r1 += int64(r10)
200
201	r21 := uint32(r[9])
202	r12 <<= 16
203	r1 += int64(r11)
204
205	r22 := uint32(r[10])
206	r13 <<= 24
207	r1 += int64(r12)
208
209	r23 := uint32(r[11])
210	r23 &= 15
211	r2 <<= 51
212	r1 += int64(r13)
213
214	d1 := r1
215	r21 <<= 8
216	r2 += int64(r20)
217
218	r30 := uint32(r[12])
219	r30 &= 252
220	r22 <<= 16
221	r2 += int64(r21)
222
223	r31 := uint32(r[13])
224	r23 <<= 24
225	r2 += int64(r22)
226
227	r32 := uint32(r[14])
228	r2 += int64(r23)
229	r3 := int64(2343)
230
231	d2 := r2
232	r3 <<= 51
233
234	r33 := uint32(r[15])
235	r33 &= 15
236	r31 <<= 8
237	r3 += int64(r30)
238
239	r32 <<= 16
240	r3 += int64(r31)
241
242	r33 <<= 24
243	r3 += int64(r32)
244
245	r3 += int64(r33)
246	h0 := alpha32 - alpha32
247
248	d3 := r3
249	h1 := alpha32 - alpha32
250
251	h2 := alpha32 - alpha32
252
253	h3 := alpha32 - alpha32
254
255	h4 := alpha32 - alpha32
256
257	r0low := math.Float64frombits(uint64(d0))
258	h5 := alpha32 - alpha32
259
260	r1low := math.Float64frombits(uint64(d1))
261	h6 := alpha32 - alpha32
262
263	r2low := math.Float64frombits(uint64(d2))
264	h7 := alpha32 - alpha32
265
266	r0low -= alpha0
267
268	r1low -= alpha32
269
270	r2low -= alpha64
271
272	r0high := r0low + alpha18
273
274	r3low := math.Float64frombits(uint64(d3))
275
276	r1high := r1low + alpha50
277	sr1low := scale * r1low
278
279	r2high := r2low + alpha82
280	sr2low := scale * r2low
281
282	r0high -= alpha18
283	r0high_stack := r0high
284
285	r3low -= alpha96
286
287	r1high -= alpha50
288	r1high_stack := r1high
289
290	sr1high := sr1low + alpham80
291
292	r0low -= r0high
293
294	r2high -= alpha82
295	sr3low = scale * r3low
296
297	sr2high := sr2low + alpham48
298
299	r1low -= r1high
300	r1low_stack := r1low
301
302	sr1high -= alpham80
303	sr1high_stack := sr1high
304
305	r2low -= r2high
306	r2low_stack := r2low
307
308	sr2high -= alpham48
309	sr2high_stack := sr2high
310
311	r3high := r3low + alpha112
312	r0low_stack := r0low
313
314	sr1low -= sr1high
315	sr1low_stack := sr1low
316
317	sr3high := sr3low + alpham16
318	r2high_stack := r2high
319
320	sr2low -= sr2high
321	sr2low_stack := sr2low
322
323	r3high -= alpha112
324	r3high_stack := r3high
325
326	sr3high -= alpham16
327	sr3high_stack := sr3high
328
329	r3low -= r3high
330	r3low_stack := r3low
331
332	sr3low -= sr3high
333	sr3low_stack := sr3low
334
335	if l < 16 {
336		goto addatmost15bytes
337	}
338
339	m00 = uint32(m[p+0])
340	m0 = 2151
341
342	m0 <<= 51
343	m1 = 2215
344	m01 = uint32(m[p+1])
345
346	m1 <<= 51
347	m2 = 2279
348	m02 = uint32(m[p+2])
349
350	m2 <<= 51
351	m3 = 2343
352	m03 = uint32(m[p+3])
353
354	m10 = uint32(m[p+4])
355	m01 <<= 8
356	m0 += int64(m00)
357
358	m11 = uint32(m[p+5])
359	m02 <<= 16
360	m0 += int64(m01)
361
362	m12 = uint32(m[p+6])
363	m03 <<= 24
364	m0 += int64(m02)
365
366	m13 = uint32(m[p+7])
367	m3 <<= 51
368	m0 += int64(m03)
369
370	m20 = uint32(m[p+8])
371	m11 <<= 8
372	m1 += int64(m10)
373
374	m21 = uint32(m[p+9])
375	m12 <<= 16
376	m1 += int64(m11)
377
378	m22 = uint32(m[p+10])
379	m13 <<= 24
380	m1 += int64(m12)
381
382	m23 = uint32(m[p+11])
383	m1 += int64(m13)
384
385	m30 = uint32(m[p+12])
386	m21 <<= 8
387	m2 += int64(m20)
388
389	m31 = uint32(m[p+13])
390	m22 <<= 16
391	m2 += int64(m21)
392
393	m32 = uint32(m[p+14])
394	m23 <<= 24
395	m2 += int64(m22)
396
397	m33 = uint64(m[p+15])
398	m2 += int64(m23)
399
400	d0 = m0
401	m31 <<= 8
402	m3 += int64(m30)
403
404	d1 = m1
405	m32 <<= 16
406	m3 += int64(m31)
407
408	d2 = m2
409	m33 += 256
410
411	m33 <<= 24
412	m3 += int64(m32)
413
414	m3 += int64(m33)
415	d3 = m3
416
417	p += 16
418	l -= 16
419
420	z0 = math.Float64frombits(uint64(d0))
421
422	z1 = math.Float64frombits(uint64(d1))
423
424	z2 = math.Float64frombits(uint64(d2))
425
426	z3 = math.Float64frombits(uint64(d3))
427
428	z0 -= alpha0
429
430	z1 -= alpha32
431
432	z2 -= alpha64
433
434	z3 -= alpha96
435
436	h0 += z0
437
438	h1 += z1
439
440	h3 += z2
441
442	h5 += z3
443
444	if l < 16 {
445		goto multiplyaddatmost15bytes
446	}
447
448multiplyaddatleast16bytes:
449
450	m2 = 2279
451	m20 = uint32(m[p+8])
452	y7 = h7 + alpha130
453
454	m2 <<= 51
455	m3 = 2343
456	m21 = uint32(m[p+9])
457	y6 = h6 + alpha130
458
459	m3 <<= 51
460	m0 = 2151
461	m22 = uint32(m[p+10])
462	y1 = h1 + alpha32
463
464	m0 <<= 51
465	m1 = 2215
466	m23 = uint32(m[p+11])
467	y0 = h0 + alpha32
468
469	m1 <<= 51
470	m30 = uint32(m[p+12])
471	y7 -= alpha130
472
473	m21 <<= 8
474	m2 += int64(m20)
475	m31 = uint32(m[p+13])
476	y6 -= alpha130
477
478	m22 <<= 16
479	m2 += int64(m21)
480	m32 = uint32(m[p+14])
481	y1 -= alpha32
482
483	m23 <<= 24
484	m2 += int64(m22)
485	m33 = uint64(m[p+15])
486	y0 -= alpha32
487
488	m2 += int64(m23)
489	m00 = uint32(m[p+0])
490	y5 = h5 + alpha96
491
492	m31 <<= 8
493	m3 += int64(m30)
494	m01 = uint32(m[p+1])
495	y4 = h4 + alpha96
496
497	m32 <<= 16
498	m02 = uint32(m[p+2])
499	x7 = h7 - y7
500	y7 *= scale
501
502	m33 += 256
503	m03 = uint32(m[p+3])
504	x6 = h6 - y6
505	y6 *= scale
506
507	m33 <<= 24
508	m3 += int64(m31)
509	m10 = uint32(m[p+4])
510	x1 = h1 - y1
511
512	m01 <<= 8
513	m3 += int64(m32)
514	m11 = uint32(m[p+5])
515	x0 = h0 - y0
516
517	m3 += int64(m33)
518	m0 += int64(m00)
519	m12 = uint32(m[p+6])
520	y5 -= alpha96
521
522	m02 <<= 16
523	m0 += int64(m01)
524	m13 = uint32(m[p+7])
525	y4 -= alpha96
526
527	m03 <<= 24
528	m0 += int64(m02)
529	d2 = m2
530	x1 += y7
531
532	m0 += int64(m03)
533	d3 = m3
534	x0 += y6
535
536	m11 <<= 8
537	m1 += int64(m10)
538	d0 = m0
539	x7 += y5
540
541	m12 <<= 16
542	m1 += int64(m11)
543	x6 += y4
544
545	m13 <<= 24
546	m1 += int64(m12)
547	y3 = h3 + alpha64
548
549	m1 += int64(m13)
550	d1 = m1
551	y2 = h2 + alpha64
552
553	x0 += x1
554
555	x6 += x7
556
557	y3 -= alpha64
558	r3low = r3low_stack
559
560	y2 -= alpha64
561	r0low = r0low_stack
562
563	x5 = h5 - y5
564	r3lowx0 = r3low * x0
565	r3high = r3high_stack
566
567	x4 = h4 - y4
568	r0lowx6 = r0low * x6
569	r0high = r0high_stack
570
571	x3 = h3 - y3
572	r3highx0 = r3high * x0
573	sr1low = sr1low_stack
574
575	x2 = h2 - y2
576	r0highx6 = r0high * x6
577	sr1high = sr1high_stack
578
579	x5 += y3
580	r0lowx0 = r0low * x0
581	r1low = r1low_stack
582
583	h6 = r3lowx0 + r0lowx6
584	sr1lowx6 = sr1low * x6
585	r1high = r1high_stack
586
587	x4 += y2
588	r0highx0 = r0high * x0
589	sr2low = sr2low_stack
590
591	h7 = r3highx0 + r0highx6
592	sr1highx6 = sr1high * x6
593	sr2high = sr2high_stack
594
595	x3 += y1
596	r1lowx0 = r1low * x0
597	r2low = r2low_stack
598
599	h0 = r0lowx0 + sr1lowx6
600	sr2lowx6 = sr2low * x6
601	r2high = r2high_stack
602
603	x2 += y0
604	r1highx0 = r1high * x0
605	sr3low = sr3low_stack
606
607	h1 = r0highx0 + sr1highx6
608	sr2highx6 = sr2high * x6
609	sr3high = sr3high_stack
610
611	x4 += x5
612	r2lowx0 = r2low * x0
613	z2 = math.Float64frombits(uint64(d2))
614
615	h2 = r1lowx0 + sr2lowx6
616	sr3lowx6 = sr3low * x6
617
618	x2 += x3
619	r2highx0 = r2high * x0
620	z3 = math.Float64frombits(uint64(d3))
621
622	h3 = r1highx0 + sr2highx6
623	sr3highx6 = sr3high * x6
624
625	r1highx4 = r1high * x4
626	z2 -= alpha64
627
628	h4 = r2lowx0 + sr3lowx6
629	r1lowx4 = r1low * x4
630
631	r0highx4 = r0high * x4
632	z3 -= alpha96
633
634	h5 = r2highx0 + sr3highx6
635	r0lowx4 = r0low * x4
636
637	h7 += r1highx4
638	sr3highx4 = sr3high * x4
639
640	h6 += r1lowx4
641	sr3lowx4 = sr3low * x4
642
643	h5 += r0highx4
644	sr2highx4 = sr2high * x4
645
646	h4 += r0lowx4
647	sr2lowx4 = sr2low * x4
648
649	h3 += sr3highx4
650	r0lowx2 = r0low * x2
651
652	h2 += sr3lowx4
653	r0highx2 = r0high * x2
654
655	h1 += sr2highx4
656	r1lowx2 = r1low * x2
657
658	h0 += sr2lowx4
659	r1highx2 = r1high * x2
660
661	h2 += r0lowx2
662	r2lowx2 = r2low * x2
663
664	h3 += r0highx2
665	r2highx2 = r2high * x2
666
667	h4 += r1lowx2
668	sr3lowx2 = sr3low * x2
669
670	h5 += r1highx2
671	sr3highx2 = sr3high * x2
672
673	p += 16
674	l -= 16
675	h6 += r2lowx2
676
677	h7 += r2highx2
678
679	z1 = math.Float64frombits(uint64(d1))
680	h0 += sr3lowx2
681
682	z0 = math.Float64frombits(uint64(d0))
683	h1 += sr3highx2
684
685	z1 -= alpha32
686
687	z0 -= alpha0
688
689	h5 += z3
690
691	h3 += z2
692
693	h1 += z1
694
695	h0 += z0
696
697	if l >= 16 {
698		goto multiplyaddatleast16bytes
699	}
700
701multiplyaddatmost15bytes:
702
703	y7 = h7 + alpha130
704
705	y6 = h6 + alpha130
706
707	y1 = h1 + alpha32
708
709	y0 = h0 + alpha32
710
711	y7 -= alpha130
712
713	y6 -= alpha130
714
715	y1 -= alpha32
716
717	y0 -= alpha32
718
719	y5 = h5 + alpha96
720
721	y4 = h4 + alpha96
722
723	x7 = h7 - y7
724	y7 *= scale
725
726	x6 = h6 - y6
727	y6 *= scale
728
729	x1 = h1 - y1
730
731	x0 = h0 - y0
732
733	y5 -= alpha96
734
735	y4 -= alpha96
736
737	x1 += y7
738
739	x0 += y6
740
741	x7 += y5
742
743	x6 += y4
744
745	y3 = h3 + alpha64
746
747	y2 = h2 + alpha64
748
749	x0 += x1
750
751	x6 += x7
752
753	y3 -= alpha64
754	r3low = r3low_stack
755
756	y2 -= alpha64
757	r0low = r0low_stack
758
759	x5 = h5 - y5
760	r3lowx0 = r3low * x0
761	r3high = r3high_stack
762
763	x4 = h4 - y4
764	r0lowx6 = r0low * x6
765	r0high = r0high_stack
766
767	x3 = h3 - y3
768	r3highx0 = r3high * x0
769	sr1low = sr1low_stack
770
771	x2 = h2 - y2
772	r0highx6 = r0high * x6
773	sr1high = sr1high_stack
774
775	x5 += y3
776	r0lowx0 = r0low * x0
777	r1low = r1low_stack
778
779	h6 = r3lowx0 + r0lowx6
780	sr1lowx6 = sr1low * x6
781	r1high = r1high_stack
782
783	x4 += y2
784	r0highx0 = r0high * x0
785	sr2low = sr2low_stack
786
787	h7 = r3highx0 + r0highx6
788	sr1highx6 = sr1high * x6
789	sr2high = sr2high_stack
790
791	x3 += y1
792	r1lowx0 = r1low * x0
793	r2low = r2low_stack
794
795	h0 = r0lowx0 + sr1lowx6
796	sr2lowx6 = sr2low * x6
797	r2high = r2high_stack
798
799	x2 += y0
800	r1highx0 = r1high * x0
801	sr3low = sr3low_stack
802
803	h1 = r0highx0 + sr1highx6
804	sr2highx6 = sr2high * x6
805	sr3high = sr3high_stack
806
807	x4 += x5
808	r2lowx0 = r2low * x0
809
810	h2 = r1lowx0 + sr2lowx6
811	sr3lowx6 = sr3low * x6
812
813	x2 += x3
814	r2highx0 = r2high * x0
815
816	h3 = r1highx0 + sr2highx6
817	sr3highx6 = sr3high * x6
818
819	r1highx4 = r1high * x4
820
821	h4 = r2lowx0 + sr3lowx6
822	r1lowx4 = r1low * x4
823
824	r0highx4 = r0high * x4
825
826	h5 = r2highx0 + sr3highx6
827	r0lowx4 = r0low * x4
828
829	h7 += r1highx4
830	sr3highx4 = sr3high * x4
831
832	h6 += r1lowx4
833	sr3lowx4 = sr3low * x4
834
835	h5 += r0highx4
836	sr2highx4 = sr2high * x4
837
838	h4 += r0lowx4
839	sr2lowx4 = sr2low * x4
840
841	h3 += sr3highx4
842	r0lowx2 = r0low * x2
843
844	h2 += sr3lowx4
845	r0highx2 = r0high * x2
846
847	h1 += sr2highx4
848	r1lowx2 = r1low * x2
849
850	h0 += sr2lowx4
851	r1highx2 = r1high * x2
852
853	h2 += r0lowx2
854	r2lowx2 = r2low * x2
855
856	h3 += r0highx2
857	r2highx2 = r2high * x2
858
859	h4 += r1lowx2
860	sr3lowx2 = sr3low * x2
861
862	h5 += r1highx2
863	sr3highx2 = sr3high * x2
864
865	h6 += r2lowx2
866
867	h7 += r2highx2
868
869	h0 += sr3lowx2
870
871	h1 += sr3highx2
872
873addatmost15bytes:
874
875	if l == 0 {
876		goto nomorebytes
877	}
878
879	lbelow2 = l - 2
880
881	lbelow3 = l - 3
882
883	lbelow2 >>= 31
884	lbelow4 = l - 4
885
886	m00 = uint32(m[p+0])
887	lbelow3 >>= 31
888	p += lbelow2
889
890	m01 = uint32(m[p+1])
891	lbelow4 >>= 31
892	p += lbelow3
893
894	m02 = uint32(m[p+2])
895	p += lbelow4
896	m0 = 2151
897
898	m03 = uint32(m[p+3])
899	m0 <<= 51
900	m1 = 2215
901
902	m0 += int64(m00)
903	m01 &^= uint32(lbelow2)
904
905	m02 &^= uint32(lbelow3)
906	m01 -= uint32(lbelow2)
907
908	m01 <<= 8
909	m03 &^= uint32(lbelow4)
910
911	m0 += int64(m01)
912	lbelow2 -= lbelow3
913
914	m02 += uint32(lbelow2)
915	lbelow3 -= lbelow4
916
917	m02 <<= 16
918	m03 += uint32(lbelow3)
919
920	m03 <<= 24
921	m0 += int64(m02)
922
923	m0 += int64(m03)
924	lbelow5 = l - 5
925
926	lbelow6 = l - 6
927	lbelow7 = l - 7
928
929	lbelow5 >>= 31
930	lbelow8 = l - 8
931
932	lbelow6 >>= 31
933	p += lbelow5
934
935	m10 = uint32(m[p+4])
936	lbelow7 >>= 31
937	p += lbelow6
938
939	m11 = uint32(m[p+5])
940	lbelow8 >>= 31
941	p += lbelow7
942
943	m12 = uint32(m[p+6])
944	m1 <<= 51
945	p += lbelow8
946
947	m13 = uint32(m[p+7])
948	m10 &^= uint32(lbelow5)
949	lbelow4 -= lbelow5
950
951	m10 += uint32(lbelow4)
952	lbelow5 -= lbelow6
953
954	m11 &^= uint32(lbelow6)
955	m11 += uint32(lbelow5)
956
957	m11 <<= 8
958	m1 += int64(m10)
959
960	m1 += int64(m11)
961	m12 &^= uint32(lbelow7)
962
963	lbelow6 -= lbelow7
964	m13 &^= uint32(lbelow8)
965
966	m12 += uint32(lbelow6)
967	lbelow7 -= lbelow8
968
969	m12 <<= 16
970	m13 += uint32(lbelow7)
971
972	m13 <<= 24
973	m1 += int64(m12)
974
975	m1 += int64(m13)
976	m2 = 2279
977
978	lbelow9 = l - 9
979	m3 = 2343
980
981	lbelow10 = l - 10
982	lbelow11 = l - 11
983
984	lbelow9 >>= 31
985	lbelow12 = l - 12
986
987	lbelow10 >>= 31
988	p += lbelow9
989
990	m20 = uint32(m[p+8])
991	lbelow11 >>= 31
992	p += lbelow10
993
994	m21 = uint32(m[p+9])
995	lbelow12 >>= 31
996	p += lbelow11
997
998	m22 = uint32(m[p+10])
999	m2 <<= 51
1000	p += lbelow12
1001
1002	m23 = uint32(m[p+11])
1003	m20 &^= uint32(lbelow9)
1004	lbelow8 -= lbelow9
1005
1006	m20 += uint32(lbelow8)
1007	lbelow9 -= lbelow10
1008
1009	m21 &^= uint32(lbelow10)
1010	m21 += uint32(lbelow9)
1011
1012	m21 <<= 8
1013	m2 += int64(m20)
1014
1015	m2 += int64(m21)
1016	m22 &^= uint32(lbelow11)
1017
1018	lbelow10 -= lbelow11
1019	m23 &^= uint32(lbelow12)
1020
1021	m22 += uint32(lbelow10)
1022	lbelow11 -= lbelow12
1023
1024	m22 <<= 16
1025	m23 += uint32(lbelow11)
1026
1027	m23 <<= 24
1028	m2 += int64(m22)
1029
1030	m3 <<= 51
1031	lbelow13 = l - 13
1032
1033	lbelow13 >>= 31
1034	lbelow14 = l - 14
1035
1036	lbelow14 >>= 31
1037	p += lbelow13
1038	lbelow15 = l - 15
1039
1040	m30 = uint32(m[p+12])
1041	lbelow15 >>= 31
1042	p += lbelow14
1043
1044	m31 = uint32(m[p+13])
1045	p += lbelow15
1046	m2 += int64(m23)
1047
1048	m32 = uint32(m[p+14])
1049	m30 &^= uint32(lbelow13)
1050	lbelow12 -= lbelow13
1051
1052	m30 += uint32(lbelow12)
1053	lbelow13 -= lbelow14
1054
1055	m3 += int64(m30)
1056	m31 &^= uint32(lbelow14)
1057
1058	m31 += uint32(lbelow13)
1059	m32 &^= uint32(lbelow15)
1060
1061	m31 <<= 8
1062	lbelow14 -= lbelow15
1063
1064	m3 += int64(m31)
1065	m32 += uint32(lbelow14)
1066	d0 = m0
1067
1068	m32 <<= 16
1069	m33 = uint64(lbelow15 + 1)
1070	d1 = m1
1071
1072	m33 <<= 24
1073	m3 += int64(m32)
1074	d2 = m2
1075
1076	m3 += int64(m33)
1077	d3 = m3
1078
1079	z3 = math.Float64frombits(uint64(d3))
1080
1081	z2 = math.Float64frombits(uint64(d2))
1082
1083	z1 = math.Float64frombits(uint64(d1))
1084
1085	z0 = math.Float64frombits(uint64(d0))
1086
1087	z3 -= alpha96
1088
1089	z2 -= alpha64
1090
1091	z1 -= alpha32
1092
1093	z0 -= alpha0
1094
1095	h5 += z3
1096
1097	h3 += z2
1098
1099	h1 += z1
1100
1101	h0 += z0
1102
1103	y7 = h7 + alpha130
1104
1105	y6 = h6 + alpha130
1106
1107	y1 = h1 + alpha32
1108
1109	y0 = h0 + alpha32
1110
1111	y7 -= alpha130
1112
1113	y6 -= alpha130
1114
1115	y1 -= alpha32
1116
1117	y0 -= alpha32
1118
1119	y5 = h5 + alpha96
1120
1121	y4 = h4 + alpha96
1122
1123	x7 = h7 - y7
1124	y7 *= scale
1125
1126	x6 = h6 - y6
1127	y6 *= scale
1128
1129	x1 = h1 - y1
1130
1131	x0 = h0 - y0
1132
1133	y5 -= alpha96
1134
1135	y4 -= alpha96
1136
1137	x1 += y7
1138
1139	x0 += y6
1140
1141	x7 += y5
1142
1143	x6 += y4
1144
1145	y3 = h3 + alpha64
1146
1147	y2 = h2 + alpha64
1148
1149	x0 += x1
1150
1151	x6 += x7
1152
1153	y3 -= alpha64
1154	r3low = r3low_stack
1155
1156	y2 -= alpha64
1157	r0low = r0low_stack
1158
1159	x5 = h5 - y5
1160	r3lowx0 = r3low * x0
1161	r3high = r3high_stack
1162
1163	x4 = h4 - y4
1164	r0lowx6 = r0low * x6
1165	r0high = r0high_stack
1166
1167	x3 = h3 - y3
1168	r3highx0 = r3high * x0
1169	sr1low = sr1low_stack
1170
1171	x2 = h2 - y2
1172	r0highx6 = r0high * x6
1173	sr1high = sr1high_stack
1174
1175	x5 += y3
1176	r0lowx0 = r0low * x0
1177	r1low = r1low_stack
1178
1179	h6 = r3lowx0 + r0lowx6
1180	sr1lowx6 = sr1low * x6
1181	r1high = r1high_stack
1182
1183	x4 += y2
1184	r0highx0 = r0high * x0
1185	sr2low = sr2low_stack
1186
1187	h7 = r3highx0 + r0highx6
1188	sr1highx6 = sr1high * x6
1189	sr2high = sr2high_stack
1190
1191	x3 += y1
1192	r1lowx0 = r1low * x0
1193	r2low = r2low_stack
1194
1195	h0 = r0lowx0 + sr1lowx6
1196	sr2lowx6 = sr2low * x6
1197	r2high = r2high_stack
1198
1199	x2 += y0
1200	r1highx0 = r1high * x0
1201	sr3low = sr3low_stack
1202
1203	h1 = r0highx0 + sr1highx6
1204	sr2highx6 = sr2high * x6
1205	sr3high = sr3high_stack
1206
1207	x4 += x5
1208	r2lowx0 = r2low * x0
1209
1210	h2 = r1lowx0 + sr2lowx6
1211	sr3lowx6 = sr3low * x6
1212
1213	x2 += x3
1214	r2highx0 = r2high * x0
1215
1216	h3 = r1highx0 + sr2highx6
1217	sr3highx6 = sr3high * x6
1218
1219	r1highx4 = r1high * x4
1220
1221	h4 = r2lowx0 + sr3lowx6
1222	r1lowx4 = r1low * x4
1223
1224	r0highx4 = r0high * x4
1225
1226	h5 = r2highx0 + sr3highx6
1227	r0lowx4 = r0low * x4
1228
1229	h7 += r1highx4
1230	sr3highx4 = sr3high * x4
1231
1232	h6 += r1lowx4
1233	sr3lowx4 = sr3low * x4
1234
1235	h5 += r0highx4
1236	sr2highx4 = sr2high * x4
1237
1238	h4 += r0lowx4
1239	sr2lowx4 = sr2low * x4
1240
1241	h3 += sr3highx4
1242	r0lowx2 = r0low * x2
1243
1244	h2 += sr3lowx4
1245	r0highx2 = r0high * x2
1246
1247	h1 += sr2highx4
1248	r1lowx2 = r1low * x2
1249
1250	h0 += sr2lowx4
1251	r1highx2 = r1high * x2
1252
1253	h2 += r0lowx2
1254	r2lowx2 = r2low * x2
1255
1256	h3 += r0highx2
1257	r2highx2 = r2high * x2
1258
1259	h4 += r1lowx2
1260	sr3lowx2 = sr3low * x2
1261
1262	h5 += r1highx2
1263	sr3highx2 = sr3high * x2
1264
1265	h6 += r2lowx2
1266
1267	h7 += r2highx2
1268
1269	h0 += sr3lowx2
1270
1271	h1 += sr3highx2
1272
1273nomorebytes:
1274
1275	y7 = h7 + alpha130
1276
1277	y0 = h0 + alpha32
1278
1279	y1 = h1 + alpha32
1280
1281	y2 = h2 + alpha64
1282
1283	y7 -= alpha130
1284
1285	y3 = h3 + alpha64
1286
1287	y4 = h4 + alpha96
1288
1289	y5 = h5 + alpha96
1290
1291	x7 = h7 - y7
1292	y7 *= scale
1293
1294	y0 -= alpha32
1295
1296	y1 -= alpha32
1297
1298	y2 -= alpha64
1299
1300	h6 += x7
1301
1302	y3 -= alpha64
1303
1304	y4 -= alpha96
1305
1306	y5 -= alpha96
1307
1308	y6 = h6 + alpha130
1309
1310	x0 = h0 - y0
1311
1312	x1 = h1 - y1
1313
1314	x2 = h2 - y2
1315
1316	y6 -= alpha130
1317
1318	x0 += y7
1319
1320	x3 = h3 - y3
1321
1322	x4 = h4 - y4
1323
1324	x5 = h5 - y5
1325
1326	x6 = h6 - y6
1327
1328	y6 *= scale
1329
1330	x2 += y0
1331
1332	x3 += y1
1333
1334	x4 += y2
1335
1336	x0 += y6
1337
1338	x5 += y3
1339
1340	x6 += y4
1341
1342	x2 += x3
1343
1344	x0 += x1
1345
1346	x4 += x5
1347
1348	x6 += y5
1349
1350	x2 += offset1
1351	d1 = int64(math.Float64bits(x2))
1352
1353	x0 += offset0
1354	d0 = int64(math.Float64bits(x0))
1355
1356	x4 += offset2
1357	d2 = int64(math.Float64bits(x4))
1358
1359	x6 += offset3
1360	d3 = int64(math.Float64bits(x6))
1361
1362	f0 = uint64(d0)
1363
1364	f1 = uint64(d1)
1365	bits32 = math.MaxUint64
1366
1367	f2 = uint64(d2)
1368	bits32 >>= 32
1369
1370	f3 = uint64(d3)
1371	f = f0 >> 32
1372
1373	f0 &= bits32
1374	f &= 255
1375
1376	f1 += f
1377	g0 = f0 + 5
1378
1379	g = g0 >> 32
1380	g0 &= bits32
1381
1382	f = f1 >> 32
1383	f1 &= bits32
1384
1385	f &= 255
1386	g1 = f1 + g
1387
1388	g = g1 >> 32
1389	f2 += f
1390
1391	f = f2 >> 32
1392	g1 &= bits32
1393
1394	f2 &= bits32
1395	f &= 255
1396
1397	f3 += f
1398	g2 = f2 + g
1399
1400	g = g2 >> 32
1401	g2 &= bits32
1402
1403	f4 = f3 >> 32
1404	f3 &= bits32
1405
1406	f4 &= 255
1407	g3 = f3 + g
1408
1409	g = g3 >> 32
1410	g3 &= bits32
1411
1412	g4 = f4 + g
1413
1414	g4 = g4 - 4
1415	s00 = uint32(s[0])
1416
1417	f = uint64(int64(g4) >> 63)
1418	s01 = uint32(s[1])
1419
1420	f0 &= f
1421	g0 &^= f
1422	s02 = uint32(s[2])
1423
1424	f1 &= f
1425	f0 |= g0
1426	s03 = uint32(s[3])
1427
1428	g1 &^= f
1429	f2 &= f
1430	s10 = uint32(s[4])
1431
1432	f3 &= f
1433	g2 &^= f
1434	s11 = uint32(s[5])
1435
1436	g3 &^= f
1437	f1 |= g1
1438	s12 = uint32(s[6])
1439
1440	f2 |= g2
1441	f3 |= g3
1442	s13 = uint32(s[7])
1443
1444	s01 <<= 8
1445	f0 += uint64(s00)
1446	s20 = uint32(s[8])
1447
1448	s02 <<= 16
1449	f0 += uint64(s01)
1450	s21 = uint32(s[9])
1451
1452	s03 <<= 24
1453	f0 += uint64(s02)
1454	s22 = uint32(s[10])
1455
1456	s11 <<= 8
1457	f1 += uint64(s10)
1458	s23 = uint32(s[11])
1459
1460	s12 <<= 16
1461	f1 += uint64(s11)
1462	s30 = uint32(s[12])
1463
1464	s13 <<= 24
1465	f1 += uint64(s12)
1466	s31 = uint32(s[13])
1467
1468	f0 += uint64(s03)
1469	f1 += uint64(s13)
1470	s32 = uint32(s[14])
1471
1472	s21 <<= 8
1473	f2 += uint64(s20)
1474	s33 = uint32(s[15])
1475
1476	s22 <<= 16
1477	f2 += uint64(s21)
1478
1479	s23 <<= 24
1480	f2 += uint64(s22)
1481
1482	s31 <<= 8
1483	f3 += uint64(s30)
1484
1485	s32 <<= 16
1486	f3 += uint64(s31)
1487
1488	s33 <<= 24
1489	f3 += uint64(s32)
1490
1491	f2 += uint64(s23)
1492	f3 += uint64(s33)
1493
1494	out[0] = byte(f0)
1495	f0 >>= 8
1496	out[1] = byte(f0)
1497	f0 >>= 8
1498	out[2] = byte(f0)
1499	f0 >>= 8
1500	out[3] = byte(f0)
1501	f0 >>= 8
1502	f1 += f0
1503
1504	out[4] = byte(f1)
1505	f1 >>= 8
1506	out[5] = byte(f1)
1507	f1 >>= 8
1508	out[6] = byte(f1)
1509	f1 >>= 8
1510	out[7] = byte(f1)
1511	f1 >>= 8
1512	f2 += f1
1513
1514	out[8] = byte(f2)
1515	f2 >>= 8
1516	out[9] = byte(f2)
1517	f2 >>= 8
1518	out[10] = byte(f2)
1519	f2 >>= 8
1520	out[11] = byte(f2)
1521	f2 >>= 8
1522	f3 += f2
1523
1524	out[12] = byte(f3)
1525	f3 >>= 8
1526	out[13] = byte(f3)
1527	f3 >>= 8
1528	out[14] = byte(f3)
1529	f3 >>= 8
1530	out[15] = byte(f3)
1531}
1532