1# Copyright (c) 2018, Amazon Inc.
2#
3# Permission to use, copy, modify, and/or distribute this software for any
4# purpose with or without fee is hereby granted, provided that the above
5# copyright notice and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14#
15# Written by Nir Drucker, and Shay Gueron
16# AWS Cryptographic Algorithms Group
17# (ndrucker@amazon.com, gueron@amazon.com)
18# based on BN_mod_inverse_odd
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
32*STDOUT=*OUT;
33
34#############################################################################
35# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
36#                                     BN_ULONG a[P256_LIMBS],
37#                                     BN_ULONG n[P256_LIMBS]);
38#
39# (Binary Extended Euclidean Algorithm.
40#  See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
41#
42# Assumption 1: n is odd for the BEEU
43# Assumption 2: 1 < a < n < 2^256
44
45$out = "%rdi";
46$a = "%rsi";
47$n = "%rdx";
48
49# X/Y will hold the inverse parameter
50# Assumption: X,Y<2^(256)
51$x0 = "%r8";
52$x1 = "%r9";
53$x2 = "%r10";
54$x3 = "%r11";
55# borrow from out (out is needed only at the end)
56$x4 = "%rdi";
57$y0 = "%r12";
58$y1 = "%r13";
59$y2 = "%r14";
60$y3 = "%r15";
61$y4 = "%rbp";
62$shift = "%rcx";
63$t0 = "%rax";
64$t1 = "%rbx";
65$t2 = "%rsi";
66# borrow
67$t3 = "%rcx";
68
69$T0 = "%xmm0";
70$T1 = "%xmm1";
71
72# Offsets on the stack
73$out_rsp = 0;
74$shift_rsp = $out_rsp+0x8;
75$a_rsp0 = $shift_rsp+0x8;
76$a_rsp1 = $a_rsp0+0x8;
77$a_rsp2 = $a_rsp1+0x8;
78$a_rsp3 = $a_rsp2+0x8;
79$b_rsp0 = $a_rsp3+0x8;
80$b_rsp1 = $b_rsp0+0x8;
81$b_rsp2 = $b_rsp1+0x8;
82$b_rsp3 = $b_rsp2+0x8;
83
84# Borrow when a_rsp/b_rsp are no longer needed.
85$y_rsp0 = $a_rsp0;
86$y_rsp1 = $y_rsp0+0x8;
87$y_rsp2 = $y_rsp1+0x8;
88$y_rsp3 = $y_rsp2+0x8;
89$y_rsp4 = $y_rsp3+0x8;
90$last_rsp_offset = $b_rsp3+0x8;
91
92sub TEST_B_ZERO {
93  return <<___;
94    xorq $t1, $t1
95    or $b_rsp0(%rsp), $t1
96    or $b_rsp1(%rsp), $t1
97    or $b_rsp2(%rsp), $t1
98    or $b_rsp3(%rsp), $t1
99    jz .Lbeeu_loop_end
100___
101}
102
103$g_next_label = 0;
104
105sub SHIFT1 {
106  my ($var0, $var1, $var2, $var3, $var4) = @_;
107  my $label = ".Lshift1_${g_next_label}";
108  $g_next_label++;
109
110  return <<___;
111    # Ensure X is even and divide by two.
112    movq \$1, $t1
113    andq $var0, $t1
114    jz $label
115    add 0*8($n), $var0
116    adc 1*8($n), $var1
117    adc 2*8($n), $var2
118    adc 3*8($n), $var3
119    adc \$0, $var4
120
121$label:
122    shrdq \$1, $var1, $var0
123    shrdq \$1, $var2, $var1
124    shrdq \$1, $var3, $var2
125    shrdq \$1, $var4, $var3
126    shrq  \$1, $var4
127___
128}
129
130sub SHIFT256 {
131  my ($var) = @_;
132  return <<___;
133    # Copy shifted values.
134    # Remember not to override t3=rcx
135    movq 1*8+$var(%rsp), $t0
136    movq 2*8+$var(%rsp), $t1
137    movq 3*8+$var(%rsp), $t2
138
139    shrdq %cl, $t0, 0*8+$var(%rsp)
140    shrdq %cl, $t1, 1*8+$var(%rsp)
141    shrdq %cl, $t2, 2*8+$var(%rsp)
142
143    shrq  %cl, $t2
144    mov $t2, 3*8+$var(%rsp)
145___
146}
147
148$code.=<<___;
149.text
150
151.type beeu_mod_inverse_vartime,\@function
152.hidden beeu_mod_inverse_vartime
153.globl  beeu_mod_inverse_vartime
154.align 32
155beeu_mod_inverse_vartime:
156.cfi_startproc
157    push %rbp
158.cfi_push rbp
159    push %r12
160.cfi_push r12
161    push %r13
162.cfi_push r13
163    push %r14
164.cfi_push r14
165    push %r15
166.cfi_push r15
167    push %rbx
168.cfi_push rbx
169    push %rsi
170.cfi_push rsi
171
172    sub \$$last_rsp_offset, %rsp
173.cfi_adjust_cfa_offset  $last_rsp_offset
174    movq $out, $out_rsp(%rsp)
175
176    # X=1, Y=0
177    movq \$1, $x0
178    xorq $x1, $x1
179    xorq $x2, $x2
180    xorq $x3, $x3
181    xorq $x4, $x4
182
183    xorq $y0, $y0
184    xorq $y1, $y1
185    xorq $y2, $y2
186    xorq $y3, $y3
187    xorq $y4, $y4
188
189    # Copy a/n into B/A on the stack.
190    vmovdqu 0*8($a), $T0
191    vmovdqu 2*8($a), $T1
192    vmovdqu $T0, $b_rsp0(%rsp)
193    vmovdqu $T1, $b_rsp2(%rsp)
194
195    vmovdqu 0*8($n), $T0
196    vmovdqu 2*8($n), $T1
197    vmovdqu $T0, $a_rsp0(%rsp)
198    vmovdqu $T1, $a_rsp2(%rsp)
199
200.Lbeeu_loop:
201    ${\TEST_B_ZERO}
202
203    # 0 < B < |n|,
204    # 0 < A <= |n|,
205    # (1)      X*a  ==  B   (mod |n|),
206    # (2) (-1)*Y*a  ==  A   (mod |n|)
207
208    # Now divide B by the maximum possible power of two in the
209    # integers, and divide X by the same value mod |n|. When we're
210    # done, (1) still holds.
211    movq \$1, $shift
212
213    # Note that B > 0
214.Lbeeu_shift_loop_XB:
215    movq $shift, $t1
216    andq $b_rsp0(%rsp), $t1
217    jnz .Lbeeu_shift_loop_end_XB
218
219    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
220    shl \$1, $shift
221
222    # Test wraparound of the shift parameter. The probability to have 32 zeroes
223    # in a row is small Therefore having the value below equal \$0x8000000 or
224    # \$0x8000 does not affect the performance. We choose 0x8000000 because it
225    # is the maximal immediate value possible.
226    cmp \$0x8000000, $shift
227    jne .Lbeeu_shift_loop_XB
228
229.Lbeeu_shift_loop_end_XB:
230    bsf $shift, $shift
231    test $shift, $shift
232    jz .Lbeeu_no_shift_XB
233
234    ${\SHIFT256($b_rsp0)}
235
236.Lbeeu_no_shift_XB:
237    # Same for A and Y.  Afterwards, (2) still holds.
238    movq \$1, $shift
239
240    # Note that A > 0
241.Lbeeu_shift_loop_YA:
242    movq $shift, $t1
243    andq $a_rsp0(%rsp), $t1
244    jnz .Lbeeu_shift_loop_end_YA
245
246    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
247    shl \$1, $shift
248
249    # Test wraparound of the shift parameter. The probability to have 32 zeroes
250    # in a row is small therefore having the value below equal \$0x8000000 or
251    # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
252    # is the maximal immediate value possible.
253    cmp \$0x8000000, $shift
254    jne .Lbeeu_shift_loop_YA
255
256.Lbeeu_shift_loop_end_YA:
257    bsf $shift, $shift
258    test $shift, $shift
259    jz .Lbeeu_no_shift_YA
260
261    ${\SHIFT256($a_rsp0)}
262
263.Lbeeu_no_shift_YA:
264    # T = B-A (A,B < 2^256)
265    mov $b_rsp0(%rsp), $t0
266    mov $b_rsp1(%rsp), $t1
267    mov $b_rsp2(%rsp), $t2
268    mov $b_rsp3(%rsp), $t3
269    sub $a_rsp0(%rsp), $t0
270    sbb $a_rsp1(%rsp), $t1
271    sbb $a_rsp2(%rsp), $t2
272    sbb $a_rsp3(%rsp), $t3  # borrow from shift
273    jnc .Lbeeu_B_bigger_than_A
274
275    # A = A - B
276    mov $a_rsp0(%rsp), $t0
277    mov $a_rsp1(%rsp), $t1
278    mov $a_rsp2(%rsp), $t2
279    mov $a_rsp3(%rsp), $t3
280    sub $b_rsp0(%rsp), $t0
281    sbb $b_rsp1(%rsp), $t1
282    sbb $b_rsp2(%rsp), $t2
283    sbb $b_rsp3(%rsp), $t3
284    mov $t0, $a_rsp0(%rsp)
285    mov $t1, $a_rsp1(%rsp)
286    mov $t2, $a_rsp2(%rsp)
287    mov $t3, $a_rsp3(%rsp)
288
289    # Y = Y + X
290    add $x0, $y0
291    adc $x1, $y1
292    adc $x2, $y2
293    adc $x3, $y3
294    adc $x4, $y4
295    jmp .Lbeeu_loop
296
297.Lbeeu_B_bigger_than_A:
298    # B = T = B - A
299    mov $t0, $b_rsp0(%rsp)
300    mov $t1, $b_rsp1(%rsp)
301    mov $t2, $b_rsp2(%rsp)
302    mov $t3, $b_rsp3(%rsp)
303
304    # X = Y + X
305    add $y0, $x0
306    adc $y1, $x1
307    adc $y2, $x2
308    adc $y3, $x3
309    adc $y4, $x4
310
311    jmp .Lbeeu_loop
312
313.Lbeeu_loop_end:
314    # The Euclid's algorithm loop ends when A == beeu(a,n);
315    # Therefore (-1)*Y*a == A (mod |n|), Y>0
316
317    # Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
318    mov $a_rsp0(%rsp), $t1
319    sub \$1, $t1
320    or $a_rsp1(%rsp), $t1
321    or $a_rsp2(%rsp), $t1
322    or $a_rsp3(%rsp), $t1
323    # If not, fail.
324    jnz .Lbeeu_err
325
326    # From this point on, we no longer need X
327    # Therefore we use it as a temporary storage.
328    # X = n
329    movq 0*8($n), $x0
330    movq 1*8($n), $x1
331    movq 2*8($n), $x2
332    movq 3*8($n), $x3
333    xorq $x4, $x4
334
335.Lbeeu_reduction_loop:
336    movq $y0, $y_rsp0(%rsp)
337    movq $y1, $y_rsp1(%rsp)
338    movq $y2, $y_rsp2(%rsp)
339    movq $y3, $y_rsp3(%rsp)
340    movq $y4, $y_rsp4(%rsp)
341
342    # If Y>n ==> Y=Y-n
343    sub $x0, $y0
344    sbb $x1, $y1
345    sbb $x2, $y2
346    sbb $x3, $y3
347    sbb \$0, $y4
348
349    # Choose old Y or new Y
350    cmovc $y_rsp0(%rsp), $y0
351    cmovc $y_rsp1(%rsp), $y1
352    cmovc $y_rsp2(%rsp), $y2
353    cmovc $y_rsp3(%rsp), $y3
354    jnc .Lbeeu_reduction_loop
355
356    # X = n - Y (n, Y < 2^256), (Cancel the (-1))
357    sub $y0, $x0
358    sbb $y1, $x1
359    sbb $y2, $x2
360    sbb $y3, $x3
361
362.Lbeeu_save:
363    # Save the inverse(<2^256) to out.
364    mov $out_rsp(%rsp), $out
365
366    movq $x0, 0*8($out)
367    movq $x1, 1*8($out)
368    movq $x2, 2*8($out)
369    movq $x3, 3*8($out)
370
371    # Return 1.
372    movq \$1, %rax
373    jmp .Lbeeu_finish
374
375.Lbeeu_err:
376    # Return 0.
377    xorq %rax, %rax
378
379.Lbeeu_finish:
380    add \$$last_rsp_offset, %rsp
381.cfi_adjust_cfa_offset  -$last_rsp_offset
382    pop %rsi
383.cfi_pop rsi
384    pop %rbx
385.cfi_pop rbx
386    pop %r15
387.cfi_pop r15
388    pop %r14
389.cfi_pop r14
390    pop %r13
391.cfi_pop r13
392    pop %r12
393.cfi_pop r12
394    pop %rbp
395.cfi_pop rbp
396    ret
397.cfi_endproc
398
399.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
400___
401
402print $code;
403close STDOUT;
404