1#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9$sse2=0;
10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11
12&external_label("OPENSSL_ia32cap_P") if ($sse2);
13
14&bn_mul_add_words("bn_mul_add_words");
15&bn_mul_words("bn_mul_words");
16&bn_sqr_words("bn_sqr_words");
17&bn_div_words("bn_div_words");
18&bn_add_words("bn_add_words");
19&bn_sub_words("bn_sub_words");
20&bn_sub_part_words("bn_sub_part_words");
21
22&asm_finish();
23
24sub bn_mul_add_words
25	{
26	local($name)=@_;
27
28	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
29
30	$r="eax";
31	$a="edx";
32	$c="ecx";
33
34	if ($sse2) {
35		&picmeup("eax","OPENSSL_ia32cap_P");
36		&bt(&DWP(0,"eax"),26);
37		&jnc(&label("maw_non_sse2"));
38
39		&mov($r,&wparam(0));
40		&mov($a,&wparam(1));
41		&mov($c,&wparam(2));
42		&movd("mm0",&wparam(3));	# mm0 = w
43		&pxor("mm1","mm1");		# mm1 = carry_in
44		&jmp(&label("maw_sse2_entry"));
45
46	&set_label("maw_sse2_unrolled",16);
47		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
48		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
49		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
50		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
51		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
52		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
53		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
54		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
55		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
56		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
57		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
58		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
59		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
60		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
61		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
62		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
63		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
64		&movd(&DWP(0,$r,"",0),"mm1");
65		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
66		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
67		&psrlq("mm1",32);		# mm1 = carry0
68		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
69		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
70		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
71		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
72		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
73		&movd(&DWP(4,$r,"",0),"mm1");
74		&psrlq("mm1",32);		# mm1 = carry1
75		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
76		&add($a,32);
77		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
78		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
79		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
80		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
81		&movd(&DWP(8,$r,"",0),"mm1");
82		&psrlq("mm1",32);		# mm1 = carry2
83		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
84		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
85		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
86		&movd(&DWP(12,$r,"",0),"mm1");
87		&psrlq("mm1",32);		# mm1 = carry3
88		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
89		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
90		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
91		&movd(&DWP(16,$r,"",0),"mm1");
92		&psrlq("mm1",32);		# mm1 = carry4
93		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
94		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
95		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
96		&movd(&DWP(20,$r,"",0),"mm1");
97		&psrlq("mm1",32);		# mm1 = carry5
98		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
99		&movd(&DWP(24,$r,"",0),"mm1");
100		&psrlq("mm1",32);		# mm1 = carry6
101		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
102		&movd(&DWP(28,$r,"",0),"mm1");
103		&lea($r,&DWP(32,$r));
104		&psrlq("mm1",32);		# mm1 = carry_out
105
106		&sub($c,8);
107		&jz(&label("maw_sse2_exit"));
108	&set_label("maw_sse2_entry");
109		&test($c,0xfffffff8);
110		&jnz(&label("maw_sse2_unrolled"));
111
112	&set_label("maw_sse2_loop",4);
113		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
114		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
115		&pmuludq("mm2","mm0");		# a[i] *= w
116		&lea($a,&DWP(4,$a));
117		&paddq("mm1","mm3");		# carry += r[i]
118		&paddq("mm1","mm2");		# carry += a[i]*w
119		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
120		&sub($c,1);
121		&psrlq("mm1",32);		# carry = carry_high
122		&lea($r,&DWP(4,$r));
123		&jnz(&label("maw_sse2_loop"));
124	&set_label("maw_sse2_exit");
125		&movd("eax","mm1");		# c = carry_out
126		&emms();
127		&ret();
128
129	&set_label("maw_non_sse2",16);
130	}
131
132	# function_begin prologue
133	&push("ebp");
134	&push("ebx");
135	&push("esi");
136	&push("edi");
137
138	&comment("");
139	$Low="eax";
140	$High="edx";
141	$a="ebx";
142	$w="ebp";
143	$r="edi";
144	$c="esi";
145
146	&xor($c,$c);		# clear carry
147	&mov($r,&wparam(0));	#
148
149	&mov("ecx",&wparam(2));	#
150	&mov($a,&wparam(1));	#
151
152	&and("ecx",0xfffffff8);	# num / 8
153	&mov($w,&wparam(3));	#
154
155	&push("ecx");		# Up the stack for a tmp variable
156
157	&jz(&label("maw_finish"));
158
159	&set_label("maw_loop",16);
160
161	for ($i=0; $i<32; $i+=4)
162		{
163		&comment("Round $i");
164
165		 &mov("eax",&DWP($i,$a)); 	# *a
166		&mul($w);			# *a * w
167		&add("eax",$c);			# L(t)+= c
168		&adc("edx",0);			# H(t)+=carry
169		 &add("eax",&DWP($i,$r));	# L(t)+= *r
170		&adc("edx",0);			# H(t)+=carry
171		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
172		&mov($c,"edx");			# c=  H(t);
173		}
174
175	&comment("");
176	&sub("ecx",8);
177	&lea($a,&DWP(32,$a));
178	&lea($r,&DWP(32,$r));
179	&jnz(&label("maw_loop"));
180
181	&set_label("maw_finish",0);
182	&mov("ecx",&wparam(2));	# get num
183	&and("ecx",7);
184	&jnz(&label("maw_finish2"));	# helps branch prediction
185	&jmp(&label("maw_end"));
186
187	&set_label("maw_finish2",1);
188	for ($i=0; $i<7; $i++)
189		{
190		&comment("Tail Round $i");
191		 &mov("eax",&DWP($i*4,$a));	# *a
192		&mul($w);			# *a * w
193		&add("eax",$c);			# L(t)+=c
194		&adc("edx",0);			# H(t)+=carry
195		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
196		&adc("edx",0);			# H(t)+=carry
197		 &dec("ecx") if ($i != 7-1);
198		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
199		 &mov($c,"edx");		# c=  H(t);
200		&jz(&label("maw_end")) if ($i != 7-1);
201		}
202	&set_label("maw_end",0);
203	&mov("eax",$c);
204
205	&pop("ecx");	# clear variable from
206
207	&function_end($name);
208	}
209
210sub bn_mul_words
211	{
212	local($name)=@_;
213
214	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
215
216	$r="eax";
217	$a="edx";
218	$c="ecx";
219
220	if ($sse2) {
221		&picmeup("eax","OPENSSL_ia32cap_P");
222		&bt(&DWP(0,"eax"),26);
223		&jnc(&label("mw_non_sse2"));
224
225		&mov($r,&wparam(0));
226		&mov($a,&wparam(1));
227		&mov($c,&wparam(2));
228		&movd("mm0",&wparam(3));	# mm0 = w
229		&pxor("mm1","mm1");		# mm1 = carry = 0
230
231	&set_label("mw_sse2_loop",16);
232		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
233		&pmuludq("mm2","mm0");		# a[i] *= w
234		&lea($a,&DWP(4,$a));
235		&paddq("mm1","mm2");		# carry += a[i]*w
236		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
237		&sub($c,1);
238		&psrlq("mm1",32);		# carry = carry_high
239		&lea($r,&DWP(4,$r));
240		&jnz(&label("mw_sse2_loop"));
241
242		&movd("eax","mm1");		# return carry
243		&emms();
244		&ret();
245	&set_label("mw_non_sse2",16);
246	}
247
248	# function_begin prologue
249	&push("ebp");
250	&push("ebx");
251	&push("esi");
252	&push("edi");
253
254	&comment("");
255	$Low="eax";
256	$High="edx";
257	$a="ebx";
258	$w="ecx";
259	$r="edi";
260	$c="esi";
261	$num="ebp";
262
263	&xor($c,$c);		# clear carry
264	&mov($r,&wparam(0));	#
265	&mov($a,&wparam(1));	#
266	&mov($num,&wparam(2));	#
267	&mov($w,&wparam(3));	#
268
269	&and($num,0xfffffff8);	# num / 8
270	&jz(&label("mw_finish"));
271
272	&set_label("mw_loop",0);
273	for ($i=0; $i<32; $i+=4)
274		{
275		&comment("Round $i");
276
277		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
278		&mul($w);			# *a * w
279		&add("eax",$c);			# L(t)+=c
280		 # XXX
281
282		&adc("edx",0);			# H(t)+=carry
283		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
284
285		&mov($c,"edx");			# c=  H(t);
286		}
287
288	&comment("");
289	&add($a,32);
290	&add($r,32);
291	&sub($num,8);
292	&jz(&label("mw_finish"));
293	&jmp(&label("mw_loop"));
294
295	&set_label("mw_finish",0);
296	&mov($num,&wparam(2));	# get num
297	&and($num,7);
298	&jnz(&label("mw_finish2"));
299	&jmp(&label("mw_end"));
300
301	&set_label("mw_finish2",1);
302	for ($i=0; $i<7; $i++)
303		{
304		&comment("Tail Round $i");
305		 &mov("eax",&DWP($i*4,$a,"",0));# *a
306		&mul($w);			# *a * w
307		&add("eax",$c);			# L(t)+=c
308		 # XXX
309		&adc("edx",0);			# H(t)+=carry
310		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
311		&mov($c,"edx");			# c=  H(t);
312		 &dec($num) if ($i != 7-1);
313		&jz(&label("mw_end")) if ($i != 7-1);
314		}
315	&set_label("mw_end",0);
316	&mov("eax",$c);
317
318	&function_end($name);
319	}
320
321sub bn_sqr_words
322	{
323	local($name)=@_;
324
325	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
326
327	$r="eax";
328	$a="edx";
329	$c="ecx";
330
331	if ($sse2) {
332		&picmeup("eax","OPENSSL_ia32cap_P");
333		&bt(&DWP(0,"eax"),26);
334		&jnc(&label("sqr_non_sse2"));
335
336		&mov($r,&wparam(0));
337		&mov($a,&wparam(1));
338		&mov($c,&wparam(2));
339
340	&set_label("sqr_sse2_loop",16);
341		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
342		&pmuludq("mm0","mm0");		# a[i] *= a[i]
343		&lea($a,&DWP(4,$a));		# a++
344		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
345		&sub($c,1);
346		&lea($r,&DWP(8,$r));		# r += 2
347		&jnz(&label("sqr_sse2_loop"));
348
349		&emms();
350		&ret();
351	&set_label("sqr_non_sse2",16);
352	}
353
354	# function_begin prologue
355	&push("ebp");
356	&push("ebx");
357	&push("esi");
358	&push("edi");
359
360	&comment("");
361	$r="esi";
362	$a="edi";
363	$num="ebx";
364
365	&mov($r,&wparam(0));	#
366	&mov($a,&wparam(1));	#
367	&mov($num,&wparam(2));	#
368
369	&and($num,0xfffffff8);	# num / 8
370	&jz(&label("sw_finish"));
371
372	&set_label("sw_loop",0);
373	for ($i=0; $i<32; $i+=4)
374		{
375		&comment("Round $i");
376		&mov("eax",&DWP($i,$a,"",0)); 	# *a
377		 # XXX
378		&mul("eax");			# *a * *a
379		&mov(&DWP($i*2,$r,"",0),"eax");	#
380		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
381		}
382
383	&comment("");
384	&add($a,32);
385	&add($r,64);
386	&sub($num,8);
387	&jnz(&label("sw_loop"));
388
389	&set_label("sw_finish",0);
390	&mov($num,&wparam(2));	# get num
391	&and($num,7);
392	&jz(&label("sw_end"));
393
394	for ($i=0; $i<7; $i++)
395		{
396		&comment("Tail Round $i");
397		&mov("eax",&DWP($i*4,$a,"",0));	# *a
398		 # XXX
399		&mul("eax");			# *a * *a
400		&mov(&DWP($i*8,$r,"",0),"eax");	#
401		 &dec($num) if ($i != 7-1);
402		&mov(&DWP($i*8+4,$r,"",0),"edx");
403		 &jz(&label("sw_end")) if ($i != 7-1);
404		}
405	&set_label("sw_end",0);
406
407	&function_end($name);
408	}
409
410sub bn_div_words
411	{
412	local($name)=@_;
413
414	&function_begin_B($name,"");
415	&mov("edx",&wparam(0));	#
416	&mov("eax",&wparam(1));	#
417	&mov("ecx",&wparam(2));	#
418	&div("ecx");
419	&ret();
420	&function_end_B($name);
421	}
422
423sub bn_add_words
424	{
425	local($name)=@_;
426
427	&function_begin($name,"");
428
429	&comment("");
430	$a="esi";
431	$b="edi";
432	$c="eax";
433	$r="ebx";
434	$tmp1="ecx";
435	$tmp2="edx";
436	$num="ebp";
437
438	&mov($r,&wparam(0));	# get r
439	 &mov($a,&wparam(1));	# get a
440	&mov($b,&wparam(2));	# get b
441	 &mov($num,&wparam(3));	# get num
442	&xor($c,$c);		# clear carry
443	 &and($num,0xfffffff8);	# num / 8
444
445	&jz(&label("aw_finish"));
446
447	&set_label("aw_loop",0);
448	for ($i=0; $i<8; $i++)
449		{
450		&comment("Round $i");
451
452		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
453		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
454		&add($tmp1,$c);
455		 &mov($c,0);
456		&adc($c,$c);
457		 &add($tmp1,$tmp2);
458		&adc($c,0);
459		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
460		}
461
462	&comment("");
463	&add($a,32);
464	 &add($b,32);
465	&add($r,32);
466	 &sub($num,8);
467	&jnz(&label("aw_loop"));
468
469	&set_label("aw_finish",0);
470	&mov($num,&wparam(3));	# get num
471	&and($num,7);
472	 &jz(&label("aw_end"));
473
474	for ($i=0; $i<7; $i++)
475		{
476		&comment("Tail Round $i");
477		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
478		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
479		&add($tmp1,$c);
480		 &mov($c,0);
481		&adc($c,$c);
482		 &add($tmp1,$tmp2);
483		&adc($c,0);
484		 &dec($num) if ($i != 6);
485		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
486		 &jz(&label("aw_end")) if ($i != 6);
487		}
488	&set_label("aw_end",0);
489
490#	&mov("eax",$c);		# $c is "eax"
491
492	&function_end($name);
493	}
494
495sub bn_sub_words
496	{
497	local($name)=@_;
498
499	&function_begin($name,"");
500
501	&comment("");
502	$a="esi";
503	$b="edi";
504	$c="eax";
505	$r="ebx";
506	$tmp1="ecx";
507	$tmp2="edx";
508	$num="ebp";
509
510	&mov($r,&wparam(0));	# get r
511	 &mov($a,&wparam(1));	# get a
512	&mov($b,&wparam(2));	# get b
513	 &mov($num,&wparam(3));	# get num
514	&xor($c,$c);		# clear carry
515	 &and($num,0xfffffff8);	# num / 8
516
517	&jz(&label("aw_finish"));
518
519	&set_label("aw_loop",0);
520	for ($i=0; $i<8; $i++)
521		{
522		&comment("Round $i");
523
524		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
525		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
526		&sub($tmp1,$c);
527		 &mov($c,0);
528		&adc($c,$c);
529		 &sub($tmp1,$tmp2);
530		&adc($c,0);
531		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
532		}
533
534	&comment("");
535	&add($a,32);
536	 &add($b,32);
537	&add($r,32);
538	 &sub($num,8);
539	&jnz(&label("aw_loop"));
540
541	&set_label("aw_finish",0);
542	&mov($num,&wparam(3));	# get num
543	&and($num,7);
544	 &jz(&label("aw_end"));
545
546	for ($i=0; $i<7; $i++)
547		{
548		&comment("Tail Round $i");
549		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
550		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
551		&sub($tmp1,$c);
552		 &mov($c,0);
553		&adc($c,$c);
554		 &sub($tmp1,$tmp2);
555		&adc($c,0);
556		 &dec($num) if ($i != 6);
557		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
558		 &jz(&label("aw_end")) if ($i != 6);
559		}
560	&set_label("aw_end",0);
561
562#	&mov("eax",$c);		# $c is "eax"
563
564	&function_end($name);
565	}
566
567sub bn_sub_part_words
568	{
569	local($name)=@_;
570
571	&function_begin($name,"");
572
573	&comment("");
574	$a="esi";
575	$b="edi";
576	$c="eax";
577	$r="ebx";
578	$tmp1="ecx";
579	$tmp2="edx";
580	$num="ebp";
581
582	&mov($r,&wparam(0));	# get r
583	 &mov($a,&wparam(1));	# get a
584	&mov($b,&wparam(2));	# get b
585	 &mov($num,&wparam(3));	# get num
586	&xor($c,$c);		# clear carry
587	 &and($num,0xfffffff8);	# num / 8
588
589	&jz(&label("aw_finish"));
590
591	&set_label("aw_loop",0);
592	for ($i=0; $i<8; $i++)
593		{
594		&comment("Round $i");
595
596		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
597		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
598		&sub($tmp1,$c);
599		 &mov($c,0);
600		&adc($c,$c);
601		 &sub($tmp1,$tmp2);
602		&adc($c,0);
603		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
604		}
605
606	&comment("");
607	&add($a,32);
608	 &add($b,32);
609	&add($r,32);
610	 &sub($num,8);
611	&jnz(&label("aw_loop"));
612
613	&set_label("aw_finish",0);
614	&mov($num,&wparam(3));	# get num
615	&and($num,7);
616	 &jz(&label("aw_end"));
617
618	for ($i=0; $i<7; $i++)
619		{
620		&comment("Tail Round $i");
621		&mov($tmp1,&DWP(0,$a,"",0));	# *a
622		 &mov($tmp2,&DWP(0,$b,"",0));# *b
623		&sub($tmp1,$c);
624		 &mov($c,0);
625		&adc($c,$c);
626		 &sub($tmp1,$tmp2);
627		&adc($c,0);
628		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
629		&add($a, 4);
630		&add($b, 4);
631		&add($r, 4);
632		 &dec($num) if ($i != 6);
633		 &jz(&label("aw_end")) if ($i != 6);
634		}
635	&set_label("aw_end",0);
636
637	&cmp(&wparam(4),0);
638	&je(&label("pw_end"));
639
640	&mov($num,&wparam(4));	# get dl
641	&cmp($num,0);
642	&je(&label("pw_end"));
643	&jge(&label("pw_pos"));
644
645	&comment("pw_neg");
646	&mov($tmp2,0);
647	&sub($tmp2,$num);
648	&mov($num,$tmp2);
649	&and($num,0xfffffff8);	# num / 8
650	&jz(&label("pw_neg_finish"));
651
652	&set_label("pw_neg_loop",0);
653	for ($i=0; $i<8; $i++)
654	{
655	    &comment("dl<0 Round $i");
656
657	    &mov($tmp1,0);
658	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
659	    &sub($tmp1,$c);
660	    &mov($c,0);
661	    &adc($c,$c);
662	    &sub($tmp1,$tmp2);
663	    &adc($c,0);
664	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
665	}
666
667	&comment("");
668	&add($b,32);
669	&add($r,32);
670	&sub($num,8);
671	&jnz(&label("pw_neg_loop"));
672
673	&set_label("pw_neg_finish",0);
674	&mov($tmp2,&wparam(4));	# get dl
675	&mov($num,0);
676	&sub($num,$tmp2);
677	&and($num,7);
678	&jz(&label("pw_end"));
679
680	for ($i=0; $i<7; $i++)
681	{
682	    &comment("dl<0 Tail Round $i");
683	    &mov($tmp1,0);
684	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
685	    &sub($tmp1,$c);
686	    &mov($c,0);
687	    &adc($c,$c);
688	    &sub($tmp1,$tmp2);
689	    &adc($c,0);
690	    &dec($num) if ($i != 6);
691	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
692	    &jz(&label("pw_end")) if ($i != 6);
693	}
694
695	&jmp(&label("pw_end"));
696
697	&set_label("pw_pos",0);
698
699	&and($num,0xfffffff8);	# num / 8
700	&jz(&label("pw_pos_finish"));
701
702	&set_label("pw_pos_loop",0);
703
704	for ($i=0; $i<8; $i++)
705	{
706	    &comment("dl>0 Round $i");
707
708	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
709	    &sub($tmp1,$c);
710	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
711	    &jnc(&label("pw_nc".$i));
712	}
713
714	&comment("");
715	&add($a,32);
716	&add($r,32);
717	&sub($num,8);
718	&jnz(&label("pw_pos_loop"));
719
720	&set_label("pw_pos_finish",0);
721	&mov($num,&wparam(4));	# get dl
722	&and($num,7);
723	&jz(&label("pw_end"));
724
725	for ($i=0; $i<7; $i++)
726	{
727	    &comment("dl>0 Tail Round $i");
728	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
729	    &sub($tmp1,$c);
730	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
731	    &jnc(&label("pw_tail_nc".$i));
732	    &dec($num) if ($i != 6);
733	    &jz(&label("pw_end")) if ($i != 6);
734	}
735	&mov($c,1);
736	&jmp(&label("pw_end"));
737
738	&set_label("pw_nc_loop",0);
739	for ($i=0; $i<8; $i++)
740	{
741	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
742	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
743	    &set_label("pw_nc".$i,0);
744	}
745
746	&comment("");
747	&add($a,32);
748	&add($r,32);
749	&sub($num,8);
750	&jnz(&label("pw_nc_loop"));
751
752	&mov($num,&wparam(4));	# get dl
753	&and($num,7);
754	&jz(&label("pw_nc_end"));
755
756	for ($i=0; $i<7; $i++)
757	{
758	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
759	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
760	    &set_label("pw_tail_nc".$i,0);
761	    &dec($num) if ($i != 6);
762	    &jz(&label("pw_nc_end")) if ($i != 6);
763	}
764
765	&set_label("pw_nc_end",0);
766	&mov($c,0);
767
768	&set_label("pw_end",0);
769
770#	&mov("eax",$c);		# $c is "eax"
771
772	&function_end($name);
773	}
774
775