1#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7$output = pop;
8open STDOUT,">$output";
9
10&asm_init($ARGV[0],$0);
11
12$sse2=0;
13for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
14
15&external_label("OPENSSL_ia32cap_P") if ($sse2);
16
17&bn_mul_add_words("bn_mul_add_words");
18&bn_mul_words("bn_mul_words");
19&bn_sqr_words("bn_sqr_words");
20&bn_div_words("bn_div_words");
21&bn_add_words("bn_add_words");
22&bn_sub_words("bn_sub_words");
23&bn_sub_part_words("bn_sub_part_words");
24
25&asm_finish();
26
27close STDOUT;
28
29sub bn_mul_add_words
30	{
31	local($name)=@_;
32
33	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
34
35	$r="eax";
36	$a="edx";
37	$c="ecx";
38
39	if ($sse2) {
40		&picmeup("eax","OPENSSL_ia32cap_P");
41		&bt(&DWP(0,"eax"),26);
42		&jnc(&label("maw_non_sse2"));
43
44		&mov($r,&wparam(0));
45		&mov($a,&wparam(1));
46		&mov($c,&wparam(2));
47		&movd("mm0",&wparam(3));	# mm0 = w
48		&pxor("mm1","mm1");		# mm1 = carry_in
49		&jmp(&label("maw_sse2_entry"));
50
51	&set_label("maw_sse2_unrolled",16);
52		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
53		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
54		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
55		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
56		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
57		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
58		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
59		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
60		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
61		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
62		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
63		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
64		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
65		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
66		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
67		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
68		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
69		&movd(&DWP(0,$r,"",0),"mm1");
70		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
71		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
72		&psrlq("mm1",32);		# mm1 = carry0
73		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
74		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
75		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
76		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
77		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
78		&movd(&DWP(4,$r,"",0),"mm1");
79		&psrlq("mm1",32);		# mm1 = carry1
80		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
81		&add($a,32);
82		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
83		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
84		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
85		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
86		&movd(&DWP(8,$r,"",0),"mm1");
87		&psrlq("mm1",32);		# mm1 = carry2
88		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
89		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
90		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
91		&movd(&DWP(12,$r,"",0),"mm1");
92		&psrlq("mm1",32);		# mm1 = carry3
93		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
94		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
95		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
96		&movd(&DWP(16,$r,"",0),"mm1");
97		&psrlq("mm1",32);		# mm1 = carry4
98		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
99		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
100		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
101		&movd(&DWP(20,$r,"",0),"mm1");
102		&psrlq("mm1",32);		# mm1 = carry5
103		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
104		&movd(&DWP(24,$r,"",0),"mm1");
105		&psrlq("mm1",32);		# mm1 = carry6
106		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
107		&movd(&DWP(28,$r,"",0),"mm1");
108		&lea($r,&DWP(32,$r));
109		&psrlq("mm1",32);		# mm1 = carry_out
110
111		&sub($c,8);
112		&jz(&label("maw_sse2_exit"));
113	&set_label("maw_sse2_entry");
114		&test($c,0xfffffff8);
115		&jnz(&label("maw_sse2_unrolled"));
116
117	&set_label("maw_sse2_loop",4);
118		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
119		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
120		&pmuludq("mm2","mm0");		# a[i] *= w
121		&lea($a,&DWP(4,$a));
122		&paddq("mm1","mm3");		# carry += r[i]
123		&paddq("mm1","mm2");		# carry += a[i]*w
124		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
125		&sub($c,1);
126		&psrlq("mm1",32);		# carry = carry_high
127		&lea($r,&DWP(4,$r));
128		&jnz(&label("maw_sse2_loop"));
129	&set_label("maw_sse2_exit");
130		&movd("eax","mm1");		# c = carry_out
131		&emms();
132		&ret();
133
134	&set_label("maw_non_sse2",16);
135	}
136
137	# function_begin prologue
138	&push("ebp");
139	&push("ebx");
140	&push("esi");
141	&push("edi");
142
143	&comment("");
144	$Low="eax";
145	$High="edx";
146	$a="ebx";
147	$w="ebp";
148	$r="edi";
149	$c="esi";
150
151	&xor($c,$c);		# clear carry
152	&mov($r,&wparam(0));	#
153
154	&mov("ecx",&wparam(2));	#
155	&mov($a,&wparam(1));	#
156
157	&and("ecx",0xfffffff8);	# num / 8
158	&mov($w,&wparam(3));	#
159
160	&push("ecx");		# Up the stack for a tmp variable
161
162	&jz(&label("maw_finish"));
163
164	&set_label("maw_loop",16);
165
166	for ($i=0; $i<32; $i+=4)
167		{
168		&comment("Round $i");
169
170		 &mov("eax",&DWP($i,$a)); 	# *a
171		&mul($w);			# *a * w
172		&add("eax",$c);			# L(t)+= c
173		&adc("edx",0);			# H(t)+=carry
174		 &add("eax",&DWP($i,$r));	# L(t)+= *r
175		&adc("edx",0);			# H(t)+=carry
176		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
177		&mov($c,"edx");			# c=  H(t);
178		}
179
180	&comment("");
181	&sub("ecx",8);
182	&lea($a,&DWP(32,$a));
183	&lea($r,&DWP(32,$r));
184	&jnz(&label("maw_loop"));
185
186	&set_label("maw_finish",0);
187	&mov("ecx",&wparam(2));	# get num
188	&and("ecx",7);
189	&jnz(&label("maw_finish2"));	# helps branch prediction
190	&jmp(&label("maw_end"));
191
192	&set_label("maw_finish2",1);
193	for ($i=0; $i<7; $i++)
194		{
195		&comment("Tail Round $i");
196		 &mov("eax",&DWP($i*4,$a));	# *a
197		&mul($w);			# *a * w
198		&add("eax",$c);			# L(t)+=c
199		&adc("edx",0);			# H(t)+=carry
200		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
201		&adc("edx",0);			# H(t)+=carry
202		 &dec("ecx") if ($i != 7-1);
203		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
204		 &mov($c,"edx");		# c=  H(t);
205		&jz(&label("maw_end")) if ($i != 7-1);
206		}
207	&set_label("maw_end",0);
208	&mov("eax",$c);
209
210	&pop("ecx");	# clear variable from
211
212	&function_end($name);
213	}
214
215sub bn_mul_words
216	{
217	local($name)=@_;
218
219	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
220
221	$r="eax";
222	$a="edx";
223	$c="ecx";
224
225	if ($sse2) {
226		&picmeup("eax","OPENSSL_ia32cap_P");
227		&bt(&DWP(0,"eax"),26);
228		&jnc(&label("mw_non_sse2"));
229
230		&mov($r,&wparam(0));
231		&mov($a,&wparam(1));
232		&mov($c,&wparam(2));
233		&movd("mm0",&wparam(3));	# mm0 = w
234		&pxor("mm1","mm1");		# mm1 = carry = 0
235
236	&set_label("mw_sse2_loop",16);
237		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
238		&pmuludq("mm2","mm0");		# a[i] *= w
239		&lea($a,&DWP(4,$a));
240		&paddq("mm1","mm2");		# carry += a[i]*w
241		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
242		&sub($c,1);
243		&psrlq("mm1",32);		# carry = carry_high
244		&lea($r,&DWP(4,$r));
245		&jnz(&label("mw_sse2_loop"));
246
247		&movd("eax","mm1");		# return carry
248		&emms();
249		&ret();
250	&set_label("mw_non_sse2",16);
251	}
252
253	# function_begin prologue
254	&push("ebp");
255	&push("ebx");
256	&push("esi");
257	&push("edi");
258
259	&comment("");
260	$Low="eax";
261	$High="edx";
262	$a="ebx";
263	$w="ecx";
264	$r="edi";
265	$c="esi";
266	$num="ebp";
267
268	&xor($c,$c);		# clear carry
269	&mov($r,&wparam(0));	#
270	&mov($a,&wparam(1));	#
271	&mov($num,&wparam(2));	#
272	&mov($w,&wparam(3));	#
273
274	&and($num,0xfffffff8);	# num / 8
275	&jz(&label("mw_finish"));
276
277	&set_label("mw_loop",0);
278	for ($i=0; $i<32; $i+=4)
279		{
280		&comment("Round $i");
281
282		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
283		&mul($w);			# *a * w
284		&add("eax",$c);			# L(t)+=c
285		 # XXX
286
287		&adc("edx",0);			# H(t)+=carry
288		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
289
290		&mov($c,"edx");			# c=  H(t);
291		}
292
293	&comment("");
294	&add($a,32);
295	&add($r,32);
296	&sub($num,8);
297	&jz(&label("mw_finish"));
298	&jmp(&label("mw_loop"));
299
300	&set_label("mw_finish",0);
301	&mov($num,&wparam(2));	# get num
302	&and($num,7);
303	&jnz(&label("mw_finish2"));
304	&jmp(&label("mw_end"));
305
306	&set_label("mw_finish2",1);
307	for ($i=0; $i<7; $i++)
308		{
309		&comment("Tail Round $i");
310		 &mov("eax",&DWP($i*4,$a,"",0));# *a
311		&mul($w);			# *a * w
312		&add("eax",$c);			# L(t)+=c
313		 # XXX
314		&adc("edx",0);			# H(t)+=carry
315		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
316		&mov($c,"edx");			# c=  H(t);
317		 &dec($num) if ($i != 7-1);
318		&jz(&label("mw_end")) if ($i != 7-1);
319		}
320	&set_label("mw_end",0);
321	&mov("eax",$c);
322
323	&function_end($name);
324	}
325
326sub bn_sqr_words
327	{
328	local($name)=@_;
329
330	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
331
332	$r="eax";
333	$a="edx";
334	$c="ecx";
335
336	if ($sse2) {
337		&picmeup("eax","OPENSSL_ia32cap_P");
338		&bt(&DWP(0,"eax"),26);
339		&jnc(&label("sqr_non_sse2"));
340
341		&mov($r,&wparam(0));
342		&mov($a,&wparam(1));
343		&mov($c,&wparam(2));
344
345	&set_label("sqr_sse2_loop",16);
346		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
347		&pmuludq("mm0","mm0");		# a[i] *= a[i]
348		&lea($a,&DWP(4,$a));		# a++
349		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
350		&sub($c,1);
351		&lea($r,&DWP(8,$r));		# r += 2
352		&jnz(&label("sqr_sse2_loop"));
353
354		&emms();
355		&ret();
356	&set_label("sqr_non_sse2",16);
357	}
358
359	# function_begin prologue
360	&push("ebp");
361	&push("ebx");
362	&push("esi");
363	&push("edi");
364
365	&comment("");
366	$r="esi";
367	$a="edi";
368	$num="ebx";
369
370	&mov($r,&wparam(0));	#
371	&mov($a,&wparam(1));	#
372	&mov($num,&wparam(2));	#
373
374	&and($num,0xfffffff8);	# num / 8
375	&jz(&label("sw_finish"));
376
377	&set_label("sw_loop",0);
378	for ($i=0; $i<32; $i+=4)
379		{
380		&comment("Round $i");
381		&mov("eax",&DWP($i,$a,"",0)); 	# *a
382		 # XXX
383		&mul("eax");			# *a * *a
384		&mov(&DWP($i*2,$r,"",0),"eax");	#
385		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
386		}
387
388	&comment("");
389	&add($a,32);
390	&add($r,64);
391	&sub($num,8);
392	&jnz(&label("sw_loop"));
393
394	&set_label("sw_finish",0);
395	&mov($num,&wparam(2));	# get num
396	&and($num,7);
397	&jz(&label("sw_end"));
398
399	for ($i=0; $i<7; $i++)
400		{
401		&comment("Tail Round $i");
402		&mov("eax",&DWP($i*4,$a,"",0));	# *a
403		 # XXX
404		&mul("eax");			# *a * *a
405		&mov(&DWP($i*8,$r,"",0),"eax");	#
406		 &dec($num) if ($i != 7-1);
407		&mov(&DWP($i*8+4,$r,"",0),"edx");
408		 &jz(&label("sw_end")) if ($i != 7-1);
409		}
410	&set_label("sw_end",0);
411
412	&function_end($name);
413	}
414
415sub bn_div_words
416	{
417	local($name)=@_;
418
419	&function_begin_B($name,"");
420	&mov("edx",&wparam(0));	#
421	&mov("eax",&wparam(1));	#
422	&mov("ecx",&wparam(2));	#
423	&div("ecx");
424	&ret();
425	&function_end_B($name);
426	}
427
428sub bn_add_words
429	{
430	local($name)=@_;
431
432	&function_begin($name,"");
433
434	&comment("");
435	$a="esi";
436	$b="edi";
437	$c="eax";
438	$r="ebx";
439	$tmp1="ecx";
440	$tmp2="edx";
441	$num="ebp";
442
443	&mov($r,&wparam(0));	# get r
444	 &mov($a,&wparam(1));	# get a
445	&mov($b,&wparam(2));	# get b
446	 &mov($num,&wparam(3));	# get num
447	&xor($c,$c);		# clear carry
448	 &and($num,0xfffffff8);	# num / 8
449
450	&jz(&label("aw_finish"));
451
452	&set_label("aw_loop",0);
453	for ($i=0; $i<8; $i++)
454		{
455		&comment("Round $i");
456
457		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
458		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
459		&add($tmp1,$c);
460		 &mov($c,0);
461		&adc($c,$c);
462		 &add($tmp1,$tmp2);
463		&adc($c,0);
464		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
465		}
466
467	&comment("");
468	&add($a,32);
469	 &add($b,32);
470	&add($r,32);
471	 &sub($num,8);
472	&jnz(&label("aw_loop"));
473
474	&set_label("aw_finish",0);
475	&mov($num,&wparam(3));	# get num
476	&and($num,7);
477	 &jz(&label("aw_end"));
478
479	for ($i=0; $i<7; $i++)
480		{
481		&comment("Tail Round $i");
482		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
483		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
484		&add($tmp1,$c);
485		 &mov($c,0);
486		&adc($c,$c);
487		 &add($tmp1,$tmp2);
488		&adc($c,0);
489		 &dec($num) if ($i != 6);
490		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
491		 &jz(&label("aw_end")) if ($i != 6);
492		}
493	&set_label("aw_end",0);
494
495#	&mov("eax",$c);		# $c is "eax"
496
497	&function_end($name);
498	}
499
500sub bn_sub_words
501	{
502	local($name)=@_;
503
504	&function_begin($name,"");
505
506	&comment("");
507	$a="esi";
508	$b="edi";
509	$c="eax";
510	$r="ebx";
511	$tmp1="ecx";
512	$tmp2="edx";
513	$num="ebp";
514
515	&mov($r,&wparam(0));	# get r
516	 &mov($a,&wparam(1));	# get a
517	&mov($b,&wparam(2));	# get b
518	 &mov($num,&wparam(3));	# get num
519	&xor($c,$c);		# clear carry
520	 &and($num,0xfffffff8);	# num / 8
521
522	&jz(&label("aw_finish"));
523
524	&set_label("aw_loop",0);
525	for ($i=0; $i<8; $i++)
526		{
527		&comment("Round $i");
528
529		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
530		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
531		&sub($tmp1,$c);
532		 &mov($c,0);
533		&adc($c,$c);
534		 &sub($tmp1,$tmp2);
535		&adc($c,0);
536		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
537		}
538
539	&comment("");
540	&add($a,32);
541	 &add($b,32);
542	&add($r,32);
543	 &sub($num,8);
544	&jnz(&label("aw_loop"));
545
546	&set_label("aw_finish",0);
547	&mov($num,&wparam(3));	# get num
548	&and($num,7);
549	 &jz(&label("aw_end"));
550
551	for ($i=0; $i<7; $i++)
552		{
553		&comment("Tail Round $i");
554		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
555		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
556		&sub($tmp1,$c);
557		 &mov($c,0);
558		&adc($c,$c);
559		 &sub($tmp1,$tmp2);
560		&adc($c,0);
561		 &dec($num) if ($i != 6);
562		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
563		 &jz(&label("aw_end")) if ($i != 6);
564		}
565	&set_label("aw_end",0);
566
567#	&mov("eax",$c);		# $c is "eax"
568
569	&function_end($name);
570	}
571
572sub bn_sub_part_words
573	{
574	local($name)=@_;
575
576	&function_begin($name,"");
577
578	&comment("");
579	$a="esi";
580	$b="edi";
581	$c="eax";
582	$r="ebx";
583	$tmp1="ecx";
584	$tmp2="edx";
585	$num="ebp";
586
587	&mov($r,&wparam(0));	# get r
588	 &mov($a,&wparam(1));	# get a
589	&mov($b,&wparam(2));	# get b
590	 &mov($num,&wparam(3));	# get num
591	&xor($c,$c);		# clear carry
592	 &and($num,0xfffffff8);	# num / 8
593
594	&jz(&label("aw_finish"));
595
596	&set_label("aw_loop",0);
597	for ($i=0; $i<8; $i++)
598		{
599		&comment("Round $i");
600
601		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
602		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
603		&sub($tmp1,$c);
604		 &mov($c,0);
605		&adc($c,$c);
606		 &sub($tmp1,$tmp2);
607		&adc($c,0);
608		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
609		}
610
611	&comment("");
612	&add($a,32);
613	 &add($b,32);
614	&add($r,32);
615	 &sub($num,8);
616	&jnz(&label("aw_loop"));
617
618	&set_label("aw_finish",0);
619	&mov($num,&wparam(3));	# get num
620	&and($num,7);
621	 &jz(&label("aw_end"));
622
623	for ($i=0; $i<7; $i++)
624		{
625		&comment("Tail Round $i");
626		&mov($tmp1,&DWP(0,$a,"",0));	# *a
627		 &mov($tmp2,&DWP(0,$b,"",0));# *b
628		&sub($tmp1,$c);
629		 &mov($c,0);
630		&adc($c,$c);
631		 &sub($tmp1,$tmp2);
632		&adc($c,0);
633		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
634		&add($a, 4);
635		&add($b, 4);
636		&add($r, 4);
637		 &dec($num) if ($i != 6);
638		 &jz(&label("aw_end")) if ($i != 6);
639		}
640	&set_label("aw_end",0);
641
642	&cmp(&wparam(4),0);
643	&je(&label("pw_end"));
644
645	&mov($num,&wparam(4));	# get dl
646	&cmp($num,0);
647	&je(&label("pw_end"));
648	&jge(&label("pw_pos"));
649
650	&comment("pw_neg");
651	&mov($tmp2,0);
652	&sub($tmp2,$num);
653	&mov($num,$tmp2);
654	&and($num,0xfffffff8);	# num / 8
655	&jz(&label("pw_neg_finish"));
656
657	&set_label("pw_neg_loop",0);
658	for ($i=0; $i<8; $i++)
659	{
660	    &comment("dl<0 Round $i");
661
662	    &mov($tmp1,0);
663	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
664	    &sub($tmp1,$c);
665	    &mov($c,0);
666	    &adc($c,$c);
667	    &sub($tmp1,$tmp2);
668	    &adc($c,0);
669	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
670	}
671
672	&comment("");
673	&add($b,32);
674	&add($r,32);
675	&sub($num,8);
676	&jnz(&label("pw_neg_loop"));
677
678	&set_label("pw_neg_finish",0);
679	&mov($tmp2,&wparam(4));	# get dl
680	&mov($num,0);
681	&sub($num,$tmp2);
682	&and($num,7);
683	&jz(&label("pw_end"));
684
685	for ($i=0; $i<7; $i++)
686	{
687	    &comment("dl<0 Tail Round $i");
688	    &mov($tmp1,0);
689	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
690	    &sub($tmp1,$c);
691	    &mov($c,0);
692	    &adc($c,$c);
693	    &sub($tmp1,$tmp2);
694	    &adc($c,0);
695	    &dec($num) if ($i != 6);
696	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
697	    &jz(&label("pw_end")) if ($i != 6);
698	}
699
700	&jmp(&label("pw_end"));
701
702	&set_label("pw_pos",0);
703
704	&and($num,0xfffffff8);	# num / 8
705	&jz(&label("pw_pos_finish"));
706
707	&set_label("pw_pos_loop",0);
708
709	for ($i=0; $i<8; $i++)
710	{
711	    &comment("dl>0 Round $i");
712
713	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
714	    &sub($tmp1,$c);
715	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
716	    &jnc(&label("pw_nc".$i));
717	}
718
719	&comment("");
720	&add($a,32);
721	&add($r,32);
722	&sub($num,8);
723	&jnz(&label("pw_pos_loop"));
724
725	&set_label("pw_pos_finish",0);
726	&mov($num,&wparam(4));	# get dl
727	&and($num,7);
728	&jz(&label("pw_end"));
729
730	for ($i=0; $i<7; $i++)
731	{
732	    &comment("dl>0 Tail Round $i");
733	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
734	    &sub($tmp1,$c);
735	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
736	    &jnc(&label("pw_tail_nc".$i));
737	    &dec($num) if ($i != 6);
738	    &jz(&label("pw_end")) if ($i != 6);
739	}
740	&mov($c,1);
741	&jmp(&label("pw_end"));
742
743	&set_label("pw_nc_loop",0);
744	for ($i=0; $i<8; $i++)
745	{
746	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
747	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
748	    &set_label("pw_nc".$i,0);
749	}
750
751	&comment("");
752	&add($a,32);
753	&add($r,32);
754	&sub($num,8);
755	&jnz(&label("pw_nc_loop"));
756
757	&mov($num,&wparam(4));	# get dl
758	&and($num,7);
759	&jz(&label("pw_nc_end"));
760
761	for ($i=0; $i<7; $i++)
762	{
763	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
764	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
765	    &set_label("pw_tail_nc".$i,0);
766	    &dec($num) if ($i != 6);
767	    &jz(&label("pw_nc_end")) if ($i != 6);
768	}
769
770	&set_label("pw_nc_end",0);
771	&mov($c,0);
772
773	&set_label("pw_end",0);
774
775#	&mov("eax",$c);		# $c is "eax"
776
777	&function_end($name);
778	}
779