6 files changed, 3592 insertions, 0 deletions
diff --git a/openssl/crypto/des/asm/crypt586.pl b/openssl/crypto/des/asm/crypt586.pl
new file mode 100644
index 0000000..e36f7d4
--- /dev/null
+++ b/openssl/crypto/des/asm/crypt586.pl
@@ -0,0 +1,209 @@
+#!/usr/local/bin/perl
+#
+# The inner loop instruction sequence and the IP/FP modifications are from
+# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
+# I've added the stuff needed for crypt() but I've not worried about making
+# things perfect.
+#
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"crypt586.pl");
+
+$L="edi";
+$R="esi";
+
+&external_label("DES_SPtrans");
+&fcrypt_body("fcrypt_body");
+&asm_finish();
+
+sub fcrypt_body
+	{
+	local($name,$do_ip)=@_;
+
+	&function_begin($name);
+
+	&comment("");
+	&comment("Load the 2 words");
+	$trans="ebp";
+
+	&xor(	$L,	$L);
+	&xor(	$R,	$R);
+
+	# PIC-ification:-)
+	&picmeup("edx","DES_SPtrans");
+	#if ($cpp)	{ &picmeup("edx","DES_SPtrans");   }
+	#else		{ &lea("edx",&DWP("DES_SPtrans")); }
+	&push("edx");	# becomes &swtmp(1)
+	#
+	&mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT
+
+	&push(&DWC(25)); # add a variable
+
+	&set_label("start");
+	for ($i=0; $i<16; $i+=2)
+		{
+		&comment("");
+		&comment("Round $i");
+		&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx");
+
+		&comment("");
+		&comment("Round ".sprintf("%d",$i+1));
+		&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx");
+		}
+	 &mov("ebx",	&swtmp(0));
+	&mov("eax",	$L);
+	 &dec("ebx");
+	&mov($L,	$R);
+	 &mov($R,	"eax");
+	&mov(&swtmp(0),	"ebx");
+	 &jnz(&label("start"));
+
+	&comment("");
+	&comment("FP");
+	&mov("edx",&wparam(0));
+
+	&FP_new($R,$L,"eax",3);
+	&mov(&DWP(0,"edx","",0),"eax");
+	&mov(&DWP(4,"edx","",0),$L);
+
+	&add("esp",8);	# remove variables
+
+	&function_end($name);
+	}
+
+sub D_ENCRYPT
+	{
+	local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_;
+
+	&mov(	$u,		&wparam(2));			# 2
+	&mov(	$t,		$R);
+	&shr(	$t,		16);				# 1
+	&mov(	$tmp2,		&wparam(3));			# 2
+	&xor(	$t,		$R);				# 1
+
+	&and(	$u,		$t);				# 2
+	&and(	$t,		$tmp2);				# 2
+
+	&mov(	$tmp1,		$u);
+	&shl(	$tmp1,		16); 				# 1
+	&mov(	$tmp2,		$t);
+	&shl(	$tmp2,		16); 				# 1
+	&xor(	$u,		$tmp1);				# 2
+	&xor(	$t,		$tmp2);				# 2
+	&mov(	$tmp1,		&DWP(&n2a($S*4),$trans,"",0));	# 2
+	&xor(	$u,		$tmp1);
+	&mov(	$tmp2,		&DWP(&n2a(($S+1)*4),$trans,"",0));	# 2
+	&xor(	$u,		$R);
+	&xor(	$t,		$R);
+	&xor(	$t,		$tmp2);
+
+	&and(	$u,		"0xfcfcfcfc"	);		# 2
+	&xor(	$tmp1,		$tmp1);				# 1
+	&and(	$t,		"0xcfcfcfcf"	);		# 2
+	&xor(	$tmp2,		$tmp2);	
+	&movb(	&LB($tmp1),	&LB($u)	);
+	&movb(	&LB($tmp2),	&HB($u)	);
+	&rotr(	$t,		4		);
+	&mov(	$trans,		&swtmp(1));
+	&xor(	$L,		&DWP("     ",$trans,$tmp1,0));
+	&movb(	&LB($tmp1),	&LB($t)	);
+	&xor(	$L,		&DWP("0x200",$trans,$tmp2,0));
+	&movb(	&LB($tmp2),	&HB($t)	);
+	&shr(	$u,		16);
+	&xor(	$L,		&DWP("0x100",$trans,$tmp1,0));
+	&movb(	&LB($tmp1),	&HB($u)	);
+	&shr(	$t,		16);
+	&xor(	$L,		&DWP("0x300",$trans,$tmp2,0));
+	&movb(	&LB($tmp2),	&HB($t)	);
+	&and(	$u,		"0xff"	);
+	&and(	$t,		"0xff"	);
+	&mov(	$tmp1,		&DWP("0x600",$trans,$tmp1,0));
+	&xor(	$L,		$tmp1);
+	&mov(	$tmp1,		&DWP("0x700",$trans,$tmp2,0));
+	&xor(	$L,		$tmp1);
+	&mov(	$tmp1,		&DWP("0x400",$trans,$u,0));
+	&xor(	$L,		$tmp1);
+	&mov(	$tmp1,		&DWP("0x500",$trans,$t,0));
+	&xor(	$L,		$tmp1);
+	&mov(	$trans,		&wparam(1));
+	}
+
+sub n2a
+	{
+	sprintf("%d",$_[0]);
+	}
+
+# now has a side affect of rotating $a by $shift
+sub R_PERM_OP
+	{
+	local($a,$b,$tt,$shift,$mask,$last)=@_;
+
+	&rotl(	$a,		$shift		) if ($shift != 0);
+	&mov(	$tt,		$a		);
+	&xor(	$a,		$b		);
+	&and(	$a,		$mask		);
+	if ($notlast eq $b)
+		{
+		&xor(	$b,		$a		);
+		&xor(	$tt,		$a		);
+		}
+	else
+		{
+		&xor(	$tt,		$a		);
+		&xor(	$b,		$a		);
+		}
+	&comment("");
+	}
+
+sub IP_new
+	{
+	local($l,$r,$tt,$lr)=@_;
+
+	&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
+	&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
+	&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
+	&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
+	&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
+	
+	if ($lr != 3)
+		{
+		if (($lr-3) < 0)
+			{ &rotr($tt,	3-$lr); }
+		else	{ &rotl($tt,	$lr-3); }
+		}
+	if ($lr != 2)
+		{
+		if (($lr-2) < 0)
+			{ &rotr($r,	2-$lr); }
+		else	{ &rotl($r,	$lr-2); }
+		}
+	}
+
+sub FP_new
+	{
+	local($l,$r,$tt,$lr)=@_;
+
+	if ($lr != 2)
+		{
+		if (($lr-2) < 0)
+			{ &rotl($r,	2-$lr); }
+		else	{ &rotr($r,	$lr-2); }
+		}
+	if ($lr != 3)
+		{
+		if (($lr-3) < 0)
+			{ &rotl($l,	3-$lr); }
+		else	{ &rotr($l,	$lr-3); }
+		}
+
+	&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
+	&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
+	&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
+	&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
+	&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
+	&rotr($tt	, 4);
+	}
+
diff --git a/openssl/crypto/des/asm/des-586.pl b/openssl/crypto/des/asm/des-586.pl
new file mode 100644
index 0000000..bd6a7dd
--- /dev/null
+++ b/openssl/crypto/des/asm/des-586.pl
@@ -0,0 +1,455 @@
+#!/usr/local/bin/perl
+#
+# The inner loop instruction sequence and the IP/FP modifications are from
+# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
+#
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+require "cbc.pl";
+require "desboth.pl";
+
+# base code is in microsft
+# op dest, source
+# format.
+#
+
+&asm_init($ARGV[0],"des-586.pl");
+
+$L="edi";
+$R="esi";
+$trans="ebp";
+$small_footprint=1 if (grep(/\-DOPENSSL_SMALL_FOOTPRINT/,@ARGV));
+# one can discuss setting this variable to 1 unconditionally, as
+# the folded loop is only 3% slower than unrolled, but >7 times smaller
+
+&public_label("DES_SPtrans");
+&static_label("des_sptrans");
+
+&DES_encrypt_internal();
+&DES_decrypt_internal();
+&DES_encrypt("DES_encrypt1",1);
+&DES_encrypt("DES_encrypt2",0);
+&DES_encrypt3("DES_encrypt3",1);
+&DES_encrypt3("DES_decrypt3",0);
+&cbc("DES_ncbc_encrypt","DES_encrypt1","DES_encrypt1",0,4,5,3,5,-1);
+&cbc("DES_ede3_cbc_encrypt","DES_encrypt3","DES_decrypt3",0,6,7,3,4,5);
+&DES_SPtrans();
+
+&asm_finish();
+
+sub DES_encrypt_internal()
+	{
+	&function_begin_B("_x86_DES_encrypt");
+
+	if ($small_footprint)
+	    {
+	    &lea("edx",&DWP(128,"ecx"));
+	    &push("edx");
+	    &push("ecx");
+	    &set_label("eloop");
+		&D_ENCRYPT(0,$L,$R,0,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("");
+		&D_ENCRYPT(1,$R,$L,2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("");
+		&add("ecx",16);
+		&cmp("ecx",&swtmp(1));
+		&mov(&swtmp(0),"ecx");
+		&jb(&label("eloop"));
+	    &add("esp",8);
+	    }
+	else
+	    {
+	    &push("ecx");
+	    for ($i=0; $i<16; $i+=2)
+		{
+		&comment("Round $i");
+		&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("Round ".sprintf("%d",$i+1));
+		&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		}
+	    &add("esp",4);
+	}
+	&ret();
+
+	&function_end_B("_x86_DES_encrypt");
+	}
+	
+sub DES_decrypt_internal()
+	{
+	&function_begin_B("_x86_DES_decrypt");
+
+	if ($small_footprint)
+	    {
+	    &push("ecx");
+	    &lea("ecx",&DWP(128,"ecx"));
+	    &push("ecx");
+	    &set_label("dloop");
+		&D_ENCRYPT(0,$L,$R,-2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("");
+		&D_ENCRYPT(1,$R,$L,-4,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("");
+		&sub("ecx",16);
+		&cmp("ecx",&swtmp(1));
+		&mov(&swtmp(0),"ecx");
+		&ja(&label("dloop"));
+	    &add("esp",8);
+	    }
+	else
+	    {
+	    &push("ecx");
+	    for ($i=15; $i>0; $i-=2)
+		{
+		&comment("Round $i");
+		&D_ENCRYPT(15-$i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		&comment("Round ".sprintf("%d",$i-1));
+		&D_ENCRYPT(15-$i+1,$R,$L,($i-1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
+		}
+	    &add("esp",4);
+	    }
+	&ret();
+
+	&function_end_B("_x86_DES_decrypt");
+	}
+	
+sub DES_encrypt
+	{
+	local($name,$do_ip)=@_;
+
+	&function_begin_B($name);
+
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	&comment("Load the 2 words");
+
+	if ($do_ip)
+		{
+		&mov($R,&wparam(0));
+		 &xor(	"ecx",		"ecx"		);
+
+		&push("ebx");
+		&push("ebp");
+
+		&mov("eax",&DWP(0,$R,"",0));
+		 &mov("ebx",&wparam(2));	# get encrypt flag
+		&mov($L,&DWP(4,$R,"",0));
+		&comment("");
+		&comment("IP");
+		&IP_new("eax",$L,$R,3);
+		}
+	else
+		{
+		&mov("eax",&wparam(0));
+		 &xor(	"ecx",		"ecx"		);
+
+		&push("ebx");
+		&push("ebp");
+
+		&mov($R,&DWP(0,"eax","",0));
+		 &mov("ebx",&wparam(2));	# get encrypt flag
+		&rotl($R,3);
+		&mov($L,&DWP(4,"eax","",0));
+		&rotl($L,3);
+		}
+
+	# PIC-ification:-)
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($trans);
+	&lea	($trans,&DWP(&label("des_sptrans")."-".&label("pic_point"),$trans));
+
+	&mov(	"ecx",	&wparam(1)	);
+
+	&cmp("ebx","0");
+	&je(&label("decrypt"));
+	&call("_x86_DES_encrypt");
+	&jmp(&label("done"));
+	&set_label("decrypt");
+	&call("_x86_DES_decrypt");
+	&set_label("done");
+
+	if ($do_ip)
+		{
+		&comment("");
+		&comment("FP");
+		&mov("edx",&wparam(0));
+		&FP_new($L,$R,"eax",3);
+
+		&mov(&DWP(0,"edx","",0),"eax");
+		&mov(&DWP(4,"edx","",0),$R);
+		}
+	else
+		{
+		&comment("");
+		&comment("Fixup");
+		&rotr($L,3);		# r
+		 &mov("eax",&wparam(0));
+		&rotr($R,3);		# l
+		 &mov(&DWP(0,"eax","",0),$L);
+		 &mov(&DWP(4,"eax","",0),$R);
+		}
+
+	&pop("ebp");
+	&pop("ebx");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+
+	&function_end_B($name);
+	}
+
+sub D_ENCRYPT
+	{
+	local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t,$wp1)=@_;
+
+	 &mov(	$u,		&DWP(&n2a($S*4),$tmp2,"",0));
+	&xor(	$tmp1,		$tmp1);
+	 &mov(	$t,		&DWP(&n2a(($S+1)*4),$tmp2,"",0));
+	&xor(	$u,		$R);
+	&xor(	$tmp2,		$tmp2);
+	 &xor(	$t,		$R);
+	&and(	$u,		"0xfcfcfcfc"	);
+	 &and(	$t,		"0xcfcfcfcf"	);
+	&movb(	&LB($tmp1),	&LB($u)	);
+	 &movb(	&LB($tmp2),	&HB($u)	);
+	&rotr(	$t,		4		);
+	&xor(	$L,		&DWP("     ",$trans,$tmp1,0));
+	 &movb(	&LB($tmp1),	&LB($t)	);
+	 &xor(	$L,		&DWP("0x200",$trans,$tmp2,0));
+	 &movb(	&LB($tmp2),	&HB($t)	);
+	&shr(	$u,		16);
+	 &xor(	$L,		&DWP("0x100",$trans,$tmp1,0));
+	 &movb(	&LB($tmp1),	&HB($u)	);
+	&shr(	$t,		16);
+	 &xor(	$L,		&DWP("0x300",$trans,$tmp2,0));
+	&movb(	&LB($tmp2),	&HB($t)	);
+	 &and(	$u,		"0xff"	);
+	&and(	$t,		"0xff"	);
+	 &xor(	$L,		&DWP("0x600",$trans,$tmp1,0));
+	 &xor(	$L,		&DWP("0x700",$trans,$tmp2,0));
+	&mov(	$tmp2,		$wp1	);
+	 &xor(	$L,		&DWP("0x400",$trans,$u,0));
+	 &xor(	$L,		&DWP("0x500",$trans,$t,0));
+	}
+
+sub n2a
+	{
+	sprintf("%d",$_[0]);
+	}
+
+# now has a side affect of rotating $a by $shift
+sub R_PERM_OP
+	{
+	local($a,$b,$tt,$shift,$mask,$last)=@_;
+
+	&rotl(	$a,		$shift		) if ($shift != 0);
+	&mov(	$tt,		$a		);
+	&xor(	$a,		$b		);
+	&and(	$a,		$mask		);
+	# This can never succeed, and besides it is difficult to see what the
+	# idea was - Ben 13 Feb 99
+	if (!$last eq $b)
+		{
+		&xor(	$b,		$a		);
+		&xor(	$tt,		$a		);
+		}
+	else
+		{
+		&xor(	$tt,		$a		);
+		&xor(	$b,		$a		);
+		}
+	&comment("");
+	}
+
+sub IP_new
+	{
+	local($l,$r,$tt,$lr)=@_;
+
+	&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
+	&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
+	&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
+	&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
+	&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
+	
+	if ($lr != 3)
+		{
+		if (($lr-3) < 0)
+			{ &rotr($tt,	3-$lr); }
+		else	{ &rotl($tt,	$lr-3); }
+		}
+	if ($lr != 2)
+		{
+		if (($lr-2) < 0)
+			{ &rotr($r,	2-$lr); }
+		else	{ &rotl($r,	$lr-2); }
+		}
+	}
+
+sub FP_new
+	{
+	local($l,$r,$tt,$lr)=@_;
+
+	if ($lr != 2)
+		{
+		if (($lr-2) < 0)
+			{ &rotl($r,	2-$lr); }
+		else	{ &rotr($r,	$lr-2); }
+		}
+	if ($lr != 3)
+		{
+		if (($lr-3) < 0)
+			{ &rotl($l,	3-$lr); }
+		else	{ &rotr($l,	$lr-3); }
+		}
+
+	&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
+	&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
+	&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
+	&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
+	&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
+	&rotr($tt	, 4);
+	}
+
+sub DES_SPtrans
+	{
+	&set_label("DES_SPtrans",64);
+	&set_label("des_sptrans");
+	&data_word(0x02080800, 0x00080000, 0x02000002, 0x02080802);
+	&data_word(0x02000000, 0x00080802, 0x00080002, 0x02000002);
+	&data_word(0x00080802, 0x02080800, 0x02080000, 0x00000802);
+	&data_word(0x02000802, 0x02000000, 0x00000000, 0x00080002);
+	&data_word(0x00080000, 0x00000002, 0x02000800, 0x00080800);
+	&data_word(0x02080802, 0x02080000, 0x00000802, 0x02000800);
+	&data_word(0x00000002, 0x00000800, 0x00080800, 0x02080002);
+	&data_word(0x00000800, 0x02000802, 0x02080002, 0x00000000);
+	&data_word(0x00000000, 0x02080802, 0x02000800, 0x00080002);
+	&data_word(0x02080800, 0x00080000, 0x00000802, 0x02000800);
+	&data_word(0x02080002, 0x00000800, 0x00080800, 0x02000002);
+	&data_word(0x00080802, 0x00000002, 0x02000002, 0x02080000);
+	&data_word(0x02080802, 0x00080800, 0x02080000, 0x02000802);
+	&data_word(0x02000000, 0x00000802, 0x00080002, 0x00000000);
+	&data_word(0x00080000, 0x02000000, 0x02000802, 0x02080800);
+	&data_word(0x00000002, 0x02080002, 0x00000800, 0x00080802);
+	# nibble 1
+	&data_word(0x40108010, 0x00000000, 0x00108000, 0x40100000);
+	&data_word(0x40000010, 0x00008010, 0x40008000, 0x00108000);
+	&data_word(0x00008000, 0x40100010, 0x00000010, 0x40008000);
+	&data_word(0x00100010, 0x40108000, 0x40100000, 0x00000010);
+	&data_word(0x00100000, 0x40008010, 0x40100010, 0x00008000);
+	&data_word(0x00108010, 0x40000000, 0x00000000, 0x00100010);
+	&data_word(0x40008010, 0x00108010, 0x40108000, 0x40000010);
+	&data_word(0x40000000, 0x00100000, 0x00008010, 0x40108010);
+	&data_word(0x00100010, 0x40108000, 0x40008000, 0x00108010);
+	&data_word(0x40108010, 0x00100010, 0x40000010, 0x00000000);
+	&data_word(0x40000000, 0x00008010, 0x00100000, 0x40100010);
+	&data_word(0x00008000, 0x40000000, 0x00108010, 0x40008010);
+	&data_word(0x40108000, 0x00008000, 0x00000000, 0x40000010);
+	&data_word(0x00000010, 0x40108010, 0x00108000, 0x40100000);
+	&data_word(0x40100010, 0x00100000, 0x00008010, 0x40008000);
+	&data_word(0x40008010, 0x00000010, 0x40100000, 0x00108000);
+	# nibble 2
+	&data_word(0x04000001, 0x04040100, 0x00000100, 0x04000101);
+	&data_word(0x00040001, 0x04000000, 0x04000101, 0x00040100);
+	&data_word(0x04000100, 0x00040000, 0x04040000, 0x00000001);
+	&data_word(0x04040101, 0x00000101, 0x00000001, 0x04040001);
+	&data_word(0x00000000, 0x00040001, 0x04040100, 0x00000100);
+	&data_word(0x00000101, 0x04040101, 0x00040000, 0x04000001);
+	&data_word(0x04040001, 0x04000100, 0x00040101, 0x04040000);
+	&data_word(0x00040100, 0x00000000, 0x04000000, 0x00040101);
+	&data_word(0x04040100, 0x00000100, 0x00000001, 0x00040000);
+	&data_word(0x00000101, 0x00040001, 0x04040000, 0x04000101);
+	&data_word(0x00000000, 0x04040100, 0x00040100, 0x04040001);
+	&data_word(0x00040001, 0x04000000, 0x04040101, 0x00000001);
+	&data_word(0x00040101, 0x04000001, 0x04000000, 0x04040101);
+	&data_word(0x00040000, 0x04000100, 0x04000101, 0x00040100);
+	&data_word(0x04000100, 0x00000000, 0x04040001, 0x00000101);
+	&data_word(0x04000001, 0x00040101, 0x00000100, 0x04040000);
+	# nibble 3
+	&data_word(0x00401008, 0x10001000, 0x00000008, 0x10401008);
+	&data_word(0x00000000, 0x10400000, 0x10001008, 0x00400008);
+	&data_word(0x10401000, 0x10000008, 0x10000000, 0x00001008);
+	&data_word(0x10000008, 0x00401008, 0x00400000, 0x10000000);
+	&data_word(0x10400008, 0x00401000, 0x00001000, 0x00000008);
+	&data_word(0x00401000, 0x10001008, 0x10400000, 0x00001000);
+	&data_word(0x00001008, 0x00000000, 0x00400008, 0x10401000);
+	&data_word(0x10001000, 0x10400008, 0x10401008, 0x00400000);
+	&data_word(0x10400008, 0x00001008, 0x00400000, 0x10000008);
+	&data_word(0x00401000, 0x10001000, 0x00000008, 0x10400000);
+	&data_word(0x10001008, 0x00000000, 0x00001000, 0x00400008);
+	&data_word(0x00000000, 0x10400008, 0x10401000, 0x00001000);
+	&data_word(0x10000000, 0x10401008, 0x00401008, 0x00400000);
+	&data_word(0x10401008, 0x00000008, 0x10001000, 0x00401008);
+	&data_word(0x00400008, 0x00401000, 0x10400000, 0x10001008);
+	&data_word(0x00001008, 0x10000000, 0x10000008, 0x10401000);
+	# nibble 4
+	&data_word(0x08000000, 0x00010000, 0x00000400, 0x08010420);
+	&data_word(0x08010020, 0x08000400, 0x00010420, 0x08010000);
+	&data_word(0x00010000, 0x00000020, 0x08000020, 0x00010400);
+	&data_word(0x08000420, 0x08010020, 0x08010400, 0x00000000);
+	&data_word(0x00010400, 0x08000000, 0x00010020, 0x00000420);
+	&data_word(0x08000400, 0x00010420, 0x00000000, 0x08000020);
+	&data_word(0x00000020, 0x08000420, 0x08010420, 0x00010020);
+	&data_word(0x08010000, 0x00000400, 0x00000420, 0x08010400);
+	&data_word(0x08010400, 0x08000420, 0x00010020, 0x08010000);
+	&data_word(0x00010000, 0x00000020, 0x08000020, 0x08000400);
+	&data_word(0x08000000, 0x00010400, 0x08010420, 0x00000000);
+	&data_word(0x00010420, 0x08000000, 0x00000400, 0x00010020);
+	&data_word(0x08000420, 0x00000400, 0x00000000, 0x08010420);
+	&data_word(0x08010020, 0x08010400, 0x00000420, 0x00010000);
+	&data_word(0x00010400, 0x08010020, 0x08000400, 0x00000420);
+	&data_word(0x00000020, 0x00010420, 0x08010000, 0x08000020);
+	# nibble 5
+	&data_word(0x80000040, 0x00200040, 0x00000000, 0x80202000);
+	&data_word(0x00200040, 0x00002000, 0x80002040, 0x00200000);
+	&data_word(0x00002040, 0x80202040, 0x00202000, 0x80000000);
+	&data_word(0x80002000, 0x80000040, 0x80200000, 0x00202040);
+	&data_word(0x00200000, 0x80002040, 0x80200040, 0x00000000);
+	&data_word(0x00002000, 0x00000040, 0x80202000, 0x80200040);
+	&data_word(0x80202040, 0x80200000, 0x80000000, 0x00002040);
+	&data_word(0x00000040, 0x00202000, 0x00202040, 0x80002000);
+	&data_word(0x00002040, 0x80000000, 0x80002000, 0x00202040);
+	&data_word(0x80202000, 0x00200040, 0x00000000, 0x80002000);
+	&data_word(0x80000000, 0x00002000, 0x80200040, 0x00200000);
+	&data_word(0x00200040, 0x80202040, 0x00202000, 0x00000040);
+	&data_word(0x80202040, 0x00202000, 0x00200000, 0x80002040);
+	&data_word(0x80000040, 0x80200000, 0x00202040, 0x00000000);
+	&data_word(0x00002000, 0x80000040, 0x80002040, 0x80202000);
+	&data_word(0x80200000, 0x00002040, 0x00000040, 0x80200040);
+	# nibble 6
+	&data_word(0x00004000, 0x00000200, 0x01000200, 0x01000004);
+	&data_word(0x01004204, 0x00004004, 0x00004200, 0x00000000);
+	&data_word(0x01000000, 0x01000204, 0x00000204, 0x01004000);
+	&data_word(0x00000004, 0x01004200, 0x01004000, 0x00000204);
+	&data_word(0x01000204, 0x00004000, 0x00004004, 0x01004204);
+	&data_word(0x00000000, 0x01000200, 0x01000004, 0x00004200);
+	&data_word(0x01004004, 0x00004204, 0x01004200, 0x00000004);
+	&data_word(0x00004204, 0x01004004, 0x00000200, 0x01000000);
+	&data_word(0x00004204, 0x01004000, 0x01004004, 0x00000204);
+	&data_word(0x00004000, 0x00000200, 0x01000000, 0x01004004);
+	&data_word(0x01000204, 0x00004204, 0x00004200, 0x00000000);
+	&data_word(0x00000200, 0x01000004, 0x00000004, 0x01000200);
+	&data_word(0x00000000, 0x01000204, 0x01000200, 0x00004200);
+	&data_word(0x00000204, 0x00004000, 0x01004204, 0x01000000);
+	&data_word(0x01004200, 0x00000004, 0x00004004, 0x01004204);
+	&data_word(0x01000004, 0x01004200, 0x01004000, 0x00004004);
+	# nibble 7
+	&data_word(0x20800080, 0x20820000, 0x00020080, 0x00000000);
+	&data_word(0x20020000, 0x00800080, 0x20800000, 0x20820080);
+	&data_word(0x00000080, 0x20000000, 0x00820000, 0x00020080);
+	&data_word(0x00820080, 0x20020080, 0x20000080, 0x20800000);
+	&data_word(0x00020000, 0x00820080, 0x00800080, 0x20020000);
+	&data_word(0x20820080, 0x20000080, 0x00000000, 0x00820000);
+	&data_word(0x20000000, 0x00800000, 0x20020080, 0x20800080);
+	&data_word(0x00800000, 0x00020000, 0x20820000, 0x00000080);
+	&data_word(0x00800000, 0x00020000, 0x20000080, 0x20820080);
+	&data_word(0x00020080, 0x20000000, 0x00000000, 0x00820000);
+	&data_word(0x20800080, 0x20020080, 0x20020000, 0x00800080);
+	&data_word(0x20820000, 0x00000080, 0x00800080, 0x20020000);
+	&data_word(0x20820080, 0x00800000, 0x20800000, 0x20000080);
+	&data_word(0x00820000, 0x00020080, 0x20020080, 0x20800000);
+	&data_word(0x00000080, 0x20820000, 0x00820080, 0x00000000);
+	&data_word(0x20000000, 0x20800080, 0x00020000, 0x00820080);
+	}
diff --git a/openssl/crypto/des/asm/des_enc.m4 b/openssl/crypto/des/asm/des_enc.m4
new file mode 100644
index 0000000..dda08e1
--- /dev/null
+++ b/openssl/crypto/des/asm/des_enc.m4
@@ -0,0 +1,2101 @@
+!  des_enc.m4
+!  des_enc.S  (generated from des_enc.m4)
+!
+!  UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file.
+!
+!  Version 1.0. 32-bit version.
+!
+!  June 8, 2000.
+!
+!  Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation
+!		by Andy Polyakov.
+!
+!  January 1, 2003.
+!
+!  Assembler version: Copyright Svend Olaf Mikkelsen.
+!
+!  Original C code: Copyright Eric A. Young.
+!
+!  This code can be freely used by LibDES/SSLeay/OpenSSL users.
+!
+!  The LibDES/SSLeay/OpenSSL copyright notices must be respected.
+!
+!  This version can be redistributed.
+!
+!  To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S
+!
+!  Global registers 1 to 5 are used. This is the same as done by the
+!  cc compiler. The UltraSPARC load/store little endian feature is used.
+!
+!  Instruction grouping often refers to one CPU cycle.
+!
+!  Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S
+!
+!  Assemble through cc:  cc -c -xarch=v8plusa -o des_enc.o des_enc.S
+!
+!  Performance improvement according to './apps/openssl speed des'
+!
+!	32-bit build:
+!		23%  faster than cc-5.2 -xarch=v8plus -xO5
+!		115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5
+!	64-bit build:
+!		50%  faster than cc-5.2 -xarch=v9 -xO5
+!		100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5
+!
+
+.ident "des_enc.m4 2.1"
+.file  "des_enc-sparc.S"
+
+#include <openssl/opensslconf.h>
+
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+# define ABI64  /* They've said -xarch=v9 at command line */
+#elif defined(__GNUC__) && defined(__arch64__)
+# define ABI64  /* They've said -m64 at command line */
+#endif
+
+#ifdef ABI64
+  .register	%g2,#scratch
+  .register	%g3,#scratch
+# define	FRAME	-192
+# define	BIAS	2047
+# define	LDPTR	ldx
+# define	STPTR	stx
+# define	ARG0	128
+# define	ARGSZ	8
+# ifndef OPENSSL_SYSNAME_ULTRASPARC
+# define OPENSSL_SYSNAME_ULTRASPARC
+# endif
+#else
+# define	FRAME	-96
+# define	BIAS	0
+# define	LDPTR	ld
+# define	STPTR	st
+# define	ARG0	68
+# define	ARGSZ	4
+#endif
+
+#define LOOPS 7
+
+#define global0 %g0
+#define global1 %g1
+#define global2 %g2
+#define global3 %g3
+#define global4 %g4
+#define global5 %g5
+
+#define local0 %l0
+#define local1 %l1
+#define local2 %l2
+#define local3 %l3
+#define local4 %l4
+#define local5 %l5
+#define local7 %l6
+#define local6 %l7
+
+#define in0 %i0
+#define in1 %i1
+#define in2 %i2
+#define in3 %i3
+#define in4 %i4
+#define in5 %i5
+#define in6 %i6
+#define in7 %i7
+
+#define out0 %o0
+#define out1 %o1
+#define out2 %o2
+#define out3 %o3
+#define out4 %o4
+#define out5 %o5
+#define out6 %o6
+#define out7 %o7
+
+#define stub stb
+
+changequote({,})
+
+
+! Macro definitions:
+
+
+! {ip_macro}
+!
+! The logic used in initial and final permutations is the same as in
+! the C code. The permutations are done with a clever shift, xor, and
+! technique.
+!
+! The macro also loads address sbox 1 to 5 to global 1 to 5, address
+! sbox 6 to local6, and addres sbox 8 to out3.
+!
+! Rotates the halfs 3 left to bring the sbox bits in convenient positions.
+!
+! Loads key first round from address in parameter 5 to out0, out1.
+!
+! After the the original LibDES initial permutation, the resulting left
+! is in the variable initially used for right and vice versa. The macro
+! implements the possibility to keep the halfs in the original registers.
+!
+! parameter 1  left
+! parameter 2  right
+! parameter 3  result left (modify in first round)
+! parameter 4  result right (use in first round)
+! parameter 5  key address
+! parameter 6  1/2 for include encryption/decryption
+! parameter 7  1 for move in1 to in3
+! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
+! parameter 9  1 for load ks3 and ks2 to in4 and in3
+
+define(ip_macro, {
+
+! {ip_macro}
+! $1 $2 $4 $3 $5 $6 $7 $8 $9
+
+	ld	[out2+256], local1
+	srl	$2, 4, local4
+
+	xor	local4, $1, local4
+	ifelse($7,1,{mov in1, in3},{nop})
+
+	ld	[out2+260], local2
+	and	local4, local1, local4
+	ifelse($8,1,{mov in3, in4},{})
+	ifelse($8,2,{mov in4, in3},{})
+
+	ld	[out2+280], out4          ! loop counter
+	sll	local4, 4, local1
+	xor	$1, local4, $1
+
+	ld	[out2+264], local3
+	srl	$1, 16, local4
+	xor	$2, local1, $2
+
+	ifelse($9,1,{LDPTR	KS3, in4},{})
+	xor	local4, $2, local4
+	nop	!sethi	%hi(DES_SPtrans), global1 ! sbox addr
+
+	ifelse($9,1,{LDPTR	KS2, in3},{})
+	and	local4, local2, local4
+	nop	!or	global1, %lo(DES_SPtrans), global1   ! sbox addr
+
+	sll	local4, 16, local1
+	xor	$2, local4, $2
+
+	srl	$2, 2, local4
+	xor	$1, local1, $1
+
+	sethi	%hi(16711680), local5
+	xor	local4, $1, local4
+
+	and	local4, local3, local4
+	or	local5, 255, local5
+
+	sll	local4, 2, local2
+	xor	$1, local4, $1
+
+	srl	$1, 8, local4
+	xor	$2, local2, $2
+
+	xor	local4, $2, local4
+	add	global1, 768, global4
+
+	and	local4, local5, local4
+	add	global1, 1024, global5
+
+	ld	[out2+272], local7
+	sll	local4, 8, local1
+	xor	$2, local4, $2
+
+	srl	$2, 1, local4
+	xor	$1, local1, $1
+
+	ld	[$5], out0                ! key 7531
+	xor	local4, $1, local4
+	add	global1, 256, global2
+
+	ld	[$5+4], out1              ! key 8642
+	and	local4, local7, local4
+	add	global1, 512, global3
+
+	sll	local4, 1, local1
+	xor	$1, local4, $1
+
+	sll	$1, 3, local3
+	xor	$2, local1, $2
+
+	sll	$2, 3, local2
+	add	global1, 1280, local6     ! address sbox 8
+
+	srl	$1, 29, local4
+	add	global1, 1792, out3       ! address sbox 8
+
+	srl	$2, 29, local1
+	or	local4, local3, $4
+
+	or	local2, local1, $3
+
+	ifelse($6, 1, {
+
+		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds
+		or	local2, local1, $3
+		xor	$4, out0, local1
+
+		call .des_enc.1
+		and	local1, 252, local1
+
+	},{})
+
+	ifelse($6, 2, {
+
+		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds
+		or	local2, local1, $3
+		xor	$4, out0, local1
+
+		call .des_dec.1
+		and	local1, 252, local1
+
+	},{})
+})
+
+
+! {rounds_macro}
+!
+! The logic used in the DES rounds is the same as in the C code,
+! except that calculations for sbox 1 and sbox 5 begin before
+! the previous round is finished.
+!
+! In each round one half (work) is modified based on key and the
+! other half (use).
+!
+! In this version we do two rounds in a loop repeated 7 times
+! and two rounds seperately.
+!
+! One half has the bits for the sboxes in the following positions:
+!
+!	777777xx555555xx333333xx111111xx
+!
+!	88xx666666xx444444xx222222xx8888
+!
+! The bits for each sbox are xor-ed with the key bits for that box.
+! The above xx bits are cleared, and the result used for lookup in
+! the sbox table. Each sbox entry contains the 4 output bits permuted
+! into 32 bits according to the P permutation.
+!
+! In the description of DES, left and right are switched after
+! each round, except after last round. In this code the original
+! left and right are kept in the same register in all rounds, meaning
+! that after the 16 rounds the result for right is in the register
+! originally used for left.
+!
+! parameter 1  first work (left in first round)
+! parameter 2  first use (right in first round)
+! parameter 3  enc/dec  1/-1
+! parameter 4  loop label
+! parameter 5  key address register
+! parameter 6  optional address for key next encryption/decryption
+! parameter 7  not empty for include retl
+!
+! also compares in2 to 8
+
+define(rounds_macro, {
+
+! {rounds_macro}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	xor	$2, out0, local1
+
+	ld	[out2+284], local5        ! 0x0000FC00
+	ba	$4
+	and	local1, 252, local1
+
+	.align 32
+
+$4:
+	! local6 is address sbox 6
+	! out3   is address sbox 8
+	! out4   is loop counter
+
+	ld	[global1+local1], local1
+	xor	$2, out1, out1            ! 8642
+	xor	$2, out0, out0            ! 7531
+	! fmovs	%f0, %f0                  ! fxor used for alignment
+
+	srl	out1, 4, local0           ! rotate 4 right
+	and	out0, local5, local3      ! 3
+	! fmovs	%f0, %f0
+
+	ld	[$5+$3*8], local7         ! key 7531 next round
+	srl	local3, 8, local3         ! 3
+	and	local0, 252, local2       ! 2
+	! fmovs	%f0, %f0
+
+	ld	[global3+local3],local3   ! 3
+	sll	out1, 28, out1            ! rotate
+	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7
+
+	ld	[global2+local2], local2  ! 2 
+	srl	out0, 24, local1          ! 7
+	or	out1, local0, out1        ! rotate
+
+	ldub	[out2+local1], local1     ! 7 (and 0xFC)
+	srl	out1, 24, local0          ! 8
+	and	out1, local5, local4      ! 4
+
+	ldub	[out2+local0], local0     ! 8 (and 0xFC)
+	srl	local4, 8, local4         ! 4
+	xor	$1, local2, $1            ! 2 finished local2 now sbox 6
+
+	ld	[global4+local4],local4   ! 4
+	srl	out1, 16, local2          ! 6
+	xor	$1, local3, $1            ! 3 finished local3 now sbox 5
+
+	ld	[out3+local0],local0      ! 8
+	and	local2, 252, local2       ! 6
+	add	global1, 1536, local5     ! address sbox 7
+
+	ld	[local6+local2], local2   ! 6
+	srl	out0, 16, local3          ! 5
+	xor	$1, local4, $1            ! 4 finished
+
+	ld	[local5+local1],local1    ! 7
+	and	local3, 252, local3       ! 5
+	xor	$1, local0, $1            ! 8 finished
+
+	ld	[global5+local3],local3   ! 5
+	xor	$1, local2, $1            ! 6 finished
+	subcc	out4, 1, out4
+
+	ld	[$5+$3*8+4], out0         ! key 8642 next round
+	xor	$1, local7, local2        ! sbox 5 next round
+	xor	$1, local1, $1            ! 7 finished
+
+	srl	local2, 16, local2        ! sbox 5 next round
+	xor	$1, local3, $1            ! 5 finished
+
+	ld	[$5+$3*16+4], out1        ! key 8642 next round again
+	and	local2, 252, local2       ! sbox5 next round
+! next round
+	xor	$1, local7, local7        ! 7531
+
+	ld	[global5+local2], local2  ! 5
+	srl	local7, 24, local3        ! 7
+	xor	$1, out0, out0            ! 8642
+
+	ldub	[out2+local3], local3     ! 7 (and 0xFC)
+	srl	out0, 4, local0           ! rotate 4 right
+	and	local7, 252, local1       ! 1
+
+	sll	out0, 28, out0            ! rotate
+	xor	$2, local2, $2            ! 5 finished local2 used
+
+	srl	local0, 8, local4         ! 4
+	and	local0, 252, local2       ! 2
+	ld	[local5+local3], local3   ! 7
+
+	srl	local0, 16, local5        ! 6
+	or	out0, local0, out0        ! rotate
+	ld	[global2+local2], local2  ! 2
+
+	srl	out0, 24, local0
+	ld	[$5+$3*16], out0          ! key 7531 next round
+	and	local4, 252, local4	  ! 4
+
+	and	local5, 252, local5       ! 6
+	ld	[global4+local4], local4  ! 4
+	xor	$2, local3, $2            ! 7 finished local3 used
+
+	and	local0, 252, local0       ! 8
+	ld	[local6+local5], local5   ! 6
+	xor	$2, local2, $2            ! 2 finished local2 now sbox 3
+
+	srl	local7, 8, local2         ! 3 start
+	ld	[out3+local0], local0     ! 8
+	xor	$2, local4, $2            ! 4 finished
+
+	and	local2, 252, local2       ! 3
+	ld	[global1+local1], local1  ! 1
+	xor	$2, local5, $2            ! 6 finished local5 used
+
+	ld	[global3+local2], local2  ! 3
+	xor	$2, local0, $2            ! 8 finished
+	add	$5, $3*16, $5             ! enc add 8, dec add -8 to key pointer
+
+	ld	[out2+284], local5        ! 0x0000FC00
+	xor	$2, out0, local4          ! sbox 1 next round
+	xor	$2, local1, $2            ! 1 finished
+
+	xor	$2, local2, $2            ! 3 finished
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bne,pt	%icc, $4
+#else
+	bne	$4
+#endif
+	and	local4, 252, local1       ! sbox 1 next round
+
+! two rounds more:
+
+	ld	[global1+local1], local1
+	xor	$2, out1, out1
+	xor	$2, out0, out0
+
+	srl	out1, 4, local0           ! rotate
+	and	out0, local5, local3
+
+	ld	[$5+$3*8], local7         ! key 7531
+	srl	local3, 8, local3
+	and	local0, 252, local2
+
+	ld	[global3+local3],local3
+	sll	out1, 28, out1            ! rotate
+	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7
+
+	ld	[global2+local2], local2
+	srl	out0, 24, local1
+	or	out1, local0, out1        ! rotate
+
+	ldub	[out2+local1], local1
+	srl	out1, 24, local0
+	and	out1, local5, local4
+
+	ldub	[out2+local0], local0
+	srl	local4, 8, local4
+	xor	$1, local2, $1            ! 2 finished local2 now sbox 6
+
+	ld	[global4+local4],local4
+	srl	out1, 16, local2
+	xor	$1, local3, $1            ! 3 finished local3 now sbox 5
+
+	ld	[out3+local0],local0
+	and	local2, 252, local2
+	add	global1, 1536, local5     ! address sbox 7
+
+	ld	[local6+local2], local2
+	srl	out0, 16, local3
+	xor	$1, local4, $1            ! 4 finished
+
+	ld	[local5+local1],local1
+	and	local3, 252, local3
+	xor	$1, local0, $1
+
+	ld	[global5+local3],local3
+	xor	$1, local2, $1            ! 6 finished
+	cmp	in2, 8
+
+	ifelse($6,{}, {}, {ld	[out2+280], out4})  ! loop counter
+	xor	$1, local7, local2        ! sbox 5 next round
+	xor	$1, local1, $1            ! 7 finished
+
+	ld	[$5+$3*8+4], out0
+	srl	local2, 16, local2        ! sbox 5 next round
+	xor	$1, local3, $1            ! 5 finished
+
+	and	local2, 252, local2
+! next round (two rounds more)
+	xor	$1, local7, local7        ! 7531
+
+	ld	[global5+local2], local2
+	srl	local7, 24, local3
+	xor	$1, out0, out0            ! 8642
+
+	ldub	[out2+local3], local3
+	srl	out0, 4, local0           ! rotate
+	and	local7, 252, local1
+
+	sll	out0, 28, out0            ! rotate
+	xor	$2, local2, $2            ! 5 finished local2 used
+
+	srl	local0, 8, local4
+	and	local0, 252, local2
+	ld	[local5+local3], local3
+
+	srl	local0, 16, local5
+	or	out0, local0, out0        ! rotate
+	ld	[global2+local2], local2
+
+	srl	out0, 24, local0
+	ifelse($6,{}, {}, {ld	[$6], out0})   ! key next encryption/decryption
+	and	local4, 252, local4
+
+	and	local5, 252, local5
+	ld	[global4+local4], local4
+	xor	$2, local3, $2            ! 7 finished local3 used
+
+	and	local0, 252, local0
+	ld	[local6+local5], local5
+	xor	$2, local2, $2            ! 2 finished local2 now sbox 3
+
+	srl	local7, 8, local2         ! 3 start
+	ld	[out3+local0], local0
+	xor	$2, local4, $2
+
+	and	local2, 252, local2
+	ld	[global1+local1], local1
+	xor	$2, local5, $2            ! 6 finished local5 used
+
+	ld	[global3+local2], local2
+	srl	$1, 3, local3
+	xor	$2, local0, $2
+
+	ifelse($6,{}, {}, {ld	[$6+4], out1}) ! key next encryption/decryption
+	sll	$1, 29, local4
+	xor	$2, local1, $2
+
+	ifelse($7,{}, {}, {retl})
+	xor	$2, local2, $2
+})
+
+
+! {fp_macro}
+!
+!  parameter 1   right (original left)
+!  parameter 2   left (original right)
+!  parameter 3   1 for optional store to [in0]
+!  parameter 4   1 for load input/output address to local5/7
+!
+!  The final permutation logic switches the halfes, meaning that
+!  left and right ends up the the registers originally used.
+
+define(fp_macro, {
+
+! {fp_macro}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	! initially undo the rotate 3 left done after initial permutation
+	! original left is received shifted 3 right and 29 left in local3/4
+
+	sll	$2, 29, local1
+	or	local3, local4, $1
+
+	srl	$2, 3, $2
+	sethi	%hi(0x55555555), local2
+
+	or	$2, local1, $2
+	or	local2, %lo(0x55555555), local2
+
+	srl	$2, 1, local3
+	sethi	%hi(0x00ff00ff), local1
+	xor	local3, $1, local3
+	or	local1, %lo(0x00ff00ff), local1
+	and	local3, local2, local3
+	sethi	%hi(0x33333333), local4
+	sll	local3, 1, local2
+
+	xor	$1, local3, $1
+
+	srl	$1, 8, local3
+	xor	$2, local2, $2
+	xor	local3, $2, local3
+	or	local4, %lo(0x33333333), local4
+	and	local3, local1, local3
+	sethi	%hi(0x0000ffff), local1
+	sll	local3, 8, local2
+
+	xor	$2, local3, $2
+
+	srl	$2, 2, local3
+	xor	$1, local2, $1
+	xor	local3, $1, local3
+	or	local1, %lo(0x0000ffff), local1
+	and	local3, local4, local3
+	sethi	%hi(0x0f0f0f0f), local4
+	sll	local3, 2, local2
+
+	ifelse($4,1, {LDPTR INPUT, local5})
+	xor	$1, local3, $1
+
+	ifelse($4,1, {LDPTR OUTPUT, local7})
+	srl	$1, 16, local3
+	xor	$2, local2, $2
+	xor	local3, $2, local3
+	or	local4, %lo(0x0f0f0f0f), local4
+	and	local3, local1, local3
+	sll	local3, 16, local2
+
+	xor	$2, local3, local1
+
+	srl	local1, 4, local3
+	xor	$1, local2, $1
+	xor	local3, $1, local3
+	and	local3, local4, local3
+	sll	local3, 4, local2
+
+	xor	$1, local3, $1
+
+	! optional store:
+
+	ifelse($3,1, {st $1, [in0]})
+
+	xor	local1, local2, $2
+
+	ifelse($3,1, {st $2, [in0+4]})
+
+})
+
+
+! {fp_ip_macro}
+!
+! Does initial permutation for next block mixed with
+! final permutation for current block.
+!
+! parameter 1   original left
+! parameter 2   original right
+! parameter 3   left ip
+! parameter 4   right ip
+! parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
+!                2: mov in4 to in3
+!
+! also adds -8 to length in2 and loads loop counter to out4
+
+define(fp_ip_macro, {
+
+! {fp_ip_macro}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	define({temp1},{out4})
+	define({temp2},{local3})
+
+	define({ip1},{local1})
+	define({ip2},{local2})
+	define({ip4},{local4})
+	define({ip5},{local5})
+
+	! $1 in local3, local4
+
+	ld	[out2+256], ip1
+	sll	out5, 29, temp1
+	or	local3, local4, $1
+
+	srl	out5, 3, $2
+	ifelse($5,2,{mov in4, in3})
+
+	ld	[out2+272], ip5
+	srl	$4, 4, local0
+	or	$2, temp1, $2
+
+	srl	$2, 1, temp1
+	xor	temp1, $1, temp1
+
+	and	temp1, ip5, temp1
+	xor	local0, $3, local0
+
+	sll	temp1, 1, temp2
+	xor	$1, temp1, $1
+
+	and	local0, ip1, local0
+	add	in2, -8, in2
+
+	sll	local0, 4, local7
+	xor	$3, local0, $3
+
+	ld	[out2+268], ip4
+	srl	$1, 8, temp1
+	xor	$2, temp2, $2
+	ld	[out2+260], ip2
+	srl	$3, 16, local0
+	xor	$4, local7, $4
+	xor	temp1, $2, temp1
+	xor	local0, $4, local0
+	and	temp1, ip4, temp1
+	and	local0, ip2, local0
+	sll	temp1, 8, temp2
+	xor	$2, temp1, $2
+	sll	local0, 16, local7
+	xor	$4, local0, $4
+
+	srl	$2, 2, temp1
+	xor	$1, temp2, $1
+
+	ld	[out2+264], temp2         ! ip3
+	srl	$4, 2, local0
+	xor	$3, local7, $3
+	xor	temp1, $1, temp1
+	xor	local0, $3, local0
+	and	temp1, temp2, temp1
+	and	local0, temp2, local0
+	sll	temp1, 2, temp2
+	xor	$1, temp1, $1
+	sll	local0, 2, local7
+	xor	$3, local0, $3
+
+	srl	$1, 16, temp1
+	xor	$2, temp2, $2
+	srl	$3, 8, local0
+	xor	$4, local7, $4
+	xor	temp1, $2, temp1
+	xor	local0, $4, local0
+	and	temp1, ip2, temp1
+	and	local0, ip4, local0
+	sll	temp1, 16, temp2
+	xor	$2, temp1, local4
+	sll	local0, 8, local7
+	xor	$4, local0, $4
+
+	srl	$4, 1, local0
+	xor	$3, local7, $3
+
+	srl	local4, 4, temp1
+	xor	local0, $3, local0
+
+	xor	$1, temp2, $1
+	and	local0, ip5, local0
+
+	sll	local0, 1, local7
+	xor	temp1, $1, temp1
+
+	xor	$3, local0, $3
+	xor	$4, local7, $4
+
+	sll	$3, 3, local5
+	and	temp1, ip1, temp1
+
+	sll	temp1, 4, temp2
+	xor	$1, temp1, $1
+
+	ifelse($5,1,{LDPTR	KS2, in4})
+	sll	$4, 3, local2
+	xor	local4, temp2, $2
+
+	! reload since used as temporar:
+
+	ld	[out2+280], out4          ! loop counter
+
+	srl	$3, 29, local0
+	ifelse($5,1,{add in4, 120, in4})
+
+	ifelse($5,1,{LDPTR	KS1, in3})
+	srl	$4, 29, local7
+
+	or	local0, local5, $4
+	or	local2, local7, $3
+
+})
+
+
+
+! {load_little_endian}
+!
+! parameter 1  address
+! parameter 2  destination left
+! parameter 3  destination right
+! parameter 4  temporar
+! parameter 5  label
+
+define(load_little_endian, {
+
+! {load_little_endian}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	! first in memory to rightmost in register
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	andcc	$1, 3, global0
+	bne,pn	%icc, $5
+	nop
+
+	lda	[$1] 0x88, $2
+	add	$1, 4, $4
+
+	ba,pt	%icc, $5a
+	lda	[$4] 0x88, $3
+#endif
+
+$5:
+	ldub	[$1+3], $2
+
+	ldub	[$1+2], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+	ldub	[$1+1], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+	ldub	[$1+0], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+
+	ldub	[$1+3+4], $3
+
+	ldub	[$1+2+4], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+
+	ldub	[$1+1+4], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+
+	ldub	[$1+0+4], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+$5a:
+
+})
+
+
+! {load_little_endian_inc}
+!
+! parameter 1  address
+! parameter 2  destination left
+! parameter 3  destination right
+! parameter 4  temporar
+! parameter 4  label
+!
+! adds 8 to address
+
+define(load_little_endian_inc, {
+
+! {load_little_endian_inc}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	! first in memory to rightmost in register
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	andcc	$1, 3, global0
+	bne,pn	%icc, $5
+	nop
+
+	lda	[$1] 0x88, $2
+	add	$1, 4, $1
+
+	lda	[$1] 0x88, $3
+	ba,pt	%icc, $5a
+	add	$1, 4, $1
+#endif
+
+$5:
+	ldub	[$1+3], $2
+
+	ldub	[$1+2], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+	ldub	[$1+1], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+	ldub	[$1+0], $4
+	sll	$2, 8, $2
+	or	$2, $4, $2
+
+	ldub	[$1+3+4], $3
+	add	$1, 8, $1
+
+	ldub	[$1+2+4-8], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+
+	ldub	[$1+1+4-8], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+
+	ldub	[$1+0+4-8], $4
+	sll	$3, 8, $3
+	or	$3, $4, $3
+$5a:
+
+})
+
+
+! {load_n_bytes}
+!
+! Loads 1 to 7 bytes little endian
+! Remaining bytes are zeroed.
+!
+! parameter 1  address
+! parameter 2  length
+! parameter 3  destination register left
+! parameter 4  destination register right
+! parameter 5  temp
+! parameter 6  temp2
+! parameter 7  label
+! parameter 8  return label
+
+define(load_n_bytes, {
+
+! {load_n_bytes}
+! $1 $2 $5 $6 $7 $8 $7 $8 $9
+
+$7.0:	call	.+8
+	sll	$2, 2, $6
+
+	add	%o7,$7.jmp.table-$7.0,$5
+
+	add	$5, $6, $5
+	mov	0, $4
+
+	ld	[$5], $5
+
+	jmp	%o7+$5
+	mov	0, $3
+
+$7.7:
+	ldub	[$1+6], $5
+	sll	$5, 16, $5
+	or	$3, $5, $3
+$7.6:
+	ldub	[$1+5], $5
+	sll	$5, 8, $5
+	or	$3, $5, $3
+$7.5:
+	ldub	[$1+4], $5
+	or	$3, $5, $3
+$7.4:
+	ldub	[$1+3], $5
+	sll	$5, 24, $5
+	or	$4, $5, $4
+$7.3:
+	ldub	[$1+2], $5
+	sll	$5, 16, $5
+	or	$4, $5, $4
+$7.2:
+	ldub	[$1+1], $5
+	sll	$5, 8, $5
+	or	$4, $5, $4
+$7.1:
+	ldub	[$1+0], $5
+	ba	$8
+	or	$4, $5, $4
+
+	.align 4
+
+$7.jmp.table:
+	.word	0
+	.word	$7.1-$7.0
+	.word	$7.2-$7.0
+	.word	$7.3-$7.0
+	.word	$7.4-$7.0
+	.word	$7.5-$7.0
+	.word	$7.6-$7.0
+	.word	$7.7-$7.0
+})
+
+
+! {store_little_endian}
+!
+! parameter 1  address
+! parameter 2  source left
+! parameter 3  source right
+! parameter 4  temporar
+
+define(store_little_endian, {
+
+! {store_little_endian}
+! $1 $2 $3 $4 $5 $6 $7 $8 $9
+
+	! rightmost in register to first in memory
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	andcc	$1, 3, global0
+	bne,pn	%icc, $5
+	nop
+
+	sta	$2, [$1] 0x88
+	add	$1, 4, $4
+
+	ba,pt	%icc, $5a
+	sta	$3, [$4] 0x88
+#endif
+
+$5:
+	and	$2, 255, $4
+	stub	$4, [$1+0]
+
+	srl	$2, 8, $4
+	and	$4, 255, $4
+	stub	$4, [$1+1]
+
+	srl	$2, 16, $4
+	and	$4, 255, $4
+	stub	$4, [$1+2]
+
+	srl	$2, 24, $4
+	stub	$4, [$1+3]
+
+
+	and	$3, 255, $4
+	stub	$4, [$1+0+4]
+
+	srl	$3, 8, $4
+	and	$4, 255, $4
+	stub	$4, [$1+1+4]
+
+	srl	$3, 16, $4
+	and	$4, 255, $4
+	stub	$4, [$1+2+4]
+
+	srl	$3, 24, $4
+	stub	$4, [$1+3+4]
+
+$5a:
+
+})
+
+
+! {store_n_bytes}
+!
+! Stores 1 to 7 bytes little endian
+!
+! parameter 1  address
+! parameter 2  length
+! parameter 3  source register left
+! parameter 4  source register right
+! parameter 5  temp
+! parameter 6  temp2
+! parameter 7  label
+! parameter 8  return label
+
+define(store_n_bytes, {
+
+! {store_n_bytes}
+! $1 $2 $5 $6 $7 $8 $7 $8 $9
+
+$7.0:	call	.+8
+	sll	$2, 2, $6
+
+	add	%o7,$7.jmp.table-$7.0,$5
+
+	add	$5, $6, $5
+
+	ld	[$5], $5
+
+	jmp	%o7+$5
+	nop
+
+$7.7:
+	srl	$3, 16, $5
+	and	$5, 0xff, $5
+	stub	$5, [$1+6]
+$7.6:
+	srl	$3, 8, $5
+	and	$5, 0xff, $5
+	stub	$5, [$1+5]
+$7.5:
+	and	$3, 0xff, $5
+	stub	$5, [$1+4]
+$7.4:
+	srl	$4, 24, $5
+	stub	$5, [$1+3]
+$7.3:
+	srl	$4, 16, $5
+	and	$5, 0xff, $5
+	stub	$5, [$1+2]
+$7.2:
+	srl	$4, 8, $5
+	and	$5, 0xff, $5
+	stub	$5, [$1+1]
+$7.1:
+	and	$4, 0xff, $5
+
+
+	ba	$8
+	stub	$5, [$1]
+
+	.align 4
+
+$7.jmp.table:
+
+	.word	0
+	.word	$7.1-$7.0
+	.word	$7.2-$7.0
+	.word	$7.3-$7.0
+	.word	$7.4-$7.0
+	.word	$7.5-$7.0
+	.word	$7.6-$7.0
+	.word	$7.7-$7.0
+})
+
+
+define(testvalue,{1})
+
+define(register_init, {
+
+! For test purposes:
+
+	sethi	%hi(testvalue), local0
+	or	local0, %lo(testvalue), local0
+
+	ifelse($1,{},{}, {mov	local0, $1})
+	ifelse($2,{},{}, {mov	local0, $2})
+	ifelse($3,{},{}, {mov	local0, $3})
+	ifelse($4,{},{}, {mov	local0, $4})
+	ifelse($5,{},{}, {mov	local0, $5})
+	ifelse($6,{},{}, {mov	local0, $6})
+	ifelse($7,{},{}, {mov	local0, $7})
+	ifelse($8,{},{}, {mov	local0, $8})
+
+	mov	local0, local1
+	mov	local0, local2
+	mov	local0, local3
+	mov	local0, local4
+	mov	local0, local5
+	mov	local0, local7
+	mov	local0, local6
+	mov	local0, out0
+	mov	local0, out1
+	mov	local0, out2
+	mov	local0, out3
+	mov	local0, out4
+	mov	local0, out5
+	mov	local0, global1
+	mov	local0, global2
+	mov	local0, global3
+	mov	local0, global4
+	mov	local0, global5
+
+})
+
+.section	".text"
+
+	.align 32
+
+.des_enc:
+
+	! key address in3
+	! loads key next encryption/decryption first round from [in4]
+
+	rounds_macro(in5, out5, 1, .des_enc.1, in3, in4, retl)
+
+
+	.align 32
+
+.des_dec:
+
+	! implemented with out5 as first parameter to avoid
+	! register exchange in ede modes
+
+	! key address in4
+	! loads key next encryption/decryption first round from [in3]
+
+	rounds_macro(out5, in5, -1, .des_dec.1, in4, in3, retl)
+
+
+
+! void DES_encrypt1(data, ks, enc)
+! *******************************
+
+	.align 32
+	.global DES_encrypt1
+	.type	 DES_encrypt1,#function
+
+DES_encrypt1:
+
+	save	%sp, FRAME, %sp
+
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	ld	[in0], in5                ! left
+	cmp	in2, 0                    ! enc
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	be,pn	%icc, .encrypt.dec        ! enc/dec
+#else
+	be	.encrypt.dec
+#endif
+	ld	[in0+4], out5             ! right
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for move in1 to in3
+	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
+
+	ip_macro(in5, out5, in5, out5, in3, 0, 1, 1)
+
+	rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used
+
+	fp_macro(in5, out5, 1)            ! 1 for store to [in0]
+
+	ret
+	restore
+
+.encrypt.dec:
+
+	add	in1, 120, in3             ! use last subkey for first round
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for move in1 to in3
+	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
+
+	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include dec,  ks in4
+
+	fp_macro(out5, in5, 1)            ! 1 for store to [in0]
+
+	ret
+	restore
+
+.DES_encrypt1.end:
+	.size	 DES_encrypt1,.DES_encrypt1.end-DES_encrypt1
+
+
+! void DES_encrypt2(data, ks, enc)
+!*********************************
+
+	! encrypts/decrypts without initial/final permutation
+
+	.align 32
+	.global DES_encrypt2
+	.type	 DES_encrypt2,#function
+
+DES_encrypt2:
+
+	save	%sp, FRAME, %sp
+
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	! Set sbox address 1 to 6 and rotate halfs 3 left
+	! Errors caught by destest? Yes. Still? *NO*
+
+	!sethi	%hi(DES_SPtrans), global1 ! address sbox 1
+
+	!or	global1, %lo(DES_SPtrans), global1  ! sbox 1
+
+	add	global1, 256, global2     ! sbox 2
+	add	global1, 512, global3     ! sbox 3
+
+	ld	[in0], out5               ! right
+	add	global1, 768, global4     ! sbox 4
+	add	global1, 1024, global5    ! sbox 5
+
+	ld	[in0+4], in5              ! left
+	add	global1, 1280, local6     ! sbox 6
+	add	global1, 1792, out3       ! sbox 8
+
+	! rotate
+
+	sll	in5, 3, local5
+	mov	in1, in3                  ! key address to in3
+
+	sll	out5, 3, local7
+	srl	in5, 29, in5
+
+	srl	out5, 29, out5
+	add	in5, local5, in5
+
+	add	out5, local7, out5
+	cmp	in2, 0
+
+	! we use our own stackframe
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	be,pn	%icc, .encrypt2.dec       ! decryption
+#else
+	be	.encrypt2.dec
+#endif
+	STPTR	in0, [%sp+BIAS+ARG0+0*ARGSZ]
+
+	ld	[in3], out0               ! key 7531 first round
+	mov	LOOPS, out4               ! loop counter
+
+	ld	[in3+4], out1             ! key 8642 first round
+	sethi	%hi(0x0000FC00), local5
+
+	call .des_enc
+	mov	in3, in4
+
+	! rotate
+	sll	in5, 29, in0
+	srl	in5, 3, in5
+	sll	out5, 29, in1
+	add	in5, in0, in5
+	srl	out5, 3, out5
+	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
+	add	out5, in1, out5
+	st	in5, [in0]
+	st	out5, [in0+4]
+
+	ret
+	restore
+
+
+.encrypt2.dec:
+
+	add in3, 120, in4
+
+	ld	[in4], out0               ! key 7531 first round
+	mov	LOOPS, out4               ! loop counter
+
+	ld	[in4+4], out1             ! key 8642 first round
+	sethi	%hi(0x0000FC00), local5
+
+	mov	in5, local1               ! left expected in out5
+	mov	out5, in5
+
+	call .des_dec
+	mov	local1, out5
+
+.encrypt2.finish:
+
+	! rotate
+	sll	in5, 29, in0
+	srl	in5, 3, in5
+	sll	out5, 29, in1
+	add	in5, in0, in5
+	srl	out5, 3, out5
+	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
+	add	out5, in1, out5
+	st	out5, [in0]
+	st	in5, [in0+4]
+
+	ret
+	restore
+
+.DES_encrypt2.end:
+	.size	 DES_encrypt2, .DES_encrypt2.end-DES_encrypt2
+
+
+! void DES_encrypt3(data, ks1, ks2, ks3)
+! **************************************
+
+	.align 32
+	.global DES_encrypt3
+	.type	 DES_encrypt3,#function
+
+DES_encrypt3:
+
+	save	%sp, FRAME, %sp
+	
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	ld	[in0], in5                ! left
+	add	in2, 120, in4             ! ks2
+
+	ld	[in0+4], out5             ! right
+	mov	in3, in2                  ! save ks3
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for mov in1 to in3
+	! parameter 8  1 for mov in3 to in4
+	! parameter 9  1 for load ks3 and ks2 to in4 and in3
+
+	ip_macro(in5, out5, in5, out5, in3, 1, 1, 0, 0)
+
+	call	.des_dec
+	mov	in2, in3                  ! preload ks3
+
+	call	.des_enc
+	nop
+
+	fp_macro(in5, out5, 1)
+
+	ret
+	restore
+
+.DES_encrypt3.end:
+	.size	 DES_encrypt3,.DES_encrypt3.end-DES_encrypt3
+
+
+! void DES_decrypt3(data, ks1, ks2, ks3)
+! **************************************
+
+	.align 32
+	.global DES_decrypt3
+	.type	 DES_decrypt3,#function
+
+DES_decrypt3:
+
+	save	%sp, FRAME, %sp
+	
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	ld	[in0], in5                ! left
+	add	in3, 120, in4             ! ks3
+
+	ld	[in0+4], out5             ! right
+	mov	in2, in3                  ! ks2
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for mov in1 to in3
+	! parameter 8  1 for mov in3 to in4
+	! parameter 9  1 for load ks3 and ks2 to in4 and in3
+
+	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 0)
+
+	call	.des_enc
+	add	in1, 120, in4             ! preload ks1
+
+	call	.des_dec
+	nop
+
+	fp_macro(out5, in5, 1)
+
+	ret
+	restore
+
+.DES_decrypt3.end:
+	.size	 DES_decrypt3,.DES_decrypt3.end-DES_decrypt3
+
+! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc)
+! *****************************************************************
+
+
+	.align 32
+	.global DES_ncbc_encrypt
+	.type	 DES_ncbc_encrypt,#function
+
+DES_ncbc_encrypt:
+
+	save	%sp, FRAME, %sp
+	
+	define({INPUT},  { [%sp+BIAS+ARG0+0*ARGSZ] })
+	define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] })
+	define({IVEC},   { [%sp+BIAS+ARG0+4*ARGSZ] })
+
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	cmp	in5, 0                    ! enc   
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	be,pn	%icc, .ncbc.dec
+#else
+	be	.ncbc.dec
+#endif
+	STPTR	in4, IVEC
+
+	! addr  left  right  temp  label
+	load_little_endian(in4, in5, out5, local3, .LLE1)  ! iv
+
+	addcc	in2, -8, in2              ! bytes missing when first block done
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc, .ncbc.enc.seven.or.less
+#else
+	bl	.ncbc.enc.seven.or.less
+#endif
+	mov	in3, in4                  ! schedule
+
+.ncbc.enc.next.block:
+
+	load_little_endian(in0, out4, global4, local3, .LLE2)  ! block
+
+.ncbc.enc.next.block_1:
+
+	xor	in5, out4, in5            ! iv xor
+	xor	out5, global4, out5       ! iv xor
+
+	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
+	ip_macro(in5, out5, in5, out5, in3, 0, 0, 2)
+
+.ncbc.enc.next.block_2:
+
+!//	call .des_enc                     ! compares in2 to 8
+!	rounds inlined for alignment purposes
+
+	add	global1, 768, global4     ! address sbox 4 since register used below
+
+	rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption  ks in3
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc, .ncbc.enc.next.block_fp
+#else
+	bl	.ncbc.enc.next.block_fp
+#endif
+	add	in0, 8, in0               ! input address
+
+	! If 8 or more bytes are to be encrypted after this block,
+	! we combine final permutation for this block with initial
+	! permutation for next block. Load next block:
+
+	load_little_endian(in0, global3, global4, local5, .LLE12)
+
+	!  parameter 1   original left
+	!  parameter 2   original right
+	!  parameter 3   left ip
+	!  parameter 4   right ip
+	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
+	!                2: mov in4 to in3
+	!
+	! also adds -8 to length in2 and loads loop counter to out4
+
+	fp_ip_macro(out0, out1, global3, global4, 2)
+
+	store_little_endian(in1, out0, out1, local3, .SLE10)  ! block
+
+	ld	[in3], out0               ! key 7531 first round next block
+	mov 	in5, local1
+	xor	global3, out5, in5        ! iv xor next block
+
+	ld	[in3+4], out1             ! key 8642
+	add	global1, 512, global3     ! address sbox 3 since register used
+	xor	global4, local1, out5     ! iv xor next block
+
+	ba	.ncbc.enc.next.block_2
+	add	in1, 8, in1               ! output adress
+
+.ncbc.enc.next.block_fp:
+
+	fp_macro(in5, out5)
+
+	store_little_endian(in1, in5, out5, local3, .SLE1)  ! block
+
+	addcc   in2, -8, in2              ! bytes missing when next block done
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bpos,pt	%icc, .ncbc.enc.next.block  ! also jumps if 0
+#else
+	bpos	.ncbc.enc.next.block
+#endif
+	add	in1, 8, in1
+
+.ncbc.enc.seven.or.less:
+
+	cmp	in2, -8
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	ble,pt	%icc, .ncbc.enc.finish
+#else
+	ble	.ncbc.enc.finish
+#endif
+	nop
+
+	add	in2, 8, local1            ! bytes to load
+
+	! addr, length, dest left, dest right, temp, temp2, label, ret label
+	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB1, .ncbc.enc.next.block_1)
+
+	! Loads 1 to 7 bytes little endian to global4, out4
+
+
+.ncbc.enc.finish:
+
+	LDPTR	IVEC, local4
+	store_little_endian(local4, in5, out5, local5, .SLE2)  ! ivec
+
+	ret
+	restore
+
+
+.ncbc.dec:
+
+	STPTR	in0, INPUT
+	cmp	in2, 0                    ! length
+	add	in3, 120, in3
+
+	LDPTR	IVEC, local7              ! ivec
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	ble,pn	%icc, .ncbc.dec.finish
+#else
+	ble	.ncbc.dec.finish
+#endif
+	mov	in3, in4                  ! schedule
+
+	STPTR	in1, OUTPUT
+	mov	in0, local5               ! input
+
+	load_little_endian(local7, in0, in1, local3, .LLE3)   ! ivec
+
+.ncbc.dec.next.block:
+
+	load_little_endian(local5, in5, out5, local3, .LLE4)  ! block
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for mov in1 to in3
+	! parameter 8  1 for mov in3 to in4
+
+	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include decryprion  ks in4
+
+	fp_macro(out5, in5, 0, 1) ! 1 for input and output address to local5/7
+
+	! in2 is bytes left to be stored
+	! in2 is compared to 8 in the rounds
+
+	xor	out5, in0, out4           ! iv xor
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc, .ncbc.dec.seven.or.less
+#else
+	bl	.ncbc.dec.seven.or.less
+#endif
+	xor	in5, in1, global4         ! iv xor
+
+	! Load ivec next block now, since input and output address might be the same.
+
+	load_little_endian_inc(local5, in0, in1, local3, .LLE5)  ! iv
+
+	store_little_endian(local7, out4, global4, local3, .SLE3)
+
+	STPTR	local5, INPUT
+	add	local7, 8, local7
+	addcc   in2, -8, in2
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bg,pt	%icc, .ncbc.dec.next.block
+#else
+	bg	.ncbc.dec.next.block
+#endif
+	STPTR	local7, OUTPUT
+
+
+.ncbc.dec.store.iv:
+
+	LDPTR	IVEC, local4              ! ivec
+	store_little_endian(local4, in0, in1, local5, .SLE4)
+
+.ncbc.dec.finish:
+
+	ret
+	restore
+
+.ncbc.dec.seven.or.less:
+
+	load_little_endian_inc(local5, in0, in1, local3, .LLE13)     ! ivec
+
+	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv)
+
+
+.DES_ncbc_encrypt.end:
+	.size	 DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt
+
+
+! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc)
+! **************************************************************************
+
+
+	.align 32
+	.global DES_ede3_cbc_encrypt
+	.type	 DES_ede3_cbc_encrypt,#function
+
+DES_ede3_cbc_encrypt:
+
+	save	%sp, FRAME, %sp
+
+	define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] })
+	define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] })
+	define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] })
+
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
+
+	LDPTR	[%fp+BIAS+ARG0+7*ARGSZ], local3          ! enc
+	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
+	cmp	local3, 0                 ! enc
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	be,pn	%icc, .ede3.dec
+#else
+	be	.ede3.dec
+#endif
+	STPTR	in4, KS2
+
+	STPTR	in5, KS3
+
+	load_little_endian(local4, in5, out5, local3, .LLE6)  ! ivec
+
+	addcc	in2, -8, in2              ! bytes missing after next block
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc,  .ede3.enc.seven.or.less
+#else
+	bl	.ede3.enc.seven.or.less
+#endif
+	STPTR	in3, KS1
+
+.ede3.enc.next.block:
+
+	load_little_endian(in0, out4, global4, local3, .LLE7)
+
+.ede3.enc.next.block_1:
+
+	LDPTR	KS2, in4
+	xor	in5, out4, in5            ! iv xor
+	xor	out5, global4, out5       ! iv xor
+
+	LDPTR	KS1, in3
+	add	in4, 120, in4             ! for decryption we use last subkey first
+	nop
+
+	ip_macro(in5, out5, in5, out5, in3)
+
+.ede3.enc.next.block_2:
+
+	call .des_enc                     ! ks1 in3
+	nop
+
+	call .des_dec                     ! ks2 in4
+	LDPTR	KS3, in3
+
+	call .des_enc                     ! ks3 in3  compares in2 to 8
+	nop
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc, .ede3.enc.next.block_fp
+#else
+	bl	.ede3.enc.next.block_fp
+#endif
+	add	in0, 8, in0
+
+	! If 8 or more bytes are to be encrypted after this block,
+	! we combine final permutation for this block with initial
+	! permutation for next block. Load next block:
+
+	load_little_endian(in0, global3, global4, local5, .LLE11)
+
+	!  parameter 1   original left
+	!  parameter 2   original right
+	!  parameter 3   left ip
+	!  parameter 4   right ip
+	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
+	!                2: mov in4 to in3
+	!
+	! also adds -8 to length in2 and loads loop counter to out4
+
+	fp_ip_macro(out0, out1, global3, global4, 1)
+
+	store_little_endian(in1, out0, out1, local3, .SLE9)  ! block
+
+	mov 	in5, local1
+	xor	global3, out5, in5        ! iv xor next block
+
+	ld	[in3], out0               ! key 7531
+	add	global1, 512, global3     ! address sbox 3
+	xor	global4, local1, out5     ! iv xor next block
+
+	ld	[in3+4], out1             ! key 8642
+	add	global1, 768, global4     ! address sbox 4
+	ba	.ede3.enc.next.block_2
+	add	in1, 8, in1
+
+.ede3.enc.next.block_fp:
+
+	fp_macro(in5, out5)
+
+	store_little_endian(in1, in5, out5, local3, .SLE5)  ! block
+
+	addcc   in2, -8, in2              ! bytes missing when next block done
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bpos,pt	%icc, .ede3.enc.next.block
+#else
+	bpos	.ede3.enc.next.block
+#endif
+	add	in1, 8, in1
+
+.ede3.enc.seven.or.less:
+
+	cmp	in2, -8
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	ble,pt	%icc, .ede3.enc.finish
+#else
+	ble	.ede3.enc.finish
+#endif
+	nop
+
+	add	in2, 8, local1            ! bytes to load
+
+	! addr, length, dest left, dest right, temp, temp2, label, ret label
+	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB2, .ede3.enc.next.block_1)
+
+.ede3.enc.finish:
+
+	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
+	store_little_endian(local4, in5, out5, local5, .SLE6)  ! ivec
+
+	ret
+	restore
+
+.ede3.dec:
+
+	STPTR	in0, INPUT
+	add	in5, 120, in5
+
+	STPTR	in1, OUTPUT
+	mov	in0, local5
+	add	in3, 120, in3
+
+	STPTR	in3, KS1
+	cmp	in2, 0
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	ble	%icc, .ede3.dec.finish
+#else
+	ble	.ede3.dec.finish
+#endif
+	STPTR	in5, KS3
+
+	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local7          ! iv
+	load_little_endian(local7, in0, in1, local3, .LLE8)
+
+.ede3.dec.next.block:
+
+	load_little_endian(local5, in5, out5, local3, .LLE9)
+
+	! parameter 6  1/2 for include encryption/decryption
+	! parameter 7  1 for mov in1 to in3
+	! parameter 8  1 for mov in3 to in4
+	! parameter 9  1 for load ks3 and ks2 to in4 and in3
+
+	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4
+
+	call .des_enc                     ! ks2 in3
+	LDPTR	KS1, in4
+
+	call .des_dec                     ! ks1 in4
+	nop
+
+	fp_macro(out5, in5, 0, 1)   ! 1 for input and output address local5/7
+
+	! in2 is bytes left to be stored
+	! in2 is compared to 8 in the rounds
+
+	xor	out5, in0, out4
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bl,pn	%icc, .ede3.dec.seven.or.less
+#else
+	bl	.ede3.dec.seven.or.less
+#endif
+	xor	in5, in1, global4
+
+	load_little_endian_inc(local5, in0, in1, local3, .LLE10)   ! iv next block
+
+	store_little_endian(local7, out4, global4, local3, .SLE7)  ! block
+
+	STPTR	local5, INPUT
+	addcc   in2, -8, in2
+	add	local7, 8, local7
+
+#ifdef OPENSSL_SYSNAME_ULTRASPARC
+	bg,pt	%icc, .ede3.dec.next.block
+#else
+	bg	.ede3.dec.next.block
+#endif
+	STPTR	local7, OUTPUT
+
+.ede3.dec.store.iv:
+
+	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
+	store_little_endian(local4, in0, in1, local5, .SLE8)  ! ivec
+
+.ede3.dec.finish:
+
+	ret
+	restore
+
+.ede3.dec.seven.or.less:
+
+	load_little_endian_inc(local5, in0, in1, local3, .LLE14)     ! iv
+
+	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv)
+
+
+.DES_ede3_cbc_encrypt.end:
+	.size	 DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt
+
+	.align	256
+	.type	 .des_and,#object
+	.size	 .des_and,284
+
+.des_and:
+
+! This table is used for AND 0xFC when it is known that register
+! bits 8-31 are zero. Makes it possible to do three arithmetic
+! operations in one cycle.
+
+	.byte  0, 0, 0, 0, 4, 4, 4, 4
+	.byte  8, 8, 8, 8, 12, 12, 12, 12
+	.byte  16, 16, 16, 16, 20, 20, 20, 20
+	.byte  24, 24, 24, 24, 28, 28, 28, 28
+	.byte  32, 32, 32, 32, 36, 36, 36, 36
+	.byte  40, 40, 40, 40, 44, 44, 44, 44
+	.byte  48, 48, 48, 48, 52, 52, 52, 52
+	.byte  56, 56, 56, 56, 60, 60, 60, 60
+	.byte  64, 64, 64, 64, 68, 68, 68, 68
+	.byte  72, 72, 72, 72, 76, 76, 76, 76
+	.byte  80, 80, 80, 80, 84, 84, 84, 84
+	.byte  88, 88, 88, 88, 92, 92, 92, 92
+	.byte  96, 96, 96, 96, 100, 100, 100, 100
+	.byte  104, 104, 104, 104, 108, 108, 108, 108
+	.byte  112, 112, 112, 112, 116, 116, 116, 116
+	.byte  120, 120, 120, 120, 124, 124, 124, 124
+	.byte  128, 128, 128, 128, 132, 132, 132, 132
+	.byte  136, 136, 136, 136, 140, 140, 140, 140
+	.byte  144, 144, 144, 144, 148, 148, 148, 148
+	.byte  152, 152, 152, 152, 156, 156, 156, 156
+	.byte  160, 160, 160, 160, 164, 164, 164, 164
+	.byte  168, 168, 168, 168, 172, 172, 172, 172
+	.byte  176, 176, 176, 176, 180, 180, 180, 180
+	.byte  184, 184, 184, 184, 188, 188, 188, 188
+	.byte  192, 192, 192, 192, 196, 196, 196, 196
+	.byte  200, 200, 200, 200, 204, 204, 204, 204
+	.byte  208, 208, 208, 208, 212, 212, 212, 212
+	.byte  216, 216, 216, 216, 220, 220, 220, 220
+	.byte  224, 224, 224, 224, 228, 228, 228, 228
+	.byte  232, 232, 232, 232, 236, 236, 236, 236
+	.byte  240, 240, 240, 240, 244, 244, 244, 244
+	.byte  248, 248, 248, 248, 252, 252, 252, 252
+
+	! 5 numbers for initil/final permutation
+
+	.word   0x0f0f0f0f                ! offset 256
+	.word	0x0000ffff                ! 260
+	.word	0x33333333                ! 264
+	.word	0x00ff00ff                ! 268
+	.word	0x55555555                ! 272
+
+	.word	0                         ! 276
+	.word	LOOPS                     ! 280
+	.word	0x0000FC00                ! 284
+
+	.global	DES_SPtrans
+	.type	DES_SPtrans,#object
+	.size	DES_SPtrans,2048
+.align	64
+DES_SPtrans:
+.PIC.DES_SPtrans:
+	! nibble 0
+	.word	0x02080800, 0x00080000, 0x02000002, 0x02080802
+	.word	0x02000000, 0x00080802, 0x00080002, 0x02000002
+	.word	0x00080802, 0x02080800, 0x02080000, 0x00000802
+	.word	0x02000802, 0x02000000, 0x00000000, 0x00080002
+	.word	0x00080000, 0x00000002, 0x02000800, 0x00080800
+	.word	0x02080802, 0x02080000, 0x00000802, 0x02000800
+	.word	0x00000002, 0x00000800, 0x00080800, 0x02080002
+	.word	0x00000800, 0x02000802, 0x02080002, 0x00000000
+	.word	0x00000000, 0x02080802, 0x02000800, 0x00080002
+	.word	0x02080800, 0x00080000, 0x00000802, 0x02000800
+	.word	0x02080002, 0x00000800, 0x00080800, 0x02000002
+	.word	0x00080802, 0x00000002, 0x02000002, 0x02080000
+	.word	0x02080802, 0x00080800, 0x02080000, 0x02000802
+	.word	0x02000000, 0x00000802, 0x00080002, 0x00000000
+	.word	0x00080000, 0x02000000, 0x02000802, 0x02080800
+	.word	0x00000002, 0x02080002, 0x00000800, 0x00080802
+	! nibble 1
+	.word	0x40108010, 0x00000000, 0x00108000, 0x40100000
+	.word	0x40000010, 0x00008010, 0x40008000, 0x00108000
+	.word	0x00008000, 0x40100010, 0x00000010, 0x40008000
+	.word	0x00100010, 0x40108000, 0x40100000, 0x00000010
+	.word	0x00100000, 0x40008010, 0x40100010, 0x00008000
+	.word	0x00108010, 0x40000000, 0x00000000, 0x00100010
+	.word	0x40008010, 0x00108010, 0x40108000, 0x40000010
+	.word	0x40000000, 0x00100000, 0x00008010, 0x40108010
+	.word	0x00100010, 0x40108000, 0x40008000, 0x00108010
+	.word	0x40108010, 0x00100010, 0x40000010, 0x00000000
+	.word	0x40000000, 0x00008010, 0x00100000, 0x40100010
+	.word	0x00008000, 0x40000000, 0x00108010, 0x40008010
+	.word	0x40108000, 0x00008000, 0x00000000, 0x40000010
+	.word	0x00000010, 0x40108010, 0x00108000, 0x40100000
+	.word	0x40100010, 0x00100000, 0x00008010, 0x40008000
+	.word	0x40008010, 0x00000010, 0x40100000, 0x00108000
+	! nibble 2
+	.word	0x04000001, 0x04040100, 0x00000100, 0x04000101
+	.word	0x00040001, 0x04000000, 0x04000101, 0x00040100
+	.word	0x04000100, 0x00040000, 0x04040000, 0x00000001
+	.word	0x04040101, 0x00000101, 0x00000001, 0x04040001
+	.word	0x00000000, 0x00040001, 0x04040100, 0x00000100
+	.word	0x00000101, 0x04040101, 0x00040000, 0x04000001
+	.word	0x04040001, 0x04000100, 0x00040101, 0x04040000
+	.word	0x00040100, 0x00000000, 0x04000000, 0x00040101
+	.word	0x04040100, 0x00000100, 0x00000001, 0x00040000
+	.word	0x00000101, 0x00040001, 0x04040000, 0x04000101
+	.word	0x00000000, 0x04040100, 0x00040100, 0x04040001
+	.word	0x00040001, 0x04000000, 0x04040101, 0x00000001
+	.word	0x00040101, 0x04000001, 0x04000000, 0x04040101
+	.word	0x00040000, 0x04000100, 0x04000101, 0x00040100
+	.word	0x04000100, 0x00000000, 0x04040001, 0x00000101
+	.word	0x04000001, 0x00040101, 0x00000100, 0x04040000
+	! nibble 3
+	.word	0x00401008, 0x10001000, 0x00000008, 0x10401008
+	.word	0x00000000, 0x10400000, 0x10001008, 0x00400008
+	.word	0x10401000, 0x10000008, 0x10000000, 0x00001008
+	.word	0x10000008, 0x00401008, 0x00400000, 0x10000000
+	.word	0x10400008, 0x00401000, 0x00001000, 0x00000008
+	.word	0x00401000, 0x10001008, 0x10400000, 0x00001000
+	.word	0x00001008, 0x00000000, 0x00400008, 0x10401000
+	.word	0x10001000, 0x10400008, 0x10401008, 0x00400000
+	.word	0x10400008, 0x00001008, 0x00400000, 0x10000008
+	.word	0x00401000, 0x10001000, 0x00000008, 0x10400000
+	.word	0x10001008, 0x00000000, 0x00001000, 0x00400008
+	.word	0x00000000, 0x10400008, 0x10401000, 0x00001000
+	.word	0x10000000, 0x10401008, 0x00401008, 0x00400000
+	.word	0x10401008, 0x00000008, 0x10001000, 0x00401008
+	.word	0x00400008, 0x00401000, 0x10400000, 0x10001008
+	.word	0x00001008, 0x10000000, 0x10000008, 0x10401000
+	! nibble 4
+	.word	0x08000000, 0x00010000, 0x00000400, 0x08010420
+	.word	0x08010020, 0x08000400, 0x00010420, 0x08010000
+	.word	0x00010000, 0x00000020, 0x08000020, 0x00010400
+	.word	0x08000420, 0x08010020, 0x08010400, 0x00000000
+	.word	0x00010400, 0x08000000, 0x00010020, 0x00000420
+	.word	0x08000400, 0x00010420, 0x00000000, 0x08000020
+	.word	0x00000020, 0x08000420, 0x08010420, 0x00010020
+	.word	0x08010000, 0x00000400, 0x00000420, 0x08010400
+	.word	0x08010400, 0x08000420, 0x00010020, 0x08010000
+	.word	0x00010000, 0x00000020, 0x08000020, 0x08000400
+	.word	0x08000000, 0x00010400, 0x08010420, 0x00000000
+	.word	0x00010420, 0x08000000, 0x00000400, 0x00010020
+	.word	0x08000420, 0x00000400, 0x00000000, 0x08010420
+	.word	0x08010020, 0x08010400, 0x00000420, 0x00010000
+	.word	0x00010400, 0x08010020, 0x08000400, 0x00000420
+	.word	0x00000020, 0x00010420, 0x08010000, 0x08000020
+	! nibble 5
+	.word	0x80000040, 0x00200040, 0x00000000, 0x80202000
+	.word	0x00200040, 0x00002000, 0x80002040, 0x00200000
+	.word	0x00002040, 0x80202040, 0x00202000, 0x80000000
+	.word	0x80002000, 0x80000040, 0x80200000, 0x00202040
+	.word	0x00200000, 0x80002040, 0x80200040, 0x00000000
+	.word	0x00002000, 0x00000040, 0x80202000, 0x80200040
+	.word	0x80202040, 0x80200000, 0x80000000, 0x00002040
+	.word	0x00000040, 0x00202000, 0x00202040, 0x80002000
+	.word	0x00002040, 0x80000000, 0x80002000, 0x00202040
+	.word	0x80202000, 0x00200040, 0x00000000, 0x80002000
+	.word	0x80000000, 0x00002000, 0x80200040, 0x00200000
+	.word	0x00200040, 0x80202040, 0x00202000, 0x00000040
+	.word	0x80202040, 0x00202000, 0x00200000, 0x80002040
+	.word	0x80000040, 0x80200000, 0x00202040, 0x00000000
+	.word	0x00002000, 0x80000040, 0x80002040, 0x80202000
+	.word	0x80200000, 0x00002040, 0x00000040, 0x80200040
+	! nibble 6
+	.word	0x00004000, 0x00000200, 0x01000200, 0x01000004
+	.word	0x01004204, 0x00004004, 0x00004200, 0x00000000
+	.word	0x01000000, 0x01000204, 0x00000204, 0x01004000
+	.word	0x00000004, 0x01004200, 0x01004000, 0x00000204
+	.word	0x01000204, 0x00004000, 0x00004004, 0x01004204
+	.word	0x00000000, 0x01000200, 0x01000004, 0x00004200
+	.word	0x01004004, 0x00004204, 0x01004200, 0x00000004
+	.word	0x00004204, 0x01004004, 0x00000200, 0x01000000
+	.word	0x00004204, 0x01004000, 0x01004004, 0x00000204
+	.word	0x00004000, 0x00000200, 0x01000000, 0x01004004
+	.word	0x01000204, 0x00004204, 0x00004200, 0x00000000
+	.word	0x00000200, 0x01000004, 0x00000004, 0x01000200
+	.word	0x00000000, 0x01000204, 0x01000200, 0x00004200
+	.word	0x00000204, 0x00004000, 0x01004204, 0x01000000
+	.word	0x01004200, 0x00000004, 0x00004004, 0x01004204
+	.word	0x01000004, 0x01004200, 0x01004000, 0x00004004
+	! nibble 7
+	.word	0x20800080, 0x20820000, 0x00020080, 0x00000000
+	.word	0x20020000, 0x00800080, 0x20800000, 0x20820080
+	.word	0x00000080, 0x20000000, 0x00820000, 0x00020080
+	.word	0x00820080, 0x20020080, 0x20000080, 0x20800000
+	.word	0x00020000, 0x00820080, 0x00800080, 0x20020000
+	.word	0x20820080, 0x20000080, 0x00000000, 0x00820000
+	.word	0x20000000, 0x00800000, 0x20020080, 0x20800080
+	.word	0x00800000, 0x00020000, 0x20820000, 0x00000080
+	.word	0x00800000, 0x00020000, 0x20000080, 0x20820080
+	.word	0x00020080, 0x20000000, 0x00000000, 0x00820000
+	.word	0x20800080, 0x20020080, 0x20020000, 0x00800080
+	.word	0x20820000, 0x00000080, 0x00800080, 0x20020000
+	.word	0x20820080, 0x00800000, 0x20800000, 0x20000080
+	.word	0x00820000, 0x00020080, 0x20020080, 0x20800000
+	.word	0x00000080, 0x20820000, 0x00820080, 0x00000000
+	.word	0x20000000, 0x20800080, 0x00020000, 0x00820080
+
diff --git a/openssl/crypto/des/asm/desboth.pl b/openssl/crypto/des/asm/desboth.pl
new file mode 100644
index 0000000..eec0088
--- /dev/null
+++ b/openssl/crypto/des/asm/desboth.pl
@@ -0,0 +1,79 @@
+#!/usr/local/bin/perl
+
+$L="edi";
+$R="esi";
+
+sub DES_encrypt3
+	{
+	local($name,$enc)=@_;
+
+	&function_begin_B($name,"");
+	&push("ebx");
+	&mov("ebx",&wparam(0));
+
+	&push("ebp");
+	&push("esi");
+
+	&push("edi");
+
+	&comment("");
+	&comment("Load the data words");
+	&mov($L,&DWP(0,"ebx","",0));
+	&mov($R,&DWP(4,"ebx","",0));
+	&stack_push(3);
+
+	&comment("");
+	&comment("IP");
+	&IP_new($L,$R,"edx",0);
+
+	# put them back
+	
+	if ($enc)
+		{
+		&mov(&DWP(4,"ebx","",0),$R);
+		 &mov("eax",&wparam(1));
+		&mov(&DWP(0,"ebx","",0),"edx");
+		 &mov("edi",&wparam(2));
+		 &mov("esi",&wparam(3));
+		}
+	else
+		{
+		&mov(&DWP(4,"ebx","",0),$R);
+		 &mov("esi",&wparam(1));
+		&mov(&DWP(0,"ebx","",0),"edx");
+		 &mov("edi",&wparam(2));
+		 &mov("eax",&wparam(3));
+		}
+	&mov(&swtmp(2),	(DWC(($enc)?"1":"0")));
+	&mov(&swtmp(1),	"eax");
+	&mov(&swtmp(0),	"ebx");
+	&call("DES_encrypt2");
+	&mov(&swtmp(2),	(DWC(($enc)?"0":"1")));
+	&mov(&swtmp(1),	"edi");
+	&mov(&swtmp(0),	"ebx");
+	&call("DES_encrypt2");
+	&mov(&swtmp(2),	(DWC(($enc)?"1":"0")));
+	&mov(&swtmp(1),	"esi");
+	&mov(&swtmp(0),	"ebx");
+	&call("DES_encrypt2");
+
+	&stack_pop(3);
+	&mov($L,&DWP(0,"ebx","",0));
+	&mov($R,&DWP(4,"ebx","",0));
+
+	&comment("");
+	&comment("FP");
+	&FP_new($L,$R,"eax",0);
+
+	&mov(&DWP(0,"ebx","",0),"eax");
+	&mov(&DWP(4,"ebx","",0),$R);
+
+	&pop("edi");
+	&pop("esi");
+	&pop("ebp");
+	&pop("ebx");
+	&ret();
+	&function_end_B($name);
+	}
+
+
diff --git a/openssl/crypto/des/asm/dest4-sparcv9.pl b/openssl/crypto/des/asm/dest4-sparcv9.pl
new file mode 100644
index 0000000..5f3a511
--- /dev/null
+++ b/openssl/crypto/des/asm/dest4-sparcv9.pl
@@ -0,0 +1,617 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. March 2013. All rights reserved.
+# ====================================================================
+
+######################################################################
+# DES for SPARC T4.
+#
+# As with other hardware-assisted ciphers CBC encrypt results [for
+# aligned data] are virtually identical to critical path lengths:
+#
+#		DES		Triple-DES
+# CBC encrypt	4.14/4.15(*)	11.7/11.7
+# CBC decrypt	1.77/4.11(**)	6.42/7.47
+#
+#			 (*)	numbers after slash are for
+#				misaligned data;
+#			 (**)	this is result for largest
+#				block size, unlike all other
+#				cases smaller blocks results
+#				are better[?];
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$code.=<<___ if ($::abibits==64);
+.register       %g2,#scratch
+.register       %g3,#scratch
+___
+
+$code.=<<___;
+.text
+___
+
+{ my ($inp,$out)=("%o0","%o1");
+
+$code.=<<___;
+.align	32
+.globl	des_t4_key_expand
+.type	des_t4_key_expand,#function
+des_t4_key_expand:
+	andcc		$inp, 0x7, %g0
+	alignaddr	$inp, %g0, $inp
+	bz,pt		%icc, 1f
+	ldd		[$inp + 0x00], %f0
+	ldd		[$inp + 0x08], %f2
+	faligndata	%f0, %f2, %f0
+1:	des_kexpand	%f0, 0, %f0
+	des_kexpand	%f0, 1, %f2
+	std		%f0, [$out + 0x00]
+	des_kexpand	%f2, 3, %f6
+	std		%f2, [$out + 0x08]
+	des_kexpand	%f2, 2, %f4
+	des_kexpand	%f6, 3, %f10
+	std		%f6, [$out + 0x18]
+	des_kexpand	%f6, 2, %f8
+	std		%f4, [$out + 0x10]
+	des_kexpand	%f10, 3, %f14
+	std		%f10, [$out + 0x28]
+	des_kexpand	%f10, 2, %f12
+	std		%f8, [$out + 0x20]
+	des_kexpand	%f14, 1, %f16
+	std		%f14, [$out + 0x38]
+	des_kexpand	%f16, 3, %f20
+	std		%f12, [$out + 0x30]
+	des_kexpand	%f16, 2, %f18
+	std		%f16, [$out + 0x40]
+	des_kexpand	%f20, 3, %f24
+	std		%f20, [$out + 0x50]
+	des_kexpand	%f20, 2, %f22
+	std		%f18, [$out + 0x48]
+	des_kexpand	%f24, 3, %f28
+	std		%f24, [$out + 0x60]
+	des_kexpand	%f24, 2, %f26
+	std		%f22, [$out + 0x58]
+	des_kexpand	%f28, 1, %f30
+	std		%f28, [$out + 0x70]
+	std		%f26, [$out + 0x68]
+	retl
+	std		%f30, [$out + 0x78]
+.size	des_t4_key_expand,.-des_t4_key_expand
+___
+}
+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
+  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
+
+$code.=<<___;
+.globl	des_t4_cbc_encrypt
+.align	32
+des_t4_cbc_encrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f0	! load ivec
+	ld		[$ivec + 4], %f1
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x00], %f4	! load key schedule
+	ldd		[$key + 0x08], %f6
+	ldd		[$key + 0x10], %f8
+	ldd		[$key + 0x18], %f10
+	ldd		[$key + 0x20], %f12
+	ldd		[$key + 0x28], %f14
+	ldd		[$key + 0x30], %f16
+	ldd		[$key + 0x38], %f18
+	ldd		[$key + 0x40], %f20
+	ldd		[$key + 0x48], %f22
+	ldd		[$key + 0x50], %f24
+	ldd		[$key + 0x58], %f26
+	ldd		[$key + 0x60], %f28
+	ldd		[$key + 0x68], %f30
+	ldd		[$key + 0x70], %f32
+	ldd		[$key + 0x78], %f34
+
+.Ldes_cbc_enc_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f2
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	fxor		%f2, %f0, %f0		! ^= ivec
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	des_round	%f20, %f22, %f0, %f0
+	des_round	%f24, %f26, %f0, %f0
+	des_round	%f28, %f30, %f0, %f0
+	des_round	%f32, %f34, %f0, %f0
+	des_iip		%f0, %f0
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_cbc_enc_loop
+	add		$out, 8, $out
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.Lcbc_abort:
+	retl
+	nop
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~4x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f2		! handle unaligned output
+
+	stda		%f2, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f2, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_cbc_enc_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.type	des_t4_cbc_encrypt,#function
+.size	des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
+
+.globl	des_t4_cbc_decrypt
+.align	32
+des_t4_cbc_decrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f2	! load ivec
+	ld		[$ivec + 4], %f3
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x78], %f4	! load key schedule
+	ldd		[$key + 0x70], %f6
+	ldd		[$key + 0x68], %f8
+	ldd		[$key + 0x60], %f10
+	ldd		[$key + 0x58], %f12
+	ldd		[$key + 0x50], %f14
+	ldd		[$key + 0x48], %f16
+	ldd		[$key + 0x40], %f18
+	ldd		[$key + 0x38], %f20
+	ldd		[$key + 0x30], %f22
+	ldd		[$key + 0x28], %f24
+	ldd		[$key + 0x20], %f26
+	ldd		[$key + 0x18], %f28
+	ldd		[$key + 0x10], %f30
+	ldd		[$key + 0x08], %f32
+	ldd		[$key + 0x00], %f34
+
+.Ldes_cbc_dec_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f0
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	des_round	%f20, %f22, %f0, %f0
+	des_round	%f24, %f26, %f0, %f0
+	des_round	%f28, %f30, %f0, %f0
+	des_round	%f32, %f34, %f0, %f0
+	des_iip		%f0, %f0
+
+	fxor		%f2, %f0, %f0		! ^= ivec
+	movxtod		%g4, %f2
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_cbc_dec_loop
+	add		$out, 8, $out
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~4x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f0		! handle unaligned output
+
+	stda		%f0, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f0, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_cbc_dec_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+.type	des_t4_cbc_decrypt,#function
+.size	des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
+___
+
+# One might wonder why does one have back-to-back des_iip/des_ip
+# pairs between EDE passes. Indeed, aren't they inverse of each other?
+# They almost are. Outcome of the pair is 32-bit words being swapped
+# in target register. Consider pair of des_iip/des_ip as a way to
+# perform the due swap, it's actually fastest way in this case.
+
+$code.=<<___;
+.globl	des_t4_ede3_cbc_encrypt
+.align	32
+des_t4_ede3_cbc_encrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f0	! load ivec
+	ld		[$ivec + 4], %f1
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x00], %f4	! load key schedule
+	ldd		[$key + 0x08], %f6
+	ldd		[$key + 0x10], %f8
+	ldd		[$key + 0x18], %f10
+	ldd		[$key + 0x20], %f12
+	ldd		[$key + 0x28], %f14
+	ldd		[$key + 0x30], %f16
+	ldd		[$key + 0x38], %f18
+	ldd		[$key + 0x40], %f20
+	ldd		[$key + 0x48], %f22
+	ldd		[$key + 0x50], %f24
+	ldd		[$key + 0x58], %f26
+	ldd		[$key + 0x60], %f28
+	ldd		[$key + 0x68], %f30
+	ldd		[$key + 0x70], %f32
+	ldd		[$key + 0x78], %f34
+
+.Ldes_ede3_cbc_enc_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f2
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	fxor		%f2, %f0, %f0		! ^= ivec
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	ldd		[$key + 0x100-0x08], %f36
+	ldd		[$key + 0x100-0x10], %f38
+	des_round	%f20, %f22, %f0, %f0
+	ldd		[$key + 0x100-0x18], %f40
+	ldd		[$key + 0x100-0x20], %f42
+	des_round	%f24, %f26, %f0, %f0
+	ldd		[$key + 0x100-0x28], %f44
+	ldd		[$key + 0x100-0x30], %f46
+	des_round	%f28, %f30, %f0, %f0
+	ldd		[$key + 0x100-0x38], %f48
+	ldd		[$key + 0x100-0x40], %f50
+	des_round	%f32, %f34, %f0, %f0
+	ldd		[$key + 0x100-0x48], %f52
+	ldd		[$key + 0x100-0x50], %f54
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x100-0x58], %f56
+	ldd		[$key + 0x100-0x60], %f58
+	des_ip		%f0, %f0
+	ldd		[$key + 0x100-0x68], %f60
+	ldd		[$key + 0x100-0x70], %f62
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x100-0x78], %f36
+	ldd		[$key + 0x100-0x80], %f38
+	des_round	%f40, %f42, %f0, %f0
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	ldd		[$key + 0x100+0x00], %f40
+	ldd		[$key + 0x100+0x08], %f42
+	des_round	%f52, %f54, %f0, %f0
+	ldd		[$key + 0x100+0x10], %f44
+	ldd		[$key + 0x100+0x18], %f46
+	des_round	%f56, %f58, %f0, %f0
+	ldd		[$key + 0x100+0x20], %f48
+	ldd		[$key + 0x100+0x28], %f50
+	des_round	%f60, %f62, %f0, %f0
+	ldd		[$key + 0x100+0x30], %f52
+	ldd		[$key + 0x100+0x38], %f54
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x100+0x40], %f56
+	ldd		[$key + 0x100+0x48], %f58
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x100+0x50], %f60
+	ldd		[$key + 0x100+0x58], %f62
+	des_ip		%f0, %f0
+	ldd		[$key + 0x100+0x60], %f36
+	ldd		[$key + 0x100+0x68], %f38
+	des_round	%f40, %f42, %f0, %f0
+	ldd		[$key + 0x100+0x70], %f40
+	ldd		[$key + 0x100+0x78], %f42
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	des_round	%f52, %f54, %f0, %f0
+	des_round	%f56, %f58, %f0, %f0
+	des_round	%f60, %f62, %f0, %f0
+	des_round	%f36, %f38, %f0, %f0
+	des_round	%f40, %f42, %f0, %f0
+	des_iip		%f0, %f0
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop
+	add		$out, 8, $out
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~2x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f2		! handle unaligned output
+
+	stda		%f2, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f2, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.type	des_t4_ede3_cbc_encrypt,#function
+.size	des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
+
+.globl	des_t4_ede3_cbc_decrypt
+.align	32
+des_t4_ede3_cbc_decrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f2	! load ivec
+	ld		[$ivec + 4], %f3
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x100+0x78], %f4	! load key schedule
+	ldd		[$key + 0x100+0x70], %f6
+	ldd		[$key + 0x100+0x68], %f8
+	ldd		[$key + 0x100+0x60], %f10
+	ldd		[$key + 0x100+0x58], %f12
+	ldd		[$key + 0x100+0x50], %f14
+	ldd		[$key + 0x100+0x48], %f16
+	ldd		[$key + 0x100+0x40], %f18
+	ldd		[$key + 0x100+0x38], %f20
+	ldd		[$key + 0x100+0x30], %f22
+	ldd		[$key + 0x100+0x28], %f24
+	ldd		[$key + 0x100+0x20], %f26
+	ldd		[$key + 0x100+0x18], %f28
+	ldd		[$key + 0x100+0x10], %f30
+	ldd		[$key + 0x100+0x08], %f32
+	ldd		[$key + 0x100+0x00], %f34
+
+.Ldes_ede3_cbc_dec_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f0
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	ldd		[$key + 0x80+0x00], %f36
+	ldd		[$key + 0x80+0x08], %f38
+	des_round	%f20, %f22, %f0, %f0
+	ldd		[$key + 0x80+0x10], %f40
+	ldd		[$key + 0x80+0x18], %f42
+	des_round	%f24, %f26, %f0, %f0
+	ldd		[$key + 0x80+0x20], %f44
+	ldd		[$key + 0x80+0x28], %f46
+	des_round	%f28, %f30, %f0, %f0
+	ldd		[$key + 0x80+0x30], %f48
+	ldd		[$key + 0x80+0x38], %f50
+	des_round	%f32, %f34, %f0, %f0
+	ldd		[$key + 0x80+0x40], %f52
+	ldd		[$key + 0x80+0x48], %f54
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x80+0x50], %f56
+	ldd		[$key + 0x80+0x58], %f58
+	des_ip		%f0, %f0
+	ldd		[$key + 0x80+0x60], %f60
+	ldd		[$key + 0x80+0x68], %f62
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x80+0x70], %f36
+	ldd		[$key + 0x80+0x78], %f38
+	des_round	%f40, %f42, %f0, %f0
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	ldd		[$key + 0x80-0x08], %f40
+	ldd		[$key + 0x80-0x10], %f42
+	des_round	%f52, %f54, %f0, %f0
+	ldd		[$key + 0x80-0x18], %f44
+	ldd		[$key + 0x80-0x20], %f46
+	des_round	%f56, %f58, %f0, %f0
+	ldd		[$key + 0x80-0x28], %f48
+	ldd		[$key + 0x80-0x30], %f50
+	des_round	%f60, %f62, %f0, %f0
+	ldd		[$key + 0x80-0x38], %f52
+	ldd		[$key + 0x80-0x40], %f54
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x80-0x48], %f56
+	ldd		[$key + 0x80-0x50], %f58
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x80-0x58], %f60
+	ldd		[$key + 0x80-0x60], %f62
+	des_ip		%f0, %f0
+	ldd		[$key + 0x80-0x68], %f36
+	ldd		[$key + 0x80-0x70], %f38
+	des_round	%f40, %f42, %f0, %f0
+	ldd		[$key + 0x80-0x78], %f40
+	ldd		[$key + 0x80-0x80], %f42
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	des_round	%f52, %f54, %f0, %f0
+	des_round	%f56, %f58, %f0, %f0
+	des_round	%f60, %f62, %f0, %f0
+	des_round	%f36, %f38, %f0, %f0
+	des_round	%f40, %f42, %f0, %f0
+	des_iip		%f0, %f0
+
+	fxor		%f2, %f0, %f0		! ^= ivec
+	movxtod		%g4, %f2
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop
+	add		$out, 8, $out
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f0		! handle unaligned output
+
+	stda		%f0, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f0, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+.type	des_t4_ede3_cbc_decrypt,#function
+.size	des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
+___
+}
+$code.=<<___;
+.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
+.align  4
+___
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/openssl/crypto/des/asm/readme b/openssl/crypto/des/asm/readme
new file mode 100644
index 0000000..1beafe2
--- /dev/null
+++ b/openssl/crypto/des/asm/readme
@@ -0,0 +1,131 @@
+First up, let me say I don't like writing in assembler.  It is not portable,
+dependant on the particular CPU architecture release and is generally a pig
+to debug and get right.  Having said that, the x86 architecture is probably
+the most important for speed due to number of boxes and since
+it appears to be the worst architecture to to get
+good C compilers for.  So due to this, I have lowered myself to do
+assembler for the inner DES routines in libdes :-).
+
+The file to implement in assembler is des_enc.c.  Replace the following
+4 functions
+des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt);
+des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt);
+des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
+des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
+
+They encrypt/decrypt the 64 bits held in 'data' using
+the 'ks' key schedules.   The only difference between the 4 functions is that
+des_encrypt2() does not perform IP() or FP() on the data (this is an
+optimization for when doing triple DES and des_encrypt3() and des_decrypt3()
+perform triple des.  The triple DES routines are in here because it does
+make a big difference to have them located near the des_encrypt2 function
+at link time..
+
+Now as we all know, there are lots of different operating systems running on
+x86 boxes, and unfortunately they normally try to make sure their assembler
+formating is not the same as the other peoples.
+The 4 main formats I know of are
+Microsoft	Windows 95/Windows NT
+Elf		Includes Linux and FreeBSD(?).
+a.out		The older Linux.
+Solaris		Same as Elf but different comments :-(.
+
+Now I was not overly keen to write 4 different copies of the same code,
+so I wrote a few perl routines to output the correct assembler, given
+a target assembler type.  This code is ugly and is just a hack.
+The libraries are x86unix.pl and x86ms.pl.
+des586.pl, des686.pl and des-som[23].pl are the programs to actually
+generate the assembler.
+
+So to generate elf assembler
+perl des-som3.pl elf >dx86-elf.s
+For Windows 95/NT
+perl des-som2.pl win32 >win32.asm
+
+[ update 4 Jan 1996 ]
+I have added another way to do things.
+perl des-som3.pl cpp >dx86-cpp.s
+generates a file that will be included by dx86unix.cpp when it is compiled.
+To build for elf, a.out, solaris, bsdi etc,
+cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o
+cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o
+cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o
+cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o
+This was done to cut down the number of files in the distribution.
+
+Now the ugly part.  I acquired my copy of Intels
+"Optimization's For Intel's 32-Bit Processors" and found a few interesting
+things.  First, the aim of the exersize is to 'extract' one byte at a time
+from a word and do an array lookup.  This involves getting the byte from
+the 4 locations in the word and moving it to a new word and doing the lookup.
+The most obvious way to do this is
+xor	eax,	eax				# clear word
+movb	al,	cl				# get low byte
+xor	edi	DWORD PTR 0x100+des_SP[eax] 	# xor in word
+movb	al,	ch				# get next byte
+xor	edi	DWORD PTR 0x300+des_SP[eax] 	# xor in word
+shr	ecx	16
+which seems ok.  For the pentium, this system appears to be the best.
+One has to do instruction interleaving to keep both functional units
+operating, but it is basically very efficient.
+
+Now the crunch.  When a full register is used after a partial write, eg.
+mov	al,	cl
+xor	edi,	DWORD PTR 0x100+des_SP[eax]
+386	- 1 cycle stall
+486	- 1 cycle stall
+586	- 0 cycle stall
+686	- at least 7 cycle stall (page 22 of the above mentioned document).
+
+So the technique that produces the best results on a pentium, according to
+the documentation, will produce hideous results on a pentium pro.
+
+To get around this, des686.pl will generate code that is not as fast on
+a pentium, should be very good on a pentium pro.
+mov	eax,	ecx				# copy word 
+shr	ecx,	8				# line up next byte
+and	eax,	0fch				# mask byte
+xor	edi	DWORD PTR 0x100+des_SP[eax] 	# xor in array lookup
+mov	eax,	ecx				# get word
+shr	ecx	8				# line up next byte
+and	eax,	0fch				# mask byte
+xor	edi	DWORD PTR 0x300+des_SP[eax] 	# xor in array lookup
+
+Due to the execution units in the pentium, this actually works quite well.
+For a pentium pro it should be very good.  This is the type of output
+Visual C++ generates.
+
+There is a third option.  instead of using
+mov	al,	ch
+which is bad on the pentium pro, one may be able to use
+movzx	eax,	ch
+which may not incur the partial write penalty.  On the pentium,
+this instruction takes 4 cycles so is not worth using but on the
+pentium pro it appears it may be worth while.  I need access to one to
+experiment :-).
+
+eric (20 Oct 1996)
+
+22 Nov 1996 - I have asked people to run the 2 different version on pentium
+pros and it appears that the intel documentation is wrong.  The
+mov al,bh is still faster on a pentium pro, so just use the des586.pl
+install des686.pl
+
+3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these
+functions into des_enc.c because it does make a massive performance
+difference on some boxes to have the functions code located close to
+the des_encrypt2() function.
+
+9 Jan 1997 - des-som2.pl is now the correct perl script to use for
+pentiums.  It contains an inner loop from
+Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at
+273,000 per second.  He had a previous version at 250,000 and the best
+I was able to get was 203,000.  The content has not changed, this is all
+due to instruction sequencing (and actual instructions choice) which is able
+to keep both functional units of the pentium going.
+We may have lost the ugly register usage restrictions when x86 went 32 bit
+but for the pentium it has been replaced by evil instruction ordering tricks.
+
+13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf.
+raw DES at 281,000 per second on a pentium 100.
+