openssl-1.0.1 upgrade Bug: 6168278 Change-Id: I648f9172828120df5d19a14425e9ceec92647921

commit: 392aa7cc7d2b122614c5393c3e357da07fd07af3 [log] [tgz]
author: Brian Carlstrom <[email protected]> Thu Mar 15 16:03:43 2012 -0700
committer: Brian Carlstrom <[email protected]> Wed Mar 21 11:09:32 2012 -0700
tree: 69f0b217fb624fdc56abb9f659c9bdea1b1865aa
parent: 7f1d63479ce92a2a4a0874b007e49f8acb13a0d9 [diff]
diff --git a/ThirdPartyProject.prop b/ThirdPartyProject.prop
index 9c17f70..3898e40 100644
--- a/ThirdPartyProject.prop
+++ b/ThirdPartyProject.prop

@@ -1,7 +1,7 @@
 # Copyright 2010 Google Inc. All Rights Reserved.
 #Fri Jul 16 10:03:09 PDT 2010
-currentVersion=1.0.0h
-version=1.0.0h
+currentVersion=1.0.1
+version=1.0.1
 isNative=true
 feedurl=http\://www.openssl.org/
 name=openssl

diff --git a/android-config.mk b/android-config.mk
index 82a0ed7..2198ef6 100644
--- a/android-config.mk
+++ b/android-config.mk

@@ -11,7 +11,7 @@
 LOCAL_CFLAGS += -DOPENSSL_NO_CAMELLIA -DOPENSSL_NO_CAPIENG -DOPENSSL_NO_CAST -DOPENSSL_NO_CMS -DOPENSSL_NO_GMP -DOPENSSL_NO_IDEA -DOPENSSL_NO_JPAKE -DOPENSSL_NO_MD2 -DOPENSSL_NO_MDC2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_SHA0 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SEED -DOPENSSL_NO_STORE -DOPENSSL_NO_WHIRLPOOL
 
 # Extra
-LOCAL_CFLAGS += -DOPENSSL_NO_STATIC_ENGINE -DOPENSSL_NO_GOST -DZLIB -DOPENSSL_NO_DTLS1
+LOCAL_CFLAGS += -DOPENSSL_NO_STATIC_ENGINE -DOPENSSL_NO_GOST -DZLIB -DOPENSSL_NO_DTLS1 -DOPENSSL_NO_RSAX -DOPENSSL_NO_RDRAND -DOPENSSL_NO_SCTP
 
 # Directories
 LOCAL_CFLAGS += -DOPENSSLDIR="\"/system/lib/ssl\"" -DENGINESDIR="\"/system/lib/ssl/engines\""

diff --git a/android.testssl/CAss.cnf b/android.testssl/CAss.cnf
index 1173c08..77c01c3 100644
--- a/android.testssl/CAss.cnf
+++ b/android.testssl/CAss.cnf

@@ -7,7 +7,7 @@
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no

diff --git a/android.testssl/Uss.cnf b/android.testssl/Uss.cnf
index 56dcdd5..317ab6d 100644
--- a/android.testssl/Uss.cnf
+++ b/android.testssl/Uss.cnf

@@ -7,11 +7,11 @@
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
-default_md		= md2
+default_md		= sha256
 
 [ req_distinguished_name ]
 countryName			= Country Name (2 letter code)

diff --git a/android.testssl/server2.pem b/android.testssl/server2.pem
index 8bb6641..a3927cf 100644
--- a/android.testssl/server2.pem
+++ b/android.testssl/server2.pem

@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
 -----BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
------END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----

diff --git a/android.testssl/testssl b/android.testssl/testssl
index 0eee92d..96a90d1 100755
--- a/android.testssl/testssl
+++ b/android.testssl/testssl

@@ -160,4 +160,14 @@
 echo test tls1 with PSK via BIO pair
 $ssltest -bio_pair -tls1 -cipher PSK -psk abc123 $extra || exit 1
 
+if adb shell /system/bin/openssl no-srp; then
+  echo skipping SRP tests
+else
+  echo test tls1 with SRP
+  $ssltest -tls1 -cipher SRP -srpuser test -srppass abc123
+
+  echo test tls1 with SRP via BIO pair
+  $ssltest -bio_pair -tls1 -cipher SRP -srpuser test -srppass abc123
+fi
+
 exit 0

diff --git a/apps/Android.mk b/apps/Android.mk
index c2dc2d7..9110490 100644
--- a/apps/Android.mk
+++ b/apps/Android.mk

@@ -48,6 +48,7 @@
 	smime.c \
 	speed.c \
 	spkac.c \
+	srp.c \
 	verify.c \
 	version.c \
 	x509.c

diff --git a/apps/apps.c b/apps/apps.c
index 272fd78..4e11915 100644
--- a/apps/apps.c
+++ b/apps/apps.c

@@ -2258,6 +2258,7 @@
 	int purpose = 0, depth = -1;
 	char **oldargs = *pargs;
 	char *arg = **pargs, *argn = (*pargs)[1];
+	time_t at_time = 0;
 	if (!strcmp(arg, "-policy"))
 		{
 		if (!argn)
@@ -2310,6 +2311,27 @@
 			}
 		(*pargs)++;
 		}
+	else if (strcmp(arg,"-attime") == 0)
+		{
+		if (!argn)
+			*badarg = 1;
+		else
+			{
+			long timestamp;
+			/* interpret the -attime argument as seconds since
+			 * Epoch */
+			if (sscanf(argn, "%li", &timestamp) != 1)
+				{
+				BIO_printf(bio_err,
+						"Error parsing timestamp %s\n",
+					   	argn);
+				*badarg = 1;
+				}
+			/* on some platforms time_t may be a float */
+			at_time = (time_t) timestamp;
+			}
+		(*pargs)++;
+		}
 	else if (!strcmp(arg, "-ignore_critical"))
 		flags |= X509_V_FLAG_IGNORE_CRITICAL;
 	else if (!strcmp(arg, "-issuer_checks"))
@@ -2364,6 +2386,9 @@
 	if (depth >= 0)
 		X509_VERIFY_PARAM_set_depth(*pm, depth);
 
+	if (at_time) 
+		X509_VERIFY_PARAM_set_time(*pm, at_time);
+
 	end:
 
 	(*pargs)++;
@@ -2695,6 +2720,50 @@
 
 #endif
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+/* next_protos_parse parses a comma separated list of strings into a string
+ * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
+ *   outlen: (output) set to the length of the resulting buffer on success.
+ *   err: (maybe NULL) on failure, an error message line is written to this BIO.
+ *   in: a NUL termianted string like "abc,def,ghi"
+ *
+ *   returns: a malloced buffer or NULL on failure.
+ */
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
+	{
+	size_t len;
+	unsigned char *out;
+	size_t i, start = 0;
+
+	len = strlen(in);
+	if (len >= 65535)
+		return NULL;
+
+	out = OPENSSL_malloc(strlen(in) + 1);
+	if (!out)
+		return NULL;
+
+	for (i = 0; i <= len; ++i)
+		{
+		if (i == len || in[i] == ',')
+			{
+			if (i - start > 255)
+				{
+				OPENSSL_free(out);
+				return NULL;
+				}
+			out[start] = i - start;
+			start = i + 1;
+			}
+		else
+			out[i+1] = in[i];
+		}
+
+	*outlen = len + 1;
+	return out;
+	}
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
 /*
  * Platform-specific sections
  */
@@ -3020,46 +3089,3 @@
 int raw_write_stdout(const void *buf,int siz)
 	{	return write(fileno(stdout),buf,siz);	}
 #endif
-
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-/* next_protos_parse parses a comma separated list of strings into a string
- * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
- *   outlen: (output) set to the length of the resulting buffer on success.
- *   in: a NUL termianted string like "abc,def,ghi"
- *
- *   returns: a malloced buffer or NULL on failure.
- */
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
-	{
-	size_t len;
-	unsigned char *out;
-	size_t i, start = 0;
-
-	len = strlen(in);
-	if (len >= 65535)
-		return NULL;
-
-	out = OPENSSL_malloc(strlen(in) + 1);
-	if (!out)
-		return NULL;
-
-	for (i = 0; i <= len; ++i)
-		{
-		if (i == len || in[i] == ',')
-			{
-			if (i - start > 255)
-				{
-				OPENSSL_free(out);
-				return NULL;
-				}
-			out[start] = i - start;
-			start = i + 1;
-			}
-		else
-			out[i+1] = in[i];
-		}
-
-	*outlen = len + 1;
-	return out;
-	}
-#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */

diff --git a/apps/apps.h b/apps/apps.h
index 42072ec..c1ca99d 100644
--- a/apps/apps.h
+++ b/apps/apps.h

@@ -317,6 +317,12 @@
 int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value);
 int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
 			const char *algname, ENGINE *e, int do_param);
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
 #ifndef OPENSSL_NO_PSK
 extern char *psk_key;
 #endif
@@ -325,6 +331,10 @@
 void jpake_server_auth(BIO *out, BIO *conn, const char *secret);
 #endif
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
 #define FORMAT_UNDEF    0
 #define FORMAT_ASN1     1
 #define FORMAT_TEXT     2
@@ -357,8 +367,7 @@
 #define TM_START	0
 #define TM_STOP		1
 double app_tminterval (int stop,int usertime);
-#endif
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#define OPENSSL_NO_SSL_INTERN
+
 #endif

diff --git a/apps/ca.c b/apps/ca.c
index 5d11948..2a83d19 100644
--- a/apps/ca.c
+++ b/apps/ca.c

@@ -197,26 +197,30 @@
 
 static void lookup_fail(const char *name, const char *tag);
 static int certify(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-		   const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,CA_DB *db,
+		   const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+		   STACK_OF(CONF_VALUE) *policy,CA_DB *db,
 		   BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate,
 		   char *enddate, long days, int batch, char *ext_sect, CONF *conf,
 		   int verbose, unsigned long certopt, unsigned long nameopt,
 		   int default_op, int ext_copy, int selfsign);
 static int certify_cert(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-			const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+			const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+			STACK_OF(CONF_VALUE) *policy,
 			CA_DB *db, BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn,
 			char *startdate, char *enddate, long days, int batch,
 			char *ext_sect, CONF *conf,int verbose, unsigned long certopt,
 			unsigned long nameopt, int default_op, int ext_copy,
 			ENGINE *e);
 static int certify_spkac(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-			 const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+			 const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+			 STACK_OF(CONF_VALUE) *policy,
 			 CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn, int email_dn,
 			 char *startdate, char *enddate, long days, char *ext_sect,
 			 CONF *conf, int verbose, unsigned long certopt, 
 			 unsigned long nameopt, int default_op, int ext_copy);
 static void write_new_certificate(BIO *bp, X509 *x, int output_der, int notext);
 static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
+	STACK_OF(OPENSSL_STRING) *sigopts,
 	STACK_OF(CONF_VALUE) *policy, CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn,
 	int email_dn, char *startdate, char *enddate, long days, int batch,
        	int verbose, X509_REQ *req, char *ext_sect, CONF *conf,
@@ -311,6 +315,7 @@
 	const EVP_MD *dgst=NULL;
 	STACK_OF(CONF_VALUE) *attribs=NULL;
 	STACK_OF(X509) *cert_sk=NULL;
+	STACK_OF(OPENSSL_STRING) *sigopts = NULL;
 #undef BSIZE
 #define BSIZE 256
 	MS_STATIC char buf[3][BSIZE];
@@ -435,6 +440,15 @@
 			if (--argc < 1) goto bad;
 			outdir= *(++argv);
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-notext") == 0)
 			notext=1;
 		else if (strcmp(*argv,"-batch") == 0)
@@ -1170,8 +1184,9 @@
 		if (spkac_file != NULL)
 			{
 			total++;
-			j=certify_spkac(&x,spkac_file,pkey,x509,dgst,attribs,db,
-				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,extensions,
+			j=certify_spkac(&x,spkac_file,pkey,x509,dgst,sigopts,
+				attribs,db, serial,subj,chtype,multirdn,
+				email_dn,startdate,enddate,days,extensions,
 				conf,verbose,certopt,nameopt,default_op,ext_copy);
 			if (j < 0) goto err;
 			if (j > 0)
@@ -1194,7 +1209,8 @@
 		if (ss_cert_file != NULL)
 			{
 			total++;
-			j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,attribs,
+			j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,sigopts,
+				attribs,
 				db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, e);
@@ -1214,7 +1230,7 @@
 		if (infile != NULL)
 			{
 			total++;
-			j=certify(&x,infile,pkey,x509p,dgst,attribs,db,
+			j=certify(&x,infile,pkey,x509p,dgst,sigopts, attribs,db,
 				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, selfsign);
@@ -1234,7 +1250,7 @@
 		for (i=0; i<argc; i++)
 			{
 			total++;
-			j=certify(&x,argv[i],pkey,x509p,dgst,attribs,db,
+			j=certify(&x,argv[i],pkey,x509p,dgst,sigopts,attribs,db,
 				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, selfsign);
@@ -1483,7 +1499,7 @@
 			crlnumber = NULL;
 			}
 
-		if (!X509_CRL_sign(crl,pkey,dgst)) goto err;
+		if (!do_X509_CRL_sign(bio_err,crl,pkey,dgst,sigopts)) goto err;
 
 		PEM_write_bio_X509_CRL(Sout,crl);
 
@@ -1537,6 +1553,8 @@
 	BN_free(serial);
 	BN_free(crlnumber);
 	free_index(db);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 	EVP_PKEY_free(pkey);
 	if (x509) X509_free(x509);
 	X509_CRL_free(crl);
@@ -1553,8 +1571,10 @@
 	}
 
 static int certify(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
-	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn,
+	     int email_dn, char *startdate, char *enddate,
 	     long days, int batch, char *ext_sect, CONF *lconf, int verbose,
 	     unsigned long certopt, unsigned long nameopt, int default_op,
 	     int ext_copy, int selfsign)
@@ -1610,7 +1630,8 @@
 	else
 		BIO_printf(bio_err,"Signature ok\n");
 
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn, email_dn,
+	ok=do_body(xret,pkey,x509,dgst,sigopts, policy,db,serial,subj,chtype,
+		multirdn, email_dn,
 		startdate,enddate,days,batch,verbose,req,ext_sect,lconf,
 		certopt, nameopt, default_op, ext_copy, selfsign);
 
@@ -1621,7 +1642,8 @@
 	}
 
 static int certify_cert(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
 	     BIGNUM *serial, char *subj, unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
 	     long days, int batch, char *ext_sect, CONF *lconf, int verbose,
 	     unsigned long certopt, unsigned long nameopt, int default_op,
@@ -1664,7 +1686,7 @@
 	if ((rreq=X509_to_X509_REQ(req,NULL,EVP_md5())) == NULL)
 		goto err;
 
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
+	ok=do_body(xret,pkey,x509,dgst,sigopts,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
 		days,batch,verbose,rreq,ext_sect,lconf, certopt, nameopt, default_op,
 		ext_copy, 0);
 
@@ -1675,7 +1697,8 @@
 	}
 
 static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
-	     STACK_OF(CONF_VALUE) *policy, CA_DB *db, BIGNUM *serial, char *subj,
+	     STACK_OF(OPENSSL_STRING) *sigopts, STACK_OF(CONF_VALUE) *policy,
+             CA_DB *db, BIGNUM *serial, char *subj,
 	     unsigned long chtype, int multirdn,
 	     int email_dn, char *startdate, char *enddate, long days, int batch,
 	     int verbose, X509_REQ *req, char *ext_sect, CONF *lconf,
@@ -2146,7 +2169,7 @@
 		EVP_PKEY_copy_parameters(pktmp,pkey);
 	EVP_PKEY_free(pktmp);
 
-	if (!X509_sign(ret,pkey,dgst))
+	if (!do_X509_sign(bio_err, ret,pkey,dgst, sigopts))
 		goto err;
 
 	/* We now just add it to the database */
@@ -2240,7 +2263,8 @@
 	}
 
 static int certify_spkac(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
 	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
 	     long days, char *ext_sect, CONF *lconf, int verbose, unsigned long certopt,
 	     unsigned long nameopt, int default_op, int ext_copy)
@@ -2366,9 +2390,9 @@
 
 	X509_REQ_set_pubkey(req,pktmp);
 	EVP_PKEY_free(pktmp);
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
-		   days,1,verbose,req,ext_sect,lconf, certopt, nameopt, default_op,
-			ext_copy, 0);
+	ok=do_body(xret,pkey,x509,dgst,sigopts,policy,db,serial,subj,chtype,
+		   multirdn,email_dn,startdate,enddate, days,1,verbose,req,
+		   ext_sect,lconf, certopt, nameopt, default_op, ext_copy, 0);
 err:
 	if (req != NULL) X509_REQ_free(req);
 	if (parms != NULL) CONF_free(parms);

diff --git a/apps/ciphers.c b/apps/ciphers.c
index 3d4c60d..5f2b739 100644
--- a/apps/ciphers.c
+++ b/apps/ciphers.c

@@ -196,7 +196,7 @@
 			
 			if (Verbose)
 				{
-				unsigned long id = c->id;
+				unsigned long id = SSL_CIPHER_get_id(c);
 				int id0 = (int)(id >> 24);
 				int id1 = (int)((id >> 16) & 0xffL);
 				int id2 = (int)((id >> 8) & 0xffL);

diff --git a/apps/client.pem b/apps/client.pem
index 307910e..e7a47a7 100644
--- a/apps/client.pem
+++ b/apps/client.pem

@@ -1,24 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Client test cert (512 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Client Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
 -----BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQIwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU2WhcNOTgwNjA5
-MTM1NzU2WjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGkNsaWVudCB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBALtv55QyzG6i2Plw
-Z1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexmq/R4KedLjFEIYjocDui+IXs62NNt
-XrT8odkCAwEAATANBgkqhkiG9w0BAQQFAAOBgQBwtMmI7oGUG8nKmftQssATViH5
-NRRtoEw07DxJp/LfatHdrhqQB73eGdL5WILZJXk46Xz2e9WMSUjVCSYhdKxtflU3
-UR2Ajv1Oo0sTNdfz0wDqJNirLNtzyhhsaq8qMTrLwXrCP31VxBiigFSQSUFnZyTE
-9TKwhS4GlwbtCfxSKQ==
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6yMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgQ2xpZW50IENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC0ranbHRLcLVqN+0BzcZpY
++yOLqxzDWT1LD9eW1stC4NzXX9/DCtSIVyN7YIHdGLrIPr64IDdXXaMRzgZ2rOKs
+lmHCAiFpO/ja99gGCJRxH0xwQatqAULfJVHeUhs7OEGOZc2nWifjqKvGfNTilP7D
+nwi69ipQFq9oS19FmhwVHk2wg7KZGHI1qDyG04UrfCZMRitvS9+UVhPpIPjuiBi2
+x3/FZIpL5gXJvvFK6xHY63oq2asyzBATntBgnP4qJFWWcvRx24wF1PnZabxuVoL2
+bPnQ/KvONDrw3IdqkKhYNTul7jEcu3OlcZIMw+7DiaKJLAzKb/bBF5gm/pwW6As9
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSZHKyLoTh7Mb409Zn/mK1ceSDAjDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAD0mL7PtPYgCEuDyOQSbLpeND5hVS
+curxQdGnrJ6Acrhodb7E9ccATokeb0PLx6HBLQUicxhTZIQ9FbO43YkQcOU6C3BB
+IlwskqmtN6+VmrQzNolHCDzvxNZs9lYL2VbGPGqVRyjZeHpoAlf9cQr8PgDb4d4b
+vUx2KAhHQvV2nkmYvKyXcgnRuHggumF87mkxidriGAEFwH4qfOqetUg64WyxP7P2
+QLipm04SyQa7ONtIApfVXgHcE42Py4/f4arzCzMjKe3VyhGkS7nsT55X/fWgTaRm
+CQPkO+H94P958WTvQDt77bQ+D3IvYaVvfil8n6HJMOJfFT0LJuSUbpSXJg==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIIBOwIBAAJBALtv55QyzG6i2PlwZ1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexm
-q/R4KedLjFEIYjocDui+IXs62NNtXrT8odkCAwEAAQJAbwXq0vJ/+uyEvsNgxLko
-/V86mGXQ/KrSkeKlL0r4ENxjcyeMAGoKu6J9yMY7+X9+Zm4nxShNfTsf/+Freoe1
-HQIhAPOSm5Q1YI+KIsII2GeVJx1U69+wnd71OasIPakS1L1XAiEAxQAW+J3/JWE0
-ftEYakbhUOKL8tD1OaFZS71/5GdG7E8CIQCefUMmySSvwd6kC0VlATSWbW+d+jp/
-nWmM1KvqnAo5uQIhALqEADu5U1Wvt8UN8UDGBRPQulHWNycuNV45d3nnskWPAiAw
-ueTyr6WsZ5+SD8g/Hy3xuvF3nPmJRH+rwvVihlcFOg==
+MIIEpQIBAAKCAQEAtK2p2x0S3C1ajftAc3GaWPsji6scw1k9Sw/XltbLQuDc11/f
+wwrUiFcje2CB3Ri6yD6+uCA3V12jEc4GdqzirJZhwgIhaTv42vfYBgiUcR9McEGr
+agFC3yVR3lIbOzhBjmXNp1on46irxnzU4pT+w58IuvYqUBavaEtfRZocFR5NsIOy
+mRhyNag8htOFK3wmTEYrb0vflFYT6SD47ogYtsd/xWSKS+YFyb7xSusR2Ot6Ktmr
+MswQE57QYJz+KiRVlnL0cduMBdT52Wm8blaC9mz50PyrzjQ68NyHapCoWDU7pe4x
+HLtzpXGSDMPuw4miiSwMym/2wReYJv6cFugLPQIDAQABAoIBAAZOyc9MhIwLSU4L
+p4RgQvM4UVVe8/Id+3XTZ8NsXExJbWxXfIhiqGjaIfL8u4vsgRjcl+v1s/jo2/iT
+KMab4o4D8gXD7UavQVDjtjb/ta79WL3SjRl2Uc9YjjMkyq6WmDNQeo2NKDdafCTB
+1uzSJtLNipB8Z53ELPuHJhxX9QMHrMnuha49riQgXZ7buP9iQrHJFhImBjSzbxJx
+L+TI6rkyLSf9Wi0Pd3L27Ob3QWNfNRYNSeTE+08eSRChkur5W0RuXAcuAICdQlCl
+LBvWO/LmmvbzCqiDcgy/TliSb6CGGwgiNG7LJZmlkYNj8laGwalNlYZs3UrVv6NO
+Br2loAECgYEA2kvCvPGj0Dg/6g7WhXDvAkEbcaL1tSeCxBbNH+6HS2UWMWvyTtCn
+/bbD519QIdkvayy1QjEf32GV/UjUVmlULMLBcDy0DGjtL3+XpIhLKWDNxN1v1/ai
+1oz23ZJCOgnk6K4qtFtlRS1XtynjA+rBetvYvLP9SKeFrnpzCgaA2r0CgYEA0+KX
+1ACXDTNH5ySX3kMjSS9xdINf+OOw4CvPHFwbtc9aqk2HePlEsBTz5I/W3rKwXva3
+NqZ/bRqVVeZB/hHKFywgdUQk2Uc5z/S7Lw70/w1HubNTXGU06Ngb6zOFAo/o/TwZ
+zTP1BMIKSOB6PAZPS3l+aLO4FRIRotfFhgRHOoECgYEAmiZbqt8cJaJDB/5YYDzC
+mp3tSk6gIb936Q6M5VqkMYp9pIKsxhk0N8aDCnTU+kIK6SzWBpr3/d9Ecmqmfyq7
+5SvWO3KyVf0WWK9KH0abhOm2BKm2HBQvI0DB5u8sUx2/hsvOnjPYDISbZ11t0MtK
+u35Zy89yMYcSsIYJjG/ROCUCgYEAgI2P9G5PNxEP5OtMwOsW84Y3Xat/hPAQFlI+
+HES+AzbFGWJkeT8zL2nm95tVkFP1sggZ7Kxjz3w7cpx7GX0NkbWSE9O+T51pNASV
+tN1sQ3p5M+/a+cnlqgfEGJVvc7iAcXQPa3LEi5h2yPR49QYXAgG6cifn3dDSpmwn
+SUI7PQECgYEApGCIIpSRPLAEHTGmP87RBL1smurhwmy2s/pghkvUkWehtxg0sGHh
+kuaqDWcskogv+QC0sVdytiLSz8G0DwcEcsHK1Fkyb8A+ayiw6jWJDo2m9+IF4Fww
+1Te6jFPYDESnbhq7+TLGgHGhtwcu5cnb4vSuYXGXKupZGzoLOBbv1Zw=
 -----END RSA PRIVATE KEY-----

diff --git a/apps/cms.c b/apps/cms.c
index d15925a..d754140 100644
--- a/apps/cms.c
+++ b/apps/cms.c

@@ -136,6 +136,7 @@
 	char *engine=NULL;
 #endif
 	unsigned char *secret_key = NULL, *secret_keyid = NULL;
+	unsigned char *pwri_pass = NULL, *pwri_tmp = NULL;
 	size_t secret_keylen = 0, secret_keyidlen = 0;
 
 	ASN1_OBJECT *econtent_type = NULL;
@@ -326,6 +327,13 @@
 				}
 			secret_keyidlen = (size_t)ltmp;
 			}
+		else if (!strcmp(*args,"-pwri_password"))
+			{
+			if (!args[1])
+				goto argerr;
+			args++;
+			pwri_pass = (unsigned char *)*args;
+			}
 		else if (!strcmp(*args,"-econtent_type"))
 			{
 			if (!args[1])
@@ -559,7 +567,7 @@
 
 	else if (operation == SMIME_DECRYPT)
 		{
-		if (!recipfile && !keyfile && !secret_key)
+		if (!recipfile && !keyfile && !secret_key && !pwri_pass)
 			{
 			BIO_printf(bio_err, "No recipient certificate or key specified\n");
 			badarg = 1;
@@ -567,7 +575,7 @@
 		}
 	else if (operation == SMIME_ENCRYPT)
 		{
-		if (!*args && !secret_key)
+		if (!*args && !secret_key && !pwri_pass)
 			{
 			BIO_printf(bio_err, "No recipient(s) certificate(s) specified\n");
 			badarg = 1;
@@ -917,6 +925,17 @@
 			secret_key = NULL;
 			secret_keyid = NULL;
 			}
+		if (pwri_pass)
+			{
+			pwri_tmp = (unsigned char *)BUF_strdup((char *)pwri_pass);
+			if (!pwri_tmp)
+				goto end;
+			if (!CMS_add0_recipient_password(cms,
+						-1, NID_undef, NID_undef,
+						 pwri_tmp, -1, NULL))
+				goto end;
+			pwri_tmp = NULL;
+			}
 		if (!(flags & CMS_STREAM))
 			{
 			if (!CMS_final(cms, in, NULL, flags))
@@ -1043,6 +1062,16 @@
 				}
 			}
 
+		if (pwri_pass)
+			{
+			if (!CMS_decrypt_set1_password(cms, pwri_pass, -1))
+				{
+				BIO_puts(bio_err,
+					"Error decrypting CMS using password\n");
+				goto end;
+				}
+			}
+
 		if (!CMS_decrypt(cms, NULL, NULL, indata, out, flags))
 			{
 			BIO_printf(bio_err, "Error decrypting CMS structure\n");
@@ -1167,6 +1196,8 @@
 		OPENSSL_free(secret_key);
 	if (secret_keyid)
 		OPENSSL_free(secret_keyid);
+	if (pwri_tmp)
+		OPENSSL_free(pwri_tmp);
 	if (econtent_type)
 		ASN1_OBJECT_free(econtent_type);
 	if (rr)

diff --git a/apps/dgst.c b/apps/dgst.c
index 9bf38ce..b08e9a7 100644
--- a/apps/dgst.c
+++ b/apps/dgst.c

@@ -127,6 +127,7 @@
 #endif
 	char *hmac_key=NULL;
 	char *mac_name=NULL;
+	int non_fips_allow = 0;
 	STACK_OF(OPENSSL_STRING) *sigopts = NULL, *macopts = NULL;
 
 	apps_startup();
@@ -215,6 +216,10 @@
 			out_bin = 1;
 		else if (strcmp(*argv,"-d") == 0)
 			debug=1;
+		else if (strcmp(*argv,"-non-fips-allow") == 0)
+			non_fips_allow=1;
+		else if (!strcmp(*argv,"-fips-fingerprint"))
+			hmac_key = "etaonrishdlcupfm";
 		else if (!strcmp(*argv,"-hmac"))
 			{
 			if (--argc < 1)
@@ -395,6 +400,13 @@
 			goto end;
 		}
 
+	if (non_fips_allow)
+		{
+		EVP_MD_CTX *md_ctx;
+		BIO_get_md_ctx(bmd,&md_ctx);
+		EVP_MD_CTX_set_flags(md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+		}
+
 	if (hmac_key)
 		{
 		sigkey = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, e,

diff --git a/apps/enc.c b/apps/enc.c
index 076225c..719acc3 100644
--- a/apps/enc.c
+++ b/apps/enc.c

@@ -129,6 +129,7 @@
 	char *engine = NULL;
 #endif
 	const EVP_MD *dgst=NULL;
+	int non_fips_allow = 0;
 
 	apps_startup();
 
@@ -281,6 +282,8 @@
 			if (--argc < 1) goto bad;
 			md= *(++argv);
 			}
+		else if (strcmp(*argv,"-non-fips-allow") == 0)
+			non_fips_allow = 1;
 		else if	((argv[0][0] == '-') &&
 			((c=EVP_get_cipherbyname(&(argv[0][1]))) != NULL))
 			{
@@ -589,6 +592,11 @@
 		 */
 
 		BIO_get_cipher_ctx(benc, &ctx);
+
+		if (non_fips_allow)
+			EVP_CIPHER_CTX_set_flags(ctx,
+				EVP_CIPH_FLAG_NON_FIPS_ALLOW);
+
 		if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, enc))
 			{
 			BIO_printf(bio_err, "Error setting cipher %s\n",

diff --git a/apps/openssl.c b/apps/openssl.c
index 1068957..33b1655 100644
--- a/apps/openssl.c
+++ b/apps/openssl.c

@@ -129,6 +129,9 @@
 #include "progs.h"
 #include "s_apps.h"
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 /* The LHASH callbacks ("hash" & "cmp") have been replaced by functions with the
  * base prototypes (we cast each variable inside the function to the required
@@ -310,6 +313,19 @@
 		CRYPTO_set_locking_callback(lock_dbg_cb);
 		}
 
+	if(getenv("OPENSSL_FIPS")) {
+#ifdef OPENSSL_FIPS
+		if (!FIPS_mode_set(1)) {
+			ERR_load_crypto_strings();
+			ERR_print_errors(BIO_new_fp(stderr,BIO_NOCLOSE));
+			EXIT(1);
+		}
+#else
+		fprintf(stderr, "FIPS mode not supported.\n");
+		EXIT(1);
+#endif
+		}
+
 	apps_startup();
 
 	/* Lets load up our environment a little */

diff --git a/apps/progs.h b/apps/progs.h
index 728bb6d..dd2298b 100644
--- a/apps/progs.h
+++ b/apps/progs.h

@@ -46,6 +46,7 @@
 extern int ocsp_main(int argc,char *argv[]);
 extern int prime_main(int argc,char *argv[]);
 extern int ts_main(int argc,char *argv[]);
+extern int srp_main(int argc,char *argv[]);
 
 #define FUNC_TYPE_GENERAL	1
 #define FUNC_TYPE_MD		2
@@ -149,6 +150,9 @@
 #if 0 /* ANDROID */
 	{FUNC_TYPE_GENERAL,"ts",ts_main},
 #endif
+#ifndef OPENSSL_NO_SRP
+	{FUNC_TYPE_GENERAL,"srp",srp_main},
+#endif
 #ifndef OPENSSL_NO_MD2
 	{FUNC_TYPE_MD,"md2",dgst_main},
 #endif

diff --git a/apps/progs.pl b/apps/progs.pl
index de6fdea..39ca8f7 100644
--- a/apps/progs.pl
+++ b/apps/progs.pl

@@ -51,6 +51,8 @@
 		{ print "#ifndef OPENSSL_NO_CMS\n${str}#endif\n"; }
 	elsif ( ($_ =~ /^ocsp$/))
 		{ print "#ifndef OPENSSL_NO_OCSP\n${str}#endif\n"; }
+	elsif ( ($_ =~ /^srp$/))
+		{ print "#ifndef OPENSSL_NO_SRP\n${str}#endif\n"; }
 	else
 		{ print $str; }
 	}

diff --git a/apps/req.c b/apps/req.c
index 820cd18..8552658 100644
--- a/apps/req.c
+++ b/apps/req.c

@@ -165,7 +165,7 @@
 	EVP_PKEY_CTX *genctx = NULL;
 	const char *keyalg = NULL;
 	char *keyalgstr = NULL;
-	STACK_OF(OPENSSL_STRING) *pkeyopts = NULL;
+	STACK_OF(OPENSSL_STRING) *pkeyopts = NULL, *sigopts = NULL;
 	EVP_PKEY *pkey=NULL;
 	int i=0,badops=0,newreq=0,verbose=0,pkey_type=-1;
 	long newkey = -1;
@@ -310,6 +310,15 @@
 			if (!pkeyopts || !sk_OPENSSL_STRING_push(pkeyopts, *(++argv)))
 				goto bad;
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-batch") == 0)
 			batch=1;
 		else if (strcmp(*argv,"-newhdr") == 0)
@@ -858,8 +867,9 @@
 					extensions);
 				goto end;
 				}
-			
-			if (!(i=X509_sign(x509ss,pkey,digest)))
+
+			i=do_X509_sign(bio_err, x509ss, pkey, digest, sigopts);
+			if (!i)
 				{
 				ERR_print_errors(bio_err);
 				goto end;
@@ -883,7 +893,8 @@
 					req_exts);
 				goto end;
 				}
-			if (!(i=X509_REQ_sign(req,pkey,digest)))
+			i=do_X509_REQ_sign(bio_err, req, pkey, digest, sigopts);
+			if (!i)
 				{
 				ERR_print_errors(bio_err);
 				goto end;
@@ -1084,6 +1095,8 @@
 		EVP_PKEY_CTX_free(genctx);
 	if (pkeyopts)
 		sk_OPENSSL_STRING_free(pkeyopts);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 #ifndef OPENSSL_NO_ENGINE
 	if (gen_eng)
 		ENGINE_free(gen_eng);
@@ -1756,3 +1769,68 @@
 #endif
 	return 1;
 	}
+
+static int do_sign_init(BIO *err, EVP_MD_CTX *ctx, EVP_PKEY *pkey,
+			const EVP_MD *md, STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	EVP_PKEY_CTX *pkctx = NULL;
+	int i;
+	EVP_MD_CTX_init(ctx);
+	if (!EVP_DigestSignInit(ctx, &pkctx, md, NULL, pkey))
+		return 0;
+	for (i = 0; i < sk_OPENSSL_STRING_num(sigopts); i++)
+		{
+		char *sigopt = sk_OPENSSL_STRING_value(sigopts, i);
+		if (pkey_ctrl_string(pkctx, sigopt) <= 0)
+			{
+			BIO_printf(err, "parameter error \"%s\"\n", sigopt);
+			ERR_print_errors(bio_err);
+			return 0;
+			}
+		}
+	return 1;
+	}
+
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+
+
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_REQ_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+		
+	
+
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_CRL_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+		
+	

diff --git a/apps/s_cb.c b/apps/s_cb.c
index c4f5512..2cd7337 100644
--- a/apps/s_cb.c
+++ b/apps/s_cb.c

@@ -357,6 +357,12 @@
 	case TLS1_VERSION:
 		str_version = "TLS 1.0 ";
 		break;
+	case TLS1_1_VERSION:
+		str_version = "TLS 1.1 ";
+		break;
+	case TLS1_2_VERSION:
+		str_version = "TLS 1.2 ";
+		break;
 	case DTLS1_VERSION:
 		str_version = "DTLS 1.0 ";
 		break;
@@ -549,6 +555,9 @@
 				case 114:
 					str_details2 = " bad_certificate_hash_value";
 					break;
+				case 115:
+					str_details2 = " unknown_psk_identity";
+					break;
 					}
 				}
 			}
@@ -597,6 +606,26 @@
 					}
 				}
 			}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+		if (content_type == 24) /* Heartbeat */
+			{
+			str_details1 = ", Heartbeat";
+			
+			if (len > 0)
+				{
+				switch (((const unsigned char*)buf)[0])
+					{
+				case 1:
+					str_details1 = ", HeartbeatRequest";
+					break;
+				case 2:
+					str_details1 = ", HeartbeatResponse";
+					break;
+					}
+				}
+			}
+#endif
 		}
 
 	BIO_printf(bio, "%s %s%s [length %04lx]%s%s\n", str_write_p, str_version, str_content_type, (unsigned long)len, str_details1, str_details2);
@@ -657,6 +686,22 @@
 		extname = "status request";
 		break;
 
+		case TLSEXT_TYPE_user_mapping:
+		extname = "user mapping";
+		break;
+
+		case TLSEXT_TYPE_client_authz:
+		extname = "client authz";
+		break;
+
+		case TLSEXT_TYPE_server_authz:
+		extname = "server authz";
+		break;
+
+		case TLSEXT_TYPE_cert_type:
+		extname = "cert type";
+		break;
+
 		case TLSEXT_TYPE_elliptic_curves:
 		extname = "elliptic curves";
 		break;
@@ -665,12 +710,28 @@
 		extname = "EC point formats";
 		break;
 
-		case TLSEXT_TYPE_session_ticket:
-		extname = "server ticket";
+		case TLSEXT_TYPE_srp:
+		extname = "SRP";
 		break;
 
-		case TLSEXT_TYPE_renegotiate:
-		extname = "renegotiate";
+		case TLSEXT_TYPE_signature_algorithms:
+		extname = "signature algorithms";
+		break;
+
+		case TLSEXT_TYPE_use_srtp:
+		extname = "use SRTP";
+		break;
+
+		case TLSEXT_TYPE_heartbeat:
+		extname = "heartbeat";
+		break;
+
+		case TLSEXT_TYPE_session_ticket:
+		extname = "session ticket";
+		break;
+
+		case TLSEXT_TYPE_renegotiate: 
+		extname = "renegotiation info";
 		break;
 
 #ifdef TLSEXT_TYPE_opaque_prf_input
@@ -678,6 +739,11 @@
 		extname = "opaque PRF input";
 		break;
 #endif
+#ifdef TLSEXT_TYPE_next_proto_neg
+		case TLSEXT_TYPE_next_proto_neg:
+		extname = "next protocol";
+		break;
+#endif
 
 		default:
 		extname = "unknown";

diff --git a/apps/s_client.c b/apps/s_client.c
index 8122e3d..f2aacaf 100644
--- a/apps/s_client.c
+++ b/apps/s_client.c

@@ -163,6 +163,9 @@
 #include <openssl/rand.h>
 #include <openssl/ocsp.h>
 #include <openssl/bn.h>
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include "s_apps.h"
 #include "timeouts.h"
 
@@ -203,6 +206,9 @@
 static int c_msg=0;
 static int c_showcerts=0;
 
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
 static void sc_usage(void);
 static void print_stuff(BIO *berr,SSL *con,int full);
 #ifndef OPENSSL_NO_TLSEXT
@@ -316,12 +322,21 @@
 	BIO_printf(bio_err," -jpake arg    - JPAKE secret to use\n");
 # endif
 #endif
+#ifndef OPENSSL_NO_SRP
+	BIO_printf(bio_err," -srpuser user     - SRP authentification for 'user'\n");
+	BIO_printf(bio_err," -srppass arg      - password for 'user'\n");
+	BIO_printf(bio_err," -srp_lateuser     - SRP username into second ClientHello message\n");
+	BIO_printf(bio_err," -srp_moregroups   - Tolerate other than the known g N values.\n");
+	BIO_printf(bio_err," -srp_strength int - minimal mength in bits for N (default %d).\n",SRP_MINIMAL_N);
+#endif
 	BIO_printf(bio_err," -ssl2         - just use SSLv2\n");
 	BIO_printf(bio_err," -ssl3         - just use SSLv3\n");
+	BIO_printf(bio_err," -tls1_2       - just use TLSv1.2\n");
+	BIO_printf(bio_err," -tls1_1       - just use TLSv1.1\n");
 	BIO_printf(bio_err," -tls1         - just use TLSv1\n");
 	BIO_printf(bio_err," -dtls1        - just use DTLSv1\n");    
 	BIO_printf(bio_err," -mtu          - set the link layer MTU\n");
-	BIO_printf(bio_err," -no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
+	BIO_printf(bio_err," -no_tls1_2/-no_tls1_1/-no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
 	BIO_printf(bio_err," -bugs         - Switch on all SSL implementation bug workarounds\n");
 	BIO_printf(bio_err," -serverpref   - Use server's cipher preferences (only SSLv2)\n");
 	BIO_printf(bio_err," -cipher       - preferred cipher to use, use the 'openssl ciphers'\n");
@@ -342,12 +357,15 @@
 	BIO_printf(bio_err," -tlsextdebug      - hex dump of all TLS extensions received\n");
 	BIO_printf(bio_err," -status           - request certificate status from server\n");
 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
-# ifndef OPENSSL_NO_NEXTPROTONEG
+	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
 	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
 # endif
-	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
 #endif
 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
+	BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+ 	BIO_printf(bio_err," -keymatexport label   - Export keying material using label\n");
+ 	BIO_printf(bio_err," -keymatexportlen len  - Export len bytes of keying material (default 20)\n");
 	}
 
 #ifndef OPENSSL_NO_TLSEXT
@@ -371,6 +389,122 @@
 	return SSL_TLSEXT_ERR_OK;
 	}
 
+#ifndef OPENSSL_NO_SRP
+
+/* This is a context that we pass to all callbacks */
+typedef struct srp_arg_st
+	{
+	char *srppassin;
+	char *srplogin;
+	int msg;   /* copy from c_msg */
+	int debug; /* copy from c_debug */
+	int amp;   /* allow more groups */
+	int strength /* minimal size for N */ ;
+	} SRP_ARG;
+
+#define SRP_NUMBER_ITERATIONS_FOR_PRIME 64
+
+static int srp_Verify_N_and_g(BIGNUM *N, BIGNUM *g)
+	{
+	BN_CTX *bn_ctx = BN_CTX_new();
+	BIGNUM *p = BN_new();
+	BIGNUM *r = BN_new();
+	int ret =
+		g != NULL && N != NULL && bn_ctx != NULL && BN_is_odd(N) &&
+		BN_is_prime_ex(N, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+		p != NULL && BN_rshift1(p, N) &&
+
+		/* p = (N-1)/2 */
+		BN_is_prime_ex(p, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+		r != NULL &&
+
+		/* verify g^((N-1)/2) == -1 (mod N) */
+		BN_mod_exp(r, g, p, N, bn_ctx) &&
+		BN_add_word(r, 1) &&
+		BN_cmp(r, N) == 0;
+
+	if(r)
+		BN_free(r);
+	if(p)
+		BN_free(p);
+	if(bn_ctx)
+		BN_CTX_free(bn_ctx);
+	return ret;
+	}
+
+/* This callback is used here for two purposes:
+   - extended debugging
+   - making some primality tests for unknown groups
+   The callback is only called for a non default group.
+
+   An application does not need the call back at all if
+   only the stanard groups are used.  In real life situations, 
+   client and server already share well known groups, 
+   thus there is no need to verify them. 
+   Furthermore, in case that a server actually proposes a group that
+   is not one of those defined in RFC 5054, it is more appropriate 
+   to add the group to a static list and then compare since 
+   primality tests are rather cpu consuming.
+*/
+
+static int MS_CALLBACK ssl_srp_verify_param_cb(SSL *s, void *arg)
+	{
+	SRP_ARG *srp_arg = (SRP_ARG *)arg;
+	BIGNUM *N = NULL, *g = NULL;
+	if (!(N = SSL_get_srp_N(s)) || !(g = SSL_get_srp_g(s)))
+		return 0;
+	if (srp_arg->debug || srp_arg->msg || srp_arg->amp == 1)
+		{
+    		BIO_printf(bio_err, "SRP parameters:\n"); 
+		BIO_printf(bio_err,"\tN="); BN_print(bio_err,N);
+		BIO_printf(bio_err,"\n\tg="); BN_print(bio_err,g);
+		BIO_printf(bio_err,"\n");
+		}
+
+	if (SRP_check_known_gN_param(g,N))
+		return 1;
+
+	if (srp_arg->amp == 1)
+		{
+		if (srp_arg->debug)
+			BIO_printf(bio_err, "SRP param N and g are not known params, going to check deeper.\n");
+
+/* The srp_moregroups is a real debugging feature.
+   Implementors should rather add the value to the known ones.
+   The minimal size has already been tested.
+*/
+		if (BN_num_bits(g) <= BN_BITS && srp_Verify_N_and_g(N,g))
+			return 1;
+		}	
+	BIO_printf(bio_err, "SRP param N and g rejected.\n");
+	return 0;
+	}
+
+#define PWD_STRLEN 1024
+
+static char * MS_CALLBACK ssl_give_srp_client_pwd_cb(SSL *s, void *arg)
+	{
+	SRP_ARG *srp_arg = (SRP_ARG *)arg;
+	char *pass = (char *)OPENSSL_malloc(PWD_STRLEN+1);
+	PW_CB_DATA cb_tmp;
+	int l;
+
+	cb_tmp.password = (char *)srp_arg->srppassin;
+	cb_tmp.prompt_info = "SRP user";
+	if ((l = password_callback(pass, PWD_STRLEN, 0, &cb_tmp))<0)
+		{
+		BIO_printf (bio_err, "Can't read Password\n");
+		OPENSSL_free(pass);
+		return NULL;
+		}
+	*(pass+l)= '\0';
+
+	return pass;
+	}
+
+#endif
+	char *srtp_profiles = NULL;
+
 # ifndef OPENSSL_NO_NEXTPROTONEG
 /* This the context that we pass to next_proto_cb */
 typedef struct tlsextnextprotoctx_st {
@@ -403,7 +537,7 @@
 	ctx->status = SSL_select_next_proto(out, outlen, in, inlen, ctx->data, ctx->len);
 	return SSL_TLSEXT_ERR_OK;
 	}
-# endif  /* ndef OPENSSL_NO_NEXTPROTONEG */
+# endif
 #endif
 
 enum
@@ -422,6 +556,9 @@
 	{
 	unsigned int off=0, clr=0;
 	SSL *con=NULL;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 	int s,k,width,state=0;
 	char *cbuf=NULL,*sbuf=NULL,*mbuf=NULL;
 	int cbuf_len,cbuf_off;
@@ -481,6 +618,11 @@
 #ifndef OPENSSL_NO_JPAKE
 	char *jpake_secret = NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	char * srppass = NULL;
+	int srp_lateuser = 0;
+	SRP_ARG srp_arg = {NULL,NULL,0,0,0,1024};
+#endif
 
 #if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
 	meth=SSLv23_client_method();
@@ -630,6 +772,37 @@
                                 }
 			}
 #endif
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv,"-srpuser") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_arg.srplogin= *(++argv);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srppass") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srppass= *(++argv);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srp_strength") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_arg.strength=atoi(*(++argv));
+			BIO_printf(bio_err,"SRP minimal length for N is %d\n",srp_arg.strength);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srp_lateuser") == 0)
+			{
+			srp_lateuser= 1;
+			meth=TLSv1_client_method();
+			}
+		else if	(strcmp(*argv,"-srp_moregroups") == 0)
+			{
+			srp_arg.amp=1;
+			meth=TLSv1_client_method();
+			}
+#endif
 #ifndef OPENSSL_NO_SSL2
 		else if	(strcmp(*argv,"-ssl2") == 0)
 			meth=SSLv2_client_method();
@@ -639,6 +812,10 @@
 			meth=SSLv3_client_method();
 #endif
 #ifndef OPENSSL_NO_TLS1
+		else if	(strcmp(*argv,"-tls1_2") == 0)
+			meth=TLSv1_2_client_method();
+		else if	(strcmp(*argv,"-tls1_1") == 0)
+			meth=TLSv1_1_client_method();
 		else if	(strcmp(*argv,"-tls1") == 0)
 			meth=TLSv1_client_method();
 #endif
@@ -687,6 +864,10 @@
 			if (--argc < 1) goto bad;
 			CAfile= *(++argv);
 			}
+		else if (strcmp(*argv,"-no_tls1_2") == 0)
+			off|=SSL_OP_NO_TLSv1_2;
+		else if (strcmp(*argv,"-no_tls1_1") == 0)
+			off|=SSL_OP_NO_TLSv1_1;
 		else if (strcmp(*argv,"-no_tls1") == 0)
 			off|=SSL_OP_NO_TLSv1;
 		else if (strcmp(*argv,"-no_ssl3") == 0)
@@ -774,7 +955,23 @@
 			jpake_secret = *++argv;
 			}
 #endif
-		else
+		else if (strcmp(*argv,"-use_srtp") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srtp_profiles = *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexport") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlabel= *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexportlen") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlen=atoi(*(++argv));
+			if (keymatexportlen == 0) goto bad;
+			}
+                else
 			{
 			BIO_printf(bio_err,"unknown option %s\n",*argv);
 			badop=1;
@@ -800,14 +997,13 @@
 			goto end;
 			}
 		psk_identity = "JPAKE";
+		if (cipher)
+			{
+			BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
+			goto end;
+			}
+		cipher = "PSK";
 		}
-
-	if (cipher)
-		{
-		BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
-		goto end;
-		}
-	cipher = "PSK";
 #endif
 
 	OpenSSL_add_ssl_algorithms();
@@ -901,6 +1097,14 @@
 			}
 		}
 
+#ifndef OPENSSL_NO_SRP
+	if(!app_passwd(bio_err, srppass, NULL, &srp_arg.srppassin, NULL))
+		{
+		BIO_printf(bio_err, "Error getting password\n");
+		goto end;
+		}
+#endif
+
 	ctx=SSL_CTX_new(meth);
 	if (ctx == NULL)
 		{
@@ -936,6 +1140,8 @@
 			BIO_printf(bio_c_out, "PSK key given or JPAKE in use, setting client callback\n");
 		SSL_CTX_set_psk_client_callback(ctx, psk_client_cb);
 		}
+	if (srtp_profiles != NULL)
+		SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles);
 #endif
 	if (bugs)
 		SSL_CTX_set_options(ctx,SSL_OP_ALL|off);
@@ -949,6 +1155,11 @@
 	 */
 	if (socket_type == SOCK_DGRAM) SSL_CTX_set_read_ahead(ctx, 1);
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+	if (next_proto.data)
+		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
+#endif
+
 	/* Enable handshake cutthrough for client connections using
 	 * strong ciphers. */
 	if (cutthrough)
@@ -958,11 +1169,6 @@
 		SSL_CTX_set_mode(ctx, ssl_mode);
 		}
 
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-	if (next_proto.data)
-		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
-#endif
-
 	if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
 	if (cipher != NULL)
 		if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
@@ -994,6 +1200,24 @@
 		SSL_CTX_set_tlsext_servername_callback(ctx, ssl_servername_cb);
 		SSL_CTX_set_tlsext_servername_arg(ctx, &tlsextcbp);
 		}
+#ifndef OPENSSL_NO_SRP
+        if (srp_arg.srplogin)
+		{
+		if (!srp_lateuser && !SSL_CTX_set_srp_username(ctx, srp_arg.srplogin))
+			{
+			BIO_printf(bio_err,"Unable to set SRP username\n");
+			goto end;
+			}
+		srp_arg.msg = c_msg;
+		srp_arg.debug = c_debug ;
+		SSL_CTX_set_srp_cb_arg(ctx,&srp_arg);
+		SSL_CTX_set_srp_client_pwd_callback(ctx, ssl_give_srp_client_pwd_cb);
+		SSL_CTX_set_srp_strength(ctx, srp_arg.strength);
+		if (c_msg || c_debug || srp_arg.amp == 0)
+			SSL_CTX_set_srp_verify_param_callback(ctx, ssl_srp_verify_param_cb);
+		}
+
+#endif
 #endif
 
 	con=SSL_new(ctx);
@@ -1032,9 +1256,10 @@
 		}
 #endif
 #ifndef OPENSSL_NO_KRB5
-	if (con  &&  (con->kssl_ctx = kssl_ctx_new()) != NULL)
+	if (con  &&  (kctx = kssl_ctx_new()) != NULL)
                 {
-                kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVER, host);
+		SSL_set0_kssl_ctx(con, kctx);
+                kssl_ctx_setstring(kctx, KSSL_SERVER, host);
 		}
 #endif	/* OPENSSL_NO_KRB5  */
 /*	SSL_set_cipher_list(con,"RC4-MD5"); */
@@ -1066,7 +1291,7 @@
 			}
 		}
 #endif                                              
-	if (c_Pause & 0x01) con->debug=1;
+	if (c_Pause & 0x01) SSL_set_debug(con, 1);
 
 	if ( SSL_version(con) == DTLS1_VERSION)
 		{
@@ -1115,7 +1340,7 @@
 
 	if (c_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(sbio,bio_dump_callback);
 		BIO_set_callback_arg(sbio,(char *)bio_c_out);
 		}
@@ -1649,6 +1874,14 @@
 				SSL_renegotiate(con);
 				cbuf_len=0;
 				}
+#ifndef OPENSSL_NO_HEARTBEATS
+			else if ((!c_ign_eof) && (cbuf[0] == 'B'))
+ 				{
+				BIO_printf(bio_err,"HEARTBEATING\n");
+				SSL_heartbeat(con);
+				cbuf_len=0;
+				}
+#endif
 			else
 				{
 				cbuf_len=i;
@@ -1710,6 +1943,7 @@
 #ifndef OPENSSL_NO_COMP
 	const COMP_METHOD *comp, *expansion;
 #endif
+	unsigned char *exportedkeymat;
 
 	if (full)
 		{
@@ -1800,7 +2034,7 @@
 			BIO_number_read(SSL_get_rbio(s)),
 			BIO_number_written(SSL_get_wbio(s)));
 		}
-	BIO_printf(bio,((s->hit)?"---\nReused, ":"---\nNew, "));
+	BIO_printf(bio,(SSL_cache_hit(s)?"---\nReused, ":"---\nNew, "));
 	c=SSL_get_current_cipher(s);
 	BIO_printf(bio,"%s, Cipher is %s\n",
 		SSL_CIPHER_get_version(c),
@@ -1822,6 +2056,18 @@
 	BIO_printf(bio,"Expansion: %s\n",
 		expansion ? SSL_COMP_get_name(expansion) : "NONE");
 #endif
+ 
+#ifdef SSL_DEBUG
+	{
+	/* Print out local port of connection: useful for debugging */
+	int sock;
+	struct sockaddr_in ladd;
+	socklen_t ladd_size = sizeof(ladd);
+	sock = SSL_get_fd(s);
+	getsockname(sock, (struct sockaddr *)&ladd, &ladd_size);
+	BIO_printf(bio_c_out, "LOCAL PORT is %u\n", ntohs(ladd.sin_port));
+	}
+#endif
 
 #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 	if (next_proto.status != -1) {
@@ -1834,7 +2080,42 @@
 	}
 #endif
 
+ 	{
+ 	SRTP_PROTECTION_PROFILE *srtp_profile=SSL_get_selected_srtp_profile(s);
+ 
+	if(srtp_profile)
+		BIO_printf(bio,"SRTP Extension negotiated, profile=%s\n",
+			   srtp_profile->name);
+	}
+ 
 	SSL_SESSION_print(bio,SSL_get_session(s));
+	if (keymatexportlabel != NULL)
+		{
+		BIO_printf(bio, "Keying material exporter:\n");
+		BIO_printf(bio, "    Label: '%s'\n", keymatexportlabel);
+		BIO_printf(bio, "    Length: %i bytes\n", keymatexportlen);
+		exportedkeymat = OPENSSL_malloc(keymatexportlen);
+		if (exportedkeymat != NULL)
+			{
+			if (!SSL_export_keying_material(s, exportedkeymat,
+						        keymatexportlen,
+						        keymatexportlabel,
+						        strlen(keymatexportlabel),
+						        NULL, 0, 0))
+				{
+				BIO_printf(bio, "    Error\n");
+				}
+			else
+				{
+				BIO_printf(bio, "    Keying material: ");
+				for (i=0; i<keymatexportlen; i++)
+					BIO_printf(bio, "%02X",
+						   exportedkeymat[i]);
+				BIO_printf(bio, "\n");
+				}
+			OPENSSL_free(exportedkeymat);
+			}
+		}
 	BIO_printf(bio,"---\n");
 	if (peer != NULL)
 		X509_free(peer);

diff --git a/apps/s_server.c b/apps/s_server.c
index a43bd74..fe29b4c 100644
--- a/apps/s_server.c
+++ b/apps/s_server.c

@@ -186,6 +186,9 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include "s_apps.h"
 #include "timeouts.h"
 
@@ -290,6 +293,9 @@
 static int s_msg=0;
 static int s_quiet=0;
 
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
 static int hack=0;
 #ifndef OPENSSL_NO_ENGINE
 static char *engine_id=NULL;
@@ -302,6 +308,7 @@
 static int cert_chain = 0;
 #endif
 
+
 #ifndef OPENSSL_NO_PSK
 static char *psk_identity="Client_identity";
 char *psk_key=NULL; /* by default PSK is not used */
@@ -369,6 +376,52 @@
         }
 #endif
 
+#ifndef OPENSSL_NO_SRP
+/* This is a context that we pass to callbacks */
+typedef struct srpsrvparm_st
+	{
+	char *login;
+	SRP_VBASE *vb;
+	SRP_user_pwd *user;
+	} srpsrvparm;
+
+/* This callback pretends to require some asynchronous logic in order to obtain
+   a verifier. When the callback is called for a new connection we return
+   with a negative value. This will provoke the accept etc to return with
+   an LOOKUP_X509. The main logic of the reinvokes the suspended call 
+   (which would normally occur after a worker has finished) and we
+   set the user parameters. 
+*/
+static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
+	{
+	srpsrvparm *p = (srpsrvparm *)arg;
+	if (p->login == NULL && p->user == NULL )
+		{
+		p->login = SSL_get_srp_username(s);
+		BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login);
+		return (-1) ;
+		}
+
+	if (p->user == NULL)
+		{
+		BIO_printf(bio_err, "User %s doesn't exist\n", p->login);
+		return SSL3_AL_FATAL;
+		}
+	if (SSL_set_srp_server_param(s, p->user->N, p->user->g, p->user->s, p->user->v,
+				     p->user->info) < 0)
+		{
+		*ad = SSL_AD_INTERNAL_ERROR;
+		return SSL3_AL_FATAL;
+		}
+	BIO_printf(bio_err, "SRP parameters set: username = \"%s\" info=\"%s\" \n", p->login,p->user->info);
+	/* need to check whether there are memory leaks */
+	p->user = NULL;
+	p->login = NULL;
+	return SSL_ERROR_NONE;
+	}
+
+#endif
+
 #ifdef MONOLITH
 static void s_server_init(void)
 	{
@@ -456,8 +509,14 @@
 	BIO_printf(bio_err," -jpake arg    - JPAKE secret to use\n");
 # endif
 #endif
+#ifndef OPENSSL_NO_SRP
+	BIO_printf(bio_err," -srpvfile file      - The verifier file for SRP\n");
+	BIO_printf(bio_err," -srpuserseed string - A seed string for a default user salt.\n");
+#endif
 	BIO_printf(bio_err," -ssl2         - Just talk SSLv2\n");
 	BIO_printf(bio_err," -ssl3         - Just talk SSLv3\n");
+	BIO_printf(bio_err," -tls1_2       - Just talk TLSv1.2\n");
+	BIO_printf(bio_err," -tls1_1       - Just talk TLSv1.1\n");
 	BIO_printf(bio_err," -tls1         - Just talk TLSv1\n");
 	BIO_printf(bio_err," -dtls1        - Just talk DTLSv1\n");
 	BIO_printf(bio_err," -timeout      - Enable timeouts\n");
@@ -466,6 +525,8 @@
 	BIO_printf(bio_err," -no_ssl2      - Just disable SSLv2\n");
 	BIO_printf(bio_err," -no_ssl3      - Just disable SSLv3\n");
 	BIO_printf(bio_err," -no_tls1      - Just disable TLSv1\n");
+	BIO_printf(bio_err," -no_tls1_1    - Just disable TLSv1.1\n");
+	BIO_printf(bio_err," -no_tls1_2    - Just disable TLSv1.2\n");
 #ifndef OPENSSL_NO_DH
 	BIO_printf(bio_err," -no_dhe       - Disable ephemeral DH\n");
 #endif
@@ -495,7 +556,10 @@
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	BIO_printf(bio_err," -nextprotoneg arg - set the advertised protocols for the NPN extension (comma-separated list)\n");
 # endif
+        BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
 #endif
+	BIO_printf(bio_err," -keymatexport label   - Export keying material using label\n");
+	BIO_printf(bio_err," -keymatexportlen len  - Export len bytes of keying material (default 20)\n");
 	}
 
 static int local_argc=0;
@@ -846,7 +910,9 @@
 
 	return SSL_TLSEXT_ERR_OK;
 	}
-# endif  /* ndef OPENSSL_NO_NPN */
+# endif  /* ndef OPENSSL_NO_NEXTPROTONEG */
+
+
 #endif
 
 int MAIN(int, char **);
@@ -854,6 +920,10 @@
 #ifndef OPENSSL_NO_JPAKE
 static char *jpake_secret = NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	static srpsrvparm srp_callback_parm;
+#endif
+static char *srtp_profiles = NULL;
 
 int MAIN(int argc, char *argv[])
 	{
@@ -885,8 +955,6 @@
 #ifndef OPENSSL_NO_TLSEXT
 	EVP_PKEY *s_key2 = NULL;
 	X509 *s_cert2 = NULL;
-#endif
-#ifndef OPENSSL_NO_TLSEXT
         tlsextctx tlsextcbp = {NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING};
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	const char *next_proto_neg_in = NULL;
@@ -897,12 +965,20 @@
 	/* by default do not send a PSK identity hint */
 	static char *psk_identity_hint=NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	char *srpuserseed = NULL;
+	char *srp_verifier_file = NULL;
+#endif
 #if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
 	meth=SSLv23_server_method();
 #elif !defined(OPENSSL_NO_SSL3)
 	meth=SSLv3_server_method();
 #elif !defined(OPENSSL_NO_SSL2)
 	meth=SSLv2_server_method();
+#elif !defined(OPENSSL_NO_TLS1)
+	meth=TLSv1_server_method();
+#else
+  /*  #error no SSL version enabled */
 #endif
 
 	local_argc=argc;
@@ -1135,6 +1211,20 @@
 				}
 			}
 #endif
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv, "-srpvfile") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_verifier_file = *(++argv);
+			meth=TLSv1_server_method();
+			}
+		else if (strcmp(*argv, "-srpuserseed") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srpuserseed = *(++argv);
+			meth=TLSv1_server_method();
+			}
+#endif
 		else if	(strcmp(*argv,"-www") == 0)
 			{ www=1; }
 		else if	(strcmp(*argv,"-WWW") == 0)
@@ -1147,6 +1237,10 @@
 			{ off|=SSL_OP_NO_SSLv3; }
 		else if	(strcmp(*argv,"-no_tls1") == 0)
 			{ off|=SSL_OP_NO_TLSv1; }
+		else if	(strcmp(*argv,"-no_tls1_1") == 0)
+			{ off|=SSL_OP_NO_TLSv1_1; }
+		else if	(strcmp(*argv,"-no_tls1_2") == 0)
+			{ off|=SSL_OP_NO_TLSv1_2; }
 		else if	(strcmp(*argv,"-no_comp") == 0)
 			{ off|=SSL_OP_NO_COMPRESSION; }
 #ifndef OPENSSL_NO_TLSEXT
@@ -1164,6 +1258,10 @@
 #ifndef OPENSSL_NO_TLS1
 		else if	(strcmp(*argv,"-tls1") == 0)
 			{ meth=TLSv1_server_method(); }
+		else if	(strcmp(*argv,"-tls1_1") == 0)
+			{ meth=TLSv1_1_server_method(); }
+		else if	(strcmp(*argv,"-tls1_2") == 0)
+			{ meth=TLSv1_2_server_method(); }
 #endif
 #ifndef OPENSSL_NO_DTLS1
 		else if	(strcmp(*argv,"-dtls1") == 0)
@@ -1231,6 +1329,22 @@
 			jpake_secret = *(++argv);
 			}
 #endif
+		else if (strcmp(*argv,"-use_srtp") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srtp_profiles = *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexport") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlabel= *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexportlen") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlen=atoi(*(++argv));
+			if (keymatexportlen == 0) goto bad;
+			}
 		else
 			{
 			BIO_printf(bio_err,"unknown option %s\n",*argv);
@@ -1327,6 +1441,22 @@
 				goto end;
 				}
 			}
+
+# ifndef OPENSSL_NO_NEXTPROTONEG
+		if (next_proto_neg_in)
+			{
+			unsigned short len;
+			next_proto.data = next_protos_parse(&len,
+				next_proto_neg_in);
+			if (next_proto.data == NULL)
+				goto end;
+			next_proto.len = len;
+			}
+		else
+			{
+			next_proto.data = NULL;
+			}
+# endif
 #endif
 		}
 
@@ -1430,6 +1560,9 @@
 	else
 		SSL_CTX_sess_set_cache_size(ctx,128);
 
+	if (srtp_profiles != NULL)
+		SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles);
+
 #if 0
 	if (cipher == NULL) cipher=getenv("SSL_CIPHER");
 #endif
@@ -1653,21 +1786,6 @@
 					goto end;
 					}
 				}
-# ifndef OPENSSL_NO_NEXTPROTONEG
-		if (next_proto_neg_in)
-			{
-			unsigned short len;
-			next_proto.data = next_protos_parse(&len,
-				next_proto_neg_in);
-			if (next_proto.data == NULL)
-				goto end;
-			next_proto.len = len;
-			}
-		else
-			{
-			next_proto.data = NULL;
-			}
-# endif
 #endif
 		RSA_free(rsa);
 		BIO_printf(bio_s_out,"\n");
@@ -1735,6 +1853,25 @@
 		}
 #endif
 
+#ifndef OPENSSL_NO_SRP
+	if (srp_verifier_file != NULL)
+		{
+		srp_callback_parm.vb = SRP_VBASE_new(srpuserseed);
+		srp_callback_parm.user = NULL;
+		srp_callback_parm.login = NULL;
+		if ((ret = SRP_VBASE_init(srp_callback_parm.vb, srp_verifier_file)) != SRP_NO_ERROR)
+			{
+			BIO_printf(bio_err,
+				   "Cannot initialize SRP verifier file \"%s\":ret=%d\n",
+				   srp_verifier_file, ret);
+				goto end;
+			}
+		SSL_CTX_set_verify(ctx, SSL_VERIFY_NONE,verify_callback);
+		SSL_CTX_set_srp_cb_arg(ctx, &srp_callback_parm);  			
+		SSL_CTX_set_srp_username_callback(ctx, ssl_srp_server_param_cb);
+		}
+	else
+#endif
 	if (CAfile != NULL)
 		{
 		SSL_CTX_set_client_CA_list(ctx,SSL_load_client_CA_file(CAfile));
@@ -1816,6 +1953,9 @@
 	unsigned long l;
 	SSL *con=NULL;
 	BIO *sbio;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 	struct timeval timeout;
 #if defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_NETWARE) || defined(OPENSSL_SYS_BEOS_R5)
 	struct timeval tv;
@@ -1856,12 +1996,11 @@
 		}
 #endif
 #ifndef OPENSSL_NO_KRB5
-		if ((con->kssl_ctx = kssl_ctx_new()) != NULL)
+		if ((kctx = kssl_ctx_new()) != NULL)
                         {
-                        kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE,
-								KRB5SVC);
-                        kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB,
-								KRB5KEYTAB);
+			SSL_set0_kssl_ctx(con, kctx);
+                        kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+                        kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
                         }
 #endif	/* OPENSSL_NO_KRB5 */
 		if(context)
@@ -1924,7 +2063,7 @@
 
 	if (s_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
 		BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
 		}
@@ -2053,6 +2192,16 @@
 					goto err;
 					}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+				if ((buf[0] == 'B') &&
+					((buf[1] == '\n') || (buf[1] == '\r')))
+					{
+					BIO_printf(bio_err,"HEARTBEATING\n");
+					SSL_heartbeat(con);
+					i=0;
+					continue;
+					}
+#endif
 				if ((buf[0] == 'r') && 
 					((buf[1] == '\n') || (buf[1] == '\r')))
 					{
@@ -2096,6 +2245,18 @@
 { static count=0; if (++count == 100) { count=0; SSL_renegotiate(con); } }
 #endif
 				k=SSL_write(con,&(buf[l]),(unsigned int)i);
+#ifndef OPENSSL_NO_SRP
+				while (SSL_get_error(con,k) == SSL_ERROR_WANT_X509_LOOKUP)
+					{
+					BIO_printf(bio_s_out,"LOOKUP renego during write\n");
+					srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+					if (srp_callback_parm.user) 
+						BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+					else 
+						BIO_printf(bio_s_out,"LOOKUP not successful\n");
+						k=SSL_write(con,&(buf[l]),(unsigned int)i);
+					}
+#endif
 				switch (SSL_get_error(con,k))
 					{
 				case SSL_ERROR_NONE:
@@ -2143,6 +2304,18 @@
 				{
 again:	
 				i=SSL_read(con,(char *)buf,bufsize);
+#ifndef OPENSSL_NO_SRP
+				while (SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP)
+					{
+					BIO_printf(bio_s_out,"LOOKUP renego during read\n");
+					srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+					if (srp_callback_parm.user) 
+						BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+					else 
+						BIO_printf(bio_s_out,"LOOKUP not successful\n");
+					i=SSL_read(con,(char *)buf,bufsize);
+					}
+#endif
 				switch (SSL_get_error(con,i))
 					{
 				case SSL_ERROR_NONE:
@@ -2155,7 +2328,6 @@
 					break;
 				case SSL_ERROR_WANT_WRITE:
 				case SSL_ERROR_WANT_READ:
-				case SSL_ERROR_WANT_X509_LOOKUP:
 					BIO_printf(bio_s_out,"Read BLOCK\n");
 					break;
 				case SSL_ERROR_SYSCALL:
@@ -2210,12 +2382,30 @@
 	X509 *peer;
 	long verify_error;
 	MS_STATIC char buf[BUFSIZ];
+#ifndef OPENSSL_NO_KRB5
+	char *client_princ;
+#endif
 #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 	const unsigned char *next_proto_neg;
 	unsigned next_proto_neg_len;
 #endif
+	unsigned char *exportedkeymat;
 
-	if ((i=SSL_accept(con)) <= 0)
+
+	i=SSL_accept(con);
+#ifndef OPENSSL_NO_SRP
+	while (i <= 0 &&  SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP) 
+		{
+			BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+			srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+			if (srp_callback_parm.user) 
+				BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+			else 
+				BIO_printf(bio_s_out,"LOOKUP not successful\n");
+			i=SSL_accept(con);
+		}
+#endif
+	if (i <= 0)
 		{
 		if (BIO_sock_should_retry(i))
 			{
@@ -2262,19 +2452,58 @@
 		BIO_printf(bio_s_out, "\n");
 		}
 #endif
-	if (con->hit) BIO_printf(bio_s_out,"Reused session-id\n");
+	{
+	SRTP_PROTECTION_PROFILE *srtp_profile
+	  = SSL_get_selected_srtp_profile(con);
+
+	if(srtp_profile)
+		BIO_printf(bio_s_out,"SRTP Extension negotiated, profile=%s\n",
+			   srtp_profile->name);
+	}
+	if (SSL_cache_hit(con)) BIO_printf(bio_s_out,"Reused session-id\n");
 	if (SSL_ctrl(con,SSL_CTRL_GET_FLAGS,0,NULL) &
 		TLS1_FLAGS_TLS_PADDING_BUG)
-		BIO_printf(bio_s_out,"Peer has incorrect TLSv1 block padding\n");
+		BIO_printf(bio_s_out,
+			   "Peer has incorrect TLSv1 block padding\n");
 #ifndef OPENSSL_NO_KRB5
-	if (con->kssl_ctx->client_princ != NULL)
+	client_princ = kssl_ctx_get0_client_princ(SSL_get0_kssl_ctx(con));
+	if (client_princ != NULL)
 		{
 		BIO_printf(bio_s_out,"Kerberos peer principal is %s\n",
-			con->kssl_ctx->client_princ);
+								client_princ);
 		}
 #endif /* OPENSSL_NO_KRB5 */
 	BIO_printf(bio_s_out, "Secure Renegotiation IS%s supported\n",
 		      SSL_get_secure_renegotiation_support(con) ? "" : " NOT");
+	if (keymatexportlabel != NULL)
+		{
+		BIO_printf(bio_s_out, "Keying material exporter:\n");
+		BIO_printf(bio_s_out, "    Label: '%s'\n", keymatexportlabel);
+		BIO_printf(bio_s_out, "    Length: %i bytes\n",
+			   keymatexportlen);
+		exportedkeymat = OPENSSL_malloc(keymatexportlen);
+		if (exportedkeymat != NULL)
+			{
+			if (!SSL_export_keying_material(con, exportedkeymat,
+						        keymatexportlen,
+						        keymatexportlabel,
+						        strlen(keymatexportlabel),
+						        NULL, 0, 0))
+				{
+				BIO_printf(bio_s_out, "    Error\n");
+				}
+			else
+				{
+				BIO_printf(bio_s_out, "    Keying material: ");
+				for (i=0; i<keymatexportlen; i++)
+					BIO_printf(bio_s_out, "%02X",
+						   exportedkeymat[i]);
+				BIO_printf(bio_s_out, "\n");
+				}
+			OPENSSL_free(exportedkeymat);
+			}
+		}
+
 	return(1);
 	}
 
@@ -2292,6 +2521,9 @@
 	return(ret);
 	}
 #endif
+#ifndef OPENSSL_NO_KRB5
+	char *client_princ;
+#endif
 
 #if 0
 static int load_CA(SSL_CTX *ctx, char *file)
@@ -2322,6 +2554,9 @@
 	SSL *con;
 	const SSL_CIPHER *c;
 	BIO *io,*ssl_bio,*sbio;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 
 	buf=OPENSSL_malloc(bufsize);
 	if (buf == NULL) return(0);
@@ -2353,10 +2588,10 @@
 			}
 #endif
 #ifndef OPENSSL_NO_KRB5
-	if ((con->kssl_ctx = kssl_ctx_new()) != NULL)
+	if ((kctx = kssl_ctx_new()) != NULL)
 		{
-		kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE, KRB5SVC);
-		kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB, KRB5KEYTAB);
+		kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+		kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
 		}
 #endif	/* OPENSSL_NO_KRB5 */
 	if(context) SSL_set_session_id_context(con, context,
@@ -2382,7 +2617,7 @@
 
 	if (s_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
 		BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
 		}
@@ -2397,7 +2632,18 @@
 		if (hack)
 			{
 			i=SSL_accept(con);
-
+#ifndef OPENSSL_NO_SRP
+			while (i <= 0 &&  SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP) 
+		{
+			BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+			srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+			if (srp_callback_parm.user) 
+				BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+			else 
+				BIO_printf(bio_s_out,"LOOKUP not successful\n");
+			i=SSL_accept(con);
+		}
+#endif
 			switch (SSL_get_error(con,i))
 				{
 			case SSL_ERROR_NONE:
@@ -2503,7 +2749,7 @@
 					}
 				BIO_puts(io,"\n");
 				}
-			BIO_printf(io,((con->hit)
+			BIO_printf(io,(SSL_cache_hit(con)
 				?"---\nReused, "
 				:"---\nNew, "));
 			c=SSL_get_current_cipher(con);

diff --git a/apps/s_socket.c b/apps/s_socket.c
index c08544a..380efdb 100644
--- a/apps/s_socket.c
+++ b/apps/s_socket.c

@@ -238,11 +238,10 @@
 	{
 	unsigned char ip[4];
 
+	memset(ip, '\0', sizeof ip);
 	if (!host_ip(host,&(ip[0])))
-		{
-		return(0);
-		}
-	return(init_client_ip(sock,ip,port,type));
+		return 0;
+	return init_client_ip(sock,ip,port,type);
 	}
 
 static int init_client_ip(int *sock, unsigned char ip[4], int port, int type)

diff --git a/apps/server.pem b/apps/server.pem
index 56248e5..d0fc265 100644
--- a/apps/server.pem
+++ b/apps/server.pem

@@ -1,369 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (512 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
 -----BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQYwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNMDAxMDE2MjIzMTAzWhcNMDMwMTE0
-MjIzMTAzWjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGlNlcnZlciB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAJ+zw4Qnlf8SMVIP
-Fe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVDTGiXav6ooKXfX3j/7tdkuD8Ey2//
-Kv7+ue0CAwEAATANBgkqhkiG9w0BAQQFAAOBgQCT0grFQeZaqYb5EYfk20XixZV4
-GmyAbXMftG1Eo7qGiMhYzRwGNWxEYojf5PZkYZXvSqZ/ZXHXa4g59jK/rJNnaVGM
-k+xIX8mxQvlV0n5O9PIha5BX5teZnkHKgL8aKKLKW1BK7YTngsfSzzaeame5iKfz
-itAE+OjGF+PFKbwX8Q==
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6zMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgU2VydmVyIENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDzhPOSNtyyRspmeuUpxfNJ
+KCLTuf7g3uQ4zu4iHOmRO5TQci+HhVlLZrHF9XqFXcIP0y4pWDbMSGuiorUmzmfi
+R7bfSdI/+qIQt8KXRH6HNG1t8ou0VSvWId5TS5Dq/er5ODUr9OaaDva7EquHIcMv
+vPQGuI+OEAcnleVCy9HVEIySrO4P3CNIicnGkwwiAud05yUAq/gPXBC1hTtmlPD7
+TVcGVSEiJdvzqqlgv02qedGrkki6GY4S7GjZxrrf7Foc2EP+51LJzwLQx3/JfrCU
+41NEWAsu/Sl0tQabXESN+zJ1pDqoZ3uHMgpQjeGiE0olr+YcsSW/tJmiU9OiAr8R
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSCvM8AABPR9zklmifnr9LvIBturDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAqb1NV0B0/pbpK9Z4/bNjzPQLTRLK
+WnSNm/Jh5v0GEUOE/Beg7GNjNrmeNmqxAlpqWz9qoeoFZax+QBpIZYjROU3TS3fp
+yLsrnlr0CDQ5R7kCCDGa8dkXxemmpZZLbUCpW2Uoy8sAA4JjN9OtsZY7dvUXFgJ7
+vVNTRnI01ghknbtD+2SxSQd3CWF6QhcRMAzZJ1z1cbbwGDDzfvGFPzJ+Sq+zEPds
+xoVLLSetCiBc+40ZcDS5dV98h9XD7JMTQfxzA7mNGv73JoZJA6nFgj+ADSlJsY/t
+JBv+z1iQRueoh9Qeee+ZbRifPouCB8FDx+AltvHTANdAq0t/K3o+pplMVA==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIIBPAIBAAJBAJ+zw4Qnlf8SMVIPFe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVD
-TGiXav6ooKXfX3j/7tdkuD8Ey2//Kv7+ue0CAwEAAQJAN6W31vDEP2DjdqhzCDDu
-OA4NACqoiFqyblo7yc2tM4h4xMbC3Yx5UKMN9ZkCtX0gzrz6DyF47bdKcWBzNWCj
-gQIhANEoojVt7hq+SQ6MCN6FTAysGgQf56Q3TYoJMoWvdiXVAiEAw3e3rc+VJpOz
-rHuDo6bgpjUAAXM+v3fcpsfZSNO6V7kCIQCtbVjanpUwvZkMI9by02oUk9taki3b
-PzPfAfNPYAbCJQIhAJXNQDWyqwn/lGmR11cqY2y9nZ1+5w3yHGatLrcDnQHxAiEA
-vnlEGo8K85u+KwIOimM48ZG8oTk7iFdkqLJR1utT3aU=
+MIIEpAIBAAKCAQEA84TzkjbcskbKZnrlKcXzSSgi07n+4N7kOM7uIhzpkTuU0HIv
+h4VZS2axxfV6hV3CD9MuKVg2zEhroqK1Js5n4ke230nSP/qiELfCl0R+hzRtbfKL
+tFUr1iHeU0uQ6v3q+Tg1K/Tmmg72uxKrhyHDL7z0BriPjhAHJ5XlQsvR1RCMkqzu
+D9wjSInJxpMMIgLndOclAKv4D1wQtYU7ZpTw+01XBlUhIiXb86qpYL9NqnnRq5JI
+uhmOEuxo2ca63+xaHNhD/udSyc8C0Md/yX6wlONTRFgLLv0pdLUGm1xEjfsydaQ6
+qGd7hzIKUI3hohNKJa/mHLElv7SZolPTogK/EQIDAQABAoIBAADq9FwNtuE5IRQn
+zGtO4q7Y5uCzZ8GDNYr9RKp+P2cbuWDbvVAecYq2NV9QoIiWJOAYZKklOvekIju3
+r0UZLA0PRiIrTg6NrESx3JrjWDK8QNlUO7CPTZ39/K+FrmMkV9lem9yxjJjyC34D
+AQB+YRTx+l14HppjdxNwHjAVQpIx/uO2F5xAMuk32+3K+pq9CZUtrofe1q4Agj9R
+5s8mSy9pbRo9kW9wl5xdEotz1LivFOEiqPUJTUq5J5PeMKao3vdK726XI4Z455Nm
+W2/MA0YV0ug2FYinHcZdvKM6dimH8GLfa3X8xKRfzjGjTiMSwsdjgMa4awY3tEHH
+674jhAECgYEA/zqMrc0zsbNk83sjgaYIug5kzEpN4ic020rSZsmQxSCerJTgNhmg
+utKSCt0Re09Jt3LqG48msahX8ycqDsHNvlEGPQSbMu9IYeO3Wr3fAm75GEtFWePY
+BhM73I7gkRt4s8bUiUepMG/wY45c5tRF23xi8foReHFFe9MDzh8fJFECgYEA9EFX
+4qAik1pOJGNei9BMwmx0I0gfVEIgu0tzeVqT45vcxbxr7RkTEaDoAG6PlbWP6D9a
+WQNLp4gsgRM90ZXOJ4up5DsAWDluvaF4/omabMA+MJJ5kGZ0gCj5rbZbKqUws7x8
+bp+6iBfUPJUbcqNqFmi/08Yt7vrDnMnyMw2A/sECgYEAiiuRMxnuzVm34hQcsbhH
+6ymVqf7j0PW2qK0F4H1ocT9qhzWFd+RB3kHWrCjnqODQoI6GbGr/4JepHUpre1ex
+4UEN5oSS3G0ru0rC3U4C59dZ5KwDHFm7ffZ1pr52ljfQDUsrjjIMRtuiwNK2OoRa
+WSsqiaL+SDzSB+nBmpnAizECgYBdt/y6rerWUx4MhDwwtTnel7JwHyo2MDFS6/5g
+n8qC2Lj6/fMDRE22w+CA2esp7EJNQJGv+b27iFpbJEDh+/Lf5YzIT4MwVskQ5bYB
+JFcmRxUVmf4e09D7o705U/DjCgMH09iCsbLmqQ38ONIRSHZaJtMDtNTHD1yi+jF+
+OT43gQKBgQC/2OHZoko6iRlNOAQ/tMVFNq7fL81GivoQ9F1U0Qr+DH3ZfaH8eIkX
+xT0ToMPJUzWAn8pZv0snA0um6SIgvkCuxO84OkANCVbttzXImIsL7pFzfcwV/ERK
+UM6j0ZuSMFOCr/lGPAoOQU0fskidGEHi1/kW+suSr28TqsyYZpwBDQ==
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
------END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----

diff --git a/apps/server2.pem b/apps/server2.pem
index 8bb6641..a3927cf 100644
--- a/apps/server2.pem
+++ b/apps/server2.pem

@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
 -----BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
------END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/[email protected]
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----

diff --git a/apps/sess_id.c b/apps/sess_id.c
index b99179f..b16686c 100644
--- a/apps/sess_id.c
+++ b/apps/sess_id.c

@@ -90,6 +90,7 @@
 int MAIN(int argc, char **argv)
 	{
 	SSL_SESSION *x=NULL;
+	X509 *peer = NULL;
 	int ret=1,i,num,badops=0;
 	BIO *out=NULL;
 	int informat,outformat;
@@ -163,16 +164,17 @@
 	ERR_load_crypto_strings();
 	x=load_sess_id(infile,informat);
 	if (x == NULL) { goto end; }
+	peer = SSL_SESSION_get0_peer(x);
 
 	if(context)
 	    {
-	    x->sid_ctx_length=strlen(context);
-	    if(x->sid_ctx_length > SSL_MAX_SID_CTX_LENGTH)
+	    size_t ctx_len = strlen(context);
+	    if(ctx_len > SSL_MAX_SID_CTX_LENGTH)
 		{
 		BIO_printf(bio_err,"Context too long\n");
 		goto end;
 		}
-	    memcpy(x->sid_ctx,context,x->sid_ctx_length);
+	    SSL_SESSION_set1_id_context(x, (unsigned char *)context, ctx_len);
 	    }
 
 #ifdef undef
@@ -231,10 +233,10 @@
 
 		if (cert)
 			{
-			if (x->peer == NULL)
+			if (peer == NULL)
 				BIO_puts(out,"No certificate present\n");
 			else
-				X509_print(out,x->peer);
+				X509_print(out,peer);
 			}
 		}
 
@@ -253,12 +255,12 @@
 			goto end;
 			}
 		}
-	else if (!noout && (x->peer != NULL)) /* just print the certificate */
+	else if (!noout && (peer != NULL)) /* just print the certificate */
 		{
 		if 	(outformat == FORMAT_ASN1)
-			i=(int)i2d_X509_bio(out,x->peer);
+			i=(int)i2d_X509_bio(out,peer);
 		else if (outformat == FORMAT_PEM)
-			i=PEM_write_bio_X509(out,x->peer);
+			i=PEM_write_bio_X509(out,peer);
 		else	{
 			BIO_printf(bio_err,"bad output format specified for outfile\n");
 			goto end;

diff --git a/apps/speed.c b/apps/speed.c
index b3c5442..ab62e01 100644
--- a/apps/speed.c
+++ b/apps/speed.c

@@ -108,8 +108,14 @@
 #include <signal.h>
 #endif
 
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
+# if defined(__CYGWIN__) && !defined(_WIN32)
+  /* <windows.h> should define _WIN32, which normally is mutually
+   * exclusive with __CYGWIN__, but if it didn't... */
+#  define _WIN32
+  /* this is done because Cygwin alarm() fails sometimes. */
+# endif
 #endif
 
 #include <openssl/bn.h>
@@ -183,6 +189,25 @@
 #ifndef OPENSSL_NO_ECDH
 #include <openssl/ecdh.h>
 #endif
+#include <openssl/modes.h>
+
+#ifdef OPENSSL_FIPS
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef AES_set_encrypt_key
+#undef AES_set_decrypt_key
+#undef DES_set_key_unchecked
+#endif
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define AES_set_encrypt_key	private_AES_set_encrypt_key
+#define AES_set_decrypt_key	private_AES_set_decrypt_key
+#define Camellia_set_key	private_Camellia_set_key
+#endif
 
 #ifndef HAVE_FORK
 # if defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MACINTOSH_CLASSIC) || defined(OPENSSL_SYS_OS2) || defined(OPENSSL_SYS_NETWARE)
@@ -214,7 +239,7 @@
 static int do_multi(int multi);
 #endif
 
-#define ALGOR_NUM	29
+#define ALGOR_NUM	30
 #define SIZE_NUM	5
 #define RSA_NUM		4
 #define DSA_NUM		3
@@ -229,7 +254,7 @@
   "aes-128 cbc","aes-192 cbc","aes-256 cbc",
   "camellia-128 cbc","camellia-192 cbc","camellia-256 cbc",
   "evp","sha256","sha512","whirlpool",
-  "aes-128 ige","aes-192 ige","aes-256 ige"};
+  "aes-128 ige","aes-192 ige","aes-256 ige","ghash"};
 static double results[ALGOR_NUM][SIZE_NUM];
 static int lengths[SIZE_NUM]={16,64,256,1024,8*1024};
 #ifndef OPENSSL_NO_RSA
@@ -273,9 +298,12 @@
 
 #if defined(_WIN32)
 
+#if !defined(SIGALRM)
 #define SIGALRM
+#endif
 static unsigned int lapse,schlock;
-static void alarm(unsigned int secs) { lapse = secs*1000; }
+static void alarm_win32(unsigned int secs) { lapse = secs*1000; }
+#define alarm alarm_win32
 
 static DWORD WINAPI sleepy(VOID *arg)
 	{
@@ -469,6 +497,7 @@
 #define D_IGE_128_AES   26
 #define D_IGE_192_AES   27
 #define D_IGE_256_AES   28
+#define D_GHASH		29
 	double d=0.0;
 	long c[ALGOR_NUM][SIZE_NUM];
 #define	R_DSA_512	0
@@ -894,6 +923,10 @@
 			doit[D_CBC_192_AES]=1;
 			doit[D_CBC_256_AES]=1;
 			}
+		else if (strcmp(*argv,"ghash") == 0)
+			{
+			doit[D_GHASH]=1;
+			}
 		else
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -1264,6 +1297,7 @@
 	c[D_IGE_128_AES][0]=count;
 	c[D_IGE_192_AES][0]=count;
 	c[D_IGE_256_AES][0]=count;
+	c[D_GHASH][0]=count;
 
 	for (i=1; i<SIZE_NUM; i++)
 		{
@@ -1458,7 +1492,7 @@
 # error "You cannot disable DES on systems without SIGALRM."
 #endif /* OPENSSL_NO_DES */
 #else
-#define COND(c)	(run)
+#define COND(c)	(run && count<0x7fffffff)
 #define COUNT(d) (count)
 #ifndef _WIN32
 	signal(SIGALRM,sig_done);
@@ -1755,7 +1789,22 @@
 			print_result(D_IGE_256_AES,j,count,d);
 			}
 		}
+	if (doit[D_GHASH])
+		{
+		GCM128_CONTEXT *ctx = CRYPTO_gcm128_new(&aes_ks1,(block128_f)AES_encrypt);
+		CRYPTO_gcm128_setiv (ctx,(unsigned char *)"0123456789ab",12);
 
+		for (j=0; j<SIZE_NUM; j++)
+			{
+			print_message(names[D_GHASH],c[D_GHASH][j],lengths[j]);
+			Time_F(START);
+			for (count=0,run=1; COND(c[D_GHASH][j]); count++)
+				CRYPTO_gcm128_aad(ctx,buf,lengths[j]);
+			d=Time_F(STOP);
+			print_result(D_GHASH,j,count,d);
+			}
+		CRYPTO_gcm128_release(ctx);
+		}
 
 #endif
 #endif
@@ -2550,7 +2599,7 @@
 	BIO_printf(bio_err,mr ? "+DTP:%d:%s:%s:%d\n"
 			   : "Doing %d bit %s %s's for %ds: ",bits,str,str2,tm);
 	(void)BIO_flush(bio_err);
-	alarm(RSA_SECONDS);
+	alarm(tm);
 #else
 	BIO_printf(bio_err,mr ? "+DNP:%ld:%d:%s:%s\n"
 			   : "Doing %ld %d bit %s %s's: ",num,bits,str,str2);
@@ -2610,7 +2659,11 @@
 	fds=malloc(multi*sizeof *fds);
 	for(n=0 ; n < multi ; ++n)
 		{
-		pipe(fd);
+		if (pipe(fd) == -1)
+			{
+			fprintf(stderr, "pipe failure\n");
+			exit(1);
+			}
 		fflush(stdout);
 		fflush(stderr);
 		if(fork())
@@ -2622,7 +2675,11 @@
 			{
 			close(fd[0]);
 			close(1);
-			dup(fd[1]);
+			if (dup(fd[1]) == -1)
+				{
+				fprintf(stderr, "dup failed\n");
+				exit(1);
+				}
 			close(fd[1]);
 			mr=1;
 			usertime=0;

diff --git a/apps/srp.c b/apps/srp.c
new file mode 100644
index 0000000..80e1b8a
--- /dev/null
+++ b/apps/srp.c

@@ -0,0 +1,756 @@
+/* apps/srp.c */
+/* Written by Peter Sylvester ([email protected])  
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#include <openssl/opensslconf.h>
+
+#ifndef OPENSSL_NO_SRP
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <openssl/conf.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#include <openssl/txt_db.h>
+#include <openssl/buffer.h>
+#include <openssl/srp.h>
+
+#include "apps.h"
+
+#undef PROG
+#define PROG srp_main
+
+#define BASE_SECTION	"srp"
+#define CONFIG_FILE "openssl.cnf"
+
+#define ENV_RANDFILE		"RANDFILE"
+
+#define ENV_DATABASE		"srpvfile"
+#define ENV_DEFAULT_SRP		"default_srp"
+
+static char *srp_usage[]={
+"usage: srp [args] [user] \n",
+"\n",
+" -verbose        Talk alot while doing things\n",
+" -config file    A config file\n",
+" -name arg       The particular srp definition to use\n",
+" -srpvfile arg   The srp verifier file name\n",
+" -add            add an user and srp verifier\n",
+" -modify         modify the srp verifier of an existing user\n",
+" -delete         delete user from verifier file\n",
+" -list           list user\n",
+" -gn arg         g and N values to be used for new verifier\n",
+" -userinfo arg   additional info to be set for user\n",
+" -passin arg     input file pass phrase source\n",
+" -passout arg    output file pass phrase source\n",
+#ifndef OPENSSL_NO_ENGINE
+" -engine e         - use engine e, possibly a hardware device.\n",
+#endif
+NULL
+};
+
+#ifdef EFENCE
+extern int EF_PROTECT_FREE;
+extern int EF_PROTECT_BELOW;
+extern int EF_ALIGNMENT;
+#endif
+
+static CONF *conf=NULL;
+static char *section=NULL;
+
+#define VERBOSE if (verbose) 
+#define VVERBOSE if (verbose>1) 
+
+
+int MAIN(int, char **);
+
+static int get_index(CA_DB *db, char* id, char type)
+	{
+	char ** pp;
+	int i;
+	if (id == NULL) return -1;
+	if (type == DB_SRP_INDEX) 
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i);
+		if (pp[DB_srptype][0] == DB_SRP_INDEX  && !strcmp(id, pp[DB_srpid])) 
+			return i;
+		}
+	else for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i);
+
+		if (pp[DB_srptype][0] != DB_SRP_INDEX && !strcmp(id,pp[DB_srpid])) 
+			return i;
+		}
+
+	return -1 ; 
+	}
+
+static void print_entry(CA_DB *db, BIO *bio, int indx, int verbose, char *s)
+	{
+	if (indx >= 0 && verbose)
+		{
+		int j;
+		char **pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, indx);
+		BIO_printf(bio, "%s \"%s\"\n", s, pp[DB_srpid]);
+		for (j = 0; j < DB_NUMBER; j++)
+			{
+			BIO_printf(bio_err,"  %d = \"%s\"\n", j, pp[j]);
+			}
+		}
+	}
+
+static void print_index(CA_DB *db, BIO *bio, int indexindex, int verbose)
+	{
+	print_entry(db, bio, indexindex, verbose, "g N entry") ;
+	}
+
+static void print_user(CA_DB *db, BIO *bio, int userindex, int verbose)
+	{
+	if (verbose > 0)
+		{
+		char **pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+
+		if (pp[DB_srptype][0] != 'I')
+			{
+			print_entry(db, bio, userindex, verbose, "User entry");
+			print_entry(db, bio, get_index(db, pp[DB_srpgN], 'I'), verbose, "g N entry");
+			}
+
+		}
+	}
+
+static int update_index(CA_DB *db, BIO *bio, char **row)
+	{
+	char ** irow;
+	int i;
+
+	if ((irow=(char **)OPENSSL_malloc(sizeof(char *)*(DB_NUMBER+1))) == NULL)
+		{
+		BIO_printf(bio_err,"Memory allocation failure\n");
+		return 0;
+		}
+
+	for (i=0; i<DB_NUMBER; i++)
+		{
+		irow[i]=row[i];
+		row[i]=NULL;
+		}
+	irow[DB_NUMBER]=NULL;
+
+	if (!TXT_DB_insert(db->db,irow))
+		{
+		BIO_printf(bio,"failed to update srpvfile\n");
+		BIO_printf(bio,"TXT_DB error number %ld\n",db->db->error);
+		OPENSSL_free(irow);
+		return 0;
+		}
+	return 1;
+	}
+
+static void lookup_fail(const char *name, char *tag)
+	{
+	BIO_printf(bio_err,"variable lookup failed for %s::%s\n",name,tag);
+	}
+
+
+static char *srp_verify_user(const char *user, const char *srp_verifier,
+			     char *srp_usersalt, const char *g, const char *N,
+			     const char *passin, BIO *bio, int verbose)
+	{
+	char password[1024];
+	PW_CB_DATA cb_tmp;
+	char *verifier = NULL;
+	char *gNid = NULL;
+
+	cb_tmp.prompt_info = user;
+	cb_tmp.password = passin;
+
+ 	if (password_callback(password, 1024, 0, &cb_tmp) >0)
+		{
+		VERBOSE BIO_printf(bio,"Validating\n   user=\"%s\"\n srp_verifier=\"%s\"\n srp_usersalt=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,srp_verifier,srp_usersalt, g, N);
+		BIO_printf(bio, "Pass %s\n", password);
+
+		if (!(gNid=SRP_create_verifier(user, password, &srp_usersalt, &verifier, N, g)))
+			{
+			BIO_printf(bio, "Internal error validating SRP verifier\n");
+			}
+		else
+			{
+			if (strcmp(verifier, srp_verifier))
+				gNid = NULL;
+			OPENSSL_free(verifier);
+			}
+		}
+	return gNid;
+	}
+
+static char *srp_create_user(char *user, char **srp_verifier,
+			     char **srp_usersalt, char *g, char *N,
+			     char *passout, BIO *bio, int verbose)
+	{
+ 	char password[1024];
+        PW_CB_DATA cb_tmp;
+	char *gNid = NULL;
+	char *salt = NULL;
+        cb_tmp.prompt_info = user;
+        cb_tmp.password = passout;
+
+	if (password_callback(password,1024,1,&cb_tmp) >0)
+		{
+		VERBOSE BIO_printf(bio,"Creating\n user=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,g,N);
+		if (!(gNid =SRP_create_verifier(user, password, &salt, srp_verifier, N, g)))
+			{
+			BIO_printf(bio,"Internal error creating SRP verifier\n");
+			}
+		else 
+			*srp_usersalt = salt;
+		VVERBOSE BIO_printf(bio,"gNid=%s salt =\"%s\"\n verifier =\"%s\"\n", gNid,salt, *srp_verifier);
+
+		}
+	return gNid;
+	}
+
+int MAIN(int argc, char **argv)
+	{
+	int add_user = 0;
+	int list_user= 0;
+	int delete_user= 0;
+	int modify_user= 0;
+	char * user = NULL;
+
+	char *passargin = NULL, *passargout = NULL;
+	char *passin = NULL, *passout = NULL;
+        char * gN = NULL;
+	int gNindex = -1;
+	char ** gNrow = NULL;
+	int maxgN = -1;
+
+	char * userinfo = NULL;
+
+	int badops=0;
+	int ret=1;
+	int errors=0;
+	int verbose=0;
+	int doupdatedb=0;
+	char *configfile=NULL;
+	char *dbfile=NULL;
+	CA_DB *db=NULL;
+	char **pp ;
+	int i;
+	long errorline = -1;
+	char *randfile=NULL;
+#ifndef OPENSSL_NO_ENGINE
+	char *engine = NULL;
+#endif
+	char *tofree=NULL;
+	DB_ATTR db_attr;
+
+#ifdef EFENCE
+EF_PROTECT_FREE=1;
+EF_PROTECT_BELOW=1;
+EF_ALIGNMENT=0;
+#endif
+
+	apps_startup();
+
+	conf = NULL;
+	section = NULL;
+
+	if (bio_err == NULL)
+		if ((bio_err=BIO_new(BIO_s_file())) != NULL)
+			BIO_set_fp(bio_err,stderr,BIO_NOCLOSE|BIO_FP_TEXT);
+
+	argc--;
+	argv++;
+	while (argc >= 1 && badops == 0)
+		{
+		if	(strcmp(*argv,"-verbose") == 0)
+			verbose++;
+		else if	(strcmp(*argv,"-config") == 0)
+			{
+			if (--argc < 1) goto bad;
+			configfile= *(++argv);
+			}
+		else if (strcmp(*argv,"-name") == 0)
+			{
+			if (--argc < 1) goto bad;
+			section= *(++argv);
+			}
+		else if	(strcmp(*argv,"-srpvfile") == 0)
+			{
+			if (--argc < 1) goto bad;
+			dbfile= *(++argv);
+			}
+		else if (strcmp(*argv,"-add") == 0)
+			add_user=1;
+		else if (strcmp(*argv,"-delete") == 0)
+			delete_user=1;
+		else if (strcmp(*argv,"-modify") == 0)
+			modify_user=1;
+		else if (strcmp(*argv,"-list") == 0)
+			list_user=1;
+		else if (strcmp(*argv,"-gn") == 0)
+			{
+			if (--argc < 1) goto bad;
+			gN= *(++argv);
+			}
+		else if (strcmp(*argv,"-userinfo") == 0)
+			{
+			if (--argc < 1) goto bad;
+			userinfo= *(++argv);
+			}
+		else if (strcmp(*argv,"-passin") == 0)
+			{
+			if (--argc < 1) goto bad;
+			passargin= *(++argv);
+			}
+		else if (strcmp(*argv,"-passout") == 0)
+			{
+			if (--argc < 1) goto bad;
+			passargout= *(++argv);
+			}
+#ifndef OPENSSL_NO_ENGINE
+		else if (strcmp(*argv,"-engine") == 0)
+			{
+			if (--argc < 1) goto bad;
+			engine= *(++argv);
+			}
+#endif
+
+		else if (**argv == '-')
+			{
+bad:
+			BIO_printf(bio_err,"unknown option %s\n",*argv);
+			badops=1;
+			break;
+			}
+		else 
+			break;
+	
+		argc--;
+		argv++;
+		}
+
+	if (dbfile && configfile)
+		{
+		BIO_printf(bio_err,"-dbfile and -configfile cannot be specified together.\n");
+		badops = 1;
+		}
+	if (add_user+delete_user+modify_user+list_user != 1)
+		{
+		BIO_printf(bio_err,"Exactly one of the options -add, -delete, -modify -list must be specified.\n");
+		badops = 1;
+		}
+	if (delete_user+modify_user+delete_user== 1 && argc <= 0)
+		{
+		BIO_printf(bio_err,"Need at least one user for options -add, -delete, -modify. \n");
+		badops = 1;
+		}
+	if ((passin || passout) && argc != 1 )
+		{
+		BIO_printf(bio_err,"-passin, -passout arguments only valid with one user.\n");
+		badops = 1;
+		}
+
+	if (badops)
+		{
+		for (pp=srp_usage; (*pp != NULL); pp++)
+			BIO_printf(bio_err,"%s",*pp);
+
+		BIO_printf(bio_err," -rand file%cfile%c...\n", LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR);
+		BIO_printf(bio_err,"                 load the file (or the files in the directory) into\n");
+		BIO_printf(bio_err,"                 the random number generator\n");
+		goto err;
+		}
+
+	ERR_load_crypto_strings();
+
+#ifndef OPENSSL_NO_ENGINE
+	setup_engine(bio_err, engine, 0);
+#endif
+
+	if(!app_passwd(bio_err, passargin, passargout, &passin, &passout))
+		{
+		BIO_printf(bio_err, "Error getting passwords\n");
+		goto err;
+		}
+
+        if (!dbfile)
+		{
+
+
+	/*****************************************************************/
+		tofree=NULL;
+		if (configfile == NULL) configfile = getenv("OPENSSL_CONF");
+		if (configfile == NULL) configfile = getenv("SSLEAY_CONF");
+		if (configfile == NULL)
+			{
+			const char *s=X509_get_default_cert_area();
+			size_t len;
+
+#ifdef OPENSSL_SYS_VMS
+			len = strlen(s)+sizeof(CONFIG_FILE);
+			tofree=OPENSSL_malloc(len);
+			strcpy(tofree,s);
+#else
+			len = strlen(s)+sizeof(CONFIG_FILE)+1;
+			tofree=OPENSSL_malloc(len);
+			BUF_strlcpy(tofree,s,len);
+			BUF_strlcat(tofree,"/",len);
+#endif
+			BUF_strlcat(tofree,CONFIG_FILE,len);
+			configfile=tofree;
+			}
+
+		VERBOSE BIO_printf(bio_err,"Using configuration from %s\n",configfile);
+		conf = NCONF_new(NULL);
+		if (NCONF_load(conf,configfile,&errorline) <= 0)
+			{
+			if (errorline <= 0)
+				BIO_printf(bio_err,"error loading the config file '%s'\n",
+					configfile);
+			else
+				BIO_printf(bio_err,"error on line %ld of config file '%s'\n"
+					,errorline,configfile);
+			goto err;
+			}
+		if(tofree)
+			{
+			OPENSSL_free(tofree);
+			tofree = NULL;
+			}
+
+		if (!load_config(bio_err, conf))
+			goto err;
+
+	/* Lets get the config section we are using */
+		if (section == NULL)
+			{
+			VERBOSE BIO_printf(bio_err,"trying to read " ENV_DEFAULT_SRP " in \" BASE_SECTION \"\n");
+
+			section=NCONF_get_string(conf,BASE_SECTION,ENV_DEFAULT_SRP);
+			if (section == NULL)
+				{
+				lookup_fail(BASE_SECTION,ENV_DEFAULT_SRP);
+				goto err;
+				}
+			}
+         
+		if (randfile == NULL && conf)
+	        	randfile = NCONF_get_string(conf, BASE_SECTION, "RANDFILE");
+
+	
+		VERBOSE BIO_printf(bio_err,"trying to read " ENV_DATABASE " in section \"%s\"\n",section);
+
+		if ((dbfile=NCONF_get_string(conf,section,ENV_DATABASE)) == NULL)
+			{
+			lookup_fail(section,ENV_DATABASE);
+			goto err;
+			}
+
+        	}
+	if (randfile == NULL)
+		ERR_clear_error();
+       	else 
+		app_RAND_load_file(randfile, bio_err, 0);
+
+	VERBOSE BIO_printf(bio_err,"Trying to read SRP verifier file \"%s\"\n",dbfile);
+
+	db = load_index(dbfile, &db_attr);
+	if (db == NULL) goto err;
+
+	/* Lets check some fields */
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i);
+	
+		if (pp[DB_srptype][0] == DB_SRP_INDEX)
+			{
+			maxgN = i;
+			if (gNindex < 0 && gN != NULL && !strcmp(gN, pp[DB_srpid]))
+				gNindex = i;
+
+			print_index(db, bio_err, i, verbose > 1);
+			}
+		}
+	
+	VERBOSE BIO_printf(bio_err, "Database initialised\n");
+
+	if (gNindex >= 0)
+		{
+		gNrow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, gNindex);
+		print_entry(db, bio_err, gNindex, verbose > 1, "Default g and N") ;
+		}
+	else if (maxgN > 0 && !SRP_get_default_gN(gN))
+		{
+		BIO_printf(bio_err, "No g and N value for index \"%s\"\n", gN);
+		goto err;
+		}
+	else
+		{
+		VERBOSE BIO_printf(bio_err, "Database has no g N information.\n");
+		gNrow = NULL;
+		}
+	
+
+	VVERBOSE BIO_printf(bio_err,"Starting user processing\n");
+
+	if (argc > 0)
+		user = *(argv++) ;
+
+	while (list_user || user)
+		{
+		int userindex = -1;
+		if (user) 
+			VVERBOSE BIO_printf(bio_err, "Processing user \"%s\"\n", user);
+		if ((userindex = get_index(db, user, 'U')) >= 0)
+			{
+			print_user(db, bio_err, userindex, (verbose > 0) || list_user);
+			}
+		
+		if (list_user)
+			{
+			if (user == NULL)
+				{
+				BIO_printf(bio_err,"List all users\n");
+
+				for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+					{
+					print_user(db,bio_err, i, 1);
+					}
+				list_user = 0;
+				}
+			else if (userindex < 0)
+				{
+				BIO_printf(bio_err, "user \"%s\" does not exist, ignored. t\n",
+					   user);
+				errors++;
+				}
+			}
+		else if (add_user)
+			{
+			if (userindex >= 0)
+				{
+				/* reactivation of a new user */
+				char **row = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+				BIO_printf(bio_err, "user \"%s\" reactivated.\n", user);
+				row[DB_srptype][0] = 'V';
+
+				doupdatedb = 1;
+				}
+			else
+				{
+				char *row[DB_NUMBER] ; char *gNid;
+				row[DB_srpverifier] = NULL;
+				row[DB_srpsalt] = NULL;
+				row[DB_srpinfo] = NULL;
+				if (!(gNid = srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:gN,gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+					{
+						BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned .\n", user);
+						errors++;
+						goto err;
+					}
+				row[DB_srpid] = BUF_strdup(user);
+				row[DB_srptype] = BUF_strdup("v");
+				row[DB_srpgN] = BUF_strdup(gNid);
+
+				if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+					(userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))) || 
+					!update_index(db, bio_err, row))
+					{
+					if (row[DB_srpid]) OPENSSL_free(row[DB_srpid]);
+					if (row[DB_srpgN]) OPENSSL_free(row[DB_srpgN]);
+					if (row[DB_srpinfo]) OPENSSL_free(row[DB_srpinfo]);
+					if (row[DB_srptype]) OPENSSL_free(row[DB_srptype]);
+					if (row[DB_srpverifier]) OPENSSL_free(row[DB_srpverifier]);
+					if (row[DB_srpsalt]) OPENSSL_free(row[DB_srpsalt]);
+					goto err;
+					}
+				doupdatedb = 1;
+				}
+			}
+		else if (modify_user)
+			{
+			if (userindex < 0)
+				{
+				BIO_printf(bio_err,"user \"%s\" does not exist, operation ignored.\n",user);
+				errors++;
+				}
+			else
+				{
+
+				char **row = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+				char type = row[DB_srptype][0];
+				if (type == 'v')
+					{
+					BIO_printf(bio_err,"user \"%s\" already updated, operation ignored.\n",user);
+					errors++;
+					}
+				else
+					{
+					char *gNid;
+
+					if (row[DB_srptype][0] == 'V')
+						{
+						int user_gN;
+						char **irow = NULL;
+						VERBOSE BIO_printf(bio_err,"Verifying password for user \"%s\"\n",user);
+						if ( (user_gN = get_index(db, row[DB_srpgN], DB_SRP_INDEX)) >= 0)
+							irow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+
+ 						if (!srp_verify_user(user, row[DB_srpverifier], row[DB_srpsalt], irow ? irow[DB_srpsalt] : row[DB_srpgN], irow ? irow[DB_srpverifier] : NULL, passin, bio_err, verbose))
+							{
+							BIO_printf(bio_err, "Invalid password for user \"%s\", operation abandoned.\n", user);
+							errors++;
+							goto err;
+							}
+						} 
+					VERBOSE BIO_printf(bio_err,"Password for user \"%s\" ok.\n",user);
+
+					if (!(gNid=srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:NULL, gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+						{
+							BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned.\n", user);
+							errors++;
+							goto err;
+						}
+
+					row[DB_srptype][0] = 'v';
+					row[DB_srpgN] = BUF_strdup(gNid);
+ 
+					if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+						(userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))))  
+						goto err;
+
+					doupdatedb = 1;
+					}
+				}
+			}
+		else if (delete_user)
+			{
+			if (userindex < 0)
+				{
+				BIO_printf(bio_err, "user \"%s\" does not exist, operation ignored. t\n", user);
+				errors++;
+				}
+			else
+				{
+				char **xpp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+				BIO_printf(bio_err, "user \"%s\" revoked. t\n", user);
+
+				xpp[DB_srptype][0] = 'R';
+				
+				doupdatedb = 1;
+				}
+			}
+		if (--argc > 0)
+			user = *(argv++) ;
+		else
+			{
+			user = NULL;
+			list_user = 0;
+			}
+		}
+
+	VERBOSE BIO_printf(bio_err,"User procession done.\n");
+
+
+	if (doupdatedb)
+		{
+		/* Lets check some fields */
+		for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+			{
+			pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i);
+	
+			if (pp[DB_srptype][0] == 'v')
+				{
+				pp[DB_srptype][0] = 'V';
+				print_user(db, bio_err, i, verbose);
+				}
+			}
+
+		VERBOSE BIO_printf(bio_err, "Trying to update srpvfile.\n");
+		if (!save_index(dbfile, "new", db)) goto err;
+				
+		VERBOSE BIO_printf(bio_err, "Temporary srpvfile created.\n");
+		if (!rotate_index(dbfile, "new", "old")) goto err;
+
+		VERBOSE BIO_printf(bio_err, "srpvfile updated.\n");
+		}
+
+	ret = (errors != 0);
+err:
+	if (errors != 0)
+	VERBOSE BIO_printf(bio_err,"User errors %d.\n",errors);
+
+	VERBOSE BIO_printf(bio_err,"SRP terminating with code %d.\n",ret);
+	if(tofree)
+		OPENSSL_free(tofree);
+	if (ret) ERR_print_errors(bio_err);
+	if (randfile) app_RAND_write_file(randfile, bio_err);
+	if (conf) NCONF_free(conf);
+	if (db) free_index(db);
+
+	OBJ_cleanup();
+	apps_shutdown();
+	OPENSSL_EXIT(ret);
+	}
+
+
+
+#endif
+

diff --git a/apps/verify.c b/apps/verify.c
index 9163997..b9749dc 100644
--- a/apps/verify.c
+++ b/apps/verify.c

@@ -230,6 +230,7 @@
 end:
 	if (ret == 1) {
 		BIO_printf(bio_err,"usage: verify [-verbose] [-CApath path] [-CAfile file] [-purpose purpose] [-crl_check]");
+		BIO_printf(bio_err," [-attime timestamp]");
 #ifndef OPENSSL_NO_ENGINE
 		BIO_printf(bio_err," [-engine e]");
 #endif

diff --git a/apps/x509.c b/apps/x509.c
index 9f5eaeb..e6e5e0d 100644
--- a/apps/x509.c
+++ b/apps/x509.c

@@ -157,9 +157,10 @@
 static int sign (X509 *x, EVP_PKEY *pkey,int days,int clrext, const EVP_MD *digest,
 						CONF *conf, char *section);
 static int x509_certify (X509_STORE *ctx,char *CAfile,const EVP_MD *digest,
-			 X509 *x,X509 *xca,EVP_PKEY *pkey,char *serial,
-			 int create,int days, int clrext, CONF *conf, char *section,
-						ASN1_INTEGER *sno);
+			 X509 *x,X509 *xca,EVP_PKEY *pkey,
+			 STACK_OF(OPENSSL_STRING) *sigopts,
+			 char *serial, int create ,int days, int clrext,
+			 CONF *conf, char *section, ASN1_INTEGER *sno);
 static int purpose_print(BIO *bio, X509 *cert, X509_PURPOSE *pt);
 static int reqfile=0;
 
@@ -172,6 +173,7 @@
 	X509_REQ *req=NULL;
 	X509 *x=NULL,*xca=NULL;
 	ASN1_OBJECT *objtmp;
+	STACK_OF(OPENSSL_STRING) *sigopts = NULL;
 	EVP_PKEY *Upkey=NULL,*CApkey=NULL;
 	ASN1_INTEGER *sno = NULL;
 	int i,num,badops=0;
@@ -271,6 +273,15 @@
 			if (--argc < 1) goto bad;
 			CAkeyformat=str2fmt(*(++argv));
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-days") == 0)
 			{
 			if (--argc < 1) goto bad;
@@ -970,7 +981,8 @@
 				
 				assert(need_rand);
 				if (!x509_certify(ctx,CAfile,digest,x,xca,
-					CApkey, CAserial,CA_createserial,days, clrext,
+					CApkey, sigopts,
+					CAserial,CA_createserial,days, clrext,
 					extconf, extsect, sno))
 					goto end;
 				}
@@ -1081,6 +1093,8 @@
 	X509_free(xca);
 	EVP_PKEY_free(Upkey);
 	EVP_PKEY_free(CApkey);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 	X509_REQ_free(rq);
 	ASN1_INTEGER_free(sno);
 	sk_ASN1_OBJECT_pop_free(trust, ASN1_OBJECT_free);
@@ -1131,8 +1145,11 @@
 	}
 
 static int x509_certify(X509_STORE *ctx, char *CAfile, const EVP_MD *digest,
-	     X509 *x, X509 *xca, EVP_PKEY *pkey, char *serialfile, int create,
-	     int days, int clrext, CONF *conf, char *section, ASN1_INTEGER *sno)
+	     		X509 *x, X509 *xca, EVP_PKEY *pkey,
+			STACK_OF(OPENSSL_STRING) *sigopts,
+	  		char *serialfile, int create,
+	     		int days, int clrext, CONF *conf, char *section,
+			ASN1_INTEGER *sno)
 	{
 	int ret=0;
 	ASN1_INTEGER *bs=NULL;
@@ -1191,7 +1208,8 @@
                 if (!X509V3_EXT_add_nconf(conf, &ctx2, section, x)) goto end;
 		}
 
-	if (!X509_sign(x,pkey,digest)) goto end;
+	if (!do_X509_sign(bio_err, x, pkey, digest, sigopts))
+		goto end;
 	ret=1;
 end:
 	X509_STORE_CTX_cleanup(&xsc);

diff --git a/crypto/Android.mk b/crypto/Android.mk
index 8090c12..fb599ce 100644
--- a/crypto/Android.mk
+++ b/crypto/Android.mk

@@ -169,7 +169,11 @@
 	bn/bn_sqrt.c \
 	bn/bn_word.c \
 	buffer/buf_err.c \
+	buffer/buf_str.c \
 	buffer/buffer.c \
+	cmac/cm_ameth.c \
+	cmac/cm_pmeth.c \
+	cmac/cmac.c \
 	comp/c_rle.c \
 	comp/c_zlib.c \
 	comp/comp_err.c \
@@ -235,6 +239,7 @@
 	dso/dso_null.c \
 	dso/dso_openssl.c \
 	ec/ec2_mult.c \
+	ec/ec2_oct.c \
 	ec/ec2_smpl.c \
 	ec/ec_ameth.c \
 	ec/ec_asn1.c \
@@ -245,11 +250,13 @@
 	ec/ec_key.c \
 	ec/ec_lib.c \
 	ec/ec_mult.c \
+	ec/ec_oct.c \
 	ec/ec_pmeth.c \
 	ec/ec_print.c \
 	ec/eck_prn.c \
 	ec/ecp_mont.c \
 	ec/ecp_nist.c \
+	ec/ecp_oct.c \
 	ec/ecp_smpl.c \
 	ecdh/ech_err.c \
 	ecdh/ech_key.c \
@@ -295,6 +302,7 @@
 	evp/c_alld.c \
 	evp/digest.c \
 	evp/e_aes.c \
+	evp/e_aes_cbc_hmac_sha1.c \
 	evp/e_bf.c \
 	evp/e_des.c \
 	evp/e_des3.c \
@@ -302,6 +310,7 @@
 	evp/e_old.c \
 	evp/e_rc2.c \
 	evp/e_rc4.c \
+	evp/e_rc4_hmac_md5.c \
 	evp/e_rc5.c \
 	evp/e_xcbc_d.c \
 	evp/encode.c \
@@ -347,9 +356,13 @@
 	md5/md5_dgst.c \
 	md5/md5_one.c \
 	modes/cbc128.c \
+	modes/ccm128.c \
 	modes/cfb128.c \
 	modes/ctr128.c \
+	modes/gcm128.c \
 	modes/ofb128.c \
+	modes/xts128.c \
+	o_init.c \
 	objects/o_names.c \
 	objects/obj_dat.c \
 	objects/obj_err.c \
@@ -398,6 +411,7 @@
 	pkcs7/pk7_mime.c \
 	pkcs7/pk7_smime.c \
 	pkcs7/pkcs7err.c \
+	pqueue/pqueue.c \
 	rand/md_rand.c \
 	rand/rand_egd.c \
 	rand/rand_err.c \
@@ -411,11 +425,13 @@
 	rc2/rc2ofb64.c \
 	rc4/rc4_enc.c \
 	rc4/rc4_skey.c \
+	rc4/rc4_utl.c \
 	ripemd/rmd_dgst.c \
 	ripemd/rmd_one.c \
 	rsa/rsa_ameth.c \
 	rsa/rsa_asn1.c \
 	rsa/rsa_chk.c \
+	rsa/rsa_crpt.c \
 	rsa/rsa_eay.c \
 	rsa/rsa_err.c \
 	rsa/rsa_gen.c \
@@ -436,6 +452,8 @@
 	sha/sha256.c \
 	sha/sha512.c \
 	sha/sha_dgst.c \
+	srp/srp_lib.c \
+	srp/srp_vfy.c \
 	stack/stack.c \
 	ts/ts_err.c \
 	txt_db/txt_db.c \
@@ -507,12 +525,15 @@
 	external/openssl \
 	external/openssl/crypto/asn1 \
 	external/openssl/crypto/evp \
+	external/openssl/crypto/modes \
 	external/openssl/include \
 	external/openssl/include/openssl \
 	external/zlib
 
 local_c_flags := -DNO_WINDOWS_BRAINDEATH
 
+local_as_flags := -x assembler-with-cpp
+
 #######################################
 # target static library
 include $(CLEAR_VARS)
@@ -525,6 +546,7 @@
 
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags)
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 ifeq ($(TARGET_ARCH),arm)
 	LOCAL_SRC_FILES += $(arm_src_files)
@@ -561,6 +583,7 @@
 
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags)
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 ifeq ($(TARGET_ARCH),arm)
 	LOCAL_SRC_FILES += $(arm_src_files)
@@ -587,6 +610,7 @@
 include $(LOCAL_PATH)/../android-config.mk
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 LOCAL_SRC_FILES += $(other_arch_src_files)
 LOCAL_STATIC_LIBRARIES += libz
@@ -602,6 +626,7 @@
 include $(LOCAL_PATH)/../android-config.mk
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 LOCAL_SRC_FILES += $(other_arch_src_files)
 LOCAL_STATIC_LIBRARIES += libz

diff --git a/crypto/aes/aes.h b/crypto/aes/aes.h
index d2c9973..031abf0 100644
--- a/crypto/aes/aes.h
+++ b/crypto/aes/aes.h

@@ -90,6 +90,11 @@
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,

diff --git a/crypto/aes/aes_core.c b/crypto/aes/aes_core.c
index a7ec54f..8f5210a 100644
--- a/crypto/aes/aes_core.c
+++ b/crypto/aes/aes_core.c

@@ -625,7 +625,7 @@
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 
 	u32 *rk;
@@ -726,7 +726,7 @@
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -734,7 +734,7 @@
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
@@ -1201,7 +1201,7 @@
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 	u32 *rk;
    	int i = 0;
@@ -1301,7 +1301,7 @@
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -1309,7 +1309,7 @@
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 

diff --git a/crypto/aes/aes_misc.c b/crypto/aes/aes_misc.c
index 4fead1b..f083488 100644
--- a/crypto/aes/aes_misc.c
+++ b/crypto/aes/aes_misc.c

@@ -50,6 +50,7 @@
  */
 
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
@@ -62,3 +63,23 @@
         return "aes(partial)";
 #endif
 }
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_encrypt_key(userKey, bits, key);
+	}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_decrypt_key(userKey, bits, key);
+	}

diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index aab40e6..687ed81 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl

@@ -39,7 +39,7 @@
 # but exhibits up to 10% improvement on other cores.
 #
 # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
 # This made it possible to implement little-endian variant of the
 # algorithm without modifying the base C code. Motivating factor for
 # the undertaken effort was that it appeared that in tight IA-32
@@ -2854,12 +2854,12 @@
     &set_label("exit");
 &function_end("_x86_AES_set_encrypt_key");
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&ret	();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
 
 sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@
 	&mov	(&DWP(4*$i,$key),$tp1);
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&cmp	("eax",0);
 	&je	(&label("proceed"));
@@ -2974,7 +2974,7 @@
 	&jb	(&label("permute"));
 
 	&xor	("eax","eax");			# return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();

diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index c51ee1f..943ce45 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl

@@ -27,6 +27,11 @@
 # Rescheduling for dual-issue pipeline resulted in 12% improvement on
 # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -46,6 +51,7 @@
 $rounds="r12";
 
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -166,7 +172,7 @@
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$t3,$i3,ror#8
 	and	$i3,lr,$s2
-	eor	$s1,$s1,$t1,ror#24
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	eor	$s1,$s1,$t1,ror#24
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$t3,$i3,ror#16
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$s2,$t2,ror#16
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	eor	$s2,$s2,$t2,ror#16
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
 	eor	$s0,$s0,$i1,ror#24
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
-	eor	$s1,$s1,$i2,ror#16
 	ldr	$i1,[$key],#16
+	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s2,$s2,$i3,ror#8
 	ldr	$t1,[$key,#-12]
 	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s2
-	eor	$s1,$t1,$s1,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	eor	$s1,$t1,$s1,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
 	eor	$s0,$i1,$s0,lsl#8
 	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$t2,$s2,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	eor	$s2,$t2,$s2,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
 	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	ldr	$i1,[$key,#0]
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	eor	$s1,$s1,$i2,lsl#8
 	ldr	$t1,[$key,#4]
 	eor	$s2,$s2,$i3,lsl#16
@@ -371,10 +404,10 @@
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -392,12 +425,13 @@
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	$tbl,r3,#private_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -430,6 +464,22 @@
 	orr	$s3,$s3,$t3,lsl#24
 	str	$s2,[$key,#-8]
 	str	$s3,[$key,#-4]
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -466,6 +516,7 @@
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#19]
 	ldrb	$t1,[$rounds,#18]
 	ldrb	$t2,[$rounds,#17]
@@ -482,6 +533,16 @@
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#16]
+	ldr	$i3,[$rounds,#20]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -526,6 +587,7 @@
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#27]
 	ldrb	$t1,[$rounds,#26]
 	ldrb	$t2,[$rounds,#25]
@@ -542,6 +604,16 @@
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#24]
+	ldr	$i3,[$rounds,#28]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	mov	$rounds,#14
 	str	$rounds,[$key,#240-32]
@@ -606,14 +678,14 @@
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	private_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -692,11 +764,15 @@
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -811,7 +887,7 @@
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -840,10 +916,33 @@
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -872,11 +971,15 @@
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -916,11 +1019,11 @@
 	and	$i2,lr,$s2		@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s2,lsr#16
-	eor	$s1,$s1,$t1,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	eor	$s1,$s1,$t1,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1032,22 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s3		@ i2
-	eor	$s2,$s2,$t2,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	eor	$s2,$s2,$t2,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
 	eor	$s0,$s0,$i1,ror#8
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
-	eor	$s1,$s1,$i2,ror#16
-	eor	$s2,$s2,$i3,ror#24
 	ldr	$i1,[$key],#16
-	eor	$s3,$s3,$t3,ror#8
+	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
+	eor	$s2,$s2,$i3,ror#24
 
 	ldr	$t1,[$key,#-12]
-	ldr	$t2,[$key,#-8]
 	eor	$s0,$s0,$i1
+	ldr	$t2,[$key,#-8]
+	eor	$s3,$s3,$t3,ror#8
 	ldr	$t3,[$key,#-4]
 	and	$i1,lr,$s0,lsr#16
 	eor	$s1,$s1,$t1
@@ -985,11 +1088,11 @@
 	and	$i1,lr,$s2,lsr#8	@ i0
 	eor	$t2,$t2,$i2,lsl#8
 	and	$i2,lr,$s2		@ i1
-	eor	$t3,$t3,$i3,lsl#8
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	eor	$t3,$t3,$i3,lsl#8
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	and	$i3,lr,$s2,lsr#16
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
 	eor	$s0,$s0,$i1,lsl#8
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1100,11 @@
 	and	$i1,lr,$s3,lsr#16	@ i0
 	eor	$s2,$t2,$s2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$t3,$i3,lsl#16
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	eor	$t3,$t3,$i3,lsl#16
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	and	$i3,lr,$s3		@ i2
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 	eor	$s0,$s0,$i1,lsl#16

diff --git a/crypto/aes/asm/aes-armv4.s b/crypto/aes/asm/aes-armv4.s
index 27c681c..e57e0d0 100644
--- a/crypto/aes/asm/aes-armv4.s
+++ b/crypto/aes/asm/aes-armv4.s

@@ -1,3 +1,4 @@
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -118,7 +119,7 @@
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -147,10 +148,33 @@
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -179,11 +203,15 @@
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -223,11 +251,11 @@
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r6,r9,ror#8
 	and	r9,lr,r2
-	eor	r1,r1,r4,ror#24
 	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
+	eor	r1,r1,r4,ror#24
+	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
@@ -236,16 +264,16 @@
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r6,r9,ror#16
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r2,r5,ror#16
 	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
+	eor	r2,r2,r5,ror#16
+	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
 	eor	r0,r0,r7,ror#24
-	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
-	eor	r1,r1,r8,ror#16
 	ldr	r7,[r11],#16
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
 	eor	r2,r2,r9,ror#8
 	ldr	r4,[r11,#-12]
 	eor	r3,r3,r6,ror#8
@@ -285,11 +313,11 @@
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r2
-	eor	r1,r4,r1,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
+	eor	r1,r4,r1,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
 	eor	r0,r7,r0,lsl#8
 	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
@@ -298,15 +326,15 @@
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r5,r2,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
+	eor	r2,r5,r2,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
 	eor	r0,r7,r0,lsl#8
-	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	ldr	r7,[r11,#0]
+	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	eor	r1,r1,r8,lsl#8
 	ldr	r4,[r11,#4]
 	eor	r2,r2,r9,lsl#16
@@ -323,10 +351,10 @@
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -344,12 +372,13 @@
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	r10,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	r10,r3,#private_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	r12,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	r11,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -382,6 +411,22 @@
 	orr	r3,r3,r6,lsl#24
 	str	r2,[r11,#-8]
 	str	r3,[r11,#-4]
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r11],#16
+	str	r1,[r11,#-12]
+	str	r2,[r11,#-8]
+	str	r3,[r11,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -418,6 +463,7 @@
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#19]
 	ldrb	r4,[r12,#18]
 	ldrb	r5,[r12,#17]
@@ -434,6 +480,16 @@
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#16]
+	ldr	r9,[r12,#20]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -478,6 +534,7 @@
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#27]
 	ldrb	r4,[r12,#26]
 	ldrb	r5,[r12,#25]
@@ -494,6 +551,16 @@
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#24]
+	ldr	r9,[r12,#28]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	mov	r12,#14
 	str	r12,[r11,#240-32]
@@ -558,14 +625,14 @@
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	private_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -639,11 +706,15 @@
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -758,7 +829,7 @@
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -787,10 +858,33 @@
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -819,11 +913,15 @@
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -863,11 +961,11 @@
 	and	r8,lr,r2		@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r2,lsr#16
-	eor	r1,r1,r4,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
+	eor	r1,r1,r4,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
@@ -876,22 +974,22 @@
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r3		@ i2
-	eor	r2,r2,r5,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
+	eor	r2,r2,r5,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
 	eor	r0,r0,r7,ror#8
-	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
-	eor	r1,r1,r8,ror#16
-	eor	r2,r2,r9,ror#24
 	ldr	r7,[r11],#16
-	eor	r3,r3,r6,ror#8
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
+	eor	r2,r2,r9,ror#24
 
 	ldr	r4,[r11,#-12]
-	ldr	r5,[r11,#-8]
 	eor	r0,r0,r7
+	ldr	r5,[r11,#-8]
+	eor	r3,r3,r6,ror#8
 	ldr	r6,[r11,#-4]
 	and	r7,lr,r0,lsr#16
 	eor	r1,r1,r4
@@ -932,11 +1030,11 @@
 	and	r7,lr,r2,lsr#8	@ i0
 	eor	r5,r5,r8,lsl#8
 	and	r8,lr,r2		@ i1
-	eor	r6,r6,r9,lsl#8
 	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
+	eor	r6,r6,r9,lsl#8
+	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	and	r9,lr,r2,lsr#16
 
-	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	ldrb	r2,[r10,r2,lsr#24]	@ Td4[s2>>24]
 	eor	r0,r0,r7,lsl#8
 	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
@@ -944,11 +1042,11 @@
 	and	r7,lr,r3,lsr#16	@ i0
 	eor	r2,r5,r2,lsl#16
 	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r6,r9,lsl#16
 	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
+	eor	r6,r6,r9,lsl#16
+	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	and	r9,lr,r3		@ i2
 
-	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
 	ldrb	r3,[r10,r3,lsr#24]	@ Td4[s3>>24]
 	eor	r0,r0,r7,lsl#16

diff --git a/crypto/aes/asm/aes-parisc.pl b/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000..c36b6a2
--- /dev/null
+++ b/crypto/aes/asm/aes-parisc.pl

@@ -0,0 +1,1021 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$inp="%r26";	# arg0
+$out="%r25";	# arg1
+$key="%r24";	# arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+AES_encrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$enc_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$enc_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$enc_inp_aligned
+	bl	_parisc_AES_encrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$enc_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$enc_done
+	stb	$s3,15($out)
+
+L\$enc_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$enc_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_encrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	12($key),$t3
+	_srm	$s0,24,$acc0
+	xor	$t1,$s1,$s1
+	ldw	16($key),$t0
+	_srm	$s1,16,$acc1
+	xor	$t2,$s2,$s2
+	ldw	20($key),$t1
+	xor	$t3,$s3,$s3
+	ldw	24($key),$t2
+	ldw	28($key),$t3
+L\$enc_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$enc_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch te4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch te4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch te4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch te4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch te4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch te4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch te4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch te4
+	_srm	$s1,16,$acc1
+		xor	$acc14,$s3,$s3
+	b	L\$enc_loop
+		xor	$acc15,$s3,$s3
+
+	.ALIGN	16
+L\$enc_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t2,16,$acc5
+	_srm	$t3,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t3,16,$acc9
+	_srm	$t0,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t0,16,$acc13
+	_srm	$t1,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t2,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Te
+	.WORD	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+	.WORD	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+	.WORD	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+	.WORD	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+	.WORD	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+	.WORD	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+	.WORD	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+	.WORD	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+	.WORD	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+	.WORD	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+	.WORD	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+	.WORD	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+	.WORD	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+	.WORD	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+	.WORD	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+	.WORD	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+	.WORD	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+	.WORD	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+	.WORD	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+	.WORD	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+	.WORD	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+	.WORD	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+	.WORD	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+	.WORD	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+	.WORD	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+	.WORD	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+	.WORD	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+	.WORD	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+	.WORD	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+	.WORD	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+	.WORD	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+	.WORD	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+	.WORD	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+	.WORD	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+	.WORD	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+	.WORD	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+	.WORD	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+	.WORD	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+	.WORD	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+	.WORD	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+	.WORD	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+	.WORD	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+	.WORD	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+	.WORD	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+	.WORD	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+	.WORD	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+	.WORD	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+	.WORD	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+	.WORD	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+	.WORD	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+	.WORD	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+	.WORD	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+	.WORD	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+	.WORD	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+	.WORD	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+	.WORD	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+	.WORD	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+	.WORD	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+	.WORD	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+	.WORD	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+	.WORD	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+	.WORD	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+	.WORD	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+	.WORD	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+	.BYTE	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.BYTE	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.BYTE	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.BYTE	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.BYTE	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.BYTE	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.BYTE	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.BYTE	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.BYTE	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.BYTE	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.BYTE	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.BYTE	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.BYTE	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.BYTE	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.BYTE	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.BYTE	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.BYTE	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.BYTE	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.BYTE	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.BYTE	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.BYTE	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.BYTE	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.BYTE	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.BYTE	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.BYTE	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.BYTE	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.BYTE	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.BYTE	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.BYTE	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.BYTE	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.BYTE	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.BYTE	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+	.EXPORT	AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	16
+AES_decrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$dec_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$dec_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$dec_inp_aligned
+	bl	_parisc_AES_decrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$dec_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$dec_done
+	stb	$s3,15($out)
+
+L\$dec_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$dec_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_decrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	ldw	12($key),$t3
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	16($key),$t0
+	xor	$t1,$s1,$s1
+	ldw	20($key),$t1
+	_srm	$s0,24,$acc0
+	xor	$t2,$s2,$s2
+	ldw	24($key),$t2
+	xor	$t3,$s3,$s3
+	ldw	28($key),$t3
+	_srm	$s3,16,$acc1
+L\$dec_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$dec_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch td4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch td4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch td4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch td4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch td4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch td4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch td4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch td4
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	b	L\$dec_loop
+	_srm	$s3,16,$acc1
+
+	.ALIGN	16
+L\$dec_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t0,16,$acc5
+	_srm	$t1,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t1,16,$acc9
+	_srm	$t2,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t2,16,$acc13
+	_srm	$t3,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t0,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Td
+	.WORD	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+	.WORD	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+	.WORD	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+	.WORD	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+	.WORD	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+	.WORD	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+	.WORD	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+	.WORD	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+	.WORD	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+	.WORD	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+	.WORD	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+	.WORD	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+	.WORD	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+	.WORD	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+	.WORD	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+	.WORD	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+	.WORD	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+	.WORD	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+	.WORD	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+	.WORD	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+	.WORD	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+	.WORD	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+	.WORD	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+	.WORD	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+	.WORD	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+	.WORD	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+	.WORD	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+	.WORD	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+	.WORD	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+	.WORD	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+	.WORD	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+	.WORD	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+	.WORD	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+	.WORD	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+	.WORD	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+	.WORD	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+	.WORD	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+	.WORD	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+	.WORD	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+	.WORD	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+	.WORD	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+	.WORD	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+	.WORD	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+	.WORD	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+	.WORD	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+	.WORD	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+	.WORD	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+	.WORD	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+	.WORD	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+	.WORD	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+	.WORD	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+	.WORD	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+	.WORD	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+	.WORD	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+	.WORD	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+	.WORD	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+	.WORD	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+	.WORD	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+	.WORD	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+	.WORD	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+	.WORD	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+	.WORD	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+	.WORD	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+	.WORD	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+	.BYTE	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.BYTE	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.BYTE	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.BYTE	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.BYTE	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.BYTE	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.BYTE	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.BYTE	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.BYTE	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.BYTE	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.BYTE	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.BYTE	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.BYTE	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.BYTE	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.BYTE	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.BYTE	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.BYTE	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.BYTE	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.BYTE	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.BYTE	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.BYTE	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.BYTE	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.BYTE	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.BYTE	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.BYTE	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.BYTE	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.BYTE	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.BYTE	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.BYTE	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.BYTE	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.BYTE	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.BYTE	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# translate made up instructons: _ror, _srm
+	s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/				or
+
+	s/_srm(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+		:            sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+	s/,\*/,/ if ($SIZE_T==4);
+	print $_,"\n";
+}
+close STDOUT;

diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index f82c5e1..7c52cbe 100644
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl

@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$STU	="stwu";
 	$POP	="lwz";
 	$PUSH	="stw";
@@ -116,15 +118,19 @@
 	addi	$Tbl0,$Tbl0,`128-8`
 	mtlr	r0
 	blr
-	.space	`32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 LAES_Td:
 	mflr	r0
 	bcl	20,31,\$+4
 	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
-	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
+	addi	$Tbl0,$Tbl0,`128-64-8+2048+256`
 	mtlr	r0
 	blr
-	.space	`128-32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`128-64-9*4`
 ___
 &_data_word(
 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@
 .globl	.AES_encrypt
 .align	7
 .AES_encrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Lenc_unaligned
+
+Lenc_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -363,8 +375,80 @@
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Lenc_done
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Lenc_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Lenc_unaligned_ok
+
+Lenc_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Te
+	bl	Lppc_AES_encrypt_compact
+
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
+
+Lenc_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_encrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -413,44 +500,44 @@
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s1,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s2,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s3,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s0,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -466,60 +553,60 @@
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s1,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s2,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc06,$s3,`32-16`,24,31
+	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc00,$Tbl2,$acc00
-	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s3,$acc07,16,8,15
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc08,8,16,23
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc09,8,16,23
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc10,8,16,23
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc11,8,16,23
 	or	$s0,$s0,$acc12
 	or	$s1,$s1,$acc13
@@ -530,29 +617,31 @@
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_encrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 	mtctr	$acc00
 .align	4
 Lenc_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@
 	rlwinm	$acc06,$s3,`32-16`,24,31
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -612,12 +701,12 @@
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -633,32 +722,32 @@
 	and	$acc03,$acc03,$mask1b
 	xor	$acc00,$acc00,$acc08	# r2
 	xor	$acc01,$acc01,$acc09
+	 rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
 	xor	$acc02,$acc02,$acc10
+	 rotlwi	$acc13,$s1,16
 	xor	$acc03,$acc03,$acc11
+	 rotlwi	$acc14,$s2,16
 
-	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
-	rotlwi	$acc13,$s1,16
-	rotlwi	$acc14,$s2,16
-	rotlwi	$acc15,$s3,16
 	xor	$s0,$s0,$acc00		# r0^r2
+	rotlwi	$acc15,$s3,16
 	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
+	xor	$s2,$s2,$acc02
 	rotrwi	$s1,$s1,24
+	xor	$s3,$s3,$acc03
 	rotrwi	$s2,$s2,24
-	rotrwi	$s3,$s3,24
 	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
+	rotrwi	$s3,$s3,24
 	xor	$s1,$s1,$acc01
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
-	rotlwi	$acc09,$acc13,8
-	rotlwi	$acc10,$acc14,8
-	rotlwi	$acc11,$acc15,8
 	xor	$s0,$s0,$acc12		#
+	rotlwi	$acc09,$acc13,8
 	xor	$s1,$s1,$acc13
+	rotlwi	$acc10,$acc14,8
 	xor	$s2,$s2,$acc14
+	rotlwi	$acc11,$acc15,8
 	xor	$s3,$s3,$acc15
 	xor	$s0,$s0,$acc08		#
 	xor	$s1,$s1,$acc09
@@ -673,14 +762,15 @@
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.AES_decrypt
 .align	7
 .AES_decrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Ldec_unaligned
+
+Ldec_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -712,8 +809,80 @@
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Ldec_done
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+Ldec_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Ldec_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Ldec_unaligned_ok
+
+Ldec_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Td
+	bl	Lppc_AES_decrypt_compact
+
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
+
+Ldec_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_decrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -762,44 +934,44 @@
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s3,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s0,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s1,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s2,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -815,56 +987,56 @@
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s3,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s0,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc00,$Tbl2,$acc00
+	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc01,$Tbl2,$acc01
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc06,$s1,`32-16`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_decrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
 	insrdi	$mask80,$mask80,32,0
@@ -904,10 +1078,10 @@
 Ldec_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@
 	rlwinm	$acc06,$s1,`32-16`,24,31
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -1030,12 +1204,12 @@
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@
 	and	$acc06,$acc02,$mask80
 	and	$acc07,$acc03,$mask80
 	srwi	$acc08,$acc04,7		# r1>>7
-	srwi	$acc09,$acc05,7
-	srwi	$acc10,$acc06,7
-	srwi	$acc11,$acc07,7
 	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	srwi	$acc09,$acc05,7
 	andc	$acc13,$acc01,$mask80
+	srwi	$acc10,$acc06,7
 	andc	$acc14,$acc02,$mask80
+	srwi	$acc11,$acc07,7
 	andc	$acc15,$acc03,$mask80
 	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
 	sub	$acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@
 
 	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
 	and	$acc09,$acc05,$mask80
-	and	$acc10,$acc06,$mask80
-	and	$acc11,$acc07,$mask80
 	srwi	$acc12,$acc08,7		# r1>>7
+	and	$acc10,$acc06,$mask80
 	srwi	$acc13,$acc09,7
+	and	$acc11,$acc07,$mask80
 	srwi	$acc14,$acc10,7
-	srwi	$acc15,$acc11,7
 	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	srwi	$acc15,$acc11,7
 	sub	$acc09,$acc09,$acc13
 	sub	$acc10,$acc10,$acc14
 	sub	$acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@
 $code.=<<___;
 	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
 	rotrwi	$s1,$s1,8
-	rotrwi	$s2,$s2,8
-	rotrwi	$s3,$s3,8
 	xor	$s0,$s0,$acc00		# ^= r2^r0
+	rotrwi	$s2,$s2,8
 	xor	$s1,$s1,$acc01
+	rotrwi	$s3,$s3,8
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	xor	$acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@
 	xor	$acc02,$acc02,$acc10
 	xor	$acc03,$acc03,$acc11
 	xor	$s0,$s0,$acc04		# ^= r4^r0
-	xor	$s1,$s1,$acc05
-	xor	$s2,$s2,$acc06
-	xor	$s3,$s3,$acc07
 	rotrwi	$acc00,$acc00,24
+	xor	$s1,$s1,$acc05
 	rotrwi	$acc01,$acc01,24
+	xor	$s2,$s2,$acc06
 	rotrwi	$acc02,$acc02,24
+	xor	$s3,$s3,$acc07
 	rotrwi	$acc03,$acc03,24
 	xor	$acc04,$acc04,$acc08
 	xor	$acc05,$acc05,$acc09
 	xor	$acc06,$acc06,$acc10
 	xor	$acc07,$acc07,$acc11
 	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-	xor	$s1,$s1,$acc09
-	xor	$s2,$s2,$acc10
-	xor	$s3,$s3,$acc11
 	rotrwi	$acc04,$acc04,16
+	xor	$s1,$s1,$acc09
 	rotrwi	$acc05,$acc05,16
+	xor	$s2,$s2,$acc10
 	rotrwi	$acc06,$acc06,16
+	xor	$s3,$s3,$acc11
 	rotrwi	$acc07,$acc07,16
 	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
-	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$acc08,$acc08,8
+	xor	$s1,$s1,$acc01
 	rotrwi	$acc09,$acc09,8
+	xor	$s2,$s2,$acc02
 	rotrwi	$acc10,$acc10,8
+	xor	$s3,$s3,$acc03
 	rotrwi	$acc11,$acc11,8
 	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
 	xor	$s1,$s1,$acc05
@@ -1179,7 +1353,9 @@
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
-.long	0
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
 .asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align	7
 ___

diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 7e01889..f749a52 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl

@@ -44,12 +44,57 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
 # because software implementation was optimized.
 
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $softonly=0;	# allow hardware support
 
 $t0="%r0";	$mask="%r0";
@@ -69,6 +114,8 @@
 $ra="%r14";
 $sp="%r15";
 
+$stdframe=16*$SIZE_T+4*8;
+
 sub _data_word()
 { my $i;
     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@
 .Lesoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -220,20 +267,20 @@
 	larl	$tbl,AES_Te
 	bras	$ra,_s390x_AES_encrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _s390x_AES_encrypt,\@function
 .align	16
 _s390x_AES_encrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -397,7 +444,7 @@
 	or	$s2,$i3
 	or	$s3,$t3
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	xr	$s0,$t0
 	xr	$s1,$t2
 	x	$s2,24($key)
@@ -536,7 +583,7 @@
 .Ldsoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -546,20 +593,20 @@
 	larl	$tbl,AES_Td
 	bras	$ra,_s390x_AES_decrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _s390x_AES_decrypt,\@function
 .align	16
 _s390x_AES_decrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -703,7 +750,7 @@
 	nr	$i1,$mask
 	nr	$i2,$mask
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	or	$s1,$t1
 	l	$t0,16($key)
 	l	$t1,20($key)
@@ -732,14 +779,14 @@
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	lghi	$t0,0
-	clgr	$inp,$t0
+	cl${g}r	$inp,$t0
 	je	.Lminus1
-	clgr	$key,$t0
+	cl${g}r	$key,$t0
 	je	.Lminus1
 
 	lghi	$t0,128
@@ -797,7 +844,7 @@
 $code.=<<___;
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+	stm${g}	%r6,%r13,6*$SIZE_T($sp)	# all non-volatile regs
 
 	larl	$tbl,AES_Te+2048
 
@@ -858,7 +905,7 @@
 	la	$t3,4($t3)		# i++
 	brct	$rounds,.L128_loop
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -906,7 +953,7 @@
 	st	$s3,36($key)
 	brct	$rounds,.L192_continue
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -968,7 +1015,7 @@
 	st	$s3,44($key)
 	brct	$rounds,.L256_continue
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -1011,19 +1058,19 @@
 .Lminus1:
 	lghi	%r2,-1
 	br	$ra
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function
 .align	16
-AES_set_decrypt_key:
-	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save non-volatile registers!
+private_AES_set_decrypt_key:
+	st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
+	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers!
 	bras	$ra,AES_set_encrypt_key
-	lg	$key,32($sp)
-	lg	$ra,112($sp)
+	l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 	ltgr	%r2,%r2
 	bnzr	$ra
 ___
@@ -1038,11 +1085,11 @@
 
 .align	16
 .Ldkey_internal:
-	stg	$key,32($sp)
-	stg	$ra,40($sp)
+	st${g}	$key,4*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
 	bras	$ra,.Lekey_internal
-	lg	$key,32($sp)
-	lg	$ra,40($sp)
+	l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 ___
 $code.=<<___;
 
@@ -1123,13 +1170,14 @@
 	la	$key,4($key)
 	brct	$rounds,.Lmix
 
-	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 #                     size_t length, const AES_KEY *key,
 #                     unsigned char *ivec, const int enc)
 {
@@ -1163,7 +1211,7 @@
 	l	%r0,240($key)	# load kmc code
 	lghi	$key,15		# res=len%16, len-=res;
 	ngr	$key,$len
-	slgr	$len,$key
+	sl${g}r	$len,$key
 	la	%r1,16($sp)	# parameter block - ivec || key
 	jz	.Lkmc_truncated
 	.long	0xb92f0042	# kmc %r4,%r2
@@ -1181,34 +1229,34 @@
 	tmll	%r0,0x80
 	jnz	.Lkmc_truncated_dec
 	lghi	%r1,0
-	stg	%r1,128($sp)
-	stg	%r1,136($sp)
+	stg	%r1,16*$SIZE_T($sp)
+	stg	%r1,16*$SIZE_T+8($sp)
 	bras	%r1,1f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 1:	ex	$key,0(%r1)
 	la	%r1,16($sp)	# restore parameter block
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
 	j	.Lkmc_done
 .align	16
 .Lkmc_truncated_dec:
-	stg	$out,64($sp)
-	la	$out,128($sp)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$out,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
-	lg	$out,64($sp)
+	l${g}	$out,4*$SIZE_T($sp)
 	bras	%r1,2f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 2:	ex	$key,0(%r1)
 	j	.Lkmc_done
 .align	16
 .Lcbc_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
 	lhi	%r0,0
-	cl	%r0,164($sp)
+	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
 	je	.Lcbc_decrypt
 
 	larl	$tbl,AES_Te
@@ -1219,10 +1267,10 @@
 	llgf	$s3,12($ivp)
 
 	lghi	$t0,16
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 .Lcbc_enc_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@@ -1231,7 +1279,7 @@
 
 	bras	$ra,_s390x_AES_encrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
@@ -1240,33 +1288,33 @@
 	la	$inp,16($inp)
 	la	$out,16($out)
 	lghi	$t0,16
-	ltgr	$len,$len
+	lt${g}r	$len,$len
 	jz	.Lcbc_enc_done
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 	j	.Lcbc_enc_loop
 .align	16
 .Lcbc_enc_done:
-	lg	$ivp,48($sp)
+	l${g}	$ivp,6*$SIZE_T($sp)
 	st	$s0,0($ivp)
 	st	$s1,4($ivp)	
 	st	$s2,8($ivp)
 	st	$s3,12($ivp)
 
-	lmg	%r7,$ra,56($sp)
+	lm${g}	%r7,$ra,7*$SIZE_T($sp)
 	br	$ra
 
 .align	16
 .Lcbc_enc_tail:
 	aghi	$len,15
 	lghi	$t0,0
-	stg	$t0,128($sp)
-	stg	$t0,136($sp)
+	stg	$t0,16*$SIZE_T($sp)
+	stg	$t0,16*$SIZE_T+8($sp)
 	bras	$t1,3f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 3:	ex	$len,0($t1)
 	lghi	$len,0
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	j	.Lcbc_enc_loop
 
 .align	16
@@ -1275,10 +1323,10 @@
 
 	lg	$t0,0($ivp)
 	lg	$t1,8($ivp)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 .Lcbc_dec_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
 	llgf	$s2,8($inp)
@@ -1287,7 +1335,7 @@
 
 	bras	$ra,_s390x_AES_decrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	sllg	$s0,$s0,32
 	sllg	$s2,$s2,32
 	lr	$s0,$s1
@@ -1295,15 +1343,15 @@
 
 	lg	$t0,0($inp)
 	lg	$t1,8($inp)
-	xg	$s0,128($sp)
-	xg	$s2,136($sp)
+	xg	$s0,16*$SIZE_T($sp)
+	xg	$s2,16*$SIZE_T+8($sp)
 	lghi	$s1,16
-	slgr	$len,$s1
+	sl${g}r	$len,$s1
 	brc	4,.Lcbc_dec_tail	# if borrow
 	brc	2,.Lcbc_dec_done	# if zero
 	stg	$s0,0($out)
 	stg	$s2,8($out)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 	la	$inp,16($inp)
 	la	$out,16($out)
@@ -1313,7 +1361,7 @@
 	stg	$s0,0($out)
 	stg	$s2,8($out)
 .Lcbc_dec_exit:
-	lmg	$ivp,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	stmg	$t0,$t1,0($ivp)
 
 	br	$ra
@@ -1321,19 +1369,889 @@
 .align	16
 .Lcbc_dec_tail:
 	aghi	$len,15
-	stg	$s0,128($sp)
-	stg	$s2,136($sp)
+	stg	$s0,16*$SIZE_T($sp)
+	stg	$s2,16*$SIZE_T+8($sp)
 	bras	$s1,4f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 4:	ex	$len,0($s1)
 	j	.Lcbc_dec_exit
 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm  OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t blocks, const AES_KEY *key,
+#                     const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4";	# blocks and out are swapped
+my $len="%r3";
+my $key="%r5";	my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl	AES_ctr32_encrypt
+.type	AES_ctr32_encrypt,\@function
+.align	16
+AES_ctr32_encrypt:
+	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lctr32_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+	la	%r1,0($key)	# %r1 is permanent copy of $key
+	lg	$iv0,0($ivp)	# load ivec
+	lg	$ivp,8($ivp)
+
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
+	st${g}	$s2,0($sp)	# back-chain
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_hw_switch	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
+	larl	$s0,OPENSSL_s390xcap_P
+	lg	$s0,8($s0)
+	tmhh	$s0,0x0004	# check for message_security-assist-4
+	jz	.Lctr32_km_loop
+
+	llgfr	$s0,%r0
+	lgr	$s1,%r1
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb92d2042	# kmctr %r4,%r2,%r2
+
+	llihh	%r0,0x8000	# check if kmctr supports the function code
+	srlg	%r0,%r0,0($s0)
+	ng	%r0,16($sp)
+	lgr	%r0,$s0
+	lgr	%r1,$s1
+	jz	.Lctr32_km_loop
+
+####### kmctr code
+	algr	$out,$inp	# restore $out
+	lgr	$s1,$len	# $s1 undertakes $len
+	j	.Lctr32_kmctr_loop
+.align	16
+.Lctr32_kmctr_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_kmctr_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_kmctr_prepare
+
+	#la	$inp,0($inp)	# inp
+	sllg	$len,$fp,4	# len
+	#la	$out,0($out)	# out
+	la	$s2,16($sp)	# iv
+	.long	0xb92da042	# kmctr $out,$s2,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+
+	slgr	$s1,$fp
+	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
+	algr	$fp,$s1
+	lghi	$s1,0
+	brc	4+1,.Lctr32_kmctr_loop	# not zero
+
+	l${g}	$sp,0($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_km_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_km_prepare
+
+	la	$s0,16($sp)	# inp
+	sllg	$s1,$fp,4	# len
+	la	$s2,16($sp)	# out
+	.long	0xb92e00a8	# km %r10,%r8
+	brc	1,.-4		# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+	slgr	$s2,$inp
+.Lctr32_km_xor:
+	lg	$s0,0($inp)
+	lg	$s1,8($inp)
+	xg	$s0,0($s2,$inp)
+	xg	$s1,8($s2,$inp)
+	stg	$s0,0($out,$inp)
+	stg	$s1,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lctr32_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_km_loop	# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lctr32_km_loop	# not zero
+
+	l${g}	$s0,0($sp)
+	l${g}	$s1,$SIZE_T($sp)
+	la	$s2,16($sp)
+.Lctr32_km_zap:
+	stg	$s0,0($s2)
+	stg	$s0,8($s2)
+	la	$s2,16($s2)
+	brct	$s1,.Lctr32_km_zap
+
+	la	$sp,0($s0)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lctr32_software:
+___
+$code.=<<___;
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
+	sl${g}r	$inp,$out
+	larl	$tbl,AES_Te
+	llgf	$t1,12($ivp)
+
+.Lctr32_loop:
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
+	llgf	$s0,0($ivp)
+	llgf	$s1,4($ivp)
+	llgf	$s2,8($ivp)
+	lgr	$s3,$t1
+	st	$t1,16*$SIZE_T($sp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_encrypt
+
+	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
+	llgf	$t1,16*$SIZE_T($sp)
+	x	$s0,0($inp,$out)
+	x	$s1,4($inp,$out)
+	x	$s2,8($inp,$out)
+	x	$s3,12($inp,$out)
+	stm	$s0,$s3,0($out)
+
+	la	$out,16($out)
+	ahi	$t1,1		# 32-bit increment
+	brct	$len,.Lctr32_loop
+
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4";	# len and out are swapped
+my $len="%r3";
+my $key1="%r5";	# $i1
+my $key2="%r6";	# $i2
+my $fp="%r7";	# $i3
+my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type	_s390x_xts_km,\@function
+.align	16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+	llgfr	$s0,%r0			# put aside the function code
+	lghi	$s1,0x7f
+	nr	$s1,%r0
+	lghi	%r0,0			# query capability vector
+	la	%r1,2*$SIZE_T($sp)
+	.long	0xb92e0042		# km %r4,%r2
+	llihh	%r1,0x8000
+	srlg	%r1,%r1,32($s1)		# check for 32+function code
+	ng	%r1,2*$SIZE_T($sp)
+	lgr	%r0,$s0			# restore the function code
+	la	%r1,0($key1)		# restore $key1
+	jz	.Lxts_km_vanilla
+
+	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
+	algr	$out,$inp
+
+	oill	%r0,32			# switch to xts function code
+	aghi	$s1,-18			#
+	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
+	la	%r1,$tweak-16($sp)
+	slgr	%r1,$s1			# parameter block position
+	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
+	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
+					# yes, it contains junk and overlaps
+					# with the tweak in 128-bit case.
+					# it's done to avoid conditional
+					# branch.
+	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
+
+	.long	0xb92e0042		# km %r4,%r2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	lrvg	$s0,$tweak+0($sp)	# load the last tweak
+	lrvg	$s1,$tweak+8($sp)
+	stmg	%r0,%r3,$tweak-32(%r1)	# wipe copy of the key
+
+	nill	%r0,0xffdf		# switch back to original function code
+	la	%r1,0($key1)		# restore pointer to $key1
+	slgr	$out,$inp
+
+	llgc	$len,2*$SIZE_T-1($sp)
+	nill	$len,0x0f		# $len%=16
+	br	$ra
+	
+.align	16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	nill	$fp,0xfff0	# round to 16*n
+	st${g}	$s2,0($sp)	# back-chain
+	nill	$len,0xfff0	# redundant
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_go	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
+	lrvg	$s1,$tweak+8($s2)
+
+	la	$s2,16($sp)		# vector of ascending tweak values
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+	j	.Lxts_km_start
+
+.Lxts_km_loop:
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_prepare:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+.Lxts_km_start:
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	stg	$i1,0($s2,$inp)
+	stg	$i2,8($s2,$inp)
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_prepare
+
+	slgr	$inp,$fp		# rewind $inp
+	la	$s2,0($out,$inp)
+	lgr	$s3,$fp
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_xor:
+	lg	$i1,0($out,$inp)
+	lg	$i2,8($out,$inp)
+	xg	$i1,0($s2,$inp)
+	xg	$i2,8($s2,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_loop		# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lxts_km_loop	# not zero
+
+	l${g}	$i1,0($sp)		# back-chain
+	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
+	la	$i2,16($sp)
+	srlg	$fp,$fp,4
+.Lxts_km_zap:
+	stg	$i1,0($i2)
+	stg	$i1,8($i2)
+	la	$i2,16($i2)
+	brct	$fp,.Lxts_km_zap
+
+	la	$sp,0($i1)
+	llgc	$len,2*$SIZE_T-1($i1)
+	nill	$len,0x0f		# $len%=16
+	bzr	$ra
+
+	# generate one more tweak...
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+
+	ltr	$len,$len		# clear zero flag
+	br	$ra
+.size	_s390x_xts_km,.-_s390x_xts_km
+
+.globl	AES_xts_encrypt
+.type	AES_xts_encrypt,\@function
+.align	16
+AES_xts_encrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	srag	$len,$len,4		# formally wrong, because it expands
+					# sign byte, but who can afford asking
+					# to process more than 2^63-1 bytes?
+					# I use it, because it sets condition
+					# code...
+	bcr	8,$ra			# abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_enc_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
+
+	sllg	$len,$len,4		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed anymore
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_enc_km_done
+
+	aghi	$inp,-16		# take one step back
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_enc_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_km_steal
+
+	la	$s2,0($i3)
+	lghi	$s3,16
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($s2)
+	xg	$i2,8($s2)
+	stg	$i1,0($s2)
+	stg	$i2,8($s2)
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($i3)
+	xg	$i2,8($i3)
+	stg	$i1,0($i3)
+	stg	$i2,8($i3)
+
+.Lxts_enc_km_done:
+	l${g}	$ra,14*$SIZE_T($sp)
+	st${g}	$sp,$tweak($sp)		# wipe tweak
+	st${g}	$sp,$tweak($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_enc_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+
+	xgr	$s0,$s0			# clear upper half
+	xgr	$s1,$s1
+	lrv	$s0,$stdframe+4($sp)	# load secno
+	lrv	$s1,$stdframe+0($sp)
+	xgr	$s2,$s2
+	xgr	$s3,$s3
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	j	.Lxts_enc_enter
+
+.align	16
+.Lxts_enc_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+	la	$inp,16($inp)		# $inp+=16
+.Lxts_enc_enter:
+	x	$s0,0($inp)		# ^=*($inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	brct${g}	$len,.Lxts_enc_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_enc_done
+
+	la	$i3,0($inp,$out)	# put aside real $out
+.Lxts_enc_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_steal
+	la	$out,0($i3)		# restore real $out
+
+	# generate last tweak...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,`$tweak+0`($sp)	# ^=tweak
+	x	$s1,`$tweak+4`($sp)
+	x	$s2,`$tweak+8`($sp)
+	x	$s3,`$tweak+12`($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+.Lxts_enc_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,u64 secno);
+#
+$code.=<<___;
+.globl	AES_xts_decrypt
+.type	AES_xts_decrypt,\@function
+.align	16
+AES_xts_decrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	aghi	$len,-16
+	bcr	4,$ra			# abort if less than zero. formally
+					# wrong, because $len is unsigned,
+					# but who can afford asking to
+					# process more than 2^63-1 bytes?
+	tmll	$len,0x0f
+	jnz	.Lxts_dec_proceed
+	aghi	$len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_dec_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
+
+	nill	$len,0xfff0		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed past this point
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+
+	ltgr	$len,$len
+	jz	.Lxts_dec_km_short
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_dec_km_done
+
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+	j	.Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%=16
+	lrvg	$s0,$tweak+0($sp)	# load the tweak
+	lrvg	$s1,$tweak+8($sp)
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$i2,0($out,$inp)
+	lghi	$i3,16
+	.long	0xb92e0066		# km $i2,$i2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0
+	lrvgr	$i2,$s1
+	xg	$i1,0($out,$inp)
+	xg	$i2,8($out,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_km_steal
+
+	lgr	$s0,$s2
+	lgr	$s1,$s3
+	xg	$s0,0($i3)
+	xg	$s1,8($i3)
+	stg	$s0,0($i3)
+	stg	$s1,8($i3)
+	la	$s0,0($i3)
+	lghi	$s1,16
+	.long	0xb92e0088		# km $s0,$s0
+	brc	1,.-4			# can this happen?
+	xg	$s2,0($i3)
+	xg	$s3,8($i3)
+	stg	$s2,0($i3)
+	stg	$s3,8($i3)
+.Lxts_dec_km_done:
+	l${g}	$ra,14*$SIZE_T($sp)
+	st${g}	$sp,$tweak($sp)		# wipe tweak
+	st${g}	$sp,$tweak($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_dec_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	srlg	$len,$len,4
+	slgr	$out,$inp
+
+	xgr	$s0,$s0			# clear upper half
+	xgr	$s1,$s1
+	lrv	$s0,$stdframe+4($sp)	# load secno
+	lrv	$s1,$stdframe+0($sp)
+	xgr	$s2,$s2
+	xgr	$s3,$s3
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	larl	$tbl,AES_Td
+	lt${g}r	$len,$len
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	jz	.Lxts_dec_short
+	j	.Lxts_dec_enter
+
+.align	16
+.Lxts_dec_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+.Lxts_dec_enter:
+	x	$s0,0($inp)		# tweak^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	la	$inp,16($inp)
+	brct${g}	$len,.Lxts_dec_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_dec_done
+
+	# generate pair of tweaks...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$i2,$s1			# flip byte order
+	lrvgr	$i3,$s3
+	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
+	j	.Lxts_dec_2ndtweak
+
+.align	16
+.Lxts_dec_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
+	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak-16+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
+	x	$s1,$tweak-16+4($sp)
+	x	$s2,$tweak-16+8($sp)
+	x	$s3,$tweak-16+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_steal
+	la	$out,0($i3)		# restore real $out
+
+	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
+	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
+	stg	$sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_decrypt,.-AES_xts_decrypt
 ___
 }
 $code.=<<___;
 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
+close STDOUT;	# force flush

diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index a545e89..48fa857 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl

@@ -588,6 +588,9 @@
 .globl	AES_encrypt
 .type	AES_encrypt,\@function,3
 .align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
 AES_encrypt:
 	push	%rbx
 	push	%rbp
@@ -1184,6 +1187,9 @@
 .globl	AES_decrypt
 .type	AES_decrypt,\@function,3
 .align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
 AES_decrypt:
 	push	%rbx
 	push	%rbp
@@ -1277,13 +1283,13 @@
 ___
 }
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function,3
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function,3
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12			# redundant, but allows to share 
@@ -1304,7 +1310,7 @@
 	add	\$56,%rsp
 .Lenc_key_epilogue:
 	ret
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
 .align	16
@@ -1547,13 +1553,13 @@
 ___
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function,3
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function,3
 .align	16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1622,7 +1628,7 @@
 	add	\$56,%rsp
 .Ldec_key_epilogue:
 	ret
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1654,9 @@
 .type	AES_cbc_encrypt,\@function,6
 .align	16
 .extern	OPENSSL_ia32cap_P
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
 AES_cbc_encrypt:
 	cmp	\$0,%rdx	# check length
 	je	.Lcbc_epilogue
@@ -2766,13 +2775,13 @@
 	.rva	.LSEH_end_AES_decrypt
 	.rva	.LSEH_info_AES_decrypt
 
-	.rva	.LSEH_begin_AES_set_encrypt_key
-	.rva	.LSEH_end_AES_set_encrypt_key
-	.rva	.LSEH_info_AES_set_encrypt_key
+	.rva	.LSEH_begin_private_AES_set_encrypt_key
+	.rva	.LSEH_end_private_AES_set_encrypt_key
+	.rva	.LSEH_info_private_AES_set_encrypt_key
 
-	.rva	.LSEH_begin_AES_set_decrypt_key
-	.rva	.LSEH_end_AES_set_decrypt_key
-	.rva	.LSEH_info_AES_set_decrypt_key
+	.rva	.LSEH_begin_private_AES_set_decrypt_key
+	.rva	.LSEH_end_private_AES_set_decrypt_key
+	.rva	.LSEH_info_private_AES_set_decrypt_key
 
 	.rva	.LSEH_begin_AES_cbc_encrypt
 	.rva	.LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2797,11 @@
 	.byte	9,0,0,0
 	.rva	block_se_handler
 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]

diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000..c6f6b33
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl

@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#		AES-128-CBC	+SHA1		stitch      gain
+# Westmere	3.77[+5.6]	9.37		6.65	    +41%
+# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+#
+#		AES-192-CBC
+# Westmere	4.51		10.11		6.97	    +45%
+# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+#
+#		AES-256-CBC
+# Westmere	5.25		10.85		7.25	    +50%
+# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+#
+# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#	background information. Above numbers in parentheses are SSSE3
+#	results collected on AVX-capable CPU, i.e. apply on OSes that
+#	don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#		AES-128-CBC	AES-192-CBC	AES-256-CBC
+# Westmere	1.31		1.55		1.80
+# Sandy Bridge	0.93		1.06		1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#			void *out,
+#			size_t length,
+#			const AES_KEY *key,
+#			unsigned char *iv,
+#			SHA_CTX *ctx,
+#			const void *in0);
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,\@abi-omnipotent
+.align	16
+aesni_cbc_sha1_enc:
+	# caller should check for SSSE3 and AES-NI bits
+	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r11d		# mask AVX bit
+	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+	jmp	aesni_cbc_sha1_enc_ssse3
+	ret
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_ssse3,\@function,6
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_ssse3		# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	movdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	movups	($key),$rndkey0		# $key[0]
+	movups	16($key),$rndkey[0]	# forward reference
+	jmp	.Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	movups	$iv,($ivp)			# write IV
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_avx,\@function,6
+.align	16
+aesni_cbc_sha1_enc_avx:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_avx			# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	vzeroall
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	vmovdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	\$112,$key		# size optimization
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	vmovups	-112($key),$rndkey0	# $key[0]
+	vmovups	16-112($key),$rndkey[0]	# forward reference
+	jmp	.Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	vmovups		`16*$n`($in0),$in		# load input
+	vxorps		$rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	vxorps		$in,$iv,$iv
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
+	je		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+	vaesenclast	$rndkey[0],$iv,$iv
+	vmovups		16-112($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	vmovups	$iv,($ivp)			# write IV
+	vzeroall
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+
+.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	96(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
+
+	mov	0(%rax),%r15
+	mov	8(%rax),%r14
+	mov	16(%rax),%r13
+	mov	24(%rax),%r12
+	mov	32(%rax),%rbp
+	mov	40(%rax),%rbx
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;

diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000..3dc345b
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86.pl

@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni")	{ $movekey=*movups; }
+else			{ $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+sub aesdec	{ aescommon(0xde,@_); }
+sub aesdeclast	{ aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x20,$key));
+	&je		(&label("${p}192"));
+	&lea		($key,&DWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shr		($rounds,1);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	eval"&aes${p}	($inout4,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout5,$rndkey1)";
+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&and	($len,-16);
+	&jz	(&label("ecb_ret"));
+	&mov	($rounds,&DWP(240,$key));
+	&test	($rounds_,$rounds_);
+	&jz	(&label("ecb_decrypt"));
+
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_enc_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+	&call	("_aesni_encrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_enc_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_enc_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_enc_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_enc_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_enc_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_dec_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+	&call	("_aesni_decrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_dec_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_dec_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_dec_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_dec_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&shr	($rounds,1);
+	&lea	($key_,&DWP(0,$key));
+	&movdqa	($inout3,&QWP(0,"esp"));
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds_,$rounds);
+	&pshufb	($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&mov		($rounds,$rounds_);
+	&movups		($in0,&QWP(0,$inp));
+
+	&xorps		($inout0,$rndkey0);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($rndkey0,$in0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($cmac,$rndkey0);		# cmac^=inp
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_enc2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+
+	&dec	($len);
+	&lea	($inp,&DWP(16,$inp));
+	&xorps	($in0,$inout0);			# inp^=E(ivec)
+	&movdqa	($inout0,$ivec);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+	&jnz	(&label("ccm64_enc_outer"));
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
+	&movdqa	($inout0,$ivec);
+
+	&mov	($key_,$key);
+	&mov	($rounds_,$rounds);
+
+	&pshufb	($ivec,$inout3);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));		# load inp
+	&paddq	($ivec,&QWP(16,"esp"));
+	&lea	($inp,&QWP(16,$inp));
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+	&xorps	($in0,$inout0);			# inp ^= E(ivec)
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds,$rounds_);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+
+	&sub	($len,1);
+	&jz	(&label("ccm64_dec_break"));
+
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=out
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_dec2_loop"));
+	&movups		($in0,&QWP(0,$inp));	# load inp
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&lea		($inp,&QWP(16,$inp));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+	&mov	($key,$key_);
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
+	else
+	{   &call	("_aesni_encrypt1",$cmac);	}
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey1,$rndkey1);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey1,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey0,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
+	&shr	($rounds,1);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout0,$rndkey1);		# merge counter-less ivec
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout1,$rndkey1);
+	&pshufd	($inout5,$rndkey0,1<<6);
+	&por	($inout2,$rndkey1);
+	&por	($inout3,$rndkey1);
+	&por	($inout4,$rndkey1);
+	&por	($inout5,$rndkey1);
+
+	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&lea		($key,&DWP(32,$key_));
+	&dec		($rounds);
+	&pxor		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout2,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout2,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&aesenc		($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout4,$rndkey1);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
+	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey1,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&mov	($rounds,$rounds_);
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&mov	($key,$key_);
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+	
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&and	("esp",-16);			# align stack
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,16*6);
+	&jc	(&label("xts_enc_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesenc	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesenc	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesenc	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesenc	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesenc	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesenc	($inout5,$rndkey1);
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_enc_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_enc_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_enc_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_enc_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_enc_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_enc_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_encrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout2);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_encrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+	&movdqa	($inout3,$tweak);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($inout3,$twtmp,0x13);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+	&movz	($rounds,&BP(0,$inp));
+	&movz	($key,&BP(-16,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(-16,$out),&LB($rounds));
+	&mov	(&BP(0,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_enc_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(-16,$out));	# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(-16,$out),$inout0);	# write output
+
+&set_label("xts_enc_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&and	("esp",-16);			# align stack
+
+	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
+	&test	($len,15);
+	&setnz	(&LB($rounds_));
+	&shl	($rounds_,4);
+	&sub	($len,$rounds_);
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&sub	($len,16*6);
+	&jc	(&label("xts_dec_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesdec	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesdec	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesdec	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesdec	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesdec	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesdec	($inout5,$rndkey1);
+	&call		(&label("_aesni_decrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_dec_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_dec_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_dec_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_dec_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_dec_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_dec_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_decrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_decrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(16*6,"esp"));
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+	&pshufd	($inout3,$twtmp,0x13);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,$twmask);		# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$inp));		# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_steal");
+	&movz	($rounds,&BP(16,$inp));
+	&movz	($key,&BP(0,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(0,$out),&LB($rounds));
+	&mov	(&BP(16,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_dec_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$out));		# load input
+	&xorps	($inout0,$inout4);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout4);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($rounds_,"esp");
+	&mov	($out,&wparam(1));
+	&sub	($rounds_,24);
+	&mov	($len,&wparam(2));
+	&and	($rounds_,-16);
+	&mov	($key,&wparam(3));
+	&mov	($key_,&wparam(4));
+	&test	($len,$len);
+	&jz	(&label("cbc_abort"));
+
+	&cmp	(&wparam(5),0);
+	&xchg	($rounds_,"esp");		# alloca
+	&movups	($ivec,&QWP(0,$key_));		# load IV
+	&mov	($rounds,&DWP(240,$key));
+	&mov	($key_,$key);			# backup $key
+	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&je	(&label("cbc_decrypt"));
+
+	&movaps	($inout0,$ivec);
+	&cmp	($len,16);
+	&jb	(&label("cbc_enc_tail"));
+	&sub	($len,16);
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movups	($ivec,&QWP(0,$inp));		# input actually
+	&lea	($inp,&DWP(16,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
+	else
+	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($key,$key_);		# restore $key
+	&movups	(&QWP(0,$out),$inout0);	# store output
+	&lea	($out,&DWP(16,$out));
+	&sub	($len,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&add	($len,16);
+	&jnz	(&label("cbc_enc_tail"));
+	&movaps	($ivec,$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+	&mov	("ecx",$len);		# zaps $rounds
+	&data_word(0xA4F3F689);		# rep movsb
+	&mov	("ecx",16);		# zero tail
+	&sub	("ecx",$len);
+	&xor	("eax","eax");		# zaps $len
+	&data_word(0xAAF3F689);		# rep stosb
+	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($inp,$out);		# $inp and $out are the same
+	&mov	($key,$key_);		# restore $key
+	&jmp	(&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+	&cmp	($len,0x50);
+	&jbe	(&label("cbc_dec_tail"));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
+	&movups	(&QWP(0,$out),$inout5);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+
+	&call	("_aesni_decrypt6");
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout4,$rndkey0);
+	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
+	&xorps	($inout5,$rndkey1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($inp,&DWP(0x60,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&mov	($rounds,$rounds_)		# restore $rounds
+	&movups	(&QWP(0x30,$out),$inout3);
+	&mov	($key,$key_);			# restore $key
+	&movups	(&QWP(0x40,$out),$inout4);
+	&lea	($out,&DWP(0x50,$out));
+	&sub	($len,0x60);
+	&ja	(&label("cbc_dec_loop6"));
+
+	&movaps	($inout0,$inout5);
+	&movaps	($ivec,$rndkey0);
+	&add	($len,0x50);
+	&jle	(&label("cbc_dec_tail_collected"));
+	&movups	(&QWP(0,$out),$inout0);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&movaps	($in0,$inout0);
+	&cmp	($len,0x10);
+	&jbe	(&label("cbc_dec_one"));
+
+	&movups	($inout1,&QWP(0x10,$inp));
+	&movaps	($in1,$inout1);
+	&cmp	($len,0x20);
+	&jbe	(&label("cbc_dec_two"));
+
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x30);
+	&jbe	(&label("cbc_dec_three"));
+
+	&movups	($inout3,&QWP(0x30,$inp));
+	&cmp	($len,0x40);
+	&jbe	(&label("cbc_dec_four"));
+
+	&movups	($inout4,&QWP(0x40,$inp));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&movups	($inout0,&QWP(0,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($ivec,&QWP(0x40,$inp));	# IV
+	&xorps	($inout4,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&lea	($out,&DWP(0x40,$out));
+	&movaps	($inout0,$inout4);
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$ivec);
+	&movaps	($ivec,$in0);
+	&sub	($len,0x10);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout1);
+	&lea	($out,&DWP(0x10,$out));
+	&movaps	($ivec,$in1);
+	&sub	($len,0x20);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&xorps	($inout2,$in1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout2);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($out,&DWP(0x20,$out));
+	&movups	($ivec,&QWP(0x20,$inp));
+	&sub	($len,0x30);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	($rndkey1,&QWP(0x10,$inp));
+	&movups	($rndkey0,&QWP(0x20,$inp));
+	&xorps	($inout0,$ivec);
+	&movups	($ivec,&QWP(0x30,$inp));
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&lea	($out,&DWP(0x30,$out));
+	&movaps	($inout0,$inout3);
+	&sub	($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+	&and	($len,15);
+	&jnz	(&label("cbc_dec_tail_partial"));
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+	&movaps	(&QWP(0,"esp"),$inout0);
+	&mov	("ecx",16);
+	&mov	($inp,"esp");
+	&sub	("ecx",$len);
+	&data_word(0xA4F3F689);		# rep movsb
+
+&set_label("cbc_ret");
+	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
+	&mov	($key_,&wparam(4));
+	&movups	(&QWP(0,$key_),$ivec);	# output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#	"eax"	const unsigned char *userKey
+#	$rounds	int bits
+#	$key	AES_KEY *key
+# output:
+#	"eax"	return code
+#	$round	rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+	&test	("eax","eax");
+	&jz	(&label("bad_pointer"));
+	&test	($key,$key);
+	&jz	(&label("bad_pointer"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	&cmp	($rounds,192);
+	&je	(&label("12rounds"));
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("12rounds",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
+	&mov		($rounds,11);
+	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
+	&call		(&label("key_192a_cold"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
+	&call		(&label("key_192b"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(48,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_192a",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+	&movaps		("xmm5","xmm2");
+&set_label("key_192b_warm");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&movdqa		("xmm3","xmm2");
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&pslldq		("xmm3",4);
+	&xorps		("xmm0","xmm4");
+	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+	&pxor		("xmm2","xmm3");
+	&pxor		("xmm0","xmm1");
+	&pshufd		("xmm3","xmm0",0b11111111);
+	&pxor		("xmm2","xmm3");
+	&ret();
+
+&set_label("key_192b",16);
+	&movaps		("xmm3","xmm0");
+	&shufps		("xmm5","xmm0",0b01000100);
+	&$movekey	(&QWP(0,$key),"xmm5");
+	&shufps		("xmm3","xmm2",0b01001110);
+	&$movekey	(&QWP(16,$key),"xmm3");
+	&lea		($key,&DWP(32,$key));
+	&jmp		(&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&mov		($rounds,13);
+	&lea		($key,&DWP(16,$key));
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("bad_pointer",4);
+	&mov	("eax",-1);
+	&ret	();
+&set_label("bad_keybits",4);
+	&mov	("eax",-2);
+	&ret	();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&mov	($key,&wparam(2));
+	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
+	&test	("eax","eax");
+	&jnz	(&label("dec_key_ret"));
+	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
+
+	&$movekey	("xmm0",&QWP(0,$key));	# just swap
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&$movekey	(&QWP(0,"eax"),"xmm0");
+	&$movekey	(&QWP(0,$key),"xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&aesimc		("xmm0","xmm0");
+	&aesimc		("xmm1","xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+	&$movekey	(&QWP(16,"eax"),"xmm0");
+	&$movekey	(&QWP(-16,$key),"xmm1");
+	&cmp		("eax",$key);
+	&ja		(&label("dec_key_inverse"));
+
+	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
+	&aesimc		("xmm0","xmm0");
+	&$movekey	(&QWP(0,$key),"xmm0");
+
+	&xor		("eax","eax");		# return success
+&set_label("dec_key_ret");
+	&ret	();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();

diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000..499f3b3
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.pl

@@ -0,0 +1,3068 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3 
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor		3x	6x	8x
+# theoretical asymptotic limit		1.67	0.83	0.625
+# measured performance for 8KB block	1.05	0.86	0.84
+#
+# "as if" interleave factor		4.7x	5.8x	6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt				1.16	0.93	0.93
+# CTR					1.14	0.91	n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+$code=".text\n";
+
+$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";	# cbc, ctr, ...
+
+$rnds_="%r10d";	# backup copy for $rounds
+$key_="%r11";	# backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";	$rndkey1="%xmm1";
+$inout0="%xmm2";	$inout1="%xmm3";
+$inout2="%xmm4";	$inout3="%xmm5";
+$inout4="%xmm6";	$inout5="%xmm7";
+$inout6="%xmm8";	$inout7="%xmm9";
+
+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
+$in0="%xmm8";		$iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+	$movkey	($key),$rndkey0
+	$movkey	16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+	xorps	$rndkey0,$ivec
+	lea	32($key),$key
+	xorps	$ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+	lea	32($key),$key
+	xorps	$rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+	aes${p}	$rndkey1,$inout
+	dec	$rounds
+	$movkey	($key),$rndkey1
+	lea	16($key),$key
+	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+	aes${p}last	$rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_encrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_decrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt3,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt3:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+
+.L${dir}_loop3:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop3
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	ret
+.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt4,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt4:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	xorps	$rndkey0,$inout3
+	$movkey	($key),$rndkey0
+
+.L${dir}_loop4:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop4
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	ret
+.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt6:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout5
+	jmp		.L${dir}_loop6_enter
+.align	16
+.L${dir}_loop6:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+.L${dir}_loop6_enter:				# happens to be 16-byte aligned
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop6
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	ret
+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt8,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt8:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	xorps		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aes${dir}	$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+	jmp		.L${dir}_loop8_enter
+.align	16
+.L${dir}_loop8:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+.L${dir}_loop8_enter:				# happens to be 16-byte aligned
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	aes${dir}	$rndkey0,$inout6
+	aes${dir}	$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop8
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	aes${dir}last	$rndkey0,$inout6
+	aes${dir}last	$rndkey0,$inout7
+	ret
+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#			  size_t length, const AES_KEY *key,
+#			  int enc);
+$code.=<<___;
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,\@function,5
+.align	16
+aesni_ecb_encrypt:
+	and	\$-16,$len
+	jz	.Lecb_ret
+
+	mov	240($key),$rounds	# key->rounds
+	$movkey	($key),$rndkey0
+	mov	$key,$key_		# backup $key
+	mov	$rounds,$rnds_		# backup $rounds
+	test	%r8d,%r8d		# 5th argument
+	jz	.Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+	cmp	\$0x80,$len
+	jb	.Lecb_enc_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	sub	\$0x80,$len
+	jnc	.Lecb_enc_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_enc_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_enc_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_enc_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_enc_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_enc_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_enc_six
+	movdqu	0x60($inp),$inout6
+	call	_aesni_encrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	$inout5,$inout5
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	jmp	.Lecb_ret
+#--------------------------- ECB DECRYPT ------------------------------#
+.align	16
+.Lecb_decrypt:
+	cmp	\$0x80,$len
+	jb	.Lecb_dec_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	$movkey	($key_),$rndkey0
+	sub	\$0x80,$len
+	jnc	.Lecb_dec_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_dec_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_dec_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_dec_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_dec_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_dec_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_dec_six
+	movups	0x60($inp),$inout6
+	$movkey	($key),$rndkey0
+	call	_aesni_decrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+
+.Lecb_ret:
+	ret
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";	# 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movdqu	($ivp),$iv
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	shr	\$1,$rounds
+	lea	0($key),$key_
+	movdqu	($cmac),$inout1
+	movdqa	$iv,$inout0
+	mov	$rounds,$rnds_
+	pshufb	$bswap_mask,$iv
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	$movkey	($key_),$rndkey0
+	mov	$rnds_,$rounds
+	movups	($inp),$in0			# load inp
+
+	xorps	$rndkey0,$inout0		# counter
+	$movkey	16($key_),$rndkey1
+	xorps	$in0,$rndkey0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout1		# cmac^=inp
+	$movkey	($key),$rndkey0
+
+.Lccm64_enc2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_enc2_loop
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	paddq	$increment,$iv
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+
+	dec	$len
+	lea	16($inp),$inp
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+	jnz	.Lccm64_enc_outer
+
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movups	($ivp),$iv
+	movdqu	($cmac),$inout1
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	movaps	$iv,$inout0
+	mov	$rounds,$rnds_
+	mov	$key,$key_
+	pshufb	$bswap_mask,$iv
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	lea	16($inp),$inp
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	mov	$rnds_,$rounds
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+
+	sub	\$1,$len
+	jz	.Lccm64_dec_break
+
+	$movkey	($key_),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key_),$rndkey1
+	xorps	$rndkey0,$in0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout0
+	xorps	$in0,$inout1			# cmac^=out
+	$movkey	($key),$rndkey0
+
+.Lccm64_dec2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_dec2_loop
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	lea	16($inp),$inp
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+	#xorps	$in0,$inout1			# cmac^=out
+___
+	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0xc8(%rsp),%rsp
+	movaps	%xmm6,0x20(%rsp)
+	movaps	%xmm7,0x30(%rsp)
+	movaps	%xmm8,0x40(%rsp)
+	movaps	%xmm9,0x50(%rsp)
+	movaps	%xmm10,0x60(%rsp)
+	movaps	%xmm11,0x70(%rsp)
+	movaps	%xmm12,0x80(%rsp)
+	movaps	%xmm13,0x90(%rsp)
+	movaps	%xmm14,0xa0(%rsp)
+	movaps	%xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+	cmp	\$1,$len
+	je	.Lctr32_one_shortcut
+
+	movdqu	($ivp),$ivec
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+	xor	$rounds,$rounds
+	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
+	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
+
+	mov	240($key),$rounds		# key->rounds
+	bswap	$rnds_
+	pxor	$iv0,$iv0			# vector of 3 32-bit counters
+	pxor	$iv1,$iv1			# vector of 3 32-bit counters
+	pinsrd	\$0,$rnds_,$iv0
+	lea	3($rnds_),$key_
+	pinsrd	\$0,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$1,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$1,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$2,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$2,$key_,$iv1
+	movdqa	$iv0,$reserved(%rsp)
+	pshufb	$bswap_mask,$iv0
+	movdqa	$iv1,`$reserved+0x10`(%rsp)
+	pshufb	$bswap_mask,$iv1
+
+	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
+	pshufd	\$`2<<6`,$iv0,$inout1
+	pshufd	\$`1<<6`,$iv0,$inout2
+	cmp	\$6,$len
+	jb	.Lctr32_tail
+	shr	\$1,$rounds
+	mov	$key,$key_			# backup $key
+	mov	$rounds,$rnds_			# backup $rounds
+	sub	\$6,$len
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout0			# merge counter-less ivec
+	 $movkey	($key_),$rndkey0
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout1
+	 $movkey	16($key_),$rndkey1
+	pshufd	\$`1<<6`,$iv1,$inout5
+	por	$ivec,$inout2
+	por	$ivec,$inout3
+	 xorps		$rndkey0,$inout0
+	por	$ivec,$inout4
+	por	$ivec,$inout5
+
+	# inline _aesni_encrypt6 and interleave last rounds
+	# with own code...
+
+	pxor		$rndkey0,$inout1
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	aesenc		$rndkey1,$inout1
+	 movdqa		.Lincrement32(%rip),$iv1
+	pxor		$rndkey0,$inout3
+	aesenc		$rndkey1,$inout2
+	 movdqa		$reserved(%rsp),$iv0
+	pxor		$rndkey0,$inout4
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	jmp		.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lctr32_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	 paddd		$iv1,$iv0		# increment counter vector
+	aesenc		$rndkey1,$inout1
+	 paddd		`$reserved+0x10`(%rsp),$iv1
+	aesenc		$rndkey1,$inout2
+	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
+	aesenc		$rndkey1,$inout3
+	 movdqa		$iv1,`$reserved+0x10`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 pshufb		$bswap_mask,$iv0	# byte swap
+	aesenc		$rndkey1,$inout5
+	 pshufb		$bswap_mask,$iv1
+
+	aesenclast	$rndkey0,$inout0
+	 movups		($inp),$in0		# load input
+	aesenclast	$rndkey0,$inout1
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey0,$inout2
+	 movups		0x20($inp),$in2
+	aesenclast	$rndkey0,$inout3
+	 movups		0x30($inp),$in3
+	aesenclast	$rndkey0,$inout4
+	 movups		0x40($inp),$rndkey1
+	aesenclast	$rndkey0,$inout5
+	 movups		0x50($inp),$rndkey0
+	 lea	0x60($inp),$inp
+
+	xorps	$inout0,$in0			# xor
+	 pshufd	\$`3<<6`,$iv0,$inout0
+	xorps	$inout1,$in1
+	 pshufd	\$`2<<6`,$iv0,$inout1
+	movups	$in0,($out)			# store output
+	xorps	$inout2,$in2
+	 pshufd	\$`1<<6`,$iv0,$inout2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	xorps	$inout5,$rndkey0
+	movups	$rndkey1,0x40($out)
+	movups	$rndkey0,0x50($out)
+	lea	0x60($out),$out
+	mov	$rnds_,$rounds
+	sub	\$6,$len
+	jnc	.Lctr32_loop6
+
+	add	\$6,$len
+	jz	.Lctr32_done
+	mov	$key_,$key			# restore $key
+	lea	1($rounds,$rounds),$rounds	# restore original value
+
+.Lctr32_tail:
+	por	$ivec,$inout0
+	movups	($inp),$in0
+	cmp	\$2,$len
+	jb	.Lctr32_one
+
+	por	$ivec,$inout1
+	movups	0x10($inp),$in1
+	je	.Lctr32_two
+
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout2
+	movups	0x20($inp),$in2
+	cmp	\$4,$len
+	jb	.Lctr32_three
+
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout3
+	movups	0x30($inp),$in3
+	je	.Lctr32_four
+
+	por	$ivec,$inout4
+	xorps	$inout5,$inout5
+
+	call	_aesni_encrypt6
+
+	movups	0x40($inp),$rndkey1
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	movups	$rndkey1,0x40($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	($ivp),$inout0
+	movups	($inp),$in0
+	mov	240($key),$rounds		# key->rounds
+.Lctr32_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	$inout0,$in0
+	movups	$in0,($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	movups	$in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rsp),%xmm6
+	movaps	0x30(%rsp),%xmm7
+	movaps	0x40(%rsp),%xmm8
+	movaps	0x50(%rsp),%xmm9
+	movaps	0x60(%rsp),%xmm10
+	movaps	0x70(%rsp),%xmm11
+	movaps	0x80(%rsp),%xmm12
+	movaps	0x90(%rsp),%xmm13
+	movaps	0xa0(%rsp),%xmm14
+	movaps	0xb0(%rsp),%xmm15
+	lea	0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,\@function,6
+.align	16
+aesni_xts_encrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240(%r8),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_enc_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_encrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesenc		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_enc_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey0,$inout3
+	 aesenc		$rndkey0,$inout4
+	 aesenc		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenclast	$rndkey0,$inout3
+	 aesenclast	$rndkey0,$inout4
+	 aesenclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_enc_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_enc_short:
+	add	\$16*6,$len
+	jz	.Lxts_enc_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_encrypt6
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	lea	16*1($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_encrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	and	\$15,$len_
+	jz	.Lxts_enc_ret
+	mov	$len_,$len
+
+.Lxts_enc_steal:
+	movzb	($inp),%eax			# borrow $rounds ...
+	movzb	-16($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,-16($out)
+	mov	%cl,0($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_enc_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	-16($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+	ret
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,\@function,6
+.align	16
+aesni_xts_decrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240($key2),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	xor	%eax,%eax			# if ($len%16) len-=16;
+	test	\$15,$len
+	setnz	%al
+	shl	\$4,%rax
+	sub	%rax,$len
+
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_dec_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_decrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesdec		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesdec		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	dec		$rounds
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	lea		32($key),$key
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_dec_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey0,$inout3
+	 aesdec		$rndkey0,$inout4
+	 aesdec		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdeclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdeclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdeclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdeclast	$rndkey0,$inout3
+	 aesdeclast	$rndkey0,$inout4
+	 aesdeclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_dec_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_dec_short:
+	add	\$16*6,$len
+	jz	.Lxts_dec_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_decrypt6
+
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	 pxor		$twtmp,$twtmp
+	movdqu	$inout3,16*3($out)
+	 pcmpgtd	@tweak[5],$twtmp
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	pand	$twmask,@tweak[1]		# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	movdqa	@tweak[2],@tweak[1]
+	lea	16*1($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[3],@tweak[1]
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movups	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movups	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_decrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	mov	$len_,$len
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($inp),$inout0
+	xorps	@tweak[1],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[1],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp),%eax			# borrow $rounds ...
+	movzb	($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,($out)
+	mov	%cl,16($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_dec_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+	ret
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#			    size_t length, const AES_KEY *key,
+#			    unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18;	# used in decrypt
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	test	$len,$len		# check length
+	jz	.Lcbc_ret
+
+	mov	240($key),$rnds_	# key->rounds
+	mov	$key,$key_		# backup $key
+	test	%r9d,%r9d		# 6th argument
+	jz	.Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+	movups	($ivp),$inout0		# load iv as initial state
+	mov	$rnds_,$rounds
+	cmp	\$16,$len
+	jb	.Lcbc_enc_tail
+	sub	\$16,$len
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	($inp),$inout1		# load input
+	lea	16($inp),$inp
+	#xorps	$inout1,$inout0
+___
+	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+	mov	$rnds_,$rounds		# restore $rounds
+	mov	$key_,$key		# restore $key
+	movups	$inout0,0($out)		# store output
+	lea	16($out),$out
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	add	\$16,$len
+	jnz	.Lcbc_enc_tail
+	movups	$inout0,($ivp)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	mov	$len,%rcx	# zaps $key
+	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
+	.long	0x9066A4F3	# rep movsb
+	mov	\$16,%ecx	# zero tail
+	sub	$len,%rcx
+	xor	%eax,%eax
+	.long	0x9066AAF3	# rep stosb
+	lea	-16(%rdi),%rdi	# rewind $out by 1 block
+	mov	$rnds_,$rounds	# restore $rounds
+	mov	%rdi,%rsi	# $inp and $out are the same
+	mov	$key_,$key	# restore $key
+	xor	$len,$len	# len=16
+	jmp	.Lcbc_enc_loop	# one more spin
+#--------------------------- CBC DECRYPT ------------------------------#
+.align	16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+	movups	($ivp),$iv
+	mov	$rnds_,$rounds
+	cmp	\$0x70,$len
+	jbe	.Lcbc_dec_tail
+	shr	\$1,$rnds_
+	sub	\$0x70,$len
+	mov	$rnds_,$rounds
+	movaps	$iv,$reserved(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	$rndkey0,$reserved(%rsp)	# save IV
+	movups	$inout7,($out)
+	lea	0x10($out),$out
+.Lcbc_dec_loop8_enter:
+	$movkey		($key),$rndkey0
+	movups	($inp),$inout0			# load input
+	movups	0x10($inp),$inout1
+	$movkey		16($key),$rndkey1
+
+	lea		32($key),$key
+	movdqu	0x20($inp),$inout2
+	xorps		$rndkey0,$inout0
+	movdqu	0x30($inp),$inout3
+	xorps		$rndkey0,$inout1
+	movdqu	0x40($inp),$inout4
+	aesdec		$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	movdqu	0x50($inp),$inout5
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	movdqu	0x60($inp),$inout6
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	movdqu	0x70($inp),$inout7
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aesdec		$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+
+	call		.Ldec_loop8_enter
+
+	movups	($inp),$rndkey1		# re-load input
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$rndkey1
+	xorps	$rndkey0,$inout6
+	movups	0x70($inp),$rndkey0	# IV
+	xorps	$rndkey1,$inout7
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout4,0x40($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout5,0x50($out)
+	lea	0x80($inp),$inp
+	movups	$inout6,0x60($out)
+	lea	0x70($out),$out
+	sub	\$0x80,$len
+	ja	.Lcbc_dec_loop8
+
+	movaps	$inout7,$inout0
+	movaps	$rndkey0,$iv
+	add	\$0x70,$len
+	jle	.Lcbc_dec_tail_collected
+	movups	$inout0,($out)
+	lea	1($rnds_,$rnds_),$rounds
+	lea	0x10($out),$out
+.Lcbc_dec_tail:
+	movups	($inp),$inout0
+	movaps	$inout0,$in0
+	cmp	\$0x10,$len
+	jbe	.Lcbc_dec_one
+
+	movups	0x10($inp),$inout1
+	movaps	$inout1,$in1
+	cmp	\$0x20,$len
+	jbe	.Lcbc_dec_two
+
+	movups	0x20($inp),$inout2
+	movaps	$inout2,$in2
+	cmp	\$0x30,$len
+	jbe	.Lcbc_dec_three
+
+	movups	0x30($inp),$inout3
+	cmp	\$0x40,$len
+	jbe	.Lcbc_dec_four
+
+	movups	0x40($inp),$inout4
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_five
+
+	movups	0x50($inp),$inout5
+	cmp	\$0x60,$len
+	jbe	.Lcbc_dec_six
+
+	movups	0x60($inp),$inout6
+	movaps	$iv,$reserved(%rsp)	# save IV
+	call	_aesni_decrypt8
+	movups	($inp),$rndkey1
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$iv		# IV
+	xorps	$rndkey0,$inout6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movaps	$inout6,$inout0
+	sub	\$0x70,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	$iv,$inout0
+	movaps	$in0,$iv
+	sub	\$0x10,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	movaps	$in1,$iv
+	movaps	$inout1,$inout0
+	lea	0x10($out),$out
+	sub	\$0x20,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	movaps	$in2,$iv
+	movaps	$inout2,$inout0
+	lea	0x20($out),$out
+	sub	\$0x30,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	$iv,$inout0
+	movups	0x30($inp),$iv
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	xorps	$in2,$inout3
+	movups	$inout2,0x20($out)
+	movaps	$inout3,$inout0
+	lea	0x30($out),$out
+	sub	\$0x40,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$iv
+	xorps	$rndkey1,$inout4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movaps	$inout4,$inout0
+	sub	\$0x50,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$rndkey0
+	xorps	$rndkey1,$inout4
+	movups	0x50($inp),$iv
+	xorps	$rndkey0,$inout5
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movaps	$inout5,$inout0
+	sub	\$0x60,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	and	\$15,$len
+	movups	$iv,($ivp)
+	jnz	.Lcbc_dec_tail_partial
+	movups	$inout0,($out)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	$inout0,$reserved(%rsp)
+	mov	\$16,%rcx
+	mov	$out,%rdi
+	sub	$len,%rcx
+	lea	$reserved(%rsp),%rsi
+	.long	0x9066A4F3	# rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} 
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+#				int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_decrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	call	__aesni_set_encrypt_key
+	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
+	test	%eax,%eax
+	jnz	.Ldec_key_ret
+	lea	16($key,$bits),$inp	# points at the end of key schedule
+
+	$movkey	($key),%xmm0		# just swap
+	$movkey	($inp),%xmm1
+	$movkey	%xmm0,($inp)
+	$movkey	%xmm1,($key)
+	lea	16($key),$key
+	lea	-16($inp),$inp
+
+.Ldec_key_inverse:
+	$movkey	($key),%xmm0		# swap and inverse
+	$movkey	($inp),%xmm1
+	aesimc	%xmm0,%xmm0
+	aesimc	%xmm1,%xmm1
+	lea	16($key),$key
+	lea	-16($inp),$inp
+	$movkey	%xmm0,16($inp)
+	$movkey	%xmm1,-16($key)
+	cmp	$key,$inp
+	ja	.Ldec_key_inverse
+
+	$movkey	($key),%xmm0		# inverse middle
+	aesimc	%xmm0,%xmm0
+	$movkey	%xmm0,($inp)
+.Ldec_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_decrypt_key:
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+#	Huang Ying <[email protected]>
+#	Vinodh Gopal <[email protected]>
+#	Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	mov	\$-1,%rax
+	test	$inp,$inp
+	jz	.Lenc_key_ret
+	test	$key,$key
+	jz	.Lenc_key_ret
+
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax
+	cmp	\$256,$bits
+	je	.L14rounds
+	cmp	\$192,$bits
+	je	.L12rounds
+	cmp	\$128,$bits
+	jne	.Lbad_keybits
+
+.L10rounds:
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
+	call		.Lkey_expansion_128_cold
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
+	call		.Lkey_expansion_128
+	$movkey	%xmm0,(%rax)
+	mov	$bits,80(%rax)	# 240(%rdx)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
+	mov	\$11,$bits			# 12 rounds for 192
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
+	call		.Lkey_expansion_192a_cold
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
+	call		.Lkey_expansion_192b
+	$movkey	%xmm0,(%rax)
+	mov	$bits,48(%rax)	# 240(%rdx)
+	xor	%rax, %rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16($inp),%xmm2			# remaning half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	$movkey	%xmm0,($key)			# round 0
+	$movkey	%xmm2,16($key)			# round 1
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
+	call		.Lkey_expansion_256a_cold
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
+	call		.Lkey_expansion_256a
+	$movkey	%xmm0,(%rax)
+	mov	$bits,16(%rax)	# 240(%rdx)
+	xor	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	mov	\$-2,%rax
+.Lenc_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_192a:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	pslldq	\$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	\$0b11111111,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+.align 16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	\$0b01000100,%xmm0,%xmm5
+	$movkey	%xmm5,(%rax)
+	shufps	\$0b01001110,%xmm2,%xmm3
+	$movkey	%xmm3,16(%rax)
+	lea	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	$movkey	%xmm2,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_256b:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+
+	shufps	\$0b00010000,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10001100,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm2
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+	.long	6,6,6,0
+.Lincrement64:
+	.long	1,0,0,0
+.Lxts_magic:
+	.long	0x87,0,1,0
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type	ecb_se_handler,\@abi-omnipotent
+.align	16
+ecb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	jmp	.Lcommon_seh_tail
+.size	ecb_se_handler,.-ecb_se_handler
+
+.type	ccm64_se_handler,\@abi-omnipotent
+.align	16
+ccm64_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ccm64_se_handler,.-ccm64_se_handler
+
+.type	ctr32_se_handler,\@abi-omnipotent
+.align	16
+ctr32_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lctr32_ret(%rip),%r10
+	cmp	%r10,%rbx
+	jae	.Lcommon_seh_tail
+
+	lea	0x20(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xc8(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ctr32_se_handler,.-ctr32_se_handler
+
+.type	xts_se_handler,\@abi-omnipotent
+.align	16
+xts_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0x60(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x68+160(%rax),%rax	# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_decrypt(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	lea	.Lcbc_decrypt_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
+	jb	.Lrestore_cbc_rax
+
+	lea	.Lcbc_ret(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>="epilogue" label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+	jmp	.Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+	mov	120($context),%rax
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	cbc_se_handler,.-cbc_se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+	.rva	.LSEH_begin_aesni_ecb_encrypt
+	.rva	.LSEH_end_aesni_ecb_encrypt
+	.rva	.LSEH_info_ecb
+
+	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_info_ccm64_enc
+
+	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_info_ccm64_dec
+
+	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+
+	.rva	.LSEH_begin_aesni_xts_encrypt
+	.rva	.LSEH_end_aesni_xts_encrypt
+	.rva	.LSEH_info_xts_enc
+
+	.rva	.LSEH_begin_aesni_xts_decrypt
+	.rva	.LSEH_end_aesni_xts_decrypt
+	.rva	.LSEH_info_xts_dec
+___
+$code.=<<___;
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_cbc
+
+	.rva	${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_set_decrypt_key
+	.rva	.LSEH_info_key
+
+	.rva	${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_set_encrypt_key
+	.rva	.LSEH_info_key
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+	.byte	9,0,0,0
+	.rva	ecb_se_handler
+.LSEH_info_ccm64_enc:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
+.LSEH_info_ccm64_dec:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr32_se_handler
+.LSEH_info_xts_enc:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.LSEH_info_xts_dec:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+	.byte	9,0,0,0
+	.rva	cbc_se_handler
+.LSEH_info_key:
+	.byte	0x01,0x04,0x01,0x00
+	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	rex(\@opcode,$4,$3);
+	push @opcode,0x0f,0x3a,0xdf;
+	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
+	my $c=$2;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesimc" => 0xdb,
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000..ff7e3af
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.pl

@@ -0,0 +1,3004 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode]				###
+### bitsliced implementation for Intel Core 2 processors	###
+### requires support of SSE extensions up to SSSE3		###
+### Author: Emilia Käsper and Peter Schwabe			###
+### Date: 2009-03-19						###
+### Public domain						###
+###								###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
+### further information.					###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#		Emilia's	this(*)		difference
+#
+# Core 2    	9.30		8.69		+7%
+# Nehalem(**) 	7.63		6.98		+9%
+# Atom	    	17.1		17.4		-2%(***)
+#
+# (*)	Comparison is not completely fair, because "this" is ECB,
+#	i.e. no extra processing such as counter values calculation
+#	and xor-ing input as in Emilia's CTR implementation is
+#	performed. However, the CTR calculations stand for not more
+#	than 1% of total time, so comparison is *rather* fair.
+#
+# (**)	Results were collected on Westmere, which is considered to
+#	be equivalent to Nehalem for this code.
+#
+# (***)	Slowdown on Atom is rather strange per se, because original
+#	implementation has a number of 9+-bytes instructions, which
+#	are bad for Atom front-end, and which I eliminated completely.
+#	In attempt to address deterioration sbox() was tested in FP
+#	SIMD "domain" (movaps instead of movdqa, xorps instead of
+#	pxor, etc.). While it resulted in nominal 4% improvement on
+#	Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# 		conversion	conversion/8x block
+# Core 2	410		0.37
+# Nehalem	310		0.35
+# Atom		570		0.26
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2	11.0
+# Nehalem	9.16
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#						<[email protected]>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
+my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[5]
+	pxor	@b[1], @b[2]
+	pxor	@b[0], @b[3]
+	pxor	@b[2], @b[6]
+	pxor 	@b[0], @b[5]
+
+	pxor	@b[3], @b[6]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[1], @b[3]
+
+	pxor	@b[7], @b[2]
+	pxor	@b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[1], @b[6]
+
+	pxor	@b[5], @b[1]
+	pxor	@b[3], @b[5]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+
+	pxor	@b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	pxor	@b[7], @b[4]
+
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+	pxor	@b[7], @b[3]
+	pxor	@b[3], @b[5]
+	pxor	@b[5], @b[1]
+
+	pxor	@b[1], @b[6]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	pxor	@b[5], @b[1]
+	pxor	@b[7], @b[2]
+
+	pxor	@b[1], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	 pxor 	@b[0], @b[5]
+	pxor	@b[7], @b[3]
+	 pxor	@b[2], @b[6]
+	 pxor	@b[1], @b[2]
+	pxor	@b[3], @b[6]
+
+	pxor	@b[0], @b[3]
+	pxor	@b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor 	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x1, $x0
+	pxor	$t0, $x1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x0, $x1
+	pxor	$t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	 movdqa	$y2, $t1
+	pxor	$y1, $t0
+	 pxor 	$y3, $t1
+	pand	$x0, $t0
+	 pand	$x2, $t1
+	pxor	$x1, $x0
+	 pxor	$x3, $x2
+	pand	$y0, $x1
+	 pand	$y2, $x3
+	pand	$y1, $x0
+	 pand	$y3, $x2
+	pxor	$x0, $x1
+	 pxor	$x3, $x2
+	pxor	$t0, $x0
+	 pxor	$t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	movdqa	@x[0], @t[0]
+	movdqa	@x[1], @t[1]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+	pxor	@x[2], @t[0]
+	pxor	@x[3], @t[1]
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@t[0], @x[0]
+	pxor	@t[0], @x[2]
+	pxor	@t[1], @x[1]
+	pxor	@t[1], @x[3]
+
+	movdqa	@x[4], @t[0]
+	movdqa	@x[5], @t[1]
+	pxor	@x[6], @t[0]
+	pxor	@x[7], @t[1]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+	pxor	@t[0], @x[4]
+	pxor	@t[0], @x[6]
+	pxor	@t[1], @x[5]
+	pxor	@t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	movdqa	@x[4], @t[3]
+	movdqa	@x[5], @t[2]
+	movdqa	@x[1], @t[1]
+	movdqa	@x[7], @s[1]
+	movdqa	@x[0], @s[0]
+
+	pxor	@x[6], @t[3]
+	pxor	@x[7], @t[2]
+	pxor	@x[3], @t[1]
+	 movdqa	@t[3], @s[2]
+	pxor	@x[6], @s[1]
+	 movdqa	@t[2], @t[0]
+	pxor	@x[2], @s[0]
+	 movdqa	@t[3], @s[3]
+
+	por	@t[1], @t[2]
+	por	@s[0], @t[3]
+	pxor	@t[0], @s[3]
+	pand	@s[0], @s[2]
+	pxor	@t[1], @s[0]
+	pand	@t[1], @t[0]
+	pand	@s[0], @s[3]
+	movdqa	@x[3], @s[0]
+	pxor	@x[2], @s[0]
+	pand	@s[0], @s[1]
+	pxor	@s[1], @t[3]
+	pxor	@s[1], @t[2]
+	movdqa	@x[4], @s[1]
+	movdqa	@x[1], @s[0]
+	pxor	@x[5], @s[1]
+	pxor	@x[0], @s[0]
+	movdqa	@s[1], @t[1]
+	pand	@s[0], @s[1]
+	por	@s[0], @t[1]
+	pxor	@s[1], @t[0]
+	pxor	@s[3], @t[3]
+	pxor	@s[2], @t[2]
+	pxor	@s[3], @t[1]
+	movdqa	@x[7], @s[0]
+	pxor	@s[2], @t[0]
+	movdqa	@x[6], @s[1]
+	pxor	@s[2], @t[1]
+	movdqa	@x[5], @s[2]
+	pand	@x[3], @s[0]
+	movdqa	@x[4], @s[3]
+	pand	@x[2], @s[1]
+	pand	@x[1], @s[2]
+	por	@x[0], @s[3]
+	pxor	@s[0], @t[3]
+	pxor	@s[1], @t[2]
+	pxor	@s[2], @t[1]
+	pxor	@s[3], @t[0] 
+
+	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	# new smaller inversion
+
+	movdqa	@t[3], @s[0]
+	pand	@t[1], @t[3]
+	pxor	@t[2], @s[0]
+
+	movdqa	@t[0], @s[2]
+	movdqa	@s[0], @s[3]
+	pxor	@t[3], @s[2]
+	pand	@s[2], @s[3]
+
+	movdqa	@t[1], @s[1]
+	pxor	@t[2], @s[3]
+	pxor	@t[0], @s[1]
+
+	pxor	@t[2], @t[3]
+
+	pand	@t[3], @s[1]
+
+	movdqa	@s[2], @t[2]
+	pxor	@t[0], @s[1]
+
+	pxor	@s[1], @t[2]
+	pxor	@s[1], @t[1]
+
+	pand	@t[0], @t[2]
+
+	pxor	@t[2], @s[2]
+	pxor	@t[2], @t[1]
+
+	pand	@s[3], @s[2]
+
+	pxor	@s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+	pxor	0x00($key),@x[0]
+	pxor	0x10($key),@x[1]
+	pshufb	$mask,@x[0]
+	pxor	0x20($key),@x[2]
+	pshufb	$mask,@x[1]
+	pxor	0x30($key),@x[3]
+	pshufb	$mask,@x[2]
+	pxor	0x40($key),@x[4]
+	pshufb	$mask,@x[3]
+	pxor	0x50($key),@x[5]
+	pshufb	$mask,@x[4]
+	pxor	0x60($key),@x[6]
+	pshufb	$mask,@x[5]
+	pxor	0x70($key),@x[7]
+	pshufb	$mask,@x[6]
+	lea	0x80($key),$key
+	pshufb	$mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
+	pshufd	\$0x93, @x[1], @t[1]
+	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
+	pshufd	\$0x93, @x[2], @t[2]
+	 pxor	@t[1], @x[1]
+	pshufd	\$0x93, @x[3], @t[3]
+	 pxor	@t[2], @x[2]
+	pshufd	\$0x93, @x[4], @t[4]
+	 pxor	@t[3], @x[3]
+	pshufd	\$0x93, @x[5], @t[5]
+	 pxor	@t[4], @x[4]
+	pshufd	\$0x93, @x[6], @t[6]
+	 pxor	@t[5], @x[5]
+	pshufd	\$0x93, @x[7], @t[7]
+	 pxor	@t[6], @x[6]
+	 pxor	@t[7], @x[7]
+
+	pxor	@x[0], @t[1]
+	pxor	@x[7], @t[0]
+	pxor	@x[7], @t[1]
+	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
+	pxor	@x[1], @t[2]
+	 pshufd	\$0x4E, @x[1], @x[1]
+	pxor	@x[4], @t[5]
+	 pxor	@t[0], @x[0]
+	pxor	@x[5], @t[6]
+	 pxor	@t[1], @x[1]
+	pxor	@x[3], @t[4]
+	 pshufd	\$0x4E, @x[4], @t[0]
+	pxor	@x[6], @t[7]
+	 pshufd	\$0x4E, @x[5], @t[1]
+	pxor	@x[2], @t[3]
+	 pshufd	\$0x4E, @x[3], @x[4]
+	pxor	@x[7], @t[3]
+	 pshufd	\$0x4E, @x[7], @x[5]
+	pxor	@x[7], @t[4]
+	 pshufd	\$0x4E, @x[6], @x[3]
+	pxor	@t[4], @t[0]
+	 pshufd	\$0x4E, @x[2], @x[6]
+	pxor	@t[5], @t[1]
+
+	pxor	@t[3], @x[4]
+	pxor	@t[7], @x[5]
+	pxor	@t[6], @x[3]
+	 movdqa	@t[0], @x[2]
+	pxor	@t[2], @x[6]
+	 movdqa	@t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	# multiplication by 0x0e
+	pshufd	\$0x93, @x[7], @t[7]
+	movdqa	@x[2], @t[2]
+	pxor	@x[5], @x[7]		# 7 5
+	pxor	@x[5], @x[2]		# 2 5
+	pshufd	\$0x93, @x[0], @t[0]
+	movdqa	@x[5], @t[5]
+	pxor	@x[0], @x[5]		# 5 0		[1]
+	pxor	@x[1], @x[0]		# 0 1
+	pshufd	\$0x93, @x[1], @t[1]
+	pxor	@x[2], @x[1]		# 1 25
+	pxor	@x[6], @x[0]		# 01 6		[2]
+	pxor	@x[3], @x[1]		# 125 3		[4]
+	pshufd	\$0x93, @x[3], @t[3]
+	pxor	@x[0], @x[2]		# 25 016	[3]
+	pxor	@x[7], @x[3]		# 3 75
+	pxor	@x[6], @x[7]		# 75 6		[0]
+	pshufd	\$0x93, @x[6], @t[6]
+	movdqa	@x[4], @t[4]
+	pxor	@x[4], @x[6]		# 6 4
+	pxor	@x[3], @x[4]		# 4 375		[6]
+	pxor	@x[7], @x[3]		# 375 756=36
+	pxor	@t[5], @x[6]		# 64 5		[7]
+	pxor	@t[2], @x[3]		# 36 2
+	pxor	@t[4], @x[3]		# 362 4		[5]
+	pshufd	\$0x93, @t[5], @t[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	# multiplication by 0x0b
+	pxor	@y[0], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[1], @y[1]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[5], @y[0]
+	pxor	@t[6], @y[1]
+	pxor	@t[7], @y[0]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @t[7]		# clobber t[7]
+	pxor	@y[0], @y[1]
+
+	pxor	@t[0], @y[3]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[1], @y[2]
+	pxor	@t[1], @y[4]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[1], @t[1]
+	pxor	@t[2], @y[3]
+	pxor	@t[2], @y[5]
+	pxor	@t[7], @y[2]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[3], @y[3]
+	pxor	@t[3], @y[6]
+	pxor	@t[3], @y[4]
+	pshufd	\$0x93, @t[3], @t[3]
+	pxor	@t[4], @y[7]
+	pxor	@t[4], @y[5]
+	pxor	@t[7], @y[7]
+	pxor	@t[5], @y[3]
+	pxor	@t[4], @y[4]
+	pxor	@t[5], @t[7]		# clobber t[7] even more
+
+	pxor	@t[7], @y[5]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[7], @y[6]
+	pxor	@t[7], @y[4]
+
+	pxor	@t[5], @t[7]
+	pshufd	\$0x93, @t[5], @t[5]
+	pxor	@t[6], @t[7]		# restore t[7]
+
+	# multiplication by 0x0d
+	pxor	@y[7], @y[4]
+	pxor	@t[4], @y[7]
+	pshufd	\$0x93, @t[6], @t[6]
+	pxor	@t[0], @y[2]
+	pxor	@t[5], @y[7]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[7], @t[7]
+
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[0], @y[3]
+	pxor	@t[5], @y[1]
+	pxor	@t[5], @y[0]
+	pxor	@t[7], @y[1]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[6], @y[0]
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[4]
+	pshufd	\$0x93, @t[1], @t[1]
+
+	pxor	@t[7], @y[7]
+	pxor	@t[2], @y[4]
+	pxor	@t[2], @y[5]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[6], @y[2]
+	pxor	@t[3], @t[6]		# clobber t[6]
+	pxor	@y[7], @y[4]
+	pxor	@t[6], @y[3]
+
+	pxor	@t[6], @y[6]
+	pxor	@t[5], @y[5]
+	pxor	@t[4], @y[6]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @y[5]
+	pxor	@t[7], @y[6]
+	pxor	@t[3], @t[6]		# restore t[6]
+
+	pshufd	\$0x93, @t[5], @t[5]
+	pshufd	\$0x93, @t[6], @t[6]
+	pshufd	\$0x93, @t[7], @t[7]
+	pshufd	\$0x93, @t[3], @t[3]
+
+	# multiplication by 0x09
+	pxor	@y[1], @y[4]
+	pxor	@y[1], @t[1]		# t[1]=y[1]
+	pxor	@t[5], @t[0]		# clobber t[0]
+	pxor	@t[5], @t[1]
+	pxor	@t[0], @y[3]
+	pxor	@y[0], @t[0]		# t[0]=y[0]
+	pxor	@t[6], @t[1]
+	pxor	@t[7], @t[6]		# clobber t[6]
+	pxor	@t[1], @y[4]
+	pxor	@t[4], @y[7]
+	pxor	@y[4], @t[4]		# t[4]=y[4]
+	pxor	@t[3], @y[6]
+	pxor	@y[3], @t[3]		# t[3]=y[3]
+	pxor	@t[2], @y[5]
+	pxor	@y[2], @t[2]		# t[2]=y[2]
+	pxor	@t[7], @t[3]
+	pxor	@y[5], @t[5]		# t[5]=y[5]
+	pxor	@t[6], @t[2]
+	pxor	@t[6], @t[5]
+	pxor	@y[6], @t[6]		# t[6]=y[6]
+	pxor	@y[7], @t[7]		# t[7]=y[7]
+
+	movdqa	@t[0],@XMM[0]
+	movdqa	@t[1],@XMM[1]
+	movdqa	@t[2],@XMM[2]
+	movdqa	@t[3],@XMM[3]
+	movdqa	@t[4],@XMM[4]
+	movdqa	@t[5],@XMM[5]
+	movdqa	@t[6],@XMM[6]
+	movdqa	@t[7],@XMM[7]
+___
+}
+
+sub aesenc {				# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x30($const),@t[0]	# .LSR
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast {			# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x40($const),@t[0]	# .LSRM0
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+$code.=<<___
+	pxor	0x00($key),@b[0]
+	pxor	0x10($key),@b[1]
+	pxor	0x20($key),@b[4]
+	pxor	0x30($key),@b[6]
+	pxor	0x40($key),@b[3]
+	pxor	0x50($key),@b[7]
+	pxor	0x60($key),@b[2]
+	pxor	0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	movdqa	$b,$t
+	psrlq	\$$n,$b
+	pxor  	$a,$b
+	pand	$mask,$b
+	pxor	$b,$a
+	psllq	\$$n,$b
+	pxor	$t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	movdqa	$b0,$t0
+	psrlq	\$$n,$b0
+	 movdqa	$b1,$t1
+	 psrlq	\$$n,$b1
+	pxor  	$a0,$b0
+	 pxor  	$a1,$b1
+	pand	$mask,$b0
+	 pand	$mask,$b1
+	pxor	$b0,$a0
+	psllq	\$$n,$b0
+	 pxor	$b1,$a1
+	 psllq	\$$n,$b1
+	pxor	$t0,$b0
+	 pxor	$t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	movdqa	0x00($const),$t0	# .LBS0
+	movdqa	0x10($const),$t1	# .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	movdqa	0x20($const),$t0	# .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern	asm_AES_encrypt
+.extern	asm_AES_decrypt
+
+.type	_bsaes_encrypt8,\@abi-omnipotent
+.align	64
+_bsaes_encrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	0x60($const), @XMM[8]	# .LM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	movdqa	0x30($const), @XMM[8]	# .LSR
+	jnz	.Lenc_loop
+	movdqa	0x40($const), @XMM[8]	# .LSRM0
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,\@abi-omnipotent
+.align	64
+_bsaes_decrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+	&InvSbox	(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Ldec_done
+___
+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+	movdqa	-0x10($const), @XMM[8]	# .LISR
+	jnz	.Ldec_loop
+	movdqa	-0x20($const), @XMM[8]	# .LISRM0
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+___
+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
+	movdqa	@x[0], @x[2]
+	movdqa	@x[1], @x[3]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	movdqa	@x[0], @x[4]
+	movdqa	@x[2], @x[6]
+	movdqa	@x[1], @x[5]
+	movdqa	@x[3], @x[7]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,\@abi-omnipotent
+.align	16
+_bsaes_key_convert:
+	lea	.LBS1(%rip), $const
+	movdqu	($inp), %xmm7		# load round 0 key
+	movdqa	-0x10($const), %xmm8	# .LBS0
+	movdqa	0x00($const), %xmm9	# .LBS1
+	movdqa	0x10($const), %xmm10	# .LBS2
+	movdqa	0x40($const), %xmm13	# .LM0
+	movdqa	0x60($const), %xmm14	# .LNOT
+
+	movdqu	0x10($inp), %xmm6	# load round 1 key
+	lea	0x10($inp), $inp
+	movdqa	%xmm7, ($out)		# save round 0 key
+	lea	0x10($out), $out
+	dec	$rounds
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+	pshufb	%xmm13, %xmm6		# .LM0
+	movdqa	%xmm6, %xmm7
+___
+	&bitslice_key	(map("%xmm$_",(0..7, 8..12)));
+$code.=<<___;
+	pxor	%xmm14, %xmm5		# "pnot"
+	pxor	%xmm14, %xmm6
+	pxor	%xmm14, %xmm0
+	pxor	%xmm14, %xmm1
+	lea	0x10($inp), $inp
+	movdqa	%xmm0, 0x00($out)	# write bit-sliced round key
+	movdqa	%xmm1, 0x10($out)
+	movdqa	%xmm2, 0x20($out)
+	movdqa	%xmm3, 0x30($out)
+	movdqa	%xmm4, 0x40($out)
+	movdqa	%xmm5, 0x50($out)
+	movdqa	%xmm6, 0x60($out)
+	movdqa	%xmm7, 0x70($out)
+	lea	0x80($out),$out
+	movdqu	($inp), %xmm6		# load next round key
+	dec	$rounds
+	jnz	.Lkey_loop
+
+	movdqa	0x70($const), %xmm7	# .L63
+	#movdqa	%xmm6, ($out)		# don't save last round key
+	ret
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) {	# following four functions are unsupported interface
+			# used for benchmarking...
+$code.=<<___;
+.globl	bsaes_enc_key_convert
+.type	bsaes_enc_key_convert,\@function,2
+.align	16
+bsaes_enc_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+	ret
+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl	bsaes_encrypt_128
+.type	bsaes_encrypt_128,\@function,4
+.align	16
+bsaes_encrypt_128:
+.Lenc128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Lenc128_loop
+	ret
+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl	bsaes_dec_key_convert
+.type	bsaes_dec_key_convert,\@function,2
+.align	16
+bsaes_dec_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	($out),%xmm7		# fix up round 0 key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,($out)
+	ret
+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl	bsaes_decrypt_128
+.type	bsaes_decrypt_128,\@function,4
+.align	16
+bsaes_decrypt_128:
+.Ldec128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Ldec128_loop
+	ret
+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl	bsaes_ecb_encrypt_blocks
+.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_encrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_enc_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	sub	\$8,$len
+.Lecb_enc_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_enc_loop
+
+	add	\$8,$len
+	jz	.Lecb_enc_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_enc_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_enc_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_enc_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_enc_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_enc_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_enc_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_six:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_five:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_four:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_three:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_two:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_one:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_enc_short
+
+.Lecb_enc_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_enc_epilogue:
+	ret
+.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl	bsaes_ecb_decrypt_blocks
+.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_decrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_dec_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	sub	\$8,$len
+.Lecb_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_dec_loop
+
+	add	\$8,$len
+	jz	.Lecb_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_six:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_five:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_four:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_three:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_two:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_one:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_dec_short
+
+.Lecb_dec_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_dec_epilogue:
+	ret
+.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern	asm_AES_cbc_encrypt
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,\@abi-omnipotent
+.align	16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+	mov	48(%rsp),$arg6		# pull direction flag
+___
+$code.=<<___;
+	cmp	\$0,$arg6
+	jne	asm_AES_cbc_encrypt
+	cmp	\$128,$arg3
+	jb	asm_AES_cbc_encrypt
+
+	mov	%rsp, %rax
+.Lcbc_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	mov	$arg5, %rbx
+	shr	\$4, $len		# bytes to blocks
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx), @XMM[15]	# load IV
+	sub	\$8,$len
+.Lcbc_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%edx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+
+	call	_bsaes_decrypt8
+
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[13], @XMM[3]
+	movdqu	0x70($inp), @XMM[15]	# IV
+	pxor	@XMM[14], @XMM[5]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x80($inp), $inp
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lcbc_dec_loop
+
+	add	\$8,$len
+	jz	.Lcbc_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lcbc_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lcbc_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lcbc_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lcbc_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lcbc_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lcbc_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[15]	# IV
+	pxor	@XMM[13], @XMM[3]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[15]	# IV
+	pxor	@XMM[12], @XMM[7]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[15]	# IV
+	pxor	@XMM[11], @XMM[2]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[15]	# IV
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[15]	# IV
+	pxor	@XMM[9], @XMM[6]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[15]	# IV
+	pxor	@XMM[8], @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	lea	($inp), $arg1
+	lea	0x20(%rbp), $arg2	# buffer output
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[15]	# ^= IV
+	movdqu	@XMM[15], ($out)	# write output
+	movdqa	@XMM[0], @XMM[15]	# IV
+
+.Lcbc_dec_done:
+	movdqu	@XMM[15], (%rbx)	# return IV
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lcbc_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lcbc_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lcbc_dec_epilogue:
+	ret
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	mov	%rsp, %rax
+.Lctr_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	movdqu	($arg5), %xmm0		# load counter
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	movdqa	%xmm0, 0x20(%rbp)	# copy counter
+	cmp	\$8, $arg3
+	jb	.Lctr_enc_short
+
+	mov	%eax, %ebx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%ebx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	movdqa	(%rsp), @XMM[9]		# load round0 key
+	lea	.LADD1(%rip), %r11
+	movdqa	0x20(%rbp), @XMM[0]	# counter copy
+	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
+	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
+	pshufb	@XMM[8], @XMM[0]
+	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	@XMM[0], 0x20(%rbp)	# save counter
+	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
+	movdqa	@XMM[0], @XMM[2]
+	paddd	0x00(%r11), @XMM[1]	# .LADD1
+	movdqa	@XMM[0], @XMM[3]
+	paddd	0x10(%r11), @XMM[2]	# .LADD2
+	movdqa	@XMM[0], @XMM[4]
+	paddd	0x20(%r11), @XMM[3]	# .LADD3
+	movdqa	@XMM[0], @XMM[5]
+	paddd	0x30(%r11), @XMM[4]	# .LADD4
+	movdqa	@XMM[0], @XMM[6]
+	paddd	0x40(%r11), @XMM[5]	# .LADD5
+	movdqa	@XMM[0], @XMM[7]
+	paddd	0x50(%r11), @XMM[6]	# .LADD6
+	paddd	0x60(%r11), @XMM[7]	# .LADD7
+
+	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	# to flip byte order in 32-bit counter
+	movdqa	(%rsp), @XMM[9]		# round 0 key
+	lea	0x10(%rsp), %rax	# pass key schedule
+	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	lea	.LBS0(%rip), %r11	# constants table
+	 pshufb	@XMM[8], @XMM[7]
+	mov	%ebx,%r10d		# pass rounds
+
+	call	_bsaes_encrypt8_bitslice
+
+	sub	\$8,$len
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0x00($inp), @XMM[8]	# load input
+	movdqu	0x10($inp), @XMM[9]
+	movdqu	0x20($inp), @XMM[10]
+	movdqu	0x30($inp), @XMM[11]
+	movdqu	0x40($inp), @XMM[12]
+	movdqu	0x50($inp), @XMM[13]
+	movdqu	0x60($inp), @XMM[14]
+	movdqu	0x70($inp), @XMM[15]
+	lea	0x80($inp),$inp
+	pxor	@XMM[0], @XMM[8]
+	movdqa	0x20(%rbp), @XMM[0]	# load counter
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[8], 0x00($out)	# write output
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	@XMM[15], @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	lea	.LADD1(%rip), %r11
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	paddd	0x70(%r11), @XMM[0]	# .LADD8
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	add	\$8, $len
+	movdqu	0x00($inp), @XMM[8]	# load input
+	pxor	@XMM[8], @XMM[0]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	cmp	\$2,$len
+	jb	.Lctr_enc_done
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[1], 0x10($out)
+	je	.Lctr_enc_done
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[4], 0x20($out)
+	cmp	\$4,$len
+	jb	.Lctr_enc_done
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[6], 0x30($out)
+	je	.Lctr_enc_done
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[3], 0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr_enc_done
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[7], 0x50($out)
+	je	.Lctr_enc_done
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	lea	0x20(%rbp), $arg1
+	lea	0x30(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	movdqu	($inp), @XMM[1]
+	lea	16($inp), $inp
+	mov	0x2c(%rbp), %eax	# load 32-bit counter
+	bswap	%eax
+	pxor	0x30(%rbp), @XMM[1]
+	inc	%eax			# increment
+	movdqu	@XMM[1], ($out)
+	bswap	%eax
+	lea	16($out), $out
+	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
+	dec	$len
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lctr_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lctr_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lctr_enc_epilogue:
+	ret
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_encrypt:
+	mov	%rsp, %rax
+.Lxts_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6, %xmm7		# fix up last round key
+	movdqa	%xmm7, (%rax)		# save last round key
+
+	and	\$-16, $len
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	add	\$0x80, $len
+	jz	.Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_encrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_enc_done:
+	and	\$15, %ebx
+	jz	.Lxts_enc_ret
+	mov	$out, %rdx
+
+.Lxts_enc_steal:
+	movzb	($inp), %eax
+	movzb	-16(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, -16(%rdx)
+	mov	%cl, 0(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	movdqu	@XMM[7], -16($out)
+
+.Lxts_enc_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_enc_epilogue:
+	ret
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_decrypt:
+	mov	%rsp, %rax
+.Lxts_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp), %xmm7		# fix up round 0 key
+	movdqa	%xmm6, (%rax)		# save last round key
+	movdqa	%xmm7, (%rsp)
+
+	xor	%eax, %eax		# if ($len%16) len-=16;
+	and	\$-16, $len
+	test	\$15, %ebx
+	setnz	%al
+	shl	\$4, %rax
+	sub	%rax, $len
+
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	add	\$0x80, $len
+	jz	.Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_decrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_dec_done:
+	and	\$15, %ebx
+	jz	.Lxts_dec_ret
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	movdqa	@XMM[7], @XMM[6]
+	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	movdqu	($inp), @XMM[0]
+	pxor	$twres, @XMM[7]
+
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	mov	$out, %rdx
+	movdqu	@XMM[7], ($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp), %eax
+	movzb	(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, (%rdx)
+	mov	%cl, 16(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[6], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[6]
+	movdqu	@XMM[6], ($out)
+
+.Lxts_dec_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_dec_epilogue:
+	ret
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type	_bsaes_const,\@object
+.align	64
+_bsaes_const:
+.LM0ISR:	# InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:		# bit-slice constants
+	.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+	.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:		# shiftrows constants
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0SR:
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LNOT:		# magic constants
+	.quad	0xffffffffffffffff, 0xffffffffffffffff
+.L63:
+	.quad	0x6363636363636363, 0x6363636363636363
+.LSWPUP:	# byte-swap upper dword
+	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:		# counter increment constants
+	.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+	.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+	.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+	.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+	.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+	.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+	.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+	.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+	.long	0x87,0,1,0
+.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align	64
+.size	_bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	160($context),%rax	# pull context->Rbp
+
+	lea	0x40(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0(%rax),%rax		# adjust stack pointer
+
+	mov	0x70(%rax),%rbp
+	mov	0x68(%rax),%rbx
+	mov	0x60(%rax),%r12
+	mov	0x58(%rax),%r13
+	mov	0x50(%rax),%r14
+	mov	0x48(%rax),%r15
+	lea	0x78(%rax),%rax		# adjust stack pointer
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($ecb);
+	.rva	.Lecb_enc_prologue
+	.rva	.Lecb_enc_epilogue
+	.rva	.Lecb_enc_info
+
+	.rva	.Lecb_dec_prologue
+	.rva	.Lecb_dec_epilogue
+	.rva	.Lecb_dec_info
+___
+$code.=<<___;
+	.rva	.Lcbc_dec_prologue
+	.rva	.Lcbc_dec_epilogue
+	.rva	.Lcbc_dec_info
+
+	.rva	.Lctr_enc_prologue
+	.rva	.Lctr_enc_epilogue
+	.rva	.Lctr_enc_info
+
+	.rva	.Lxts_enc_prologue
+	.rva	.Lxts_enc_epilogue
+	.rva	.Lxts_enc_info
+
+	.rva	.Lxts_dec_prologue
+	.rva	.Lxts_dec_epilogue
+	.rva	.Lxts_dec_info
+
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
+.Lecb_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
+.Lctr_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
+.Lxts_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.Lxts_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000..84a6f6d
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86.pl

@@ -0,0 +1,901 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
+# Nehalem	27.9/40.4/18.1		10.3/12.0
+# Atom		102./119./60.1		64.5/85.3(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
+#	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<[email protected]>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
+	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
+	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
+	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;		# decryption key schedule: invskew x*9
+	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;		# decryption input transform
+	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
+	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
+	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
+	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
+	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;		# decryption sbox final output
+	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm2","xmm5");
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+	&sub	($round,1);			# nr--
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+	&mov	($round,&DWP(240,$key));
+	&lea	($base,&DWP($k_dsbd,$const));
+	&movdqa	("xmm1","xmm6");
+	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+	&pandn	("xmm1","xmm0");
+	&mov	($magic,$round);
+	&psrld	("xmm1",4)
+	&movdqu	("xmm5",&QWP(0,$key));
+	&shl	($magic,4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+	&xor	($magic,0x30);
+	&pshufb	("xmm0","xmm1");
+	&and	($magic,0x30);
+	&pxor	("xmm2","xmm5");
+	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+	&jmp	(&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
+	&pshufb	("xmm4","xmm2");		# 4 = sb9u
+	&pxor	("xmm4","xmm0");
+	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
+	&pshufb	("xmm0","xmm3");		# 0 = sb9t
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&add	($key,16);			# next round key
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
+	&pshufb	("xmm4","xmm2");		# 4 = sbdu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
+	&pshufb	("xmm0","xmm3");		# 0 = sbdt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&sub	($round,1);			# nr--
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
+	&pshufb	("xmm4","xmm2");		# 4 = sbbu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
+	&pshufb	("xmm0","xmm3");		# 0 = sbbt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
+	&pshufb	("xmm4","xmm2");		# 4 = sbeu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
+	&pshufb	("xmm0","xmm3");		# 0 = sbet
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqu	("xmm0",&QWP(0,$key));
+	&jnz	(&label("dec_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
+	&movdqa	("xmm2",&QWP(0,$magic));
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	&je	(&label("schedule_192"));
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&movdqa	("xmm6","xmm0");		# save short part
+	&pxor	("xmm4","xmm4");		# clear 4
+	&movhlps("xmm6","xmm4");		# clobber low side with zeros
+	&mov	($round,4);
+
+&set_label("loop_schedule_192");
+	&call	("_vpaes_schedule_round");
+	&palignr("xmm0","xmm6",8);
+	&call	("_vpaes_schedule_mangle");	# save key n
+	&call	("_vpaes_schedule_192_smear");
+	&call	("_vpaes_schedule_mangle");	# save key n+1
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# save key n+2
+	&call	("_vpaes_schedule_192_smear");
+	&jmp	(&label("loop_schedule_192"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
+	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
+	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
+	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
+	&movdqa	("xmm0","xmm6");
+	&pxor	("xmm1","xmm1");
+	&movhlps("xmm6","xmm1");		# clobber low side with zeros
+	&ret	();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");	
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
+	&shl	($base,4);
+	&lea	($key,&DWP(16,$key,$base));
+
+	&mov	($out,1);
+	&mov	($magic,$round);
+	&shr	($magic,1);
+	&and	($magic,32);
+	&xor	($magic,32);			# nbist==192?0:32;
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_decrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));		# inp
+	&mov	($out,&wparam(1));		# out
+	&mov	($round,&wparam(2));		# len
+	&mov	($key,&wparam(3));		# key
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($const,&wparam(4));		# ivp
+	&and	($base,-16);
+	&mov	($magic,&wparam(5));		# enc
+	&xchg	($base,"esp");			# alloca
+	&movdqu	("xmm1",&QWP(0,$const));	# load IV
+	&sub	($out,$inp);
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	(&DWP(0,"esp"),$out);		# save out
+	&sub	($round,16);
+	&mov	(&DWP(4,"esp"),$key)		# save key
+	&mov	(&DWP(8,"esp"),$const);		# save ivp
+	&mov	($out,$round);			# $out works as $len
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&cmp	($magic,0);
+	&je	(&label("cbc_dec_loop"));
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&pxor	("xmm0","xmm1");		# inp^=iv
+	&call	("_vpaes_encrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&movdqa	("xmm1","xmm0");
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&jmp	(&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
+	&call	("_vpaes_decrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
+	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+	&mov	($base,&DWP(8,"esp"));		# restore ivp
+	&mov	("esp",&DWP(48,"esp"));
+	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();

diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000..0254702
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.pl

@@ -0,0 +1,1204 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#		aes-x86_64.pl		vpaes-x86_64.pl
+#
+# Core 2(**)	30.5/43.7/14.3		21.8/25.7(***)
+# Nehalem	30.5/42.2/14.6		 9.8/11.8
+# Atom		63.9/79.0/32.1		64.0/84.8(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +40%/78% improvement on Core 2
+#	(as implied, over "hyper-threading-safe" code path).
+#
+#						<[email protected]>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm5,	%xmm2
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align 16
+.Lenc_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4	# 4 = sb1u
+	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm5	# 4 : sb2u
+	pshufb	%xmm2,	%xmm5	# 4 = sb2u
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm5,	%xmm2	# 2 = 2A
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	add	\$16,	%r9	# next key
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	%xmm4,	%xmm3	# 3 = D
+	add	\$16,	%r11	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	and	\$0x30,	%r11	# ... mod 4
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	sub	\$1,%rax	# nr--
+
+.Lenc_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,   	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm5	# 2 : a/k
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	movdqu	(%r9),	%xmm5
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Lenc_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm1,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+	
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,\@abi-omnipotent
+.align	16
+_vpaes_decrypt_core:
+	mov	%rdx,	%r9		# load key
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	mov	%rax,	%r11
+	psrld	\$4,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	shl	\$4,	%r11
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
+	xor	\$0x30,	%r11
+	lea	.Lk_dsbd(%rip),%r10
+	pshufb	%xmm1,	%xmm0
+	and	\$0x30,	%r11
+	pxor	%xmm5,	%xmm2
+	movdqa	.Lk_mc_forward+48(%rip), %xmm5
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	add	%r10,	%r11
+	jmp	.Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	%xmm0,	%xmm4
+	movdqa  -0x10(%r10),%xmm0	# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	add	\$16, %r9		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
+	pshufb	%xmm3,	%xmm0		# 0 = sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	sub	\$1,%rax		# nr--
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x30(%r10),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x50(%r10),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	palignr	\$12,	%xmm5,	%xmm5
+	
+.Ldec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,    %xmm1	# 1 = i
+	pand	%xmm9, 	%xmm0	# 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2	# 2 = io
+	movdqa	%xmm10, %xmm3	# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
+	pxor	%xmm1,  %xmm3	# 3 = jo
+	movdqu	(%r9),	%xmm0
+	jnz	.Ldec_loop
+
+	# middle of last round
+	movdqa	0x60(%r10), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
+	movdqa	0x70(%r10), %xmm0	# 0 : sbot
+	movdqa	.Lk_sr-.Lk_dsbd(%r11), %xmm2
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm2,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_schedule_core,\@abi-omnipotent
+.align	16
+_vpaes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	call	_vpaes_preheat		# load the tables
+	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqa	%xmm0,	%xmm3
+	lea	.Lk_ipt(%rip), %r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,	%xmm7
+
+	lea	.Lk_sr(%rip),%r10
+	test	%rcx,	%rcx
+	jnz	.Lschedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqu	%xmm0,	(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	movdqa	(%r8,%r10),%xmm1
+	pshufb  %xmm1,	%xmm3
+	movdqu	%xmm3,	(%rdx)
+	xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	\$192,	%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	\$10, %esi
+	
+.Loop_schedule_128:
+	call 	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# write output
+	jmp 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	movdqa	%xmm0,	%xmm6		# save short part
+	pxor	%xmm4,	%xmm4		# clear 4
+	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
+	mov	\$4,	%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+	palignr	\$8,%xmm6,%xmm0	
+	call	_vpaes_schedule_mangle	# save key n
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	# save key n+1
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# save key n+2
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	mov	\$7, %esi
+	
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+	# low round. swap xmm7 and xmm6
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+	
+	jmp	.Loop_schedule_256
+
+	
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	16
+.Lschedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+	# encrypting
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,	%xmm0		# output permute
+	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
+	add	\$32,	%rdx
+
+.Lschedule_mangle_last_dec:
+	add	\$-16,	%rdx
+	pxor	.Lk_s63(%rip),	%xmm0
+	call	_vpaes_schedule_transform # output transform
+	movdqu	%xmm0,	(%rdx)		# save last key
+
+	# cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,\@abi-omnipotent
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	\$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
+	movdqa	%xmm6,	%xmm0
+	pxor	%xmm1,	%xmm1
+	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,\@abi-omnipotent
+.align	16
+_vpaes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	palignr	\$1,	%xmm0,	%xmm0
+	
+	# fall through...
+	
+	# low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%rip), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	\$4,    %xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa	%xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa	%xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	%xmm13, %xmm4		# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	%xmm12, %xmm0		# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0	
+	movdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,\@abi-omnipotent
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,\@abi-omnipotent
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_dec
+
+	# encrypting
+	add	\$16,	%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+	# inverse mix columns
+	lea	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm4,	%xmm1
+	psrld	\$4,	%xmm1	# 1 = hi
+	pand	%xmm9,	%xmm4	# 4 = lo
+
+	movdqa	0x00(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	movdqa	0x10(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x20(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x30(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x40(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x50(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x60(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x70(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+
+	add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,%xmm3
+	add	\$-16,	%r8
+	and	\$0x30,	%r8
+	movdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@function,3
+.align	16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	\$0,%ecx
+	mov	\$0x30,%r8d
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@function,3
+.align	16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	shl	\$4,%eax
+	lea	16(%rdx,%rax),%rdx
+
+	mov	\$1,%ecx
+	mov	%esi,%r8d
+	shr	\$1,%r8d
+	and	\$32,%r8d
+	xor	\$32,%r8d	# nbits==192?0:32
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@function,3
+.align	16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@function,3
+.align	16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	xchg	$key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+	movdqu	($ivp),%xmm6		# load IV
+	sub	$inp,$out
+	sub	\$16,$len
+	call	_vpaes_preheat
+	cmp	\$0,${enc}d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	($inp),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	($inp),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,($ivp)		# save IV
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_preheat,\@abi-omnipotent
+.align	16
+_vpaes_preheat:
+	lea	.Lk_s0F(%rip), %r10
+	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
+	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
+	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
+	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
+	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
+	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
+	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type	_vpaes_consts,\@object
+.align	64
+_vpaes_consts:
+.Lk_inv:	# inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:	# s0F
+	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:	# input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:	# sb1u, sb1t
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:	# sb2u, sb2t
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:	# sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:	# mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:		# sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:	# rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:	# s63: all equal to 0x63 transformed
+	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:	# output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:	# decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	# decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	# decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:	# decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:	# decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	# decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	# decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	# decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:	# decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz	"Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	16(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xb8(%rax),%rax		# adjust stack pointer
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_encrypt
+	.rva	.LSEH_end_${PREFIX}_encrypt
+	.rva	.LSEH_info_${PREFIX}_encrypt
+
+	.rva	.LSEH_begin_${PREFIX}_decrypt
+	.rva	.LSEH_end_${PREFIX}_decrypt
+	.rva	.LSEH_info_${PREFIX}_decrypt
+
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
+
+.section	.xdata
+.align	8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
new file mode 100644
index 0000000..5a83107
--- /dev/null
+++ b/crypto/arm_arch.h

@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+#  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__BIG_ENDIAN)
+#   define __ARMEB__
+#  else
+#   define __ARMEL__
+#  endif
+# elif defined(__GNUC__)
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
+	defined(__ARM_ARCH_7EM__)
+#   define __ARM_ARCH__ 7
+#  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
+	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
+	defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)	|| \
+	defined(__ARM_ARCH_6T2__)
+#   define __ARM_ARCH__ 6
+#  elif	defined(__ARM_ARCH_5__)	|| defined(__ARM_ARCH_5T__)	|| \
+	defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)	|| \
+	defined(__ARM_ARCH_5TEJ__)
+#   define __ARM_ARCH__ 5
+#  elif	defined(__ARM_ARCH_4__)	|| defined(__ARM_ARCH_4T__)
+#   define __ARM_ARCH__ 4
+#  else
+#   error "unsupported ARM architecture"
+#  endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+
+#endif

diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
new file mode 100644
index 0000000..2d618de
--- /dev/null
+++ b/crypto/armv4cpuid.S

@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.global	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	.word	0xf26ee1fe	@ vorr	q15,q15,q15
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.global	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+	mrc	p15,0,r0,c9,c13,0
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_tick,.-_armv7_tick
+
+.global	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	.Ladd
+	mov	r0,r3
+	.word	0xe12fff1e	@ bx	lr
+#else
+	stmdb	sp!,{r4-r6,lr}
+	ldr	r2,.Lspinlock
+	adr	r3,.Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+.Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	.Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4-r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+#endif
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global	OPENSSL_cleanse
+.type	OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	.Lot
+	cmp	r1,#0
+	beq	.Lcleanse_done
+.Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	.Little
+	b	.Lcleanse_done
+
+.Lot:	tst	r0,#3
+	beq	.Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	.Lot
+.Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	.Laligned
+	adds	r1,r1,#4
+	bne	.Little
+.Lcleanse_done:
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+	ldr	r0,.LOPENSSL_armcap
+	adr	r1,.LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	.Lwipe_done
+	.word	0xf3000150	@ veor    q0, q0, q0
+	.word	0xf3022152	@ veor    q1, q1, q1
+	.word	0xf3044154	@ veor    q2, q2, q2
+	.word	0xf3066156	@ veor    q3, q3, q3
+	.word	0xf34001f0	@ veor    q8, q8, q8
+	.word	0xf34221f2	@ veor    q9, q9, q9
+	.word	0xf34441f4	@ veor    q10, q10, q10
+	.word	0xf34661f6	@ veor    q11, q11, q11
+	.word	0xf34881f8	@ veor    q12, q12, q12
+	.word	0xf34aa1fa	@ veor    q13, q13, q13
+	.word	0xf34cc1fc	@ veor    q14, q14, q14
+	.word	0xf34ee1fe	@ veor    q15, q15, q15
+.Lwipe_done:
+	mov	r0,sp
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align	5
+#else
+.Lspinlock:
+.word	atomic_add_spinlock-.Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word	0
+#endif
+
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P

diff --git a/crypto/asn1/a_digest.c b/crypto/asn1/a_digest.c
index d00d9e2..cbdeea6 100644
--- a/crypto/asn1/a_digest.c
+++ b/crypto/asn1/a_digest.c

@@ -87,7 +87,8 @@
 	p=str;
 	i2d(data,&p);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
@@ -104,7 +105,8 @@
 	i=ASN1_item_i2d(asn,&str, it);
 	if (!str) return(0);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}

diff --git a/crypto/asn1/a_sign.c b/crypto/asn1/a_sign.c
index ff63bfc..7b4a193 100644
--- a/crypto/asn1/a_sign.c
+++ b/crypto/asn1/a_sign.c

@@ -184,9 +184,9 @@
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+	if (!EVP_SignInit_ex(&ctx,type, NULL)
+		|| !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+		|| !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
 			(unsigned int *)&outl,pkey))
 		{
 		outl=0;
@@ -218,65 +218,100 @@
 	     const EVP_MD *type)
 	{
 	EVP_MD_CTX ctx;
-	unsigned char *buf_in=NULL,*buf_out=NULL;
-	int inl=0,outl=0,outll=0;
-	int signid, paramtype;
-
-	if (type == NULL)
+	EVP_MD_CTX_init(&ctx);
+	if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
 		{
-		int def_nid;
-		if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
-			type = EVP_get_digestbynid(def_nid);
+		EVP_MD_CTX_cleanup(&ctx);
+		return 0;
 		}
+	return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+	}
+		
 
-	if (type == NULL)
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+	{
+	const EVP_MD *type;
+	EVP_PKEY *pkey;
+	unsigned char *buf_in=NULL,*buf_out=NULL;
+	size_t inl=0,outl=0,outll=0;
+	int signid, paramtype;
+	int rv;
+
+	type = EVP_MD_CTX_md(ctx);
+	pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
+
+	if (!type || !pkey)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
 		return 0;
 		}
 
-	if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+	if (pkey->ameth->item_sign)
 		{
-		if (!pkey->ameth ||
-			!OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
-						pkey->ameth->pkey_id))
-			{
-			ASN1err(ASN1_F_ASN1_ITEM_SIGN,
-				ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
-			return 0;
-			}
+		rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+						signature);
+		if (rv == 1)
+			outl = signature->length;
+		/* Return value meanings:
+		 * <=0: error.
+		 *   1: method does everything.
+		 *   2: carry on as normal.
+		 *   3: ASN1 method sets algorithm identifiers: just sign.
+		 */
+		if (rv <= 0)
+			ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+		if (rv <= 1)
+			goto err;
 		}
 	else
-		signid = type->pkey_type;
+		rv = 2;
 
-	if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
-		paramtype = V_ASN1_NULL;
-	else
-		paramtype = V_ASN1_UNDEF;
+	if (rv == 2)
+		{
+		if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+			{
+			if (!pkey->ameth ||
+				!OBJ_find_sigid_by_algs(&signid,
+							EVP_MD_nid(type),
+							pkey->ameth->pkey_id))
+				{
+				ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+					ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+				return 0;
+				}
+			}
+		else
+			signid = type->pkey_type;
 
-	if (algor1)
-		X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
-	if (algor2)
-		X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+		if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+			paramtype = V_ASN1_NULL;
+		else
+			paramtype = V_ASN1_UNDEF;
 
-	EVP_MD_CTX_init(&ctx);
+		if (algor1)
+			X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor2)
+			X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+		}
+
 	inl=ASN1_item_i2d(asn,&buf_in, it);
 	outll=outl=EVP_PKEY_size(pkey);
-	buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+	buf_out=OPENSSL_malloc((unsigned int)outl);
 	if ((buf_in == NULL) || (buf_out == NULL))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
 
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
-			(unsigned int *)&outl,pkey))
+	if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+		|| !EVP_DigestSignFinal(ctx, buf_out, &outl))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
 		goto err;
 		}
 	if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@
 	signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
 	signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
 err:
-	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(ctx);
 	if (buf_in != NULL)
 		{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
 	if (buf_out != NULL)

diff --git a/crypto/asn1/a_verify.c b/crypto/asn1/a_verify.c
index cecdb13..432722e 100644
--- a/crypto/asn1/a_verify.c
+++ b/crypto/asn1/a_verify.c

@@ -101,8 +101,13 @@
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_VerifyInit_ex(&ctx,type, NULL);
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+		|| !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
@@ -126,11 +131,10 @@
 #endif
 
 
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
-	     void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+		ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
 	{
 	EVP_MD_CTX ctx;
-	const EVP_MD *type = NULL;
 	unsigned char *buf_in=NULL;
 	int ret= -1,inl;
 
@@ -144,25 +148,47 @@
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
 		goto err;
 		}
-	type=EVP_get_digestbynid(mdnid);
-	if (type == NULL)
+	if (mdnid == NID_undef)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
-		goto err;
+		if (!pkey->ameth || !pkey->ameth->item_verify)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+			goto err;
+			}
+		ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+							signature, pkey);
+		/* Return value of 2 means carry on, anything else means we
+		 * exit straight away: either a fatal error of the underlying
+		 * verification routine handles all verification.
+		 */
+		if (ret != 2)
+			goto err;
+		ret = -1;
 		}
-
-	/* Check public key OID matches public key type */
-	if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+	else
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
-		goto err;
-		}
+		const EVP_MD *type;
+		type=EVP_get_digestbynid(mdnid);
+		if (type == NULL)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+			goto err;
+			}
 
-	if (!EVP_VerifyInit_ex(&ctx,type, NULL))
-		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
-		ret=0;
-		goto err;
+		/* Check public key OID matches public key type */
+		if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+			goto err;
+			}
+
+		if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+			ret=0;
+			goto err;
+			}
+
 		}
 
 	inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +199,18 @@
 		goto err;
 		}
 
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
 
-	if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
-			(unsigned int)signature->length,pkey) <= 0)
+	if (EVP_DigestVerifyFinal(&ctx,signature->data,
+			(size_t)signature->length) <= 0)
 		{
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
 		ret=0;

diff --git a/crypto/asn1/ameth_lib.c b/crypto/asn1/ameth_lib.c
index 5a581b9..a19e058 100644
--- a/crypto/asn1/ameth_lib.c
+++ b/crypto/asn1/ameth_lib.c

@@ -69,6 +69,7 @@
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
 
 /* Keep this sorted in type order !! */
 static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
@@ -90,7 +91,8 @@
 #ifndef OPENSSL_NO_EC
 	&eckey_asn1_meth,
 #endif
-	&hmac_asn1_meth
+	&hmac_asn1_meth,
+	&cmac_asn1_meth
 	};
 
 typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@
 	if (!ameth)
 		return NULL;
 
+	memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
 	ameth->pkey_id = id;
 	ameth->pkey_base_id = id;
 	ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@
 	ameth->old_priv_encode = 0;
 	ameth->old_priv_decode = 0;
 
+	ameth->item_verify = 0;
+	ameth->item_sign = 0;
+
 	ameth->pkey_size = 0;
 	ameth->pkey_bits = 0;
 
@@ -376,6 +383,9 @@
 	dst->pkey_free = src->pkey_free;
 	dst->pkey_ctrl = src->pkey_ctrl;
 
+	dst->item_sign = src->item_sign;
+	dst->item_verify = src->item_verify;
+
 	}
 
 void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)

diff --git a/crypto/asn1/asn1.h b/crypto/asn1/asn1.h
index 59540e4..220a0c8 100644
--- a/crypto/asn1/asn1.h
+++ b/crypto/asn1/asn1.h

@@ -235,7 +235,7 @@
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111

diff --git a/crypto/asn1/asn1_err.c b/crypto/asn1/asn1_err.c
index 6e04d08..1a30bf1 100644
--- a/crypto/asn1/asn1_err.c
+++ b/crypto/asn1/asn1_err.c

@@ -1,6 +1,6 @@
 /* crypto/asn1/asn1_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@
 {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP),	"ASN1_item_i2d_fp"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK),	"ASN1_item_pack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN),	"ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX),	"ASN1_item_sign_ctx"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK),	"ASN1_item_unpack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY),	"ASN1_item_verify"},
 {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY),	"ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@
 {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV),	"PKCS5_pbe2_set_iv"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET),	"PKCS5_pbe_set"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR),	"PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET),	"PKCS5_pbkdf2_set"},
 {ERR_FUNC(ASN1_F_SMIME_READ_ASN1),	"SMIME_read_ASN1"},
 {ERR_FUNC(ASN1_F_SMIME_TEXT),	"SMIME_text"},
 {ERR_FUNC(ASN1_F_X509_CINF_NEW),	"X509_CINF_NEW"},
@@ -207,6 +209,7 @@
 {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
 {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL)     ,"buffer too small"},
 {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
 {ERR_REASON(ASN1_R_DATA_IS_WRONG)        ,"data is wrong"},
 {ERR_REASON(ASN1_R_DECODE_ERROR)         ,"decode error"},
 {ERR_REASON(ASN1_R_DECODING_ERROR)       ,"decoding error"},

diff --git a/crypto/asn1/asn1_locl.h b/crypto/asn1/asn1_locl.h
index 5aa65e2..9fcf0d9 100644
--- a/crypto/asn1/asn1_locl.h
+++ b/crypto/asn1/asn1_locl.h

@@ -102,6 +102,10 @@
 	int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
 	int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
 							ASN1_PCTX *pctx);
+	int (*sig_print)(BIO *out,
+			 const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+					 int indent, ASN1_PCTX *pctx);
+
 
 	void (*pkey_free)(EVP_PKEY *pkey);
 	int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@
 	int (*old_priv_decode)(EVP_PKEY *pkey,
 				const unsigned char **pder, int derlen);
 	int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+	/* Custom ASN1 signature verification */
+	int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *a, ASN1_BIT_STRING *sig,
+				EVP_PKEY *pkey);
+	int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig);
 
 	} /* EVP_PKEY_ASN1_METHOD */;
 

diff --git a/crypto/asn1/n_pkey.c b/crypto/asn1/n_pkey.c
index e7d0439..e251739 100644
--- a/crypto/asn1/n_pkey.c
+++ b/crypto/asn1/n_pkey.c

@@ -129,6 +129,7 @@
 	unsigned char buf[256],*zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	if (a == NULL) return(0);
 
@@ -206,24 +207,28 @@
 	i = strlen((char *)buf);
 	/* If the key is used for SGC the algorithm is modified a little. */
 	if(sgckey) {
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
 	/* Encrypt private key in place */
 	zz = enckey->enckey->digest->data;
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
-	EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
-	EVP_EncryptFinal_ex(&ctx,zz + i,&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+		goto err;
+	if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+		goto err;
+	if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+		goto err;
 
 	ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_ENCRYPTED_PKEY_free(enckey);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
@@ -288,6 +293,7 @@
 	const unsigned char *zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	i=cb((char *)buf,256,"Enter Private Key password:",0);
 	if (i != 0)
@@ -298,19 +304,22 @@
 
 	i = strlen((char *)buf);
 	if(sgckey){
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 		
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
-	EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
-	EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+		goto err;
+	if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+		goto err;
+	if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+		goto err;
 	os->length=i+j;
 
 	zz=os->data;
@@ -328,6 +337,7 @@
 		goto err;
 		}
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
 	}

diff --git a/crypto/asn1/p5_pbev2.c b/crypto/asn1/p5_pbev2.c
index cb49b66..4ea6830 100644
--- a/crypto/asn1/p5_pbev2.c
+++ b/crypto/asn1/p5_pbev2.c

@@ -91,12 +91,10 @@
 				 unsigned char *aiv, int prf_nid)
 {
 	X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
-	int alg_nid;
+	int alg_nid, keylen;
 	EVP_CIPHER_CTX ctx;
 	unsigned char iv[EVP_MAX_IV_LENGTH];
-	PBKDF2PARAM *kdf = NULL;
 	PBE2PARAM *pbe2 = NULL;
-	ASN1_OCTET_STRING *osalt = NULL;
 	ASN1_OBJECT *obj;
 
 	alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@
 	EVP_CIPHER_CTX_init(&ctx);
 
 	/* Dummy cipherinit to just setup the IV, and PRF */
-	EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+	if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+		goto err;
 	if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
 		ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
 					ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@
 		}
 	EVP_CIPHER_CTX_cleanup(&ctx);
 
-	if(!(kdf = PBKDF2PARAM_new())) goto merr;
-	if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
-	if (!saltlen) saltlen = PKCS5_SALT_LEN;
-	if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
-	osalt->length = saltlen;
-	if (salt) memcpy (osalt->data, salt, saltlen);
-	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
-	if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
-	if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
-	/* Now include salt in kdf structure */
-	kdf->salt->value.octet_string = osalt;
-	kdf->salt->type = V_ASN1_OCTET_STRING;
-	osalt = NULL;
-
 	/* If its RC2 then we'd better setup the key length */
 
-	if(alg_nid == NID_rc2_cbc) {
-		if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
-		if(!ASN1_INTEGER_set (kdf->keylength,
-				 EVP_CIPHER_key_length(cipher))) goto merr;
-	}
+	if(alg_nid == NID_rc2_cbc)
+		keylen = EVP_CIPHER_key_length(cipher);
+	else
+		keylen = -1;
 
-	/* prf can stay NULL if we are using hmacWithSHA1 */
-	if (prf_nid != NID_hmacWithSHA1)
-		{
-		kdf->prf = X509_ALGOR_new();
-		if (!kdf->prf)
-			goto merr;
-		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
-					V_ASN1_NULL, NULL);
-		}
+	/* Setup keyfunc */
 
-	/* Now setup the PBE2PARAM keyfunc structure */
+	X509_ALGOR_free(pbe2->keyfunc);
 
-	pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+	pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
 
-	/* Encode PBKDF2PARAM into parameter of pbe2 */
-
-	if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
-
-	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
-			 &pbe2->keyfunc->parameter->value.sequence)) goto merr;
-	pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
-	PBKDF2PARAM_free(kdf);
-	kdf = NULL;
+	if (!pbe2->keyfunc)
+		goto merr;
 
 	/* Now set up top level AlgorithmIdentifier */
 
@@ -219,8 +184,6 @@
 	err:
 	PBE2PARAM_free(pbe2);
 	/* Note 'scheme' is freed as part of pbe2 */
-	M_ASN1_OCTET_STRING_free(osalt);
-	PBKDF2PARAM_free(kdf);
 	X509_ALGOR_free(kalg);
 	X509_ALGOR_free(ret);
 
@@ -233,3 +196,85 @@
 	{
 	return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
 	}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen)
+	{
+	X509_ALGOR *keyfunc = NULL;
+	PBKDF2PARAM *kdf = NULL;
+	ASN1_OCTET_STRING *osalt = NULL;
+
+	if(!(kdf = PBKDF2PARAM_new()))
+		goto merr;
+	if(!(osalt = M_ASN1_OCTET_STRING_new()))
+		goto merr;
+
+	kdf->salt->value.octet_string = osalt;
+	kdf->salt->type = V_ASN1_OCTET_STRING;
+
+	if (!saltlen)
+		saltlen = PKCS5_SALT_LEN;
+	if (!(osalt->data = OPENSSL_malloc (saltlen)))
+		goto merr;
+
+	osalt->length = saltlen;
+
+	if (salt)
+		memcpy (osalt->data, salt, saltlen);
+	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+		goto merr;
+
+	if(iter <= 0)
+		iter = PKCS5_DEFAULT_ITER;
+
+	if(!ASN1_INTEGER_set(kdf->iter, iter))
+		goto merr;
+
+	/* If have a key len set it up */
+
+	if(keylen > 0) 
+		{
+		if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+			goto merr;
+		if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+			goto merr;
+		}
+
+	/* prf can stay NULL if we are using hmacWithSHA1 */
+	if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+		{
+		kdf->prf = X509_ALGOR_new();
+		if (!kdf->prf)
+			goto merr;
+		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+					V_ASN1_NULL, NULL);
+		}
+
+	/* Finally setup the keyfunc structure */
+
+	keyfunc = X509_ALGOR_new();
+	if (!keyfunc)
+		goto merr;
+
+	keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+	/* Encode PBKDF2PARAM into parameter of pbe2 */
+
+	if(!(keyfunc->parameter = ASN1_TYPE_new()))
+		goto merr;
+
+	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+			 &keyfunc->parameter->value.sequence))
+		goto merr;
+	keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+	PBKDF2PARAM_free(kdf);
+	return keyfunc;
+
+	merr:
+	ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+	PBKDF2PARAM_free(kdf);
+	X509_ALGOR_free(keyfunc);
+	return NULL;
+	}
+

diff --git a/crypto/asn1/t_crl.c b/crypto/asn1/t_crl.c
index ee5a687..c611692 100644
--- a/crypto/asn1/t_crl.c
+++ b/crypto/asn1/t_crl.c

@@ -94,8 +94,7 @@
 	l = X509_CRL_get_version(x);
 	BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
 	i = OBJ_obj2nid(x->sig_alg->algorithm);
-	BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
-				 (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+	X509_signature_print(out, x->sig_alg, NULL);
 	p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
 	BIO_printf(out,"%8sIssuer: %s\n","",p);
 	OPENSSL_free(p);

diff --git a/crypto/asn1/t_x509.c b/crypto/asn1/t_x509.c
index 89e7a7f..edbb39a 100644
--- a/crypto/asn1/t_x509.c
+++ b/crypto/asn1/t_x509.c

@@ -72,6 +72,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/x509v3.h>
+#include "asn1_locl.h"
 
 #ifndef OPENSSL_NO_FP_API
 int X509_print_fp(FILE *fp, X509 *x)
@@ -137,7 +138,7 @@
 		if (BIO_write(bp,"        Serial Number:",22) <= 0) goto err;
 
 		bs=X509_get_serialNumber(x);
-		if (bs->length <= 4)
+		if (bs->length <= (int)sizeof(long))
 			{
 			l=ASN1_INTEGER_get(bs);
 			if (bs->type == V_ASN1_NEG_INTEGER)
@@ -167,12 +168,16 @@
 
 	if(!(cflag & X509_FLAG_NO_SIGNAME))
 		{
+		if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+			goto err;
+#if 0
 		if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0) 
 			goto err;
 		if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
 			goto err;
 		if (BIO_puts(bp, "\n") <= 0)
 			goto err;
+#endif
 		}
 
 	if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@
 		goto err;
 	i2d_X509_NAME(x->cert_info->subject, &dertmp);
 
-	EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@
 	if (BIO_printf(bp,"\n        Public key OCSP hash: ") <= 0)
 		goto err;
 
-	EVP_Digest(x->cert_info->key->public_key->data,
-		x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->cert_info->key->public_key->data,
+			x->cert_info->key->public_key->length,
+			SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@
 	return(0);
 	}
 
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
 {
-	unsigned char *s;
+	const unsigned char *s;
 	int i, n;
-	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
-	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
 
 	n=sig->length;
 	s=sig->data;
 	for (i=0; i<n; i++)
 		{
 		if ((i%18) == 0)
-			if (BIO_write(bp,"\n        ",9) <= 0) return 0;
+			{
+			if (BIO_write(bp,"\n",1) <= 0) return 0;
+			if (BIO_indent(bp, indent, indent) <= 0) return 0;
+			}
 			if (BIO_printf(bp,"%02x%s",s[i],
 				((i+1) == n)?"":":") <= 0) return 0;
 		}
 	if (BIO_write(bp,"\n",1) != 1) return 0;
+
+	return 1;
+}
+
+int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+{
+	int sig_nid;
+	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
+	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
+
+	sig_nid = OBJ_obj2nid(sigalg->algorithm);
+	if (sig_nid != NID_undef)
+		{
+		int pkey_nid, dig_nid;
+		const EVP_PKEY_ASN1_METHOD *ameth;
+		if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+			{
+			ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+			if (ameth && ameth->sig_print)
+				return ameth->sig_print(bp, sigalg, sig, 9, 0);
+			}
+		}
+	if (sig)
+		return X509_signature_dump(bp, sig, 9);
+	else if (BIO_puts(bp, "\n") <= 0)
+		return 0;
 	return 1;
 }
 

diff --git a/crypto/asn1/x_algor.c b/crypto/asn1/x_algor.c
index 99e5342..274e456 100644
--- a/crypto/asn1/x_algor.c
+++ b/crypto/asn1/x_algor.c

@@ -128,3 +128,17 @@
 		}
 	}
 
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+	{
+	int param_type;
+
+	if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+		param_type = V_ASN1_UNDEF;
+	else
+		param_type = V_ASN1_NULL;
+
+	X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+	}

diff --git a/crypto/bf/bf_skey.c b/crypto/bf/bf_skey.c
index 3673cde..3b0bca4 100644
--- a/crypto/bf/bf_skey.c
+++ b/crypto/bf/bf_skey.c

@@ -58,11 +58,19 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/blowfish.h>
 #include "bf_locl.h"
 #include "bf_pi.h"
 
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(BLOWFISH);
+	private_BF_set_key(key, len, data);
+	}
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
 	{
 	int i;
 	BF_LONG *p,ri,in[2];

diff --git a/crypto/bf/blowfish.h b/crypto/bf/blowfish.h
index b97e76f..4b6c892 100644
--- a/crypto/bf/blowfish.h
+++ b/crypto/bf/blowfish.h

@@ -104,7 +104,9 @@
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);

diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index 3c39d18..05699ab 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h

@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -162,7 +173,22 @@
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -332,6 +358,34 @@
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -629,6 +683,9 @@
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -671,6 +728,15 @@
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -735,6 +801,7 @@
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129

diff --git a/crypto/bio/bio_err.c b/crypto/bio/bio_err.c
index a224edd..0dbfbd8 100644
--- a/crypto/bio/bio_err.c
+++ b/crypto/bio/bio_err.c

@@ -1,6 +1,6 @@
 /* crypto/bio/bio_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@
 {ERR_FUNC(BIO_F_BUFFER_CTRL),	"BUFFER_CTRL"},
 {ERR_FUNC(BIO_F_CONN_CTRL),	"CONN_CTRL"},
 {ERR_FUNC(BIO_F_CONN_STATE),	"CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ),	"DGRAM_SCTP_READ"},
 {ERR_FUNC(BIO_F_FILE_CTRL),	"FILE_CTRL"},
 {ERR_FUNC(BIO_F_FILE_READ),	"FILE_READ"},
 {ERR_FUNC(BIO_F_LINEBUFFER_CTRL),	"LINEBUFFER_CTRL"},

diff --git a/crypto/bio/bss_bio.c b/crypto/bio/bss_bio.c
index 76bd48e..52ef0eb 100644
--- a/crypto/bio/bss_bio.c
+++ b/crypto/bio/bss_bio.c

@@ -277,10 +277,10 @@
  */
 /* WARNING: The non-copying interface is largely untested as of yet
  * and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num;
+	ossl_ssize_t num;
 	
 	BIO_clear_retry_flags(bio);
 
@@ -315,15 +315,15 @@
 	return num;
 	}
 
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num, available;
+	ossl_ssize_t num, available;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	available = bio_nread0(bio, buf);
 	if (num > available)
@@ -428,7 +428,7 @@
  * (example usage:  bio_nwrite0(), write to buffer, bio_nwrite()
  *  or just         bio_nwrite(), write to buffer)
  */
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b;
 	size_t num;
@@ -476,15 +476,15 @@
 	return num;
 	}
 
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b;
-	ssize_t num, space;
+	ossl_ssize_t num, space;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	space = bio_nwrite0(bio, buf);
 	if (num > space)

diff --git a/crypto/bio/bss_dgram.c b/crypto/bio/bss_dgram.c
index ad9c372..1b1e4be 100644
--- a/crypto/bio/bss_dgram.c
+++ b/crypto/bio/bss_dgram.c

@@ -70,6 +70,13 @@
 #include <sys/timeb.h>
 #endif
 
+#ifndef OPENSSL_NO_SCTP
+#include <netinet/sctp.h>
+#include <fcntl.h>
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE            0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
 #ifdef OPENSSL_SYS_LINUX
 #define IP_MTU      14 /* linux is lame */
 #endif
@@ -88,6 +95,18 @@
 static int dgram_free(BIO *data);
 static int dgram_clear(BIO *bio);
 
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
 static int BIO_dgram_should_retry(int s);
 
 static void get_current_time(struct timeval *t);
@@ -106,6 +125,22 @@
 	NULL,
 	};
 
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+	{
+	BIO_TYPE_DGRAM_SCTP,
+	"datagram sctp socket",
+	dgram_sctp_write,
+	dgram_sctp_read,
+	dgram_sctp_puts,
+	NULL, /* dgram_gets, */
+	dgram_sctp_ctrl,
+	dgram_sctp_new,
+	dgram_sctp_free,
+	NULL,
+	};
+#endif
+
 typedef struct bio_dgram_data_st
 	{
 	union {
@@ -122,6 +157,40 @@
 	struct timeval socket_timeout;
 	} bio_dgram_data;
 
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+	{
+        BIO *bio;
+        char *data;
+        int length;
+	} bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+	{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+		struct sockaddr_in6 sa_in6;
+#endif
+	} peer;
+	unsigned int connected;
+	unsigned int _errno;
+	unsigned int mtu;
+	struct bio_dgram_sctp_sndinfo sndinfo;
+	struct bio_dgram_sctp_rcvinfo rcvinfo;
+	struct bio_dgram_sctp_prinfo prinfo;
+	void (*handle_notifications)(BIO *bio, void *context, void *buf);
+	void* notification_context;
+	int in_handshake;
+	int ccs_rcvd;
+	int ccs_sent;
+	int save_shutdown;
+	int peer_auth_tested;
+	bio_dgram_sctp_save_message saved_message;
+	} bio_dgram_sctp_data;
+#endif
+
 BIO_METHOD *BIO_s_datagram(void)
 	{
 	return(&methods_dgramp);
@@ -759,6 +828,912 @@
 	return(ret);
 	}
 
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+	{
+	return(&methods_dgramp_sctp);
+	}
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+	{
+	BIO *bio;
+	int ret, optval = 20000;
+	int auth_data = 0, auth_forward = 0;
+	unsigned char *p;
+	struct sctp_authchunk auth;
+	struct sctp_authchunks *authchunks;
+	socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+#endif
+#endif
+
+	bio=BIO_new(BIO_s_datagram_sctp());
+	if (bio == NULL) return(NULL);
+	BIO_set_fd(bio,fd,close_flag);
+
+	/* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+	auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+	auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+
+	/* Test if activation was successful. When using accept(),
+	 * SCTP-AUTH has to be activated for the listening socket
+	 * already, otherwise the connected socket won't use it. */
+	sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+	authchunks = OPENSSL_malloc(sockopt_len);
+	memset(authchunks, 0, sizeof(sockopt_len));
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+	
+	for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+	     p < (unsigned char*) authchunks + sockopt_len;
+	     p += sizeof(uint8_t))
+		{
+		if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+		if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+		}
+		
+	OPENSSL_free(authchunks);
+
+	OPENSSL_assert(auth_data);
+	OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_AUTHENTICATION_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+	OPENSSL_assert(ret >= 0);
+#else
+	sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+
+	event.sctp_authentication_event = 1;
+
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+	OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+	/* Disable partial delivery by setting the min size
+	 * larger than the max record size of 2^14 + 2048 + 13
+	 */
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+	OPENSSL_assert(ret >= 0);
+
+	return(bio);
+	}
+
+int BIO_dgram_is_sctp(BIO *bio)
+	{
+	return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+	}
+
+static int dgram_sctp_new(BIO *bi)
+	{
+	bio_dgram_sctp_data *data = NULL;
+
+	bi->init=0;
+	bi->num=0;
+	data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+	if (data == NULL)
+		return 0;
+	memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+	data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+    bi->ptr = data;
+
+	bi->flags=0;
+	return(1);
+	}
+
+static int dgram_sctp_free(BIO *a)
+	{
+	bio_dgram_sctp_data *data;
+
+	if (a == NULL) return(0);
+	if ( ! dgram_clear(a))
+		return 0;
+
+	data = (bio_dgram_sctp_data *)a->ptr;
+	if(data != NULL) OPENSSL_free(data);
+
+	return(1);
+	}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+	{
+	unsigned int sockopt_len = 0;
+	int ret;
+	struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+	if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+		{
+		struct sctp_authkeyid authkeyid;
+
+		/* delete key */
+		authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+		      &authkeyid, sockopt_len);
+		}
+	}
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+	{
+	int ret = 0, n = 0, i, optval;
+	socklen_t optlen;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	union sctp_notification *snp;
+	struct msghdr msg;
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[512];
+
+	if (out != NULL)
+		{
+		clear_socket_error();
+
+		do
+			{
+			memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+			iov.iov_base = out;
+			iov.iov_len = outl;
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = cmsgbuf;
+			msg.msg_controllen = 512;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (msg.msg_controllen > 0)
+				{
+				for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+					{
+					if (cmsg->cmsg_level != IPPROTO_SCTP)
+						continue;
+#ifdef SCTP_RCVINFO
+					if (cmsg->cmsg_type == SCTP_RCVINFO)
+						{
+						struct sctp_rcvinfo *rcvinfo;
+
+						rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+						data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+						data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+						data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+						data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+						data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+						data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+						}
+#endif
+#ifdef SCTP_SNDRCV
+					if (cmsg->cmsg_type == SCTP_SNDRCV)
+						{
+						struct sctp_sndrcvinfo *sndrcvinfo;
+
+						sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+						data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+						data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+						data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+						data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+						data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+						data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+						}
+#endif
+					}
+				}
+
+			if (n <= 0)
+				{
+				if (n < 0)
+					ret = n;
+				break;
+				}
+
+			if (msg.msg_flags & MSG_NOTIFICATION)
+				{
+				snp = (union sctp_notification*) out;
+				if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+					{
+#ifdef SCTP_EVENT
+					struct sctp_event event;
+#else
+					struct sctp_event_subscribe event;
+					socklen_t eventsize;
+#endif
+					/* If a message has been delayed until the socket
+					 * is dry, it can be sent now.
+					 */
+					if (data->saved_message.length > 0)
+						{
+						dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+						                 data->saved_message.length);
+						OPENSSL_free(data->saved_message.data);
+						data->saved_message.length = 0;
+						}
+
+					/* disable sender dry event */
+#ifdef SCTP_EVENT
+					memset(&event, 0, sizeof(struct sctp_event));
+					event.se_assoc_id = 0;
+					event.se_type = SCTP_SENDER_DRY_EVENT;
+					event.se_on = 0;
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+					OPENSSL_assert(i >= 0);
+#else
+					eventsize = sizeof(struct sctp_event_subscribe);
+					i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+					OPENSSL_assert(i >= 0);
+
+					event.sctp_sender_dry_event = 0;
+
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+					OPENSSL_assert(i >= 0);
+#endif
+					}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+				if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+					dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+				if (data->handle_notifications != NULL)
+					data->handle_notifications(b, data->notification_context, (void*) out);
+
+				memset(out, 0, outl);
+				}
+			else
+				ret += n;
+			}
+		while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+		if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+			{
+			/* Partial message read, this should never happen! */
+
+			/* The buffer was too small, this means the peer sent
+			 * a message that was larger than allowed. */
+			if (ret == outl)
+				return -1;
+
+			/* Test if socket buffer can handle max record
+			 * size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Test if SCTP doesn't partially deliver below
+			 * max record size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+			                 &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Partially delivered notification??? Probably a bug.... */
+			OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+			/* Everything seems ok till now, so it's most likely
+			 * a message dropped by PR-SCTP.
+			 */
+			memset(out, 0, outl);
+			BIO_set_retry_read(b);
+			return -1;
+			}
+
+		BIO_clear_retry_flags(b);
+		if (ret < 0)
+			{
+			if (BIO_dgram_should_retry(ret))
+				{
+				BIO_set_retry_read(b);
+				data->_errno = get_last_socket_error();
+				}
+			}
+
+		/* Test if peer uses SCTP-AUTH before continuing */
+		if (!data->peer_auth_tested)
+			{
+			int ii, auth_data = 0, auth_forward = 0;
+			unsigned char *p;
+			struct sctp_authchunks *authchunks;
+
+			optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+			authchunks = OPENSSL_malloc(optlen);
+			memset(authchunks, 0, sizeof(optlen));
+			ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+			OPENSSL_assert(ii >= 0);
+
+			for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+				 p < (unsigned char*) authchunks + optlen;
+				 p += sizeof(uint8_t))
+				{
+				if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+				if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+				}
+
+			OPENSSL_free(authchunks);
+
+			if (!auth_data || !auth_forward)
+				{
+				BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+				return -1;
+				}
+
+			data->peer_auth_tested = 1;
+			}
+		}
+	return(ret);
+	}
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+	{
+	int ret;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+	struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+	struct bio_dgram_sctp_sndinfo handshake_sinfo;
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+	struct sctp_sndinfo *sndinfo;
+	struct sctp_prinfo *prinfo;
+#else
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+	clear_socket_error();
+
+	/* If we're send anything else than application data,
+	 * disable all user parameters and flags.
+	 */
+	if (in[0] != 23) {
+		memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+		handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+		sinfo = &handshake_sinfo;
+	}
+
+	/* If we have to send a shutdown alert message and the
+	 * socket is not dry yet, we have to save it and send it
+	 * as soon as the socket gets dry.
+	 */
+	if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+	{
+		data->saved_message.bio = b;
+		data->saved_message.length = inl;
+		data->saved_message.data = OPENSSL_malloc(inl);
+		memcpy(data->saved_message.data, in, inl);
+		return inl;
+	}
+
+	iov[0].iov_base = (char *)in;
+	iov[0].iov_len = inl;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = (caddr_t)cmsgbuf;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+	sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+	memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+	sndinfo->snd_sid = sinfo->snd_sid;
+	sndinfo->snd_flags = sinfo->snd_flags;
+	sndinfo->snd_ppid = sinfo->snd_ppid;
+	sndinfo->snd_context = sinfo->snd_context;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+	cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_PRINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+	prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+	memset(prinfo, 0, sizeof(struct sctp_prinfo));
+	prinfo->pr_policy = pinfo->pr_policy;
+	prinfo->pr_value = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+	sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+	sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+	sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+	sndrcvinfo->sinfo_context = sinfo->snd_context;
+	sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+	ret = sendmsg(b->num, &msg, 0);
+
+	BIO_clear_retry_flags(b);
+	if (ret <= 0)
+		{
+		if (BIO_dgram_should_retry(ret))
+			{
+			BIO_set_retry_write(b);  
+			data->_errno = get_last_socket_error();
+			}
+		}
+	return(ret);
+	}
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+	{
+	long ret=1;
+	bio_dgram_sctp_data *data = NULL;
+	unsigned int sockopt_len = 0;
+	struct sctp_authkeyid authkeyid;
+	struct sctp_authkey *authkey;
+
+	data = (bio_dgram_sctp_data *)b->ptr;
+
+	switch (cmd)
+		{
+	case BIO_CTRL_DGRAM_QUERY_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore user input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_CONNECTED:
+	case BIO_CTRL_DGRAM_CONNECT:
+		/* Returns always -1. */
+		ret = -1;
+		break;
+	case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+		/* SCTP doesn't need the DTLS timer
+		 * Returns always 1.
+		 */
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+		if (num > 0)
+			data->in_handshake = 1;
+		else
+			data->in_handshake = 0;
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+		break;
+	case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+		/* New shared key for SCTP AUTH.
+		 * Returns 0 on success, -1 otherwise.
+		 */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Add new key */
+		sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+		authkey = OPENSSL_malloc(sockopt_len);
+		memset(authkey, 0x00, sockopt_len);
+		authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+		/* This field is missing in FreeBSD 8.2 and earlier,
+		 * and FreeBSD 8.3 and higher work without it.
+		 */
+		authkey->sca_keylength = 64;
+#endif
+		memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+		if (ret < 0) break;
+
+		/* Reset active key */
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		break;
+	case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Set active key */
+		authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		/* CCS has been sent, so remember that and fall through
+		 * to check if we need to deactivate an old key
+		 */
+		data->ccs_sent = 1;
+
+	case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Has this command really been called or is this just a fall-through? */
+		if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+			data->ccs_rcvd = 1;
+
+		/* CSS has been both, received and sent, so deactivate an old key */
+		if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+			{
+			/* Get active key */
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+			if (ret < 0) break;
+
+			/* Deactivate key or delete second last key if
+			 * SCTP_AUTHENTICATION_EVENT is not available.
+			 */
+			authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+			      &authkeyid, sockopt_len);
+			if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+			if (authkeyid.scact_keynumber > 0)
+				{
+				authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+				ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+					  &authkeyid, sizeof(struct sctp_authkeyid));
+				if (ret < 0) break;
+				}
+#endif
+
+			data->ccs_rcvd = 0;
+			data->ccs_sent = 0;
+			}
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(ptr, &(data->sndinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(&(data->sndinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(ptr, &data->rcvinfo, num);
+
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(&(data->rcvinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(ptr, &(data->prinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(&(data->prinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+		/* Returns always 1. */
+		if (num > 0)
+			data->save_shutdown = 1;
+		else
+			data->save_shutdown = 0;
+		break;
+
+	default:
+		/* Pass to default ctrl function to
+		 * process SCTP unspecific commands
+		 */
+		ret=dgram_ctrl(b, cmd, num, ptr);
+		break;
+		}
+	return(ret);
+	}
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context)
+	{
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+	if (handle_notifications != NULL)
+		{
+		data->handle_notifications = handle_notifications;
+		data->notification_context = context;
+		}
+	else
+		return -1;
+
+	return 0;
+	}
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+	int is_dry = 0;
+	int n, sockflags, ret;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+	socklen_t eventsize;
+#endif
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* set sender dry event */
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_SENDER_DRY_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+	eventsize = sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+	if (ret < 0)
+		return -1;
+	
+	event.sctp_sender_dry_event = 1;
+	
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+	if (ret < 0)
+		return -1;
+
+	/* peek for notification */
+	memset(&snp, 0x00, sizeof(union sctp_notification));
+	iov.iov_base = (char *)&snp;
+	iov.iov_len = sizeof(union sctp_notification);
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	n = recvmsg(b->num, &msg, MSG_PEEK);
+	if (n <= 0)
+		{
+		if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+			return -1;
+		else
+			return 0;
+		}
+
+	/* if we find a notification, process it and try again if necessary */
+	while (msg.msg_flags & MSG_NOTIFICATION)
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		n = recvmsg(b->num, &msg, 0);
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		
+		if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+			{
+			is_dry = 1;
+
+			/* disable sender dry event */
+#ifdef SCTP_EVENT
+			memset(&event, 0, sizeof(struct sctp_event));
+			event.se_assoc_id = 0;
+			event.se_type = SCTP_SENDER_DRY_EVENT;
+			event.se_on = 0;
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+			eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+			if (ret < 0)
+				return -1;
+
+			event.sctp_sender_dry_event = 0;
+
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+			if (ret < 0)
+				return -1;
+			}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+		if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+			dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+		if (data->handle_notifications != NULL)
+			data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+		/* found notification, peek again */
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		/* if we have seen the dry already, don't wait */
+		if (is_dry)
+			{
+			sockflags = fcntl(b->num, F_GETFL, 0);
+			fcntl(b->num, F_SETFL, O_NONBLOCK);
+			}
+
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+
+		if (is_dry)
+			{
+			fcntl(b->num, F_SETFL, sockflags);
+			}
+
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		}
+
+	/* read anything else */
+	return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+	{
+	int n, sockflags;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* Check if there are any messages waiting to be read */
+	do
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		sockflags = fcntl(b->num, F_GETFL, 0);
+		fcntl(b->num, F_SETFL, O_NONBLOCK);
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+		fcntl(b->num, F_SETFL, sockflags);
+
+		/* if notification, process and try again */
+		if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+			{
+#ifdef SCTP_AUTHENTICATION_EVENT
+			if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+				dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+			memset(&snp, 0x00, sizeof(union sctp_notification));
+			iov.iov_base = (char *)&snp;
+			iov.iov_len = sizeof(union sctp_notification);
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (data->handle_notifications != NULL)
+				data->handle_notifications(b, data->notification_context, (void*) &snp);
+			}
+
+		} while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+	/* Return 1 if there is a message to be read, return 0 otherwise. */
+	if (n > 0)
+		return 1;
+	else
+		return 0;
+	}
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+	{
+	int n,ret;
+
+	n=strlen(str);
+	ret=dgram_sctp_write(bp,str,n);
+	return(ret);
+	}
+#endif
+
 static int BIO_dgram_should_retry(int i)
 	{
 	int err;

diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000..c52e0b7
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.pl

@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
+	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb
+	vshl.u64	`&Dlo("q2")`,d16,#16
+	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb
+	vshl.u64	`&Dlo("q3")`,d16,#24
+	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb
+	vshr.u64	`&Dlo("q1")`,#8
+	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb
+	vshl.u64	`&Dhi("q1")`,#24
+	veor		d0,`&Dlo("q1")`
+	vshr.u64	`&Dlo("q2")`,#16
+	veor		d0,`&Dhi("q1")`
+	vshl.u64	`&Dhi("q2")`,#16
+	veor		d0,`&Dlo("q2")`
+	vshr.u64	`&Dlo("q3")`,#24
+	veor		d0,`&Dhi("q2")`
+	vshl.u64	`&Dhi("q3")`,#8
+	veor		d0,`&Dlo("q3")`
+	veor		d0,`&Dhi("q3")`
+	bx	lr
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	$A1,$A1
+	vmov.32	$B1,r3,r3		@ two copies of b1
+	vmov.32	${A1}[0],r1		@ a1
+
+	veor	$A0,$A0
+	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
+	vmov.32	${A0}[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,$A1
+	vmov	d17,$B1
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	$A1B1,d0
+
+	vmov	d16,$A0
+	vmov	d17,$B0
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	$A0B0,d0
+
+	veor	d16,$A0,$A1
+	veor	d17,$B0,$B1
+	veor	$A0,$A0B0,$A1B1
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	$A0B0,d1
+	veor	$A1B1,d0
+	vst1.32	{${A0B0}[0]},[r0,:32]!
+	vst1.32	{${A0B0}[1]},[r0,:32]!
+	vst1.32	{${A1B1}[0]},[r0,:32]!
+	vst1.32	{${A1B1}[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;   # enforce flush

diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 14e0d2d..f78a8b5 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl

@@ -23,6 +23,9 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@
 .L1st:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adds	$nlo,$nlo,$alo
@@ -101,21 +104,21 @@
 	bne	.L1st
 
 	adds	$nlo,$nlo,$ahi
-	mov	$nhi,#0
-	adc	$nhi,$nhi,#0
 	ldr	$tp,[$_bp]		@ restore bp
-	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$nhi,#0
 	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
-	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$aj,[$ap,#-4]		@ ap[0]
-	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$tj,[sp,#4]		@ tp[1]
 
 	mov	$ahi,#0
@@ -129,13 +132,13 @@
 .Linner:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
-	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
@@ -144,13 +147,13 @@
 
 	adds	$nlo,$nlo,$ahi
 	mov	$nhi,#0
-	adc	$nhi,$nhi,#0
-	adds	$nlo,$nlo,$tj
-	adc	$nhi,$nhi,#0
 	ldr	$tp,[$_bp]		@ restore bp
-	ldr	$tj,[$_bpend]		@ restore &bp[num]
-	str	$nlo,[$num]		@ tp[num-1]=
+	adc	$nhi,$nhi,#0
 	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj

diff --git a/crypto/bn/asm/armv4-mont.s b/crypto/bn/asm/armv4-mont.s
index 0488455..64c220b 100644
--- a/crypto/bn/asm/armv4-mont.s
+++ b/crypto/bn/asm/armv4-mont.s

@@ -38,9 +38,9 @@
 .L1st:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
 	adds	r12,r12,r10
@@ -50,21 +50,21 @@
 	bne	.L1st
 
 	adds	r12,r12,r11
-	mov	r14,#0
-	adc	r14,r14,#0
 	ldr	r4,[r0,#13*4]		@ restore bp
-	str	r12,[r0]		@ tp[num-1]=
+	mov	r14,#0
 	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
 	str	r14,[r0,#4]		@ tp[num]=
 
 .Louter:
 	sub	r7,r0,sp		@ "original" r0-1 value
 	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
-	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r5,[r1,#-4]		@ ap[0]
-	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r7,[sp,#4]		@ tp[1]
 
 	mov	r11,#0
@@ -78,13 +78,13 @@
 .Linner:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
-	ldr	r7,[r4,#8]		@ tp[j+1]
 	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
 	adds	r12,r12,r10
 	str	r12,[r4],#4		@ tp[j-1]=,tp++
 	adc	r12,r14,#0
@@ -93,13 +93,13 @@
 
 	adds	r12,r12,r11
 	mov	r14,#0
-	adc	r14,r14,#0
-	adds	r12,r12,r7
-	adc	r14,r14,#0
 	ldr	r4,[r0,#13*4]		@ restore bp
-	ldr	r7,[r0,#15*4]		@ restore &bp[num]
-	str	r12,[r0]		@ tp[num-1]=
+	adc	r14,r14,#0
 	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
 	str	r14,[r0,#4]		@ tp[num]=
 
 	cmp	r4,r7

diff --git a/crypto/bn/asm/bn-mips.s b/crypto/bn/asm/bn-mips.s
index d1535b1..02097fa 100644
--- a/crypto/bn/asm/bn-mips.s
+++ b/crypto/bn/asm/bn-mips.s

@@ -394,14 +394,14 @@
 	sltu	$2,$14,$10
 	sw	$14,-2*4($4)
 	addu	$2,$24
-
+	
 	addu	$11,$15
 	sltu	$25,$11,$15
 	addu	$15,$11,$2
 	sltu	$2,$15,$11
 	sw	$15,-4($4)
 	addu	$2,$25
-
+	
 	.set	noreorder
 	bgtzl	$1,.L_bn_add_words_loop
 	lw	$12,0($5)
@@ -567,7 +567,7 @@
 				# so that we can save two arguments
 				# and return address in registers
 				# instead of stack:-)
-
+				
 	lw	$4,($7)
 	move	$10,$5
 	bne	$4,$6,bn_div_3_words_internal

diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000..e258658
--- /dev/null
+++ b/crypto/bn/asm/ia64-mont.pl

@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
index f04b3b9..c162a3e 100644
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl

@@ -583,14 +583,14 @@
 	sltu	$v0,$t2,$ta2
 	$ST	$t2,-2*$BNSZ($a0)
 	$ADDU	$v0,$t8
-
+	
 	$ADDU	$ta3,$t3
 	sltu	$t9,$ta3,$t3
 	$ADDU	$t3,$ta3,$v0
 	sltu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
 	$ADDU	$v0,$t9
-
+	
 	.set	noreorder
 	bgtzl	$at,.L_bn_add_words_loop
 	$LD	$t0,0($a1)
@@ -790,7 +790,7 @@
 				# so that we can save two arguments
 				# and return address in registers
 				# instead of stack:-)
-
+				
 	$LD	$a0,($a3)
 	move	$ta2,$a1
 	bne	$a0,$a2,bn_div_3_words_internal

diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000..54aeb01
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.pl

@@ -0,0 +1,1496 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: [email protected]
+#           Jim Guilford
+#           [email protected]
+#           [email protected]
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    [email protected].
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	($ASRC), $X[0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 mov	(+8*$i)($ASRC), $X[$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#MULSTEP_512	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle	pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+	 mov	\$8, $cnt
+loop_$m:
+	 mov	($pSrc), $d0
+	 mov	$d0#w, ($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*1)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*2)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*3)($pDst)
+	 lea	8($pSrc), $pSrc
+	 lea	64*4($pDst), $pDst
+	 dec	$cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle	pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+	 mov	\$4, $cnt
+loop_$m:
+	 movzxw	(+64*3+256*0)($pSrc), $d0
+	 movzxw	(+64*3+256*1)($pSrc), $d1
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*2+256*0)($pSrc), $d0#w
+	 mov	(+64*2+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*1+256*0)($pSrc), $d0#w
+	 mov	(+64*1+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*0+256*0)($pSrc), $d0#w
+	 mov	(+64*0+256*1)($pSrc), $d1#w
+	 mov	$d0, (+8*0)($pDst)
+	 mov	$d1, (+8*1)($pDst)
+	 lea	256*2($pSrc), $pSrc
+	 lea	8*2($pDst), $pDst
+	 sub	\$1, $cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+
+my $X1_offset           =  0;			# 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;			# 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;			# 1 qword
+my $Q_offset            =  0;			# 2 qwords
+my $P_offset            =  $Q_offset + 2*8;			# 11 qwords
+my $Y_offset            =  0;			# 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;			# 9 qwords
+
+my $Red_Data_Size       =  $Carries_offset + 1*8;			# (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset	value
+# ...		<old stack contents>
+# ...
+# 280		Garray
+
+# 278		tmp16[15]
+# ...		...
+# 200		tmp16[0]
+
+# 1F8		tmp[7]
+# ...		...
+# 1C0		tmp[0]
+
+# 1B8		GT[7]
+# ...		...
+# 180		GT[0]
+
+# 178		Reduce Data
+# ...		...
+# 0B8		Reduce Data
+# 0B0		reserved
+# 0A8		reserved
+# 0A0		reserved
+# 098		reserved
+# 090		reserved
+# 088		reduce result addr
+# 080		exp[8]
+
+# ...
+# 048		exp[1]
+# 040		exp[0]
+
+# 038		reserved
+# 030		loop_idx
+# 028		pg
+# 020		i
+# 018		pData	; arg 4
+# 010		pG	; arg 2
+# 008		pResult	; arg 1
+# 000		rsp	; stack pointer before subtract
+
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+#	struct MODF_2FOLD_MONT_512_C1_DATA {
+#	UINT64 t[8][8];
+#	UINT64 m[8];
+#	UINT64 m1[8]; /* 2^768 % m */
+#	UINT64 m2[8]; /* 2^640 % m */
+#	UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#	};
+
+my $T                   =  0;
+my $M                   =  512;			# = 8 * 8 * 8
+my $M1                  =  576;			# = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;			# = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;			# = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+#   FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type	MULADD_128x512,\@abi-omnipotent
+.align	16
+MULADD_128x512:
+___
+	&MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 mov	(+8*1)(%rdi), %rbp
+___
+	&MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 ret
+.size	MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512	MACRO	pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+	mov	(+8*12)($pA), $OP
+___
+	&MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*13)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*14)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*15)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#	       UINT64 *m,  /*  512 bits,  8 qwords */
+#	       MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):		     Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type	mont_reduce,\@abi-omnipotent
+.align	16
+mont_reduce:
+___
+
+my $STACK_DEPTH         =  8;
+	#
+	# X1 = Xh * M1 + Xl
+$code.=<<___;
+	 lea	(+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi			# pX1 (Dst) 769 bits, 13 qwords
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rsi			# pM1 (Bsrc) 512 bits, 8 qwords
+	 add	\$$M1, %rsi
+	 lea	(+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx			# X (Asrc) 1024 bits, 16 qwords
+
+___
+
+	&MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);	# rotates @X 4 times
+	# results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+	 xor	%rax, %rax
+	# X1 += xl
+	 add	(+8*8)(%rcx), $X[4]
+	 adc	(+8*9)(%rcx), $X[5]
+	 adc	(+8*10)(%rcx), $X[6]
+	 adc	(+8*11)(%rcx), $X[7]
+	 adc	\$0, %rax
+	# X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+	#
+	# check for carry ;; carry stored in rax
+	 mov	$X[4], (+8*8)(%rdi)			# rdi points to X1
+	 mov	$X[5], (+8*9)(%rdi)
+	 mov	$X[6], %rbp
+	 mov	$X[7], (+8*11)(%rdi)
+
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 mov	(+8*0)(%rdi), $X[4]
+	 mov	(+8*1)(%rdi), $X[5]
+	 mov	(+8*2)(%rdi), $X[6]
+	 mov	(+8*3)(%rdi), $X[7]
+
+	# X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+	# rdi -> X1
+	# rsi -> M1
+
+	#
+	# X2 = Xh * M2 + Xl
+	# do first part (X2 = Xh * M2)
+	 add	\$8*10, %rdi			# rdi -> pXh ; 128 bits, 2 qwords
+				#        Xh is actually { [rdi+8*1], rbp }
+	 add	\$`$M2-$M1`, %rsi			# rsi -> M2
+	 lea	(+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx			# rcx -> pX2 ; 641 bits, 11 qwords
+___
+	unshift(@X,pop(@X));	unshift(@X,pop(@X));
+$code.=<<___;
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+	# X2 += Xl
+	 add	(+8*8-8*10)(%rdi), $X[6]		# (-8*10) is to adjust rdi -> Xh to Xl
+	 adc	(+8*9-8*10)(%rdi), $X[7]
+	 mov	$X[6], (+8*8)(%rcx)
+	 mov	$X[7], (+8*9)(%rcx)
+
+	 adc	%rax, %rax
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 lea	(+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi			# rdi -> pQ ; 128 bits, 2 qwords
+	 add	\$`$K1-$M2`, %rsi			# rsi -> pK1 ; 128 bits, 2 qwords
+
+	# MUL_128x128t128	rdi, rcx, rsi	; Q = X2 * K1 (bottom half)
+	# B1:B0 = rsi[1:0] = K1[1:0]
+	# A1:A0 = rcx[1:0] = X2[1:0]
+	# Result = rdi[1],rbp = Q[1],rbp
+	 mov	(%rsi), %r8			# B0
+	 mov	(+8*1)(%rsi), %rbx			# B1
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%r8			# B0
+	 mov	%rax, %rbp
+	 mov	%rdx, %r9
+
+	 mov	(+8*1)(%rcx), %rax			# A1
+	 mul	%r8			# B0
+	 add	%rax, %r9
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%rbx			# B1
+	 add	%rax, %r9
+
+	 mov	%r9, (+8*1)(%rdi)
+	# end MUL_128x128t128
+
+	 sub	\$`$K1-$M`, %rsi
+
+	 mov	(%rcx), $X[6]
+	 mov	(+8*1)(%rcx), $X[7]			# r9:r8 = X2[1:0]
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+	# load first half of m to rdx, rdi, rbx, rax
+	# moved this here for efficiency
+	 mov	(+8*0)(%rsi), %rax
+	 mov	(+8*1)(%rsi), %rbx
+	 mov	(+8*2)(%rsi), %rdi
+	 mov	(+8*3)(%rsi), %rdx
+
+	# continue with reduction
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+	 add	(+8*8)(%rcx), $X[6]
+	 adc	(+8*9)(%rcx), $X[7]
+
+	#accumulate the final carry to rbp
+	 adc	%rbp, %rbp
+
+	# Add in overflow corrections: R = (X2>>128) += T[overflow]
+	# R = {r9, r8, r15, r14, ..., r10}
+	 shl	\$3, %rbp
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rcx			# rsi -> Data (and points to T)
+	 add	%rcx, %rbp			# pT ; 512 bits, 8 qwords, spread out
+
+	# rsi will be used to generate a mask after the addition
+	 xor	%rsi, %rsi
+
+	 add	(+8*8*0)(%rbp), $X[0]
+	 adc	(+8*8*1)(%rbp), $X[1]
+	 adc	(+8*8*2)(%rbp), $X[2]
+	 adc	(+8*8*3)(%rbp), $X[3]
+	 adc	(+8*8*4)(%rbp), $X[4]
+	 adc	(+8*8*5)(%rbp), $X[5]
+	 adc	(+8*8*6)(%rbp), $X[6]
+	 adc	(+8*8*7)(%rbp), $X[7]
+
+	# if there is a carry:	rsi = 0xFFFFFFFFFFFFFFFF
+	# if carry is clear:	rsi = 0x0000000000000000
+	 sbb	\$0, %rsi
+
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	 mov	\$1, %rbp
+	 sub	%rax, $X[0]
+	 sbb	%rbx, $X[1]
+	 sbb	%rdi, $X[2]
+	 sbb	%rdx, $X[3]
+
+	# if there is a borrow:		rbp = 0
+	# if there is no borrow:	rbp = 1
+	# this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+	 sbb	\$0, %rbp
+
+	#load second half of m to rdx, rdi, rbx, rax
+
+	 add	\$$M, %rcx
+	 mov	(+8*4)(%rcx), %rax
+	 mov	(+8*5)(%rcx), %rbx
+	 mov	(+8*6)(%rcx), %rdi
+	 mov	(+8*7)(%rcx), %rdx
+
+	# use the rsi mask as before
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	# if rbp = 0, there was a borrow before, it is moved to the carry flag
+	# if rbp = 1, there was not a borrow before, carry flag is cleared
+	 sub	\$1, %rbp
+
+	 sbb	%rax, $X[4]
+	 sbb	%rbx, $X[5]
+	 sbb	%rdi, $X[6]
+	 sbb	%rdx, $X[7]
+
+	# write R back to memory
+
+	 mov	(+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+	 mov	$X[0], (+8*0)(%rsi)
+	 mov	$X[1], (+8*1)(%rsi)
+	 mov	$X[2], (+8*2)(%rsi)
+	 mov	$X[3], (+8*3)(%rsi)
+	 mov	$X[4], (+8*4)(%rsi)
+	 mov	$X[5], (+8*5)(%rsi)
+	 mov	$X[6], (+8*6)(%rsi)
+	 mov	$X[7], (+8*7)(%rsi)
+
+	 ret
+.size	mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512	MACRO	pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+
+$code.=<<___;
+	 mov	(+8*0)($pA), $OP
+
+	 mov	$X[0], %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	$X[$i], %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i-1]
+	 adc	\$0, %rdx
+	 mov	%rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	(+8*$i)($pA), $OP
+___
+
+	&MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+	push(@X,shift(@X));
+}
+
+$code.=<<___;
+	 mov	$X[0], (+$pDst_o+8*8)($pDst)
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+	 mov	$X[3], (+$pDst_o+8*11)($pDst)
+	 mov	$X[4], (+$pDst_o+8*12)($pDst)
+	 mov	$X[5], (+$pDst_o+8*13)($pDst)
+	 mov	$X[6], (+$pDst_o+8*14)($pDst)
+	 mov	$X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type	mont_mul_a3b,\@abi-omnipotent
+.align	16
+mont_mul_a3b:
+	#
+	# multiply tmp = src1 * src2
+	# For multiply: dst = rcx, src1 = rdi, src2 = rsi
+	# stack depth is extra 8 from call
+___
+	&MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+	#
+	# Dst = tmp % m
+	# Call reduce(tmp, m, data, dst)
+
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	# ------------------
+	# first pass 01...07
+	# ------------------
+	 mov	$X[0], $A
+
+	 mov	$X[1],%rax
+	 mul	$A
+	 mov	%rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $X[$i-2]
+	 mov	$X[$i],%rax
+	 mul	$A
+	 add	%rax, $X[$i-2]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $x7
+
+	 mov	$X[0], (+$pDst_o+8*2)($pDst)
+
+	# ------------------
+	# second pass 12...17
+	# ------------------
+
+	 mov	(+8*1)($pA), $A
+
+	 mov	(+8*2)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*3)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*4)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# third pass 23...27
+	# ------------------
+	 mov	(+8*2)($pA), $A
+
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 mov	$X[3], (+$pDst_o+8*5)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+	 mov	$X[4], (+$pDst_o+8*6)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# fourth pass 34...37
+	# ------------------
+
+	 mov	(+8*3)($pA), $A
+
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*7)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+	 mov	$x7, (+$pDst_o+8*8)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[5]
+
+	# ------------------
+	# fifth pass 45...47
+	# ------------------
+	 mov	(+8*4)($pA), $A
+
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# sixth pass 56...57
+	# ------------------
+	 mov	(+8*5)($pA), $A
+
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*11)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*12)($pDst)
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# seventh pass 67
+	# ------------------
+	 mov	$X[6], $A
+
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*13)($pDst)
+
+	 mov	%rdx, (+$pDst_o+8*14)($pDst)
+
+	# start finalize (add	in squares, and double off-terms)
+	 mov	(+$pDst_o+8*1)($pDst), $X[0]
+	 mov	(+$pDst_o+8*2)($pDst), $X[1]
+	 mov	(+$pDst_o+8*3)($pDst), $X[2]
+	 mov	(+$pDst_o+8*4)($pDst), $X[3]
+	 mov	(+$pDst_o+8*5)($pDst), $X[4]
+	 mov	(+$pDst_o+8*6)($pDst), $X[5]
+
+	 mov	(+8*3)($pA), %rax
+	 mul	%rax
+	 mov	%rax, $x6
+	 mov	%rdx, $X[6]
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	\$0, $X[6]
+
+	 mov	(+8*0)($pA), %rax
+	 mul	%rax
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $A
+
+	 mov	(+8*1)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+	 mov	$X[0], (+$pDst_o+8*1)($pDst)
+	 mov	$X[1], (+$pDst_o+8*2)($pDst)
+
+	 mov	(+8*2)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+
+	 mov	$X[2], (+$pDst_o+8*3)($pDst)
+	 mov	$X[3], (+$pDst_o+8*4)($pDst)
+
+	 xor	$tmp, $tmp
+	 add	$A, $X[4]
+	 adc	$x6, $X[5]
+	 adc	\$0, $tmp
+
+	 mov	$X[4], (+$pDst_o+8*5)($pDst)
+	 mov	$X[5], (+$pDst_o+8*6)($pDst)
+
+	# %%tmp has 0/1 in column 7
+	# %%A6 has a full value in column 7
+
+	 mov	(+$pDst_o+8*7)($pDst), $X[0]
+	 mov	(+$pDst_o+8*8)($pDst), $X[1]
+	 mov	(+$pDst_o+8*9)($pDst), $X[2]
+	 mov	(+$pDst_o+8*10)($pDst), $X[3]
+	 mov	(+$pDst_o+8*11)($pDst), $X[4]
+	 mov	(+$pDst_o+8*12)($pDst), $X[5]
+	 mov	(+$pDst_o+8*13)($pDst), $x6
+	 mov	(+$pDst_o+8*14)($pDst), $x7
+
+	 mov	$X[7], %rax
+	 mul	%rax
+	 mov	%rax, $X[7]
+	 mov	%rdx, $A
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	$x6, $x6
+	 adc	$x7, $x7
+	 adc	\$0, $A
+
+	 add	$tmp, $X[0]
+
+	 mov	(+8*4)($pA), %rax
+	 mul	%rax
+
+	 add	$X[6], $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[0], (+$pDst_o+8*7)($pDst)
+	 mov	$X[1], (+$pDst_o+8*8)($pDst)
+
+	 mov	(+8*5)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[2], (+$pDst_o+8*9)($pDst)
+	 mov	$X[3], (+$pDst_o+8*10)($pDst)
+
+	 mov	(+8*6)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[4]
+	 adc	%rax, $X[5]
+	 adc	\$0, %rdx
+
+	 mov	$X[4], (+$pDst_o+8*11)($pDst)
+	 mov	$X[5], (+$pDst_o+8*12)($pDst)
+
+	 add	%rdx, $x6
+	 adc	$X[7], $x7
+	 adc	\$0, $A
+
+	 mov	$x6, (+$pDst_o+8*13)($pDst)
+	 mov	$x7, (+$pDst_o+8*14)($pDst)
+	 mov	$A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type	sqr_reduce,\@abi-omnipotent
+.align	16
+sqr_reduce:
+	 mov	(+$pResult_offset+8)(%rsp), %rcx
+___
+	&SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries	equ	32
+#table_size	equ	table_entries * 8
+$code.=<<___;
+.globl	mod_exp_512
+.type	mod_exp_512,\@function,4
+mod_exp_512:
+	 push	%rbp
+	 push	%rbx
+	 push	%r12
+	 push	%r13
+	 push	%r14
+	 push	%r15
+
+	# adjust stack down and then align it with cache boundary
+	 mov	%rsp, %r8
+	 sub	\$$mem_size, %rsp
+	 and	\$-64, %rsp
+
+	# store previous stack pointer and arguments
+	 mov	%r8, (+$rsp_offset)(%rsp)
+	 mov	%rdi, (+$pResult_offset)(%rsp)
+	 mov	%rsi, (+$pG_offset)(%rsp)
+	 mov	%rcx, (+$pData_offset)(%rsp)
+.Lbody:
+	# transform g into montgomery space
+	# GT = reduce(g * C2) = reduce(g * (2^256))
+	# reduce expects to have the input in [tmp16]
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rsi), %xmm0
+	 movdqu	(+16*1)(%rsi), %xmm1
+	 movdqu	(+16*2)(%rsi), %xmm2
+	 movdqu	(+16*3)(%rsi), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*3)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+	# load pExp before rdx gets blown away
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+
+	 lea	(+$GT_offset)(%rsp), %rbx
+	 mov	%rbx, (+$red_result_addr_offset)(%rsp)
+	 call	mont_reduce
+
+	# Initialize tmp = C
+	 lea	(+$tmp_offset)(%rsp), %rcx
+	 xor	%rax, %rax
+	 mov	%rax, (+8*0)(%rcx)
+	 mov	%rax, (+8*1)(%rcx)
+	 mov	%rax, (+8*3)(%rcx)
+	 mov	%rax, (+8*4)(%rcx)
+	 mov	%rax, (+8*5)(%rcx)
+	 mov	%rax, (+8*6)(%rcx)
+	 mov	%rax, (+8*7)(%rcx)
+	 mov	%rax, (+$exp_offset+8*8)(%rsp)
+	 movq	\$1, (+8*2)(%rcx)
+
+	 lea	(+$garray_offset)(%rsp), %rbp
+	 mov	%rcx, %rsi			# pTmp
+	 mov	%rbp, %rdi			# Garray[][0]
+___
+
+	&swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+	# for (rax = 31; rax != 0; rax--) {
+	#     tmp = reduce(tmp * G)
+	#     swizzle(pg, tmp);
+	#     pg += 2; }
+$code.=<<___;
+	 mov	\$31, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	# rsi -> pTmp
+	 mov	%rsi, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rsi), %r10
+	 mov	(+8*1)(%rsi), %r11
+	 mov	(+8*2)(%rsi), %r12
+	 mov	(+8*3)(%rsi), %r13
+	 mov	(+8*4)(%rsi), %r14
+	 mov	(+8*5)(%rsi), %r15
+	 mov	(+8*6)(%rsi), %r8
+	 mov	(+8*7)(%rsi), %r9
+init_loop:
+	 lea	(+$GT_offset)(%rsp), %rdi
+	 call	mont_mul_a3b
+	 lea	(+$tmp_offset)(%rsp), %rsi
+	 mov	(+$pg_offset)(%rsp), %rbp
+	 add	\$2, %rbp
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	 mov	%rsi, %rcx			# rcx = rsi = addr of tmp
+___
+
+	&swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+	 mov	(+$i_offset)(%rsp), %rax
+	 sub	\$1, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 jne	init_loop
+
+	#
+	# Copy exponent onto stack
+	 movdqa	%xmm0, (+$exp_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$exp_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$exp_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+	#
+	# Do exponentiation
+	# Initialize result to G[exp{511:507}]
+	 mov	(+$exp_offset+62)(%rsp), %eax
+	 mov	%rax, %rdx
+	 shr	\$11, %rax
+	 and	\$0x07FF, %edx
+	 mov	%edx, (+$exp_offset+62)(%rsp)
+	 lea	(+$garray_offset)(%rsp,%rax,2), %rsi
+	 mov	(+$pResult_offset)(%rsp), %rdx
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+	#
+	# Loop variables
+	# rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+	 movq	\$505, (+$loop_idx_offset)(%rsp)
+
+	 mov	(+$pResult_offset)(%rsp), %rcx
+	 mov	%rcx, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rcx), %r10
+	 mov	(+8*1)(%rcx), %r11
+	 mov	(+8*2)(%rcx), %r12
+	 mov	(+8*3)(%rcx), %r13
+	 mov	(+8*4)(%rcx), %r14
+	 mov	(+8*5)(%rcx), %r15
+	 mov	(+8*6)(%rcx), %r8
+	 mov	(+8*7)(%rcx), %r9
+	 jmp	sqr_2
+
+main_loop_a3b:
+	 call	sqr_reduce
+	 call	sqr_reduce
+	 call	sqr_reduce
+sqr_2:
+	 call	sqr_reduce
+	 call	sqr_reduce
+
+	#
+	# Do multiply, first look up proper value in Garray
+	 mov	(+$loop_idx_offset)(%rsp), %rcx			# bit index
+	 mov	%rcx, %rax
+	 shr	\$4, %rax			# rax is word pointer
+	 mov	(+$exp_offset)(%rsp,%rax,2), %edx
+	 and	\$15, %rcx
+	 shrq	%cl, %rdx
+	 and	\$0x1F, %rdx
+
+	 lea	(+$garray_offset)(%rsp,%rdx,2), %rsi
+	 lea	(+$tmp_offset)(%rsp), %rdx
+	 mov	%rdx, %rdi
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+	# rdi = tmp = pG
+
+	#
+	# Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+	#                 result result pG     M   Data
+$code.=<<___;
+	 mov	(+$pResult_offset)(%rsp), %rsi
+	 call	mont_mul_a3b
+
+	#
+	# finish loop
+	 mov	(+$loop_idx_offset)(%rsp), %rcx
+	 sub	\$5, %rcx
+	 mov	%rcx, (+$loop_idx_offset)(%rsp)
+	 jge	main_loop_a3b
+
+	#
+
+end_main_loop_a3b:
+	# transform result out of Montgomery space
+	# result = reduce(result)
+	 mov	(+$pResult_offset)(%rsp), %rdx
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*5)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*3)(%rsp)
+	 call	mont_reduce
+
+	# If result > m, subract m
+	# load result into r15:r8
+	 mov	(+$pResult_offset)(%rsp), %rax
+	 mov	(+8*0)(%rax), %r8
+	 mov	(+8*1)(%rax), %r9
+	 mov	(+8*2)(%rax), %r10
+	 mov	(+8*3)(%rax), %r11
+	 mov	(+8*4)(%rax), %r12
+	 mov	(+8*5)(%rax), %r13
+	 mov	(+8*6)(%rax), %r14
+	 mov	(+8*7)(%rax), %r15
+
+	# subtract m
+	 mov	(+$pData_offset)(%rsp), %rbx
+	 add	\$$M, %rbx
+
+	 sub	(+8*0)(%rbx), %r8
+	 sbb	(+8*1)(%rbx), %r9
+	 sbb	(+8*2)(%rbx), %r10
+	 sbb	(+8*3)(%rbx), %r11
+	 sbb	(+8*4)(%rbx), %r12
+	 sbb	(+8*5)(%rbx), %r13
+	 sbb	(+8*6)(%rbx), %r14
+	 sbb	(+8*7)(%rbx), %r15
+
+	# if Carry is clear, replace result with difference
+	 mov	(+8*0)(%rax), %rsi
+	 mov	(+8*1)(%rax), %rdi
+	 mov	(+8*2)(%rax), %rcx
+	 mov	(+8*3)(%rax), %rdx
+	 cmovnc	%r8, %rsi
+	 cmovnc	%r9, %rdi
+	 cmovnc	%r10, %rcx
+	 cmovnc	%r11, %rdx
+	 mov	%rsi, (+8*0)(%rax)
+	 mov	%rdi, (+8*1)(%rax)
+	 mov	%rcx, (+8*2)(%rax)
+	 mov	%rdx, (+8*3)(%rax)
+
+	 mov	(+8*4)(%rax), %rsi
+	 mov	(+8*5)(%rax), %rdi
+	 mov	(+8*6)(%rax), %rcx
+	 mov	(+8*7)(%rax), %rdx
+	 cmovnc	%r12, %rsi
+	 cmovnc	%r13, %rdi
+	 cmovnc	%r14, %rcx
+	 cmovnc	%r15, %rdx
+	 mov	%rsi, (+8*4)(%rax)
+	 mov	%rdi, (+8*5)(%rax)
+	 mov	%rcx, (+8*6)(%rax)
+	 mov	%rdx, (+8*7)(%rax)
+
+	 mov	(+$rsp_offset)(%rsp), %rsi
+	 mov	0(%rsi),%r15
+	 mov	8(%rsi),%r14
+	 mov	16(%rsi),%r13
+	 mov	24(%rsi),%r12
+	 mov	32(%rsi),%rbx
+	 mov	40(%rsi),%rbp
+	 lea	48(%rsi),%rsp
+.Lepilogue:
+	 ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mod_exp_512_se_handler,\@abi-omnipotent
+.align	16
+mod_exp_512_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	$rsp_offset(%rax),%rax	# pull saved Rsp
+
+	mov	32(%rax),%rbx
+	mov	40(%rax),%rbp
+	mov	24(%rax),%r12
+	mov	16(%rax),%r13
+	mov	8(%rax),%r14
+	mov	0(%rax),%r15
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_mod_exp_512
+	.rva	.LSEH_end_mod_exp_512
+	.rva	.LSEH_info_mod_exp_512
+
+.section	.xdata
+.align	8
+.LSEH_info_mod_exp_512:
+	.byte	9,0,0,0
+	.rva	mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000..4a766a8
--- /dev/null
+++ b/crypto/bn/asm/parisc-mont.pl

@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	print $_,"\n";
+}
+close STDOUT;

diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index 7849eae..f9b6992 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl

@@ -31,7 +31,6 @@
 	$BNSZ=	$BITS/8;
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*16;
 
 	$LD=	"lwz";		# load
 	$LDU=	"lwzu";		# load and update
@@ -51,7 +50,6 @@
 	$BNSZ=	$BITS/8;
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*16;
 
 	# same as above, but 64-bit mnemonics...
 	$LD=	"ld";		# load
@@ -69,6 +67,9 @@
 	$POP=	$LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -108,42 +109,48 @@
 .machine "any"
 .text
 
-.globl	.bn_mul_mont
+.globl	.bn_mul_mont_int
 .align	4
-.bn_mul_mont:
+.bn_mul_mont_int:
 	cmpwi	$num,4
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0
 	bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
 	slwi	$num,$num,`log($BNSZ)/log(2)`
 	li	$tj,-4096
-	addi	$ovf,$num,`$FRAME+$RZONE`
+	addi	$ovf,$num,$FRAME
 	subf	$ovf,$ovf,$sp	; $sp-$ovf
 	and	$ovf,$ovf,$tj	; minimize TLB usage
 	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
 	srwi	$num,$num,`log($BNSZ)/log(2)`
 	$STUX	$sp,$sp,$ovf
 
-	$PUSH	r14,`4*$SIZE_T`($sp)
-	$PUSH	r15,`5*$SIZE_T`($sp)
-	$PUSH	r16,`6*$SIZE_T`($sp)
-	$PUSH	r17,`7*$SIZE_T`($sp)
-	$PUSH	r18,`8*$SIZE_T`($sp)
-	$PUSH	r19,`9*$SIZE_T`($sp)
-	$PUSH	r20,`10*$SIZE_T`($sp)
-	$PUSH	r21,`11*$SIZE_T`($sp)
-	$PUSH	r22,`12*$SIZE_T`($sp)
-	$PUSH	r23,`13*$SIZE_T`($sp)
-	$PUSH	r24,`14*$SIZE_T`($sp)
-	$PUSH	r25,`15*$SIZE_T`($sp)
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
 
 	$LD	$n0,0($n0)	; pull n0[0] value
 	addi	$num,$num,-2	; adjust $num for counter register
 
 	$LD	$m0,0($bp)	; m0=bp[0]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
 	$UMULH	$hi0,$aj,$m0
 
@@ -205,8 +212,8 @@
 Louter:
 	$LDX	$m0,$bp,$i	; m0=bp[i]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
-	$LD	$tj,$FRAME($sp)	; tp[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
@@ -273,7 +280,7 @@
 
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	mtctr	$num
 
 .align	4
@@ -299,23 +306,27 @@
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy
 
-	$POP	r14,`4*$SIZE_T`($sp)
-	$POP	r15,`5*$SIZE_T`($sp)
-	$POP	r16,`6*$SIZE_T`($sp)
-	$POP	r17,`7*$SIZE_T`($sp)
-	$POP	r18,`8*$SIZE_T`($sp)
-	$POP	r19,`9*$SIZE_T`($sp)
-	$POP	r20,`10*$SIZE_T`($sp)
-	$POP	r21,`11*$SIZE_T`($sp)
-	$POP	r22,`12*$SIZE_T`($sp)
-	$POP	r23,`13*$SIZE_T`($sp)
-	$POP	r24,`14*$SIZE_T`($sp)
-	$POP	r25,`15*$SIZE_T`($sp)
-	$POP	$sp,0($sp)
+	$POP	$tj,0($sp)
 	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index f409317..1249ce2 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl

@@ -389,7 +389,9 @@
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -814,8 +816,9 @@
 
 
 	blr
-
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -966,7 +969,9 @@
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1502,7 +1507,9 @@
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1550,8 +1557,9 @@
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1594,7 +1602,9 @@
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1707,7 +1717,9 @@
 Lppcasm_div9:
 	or	r3,r8,r0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1746,8 +1758,9 @@
 	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1850,7 +1863,9 @@
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1973,7 +1988,9 @@
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 	.align	4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 3449b35..a14e769 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl

@@ -45,23 +45,40 @@
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
 
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
 $flavour = shift;
 
 if ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont_ppc64";
+	$fname=	"bn_mul_mont_fpu64";
 
 	$STUX=	"stwux";	# store indexed and update
 	$PUSH=	"stw";
 	$POP=	"lwz";
-	die "not implemented yet";
 } elsif ($flavour =~ /64/) {
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont";
+	$fname=	"bn_mul_mont_fpu64";
 
 	# same as above, but 64-bit mnemonics...
 	$STUX=	"stdux";	# store indexed and update
@@ -76,7 +93,7 @@
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;	# padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -93,16 +110,16 @@
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";	# interleaved ap and np in double format
-$a0="r15";	# ap[0]
-$t0="r16";	# temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";	# interleaved ap and np in double format
+$a0="r23";	# ap[0]
+$t0="r24";	# temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@
 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
 $dota="f8";	$dotb="f9";
 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
-$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
-$T0a="f18";	$T0b="f19";
-$T1a="f20";	$T1b="f21";
-$T2a="f22";	$T2b="f23";
-$T3a="f24";	$T3b="f25";
+$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
+$T0a="f24";	$T0b="f25";
+$T1a="f26";	$T1b="f27";
+$T2a="f28";	$T2b="f29";
+$T3a="f30";	$T3b="f31";
 
 # sp----------->+-------------------------------+
 #		| saved sp			|
 #		+-------------------------------+
-#		|				|
-#		+-------------------------------+
-#		| 10 saved gpr, r14-r23		|
 #		.				.
-#		.				.
-#   +12*size_t	+-------------------------------+
-#		| 12 saved fpr, f14-f25		|
-#		.				.
-#		.				.
-#   +12*8	+-------------------------------+
-#		| padding to 64 byte boundary	|
-#		.				.
-#   +X		+-------------------------------+
+#   +64		+-------------------------------+
 #		| 16 gpr<->fpr transfer zone	|
 #		.				.
 #		.				.
@@ -173,6 +179,16 @@
 #		.				.
 #		.				.
 #		+-------------------------------+
+#		.				.
+#   -12*size_t	+-------------------------------+
+#		| 10 saved gpr, r22-r31		|
+#		.				.
+#		.				.
+#   -12*8	+-------------------------------+
+#		| 12 saved fpr, f20-f31		|
+#		.				.
+#		.				.
+#		+-------------------------------+
 
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@
 .globl	.$fname
 .align	5
 .$fname:
-	cmpwi	$num,4
+	cmpwi	$num,`3*8/$SIZE_T`
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0		; possible "not handled" return code
 	bltlr-
-	andi.	r0,$num,1	; $num has to be even
+	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
 	bnelr-
 
-	slwi	$num,$num,3	; num*=8
+	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
 	li	$i,-4096
 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
 	add	$tp,$tp,$num	; place for tp[num+1]
@@ -196,35 +212,50 @@
 	subf	$tp,$tp,$sp	; $sp-$tp
 	and	$tp,$tp,$i	; minimize TLB usage
 	subf	$tp,$sp,$tp	; $tp-$sp
+	mr	$i,$sp
 	$STUX	$sp,$sp,$tp	; alloca
 
-	$PUSH	r14,`2*$SIZE_T`($sp)
-	$PUSH	r15,`3*$SIZE_T`($sp)
-	$PUSH	r16,`4*$SIZE_T`($sp)
-	$PUSH	r17,`5*$SIZE_T`($sp)
-	$PUSH	r18,`6*$SIZE_T`($sp)
-	$PUSH	r19,`7*$SIZE_T`($sp)
-	$PUSH	r20,`8*$SIZE_T`($sp)
-	$PUSH	r21,`9*$SIZE_T`($sp)
-	$PUSH	r22,`10*$SIZE_T`($sp)
-	$PUSH	r23,`11*$SIZE_T`($sp)
-	stfd	f14,`12*$SIZE_T+0`($sp)
-	stfd	f15,`12*$SIZE_T+8`($sp)
-	stfd	f16,`12*$SIZE_T+16`($sp)
-	stfd	f17,`12*$SIZE_T+24`($sp)
-	stfd	f18,`12*$SIZE_T+32`($sp)
-	stfd	f19,`12*$SIZE_T+40`($sp)
-	stfd	f20,`12*$SIZE_T+48`($sp)
-	stfd	f21,`12*$SIZE_T+56`($sp)
-	stfd	f22,`12*$SIZE_T+64`($sp)
-	stfd	f23,`12*$SIZE_T+72`($sp)
-	stfd	f24,`12*$SIZE_T+80`($sp)
-	stfd	f25,`12*$SIZE_T+88`($sp)
-
+	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
+	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
+	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
+	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
+	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
+	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
+	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
+	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
+	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
+	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
+	stfd	f20,`-12*8`($i)
+	stfd	f21,`-11*8`($i)
+	stfd	f22,`-10*8`($i)
+	stfd	f23,`-9*8`($i)
+	stfd	f24,`-8*8`($i)
+	stfd	f25,`-7*8`($i)
+	stfd	f26,`-6*8`($i)
+	stfd	f27,`-5*8`($i)
+	stfd	f28,`-4*8`($i)
+	stfd	f29,`-3*8`($i)
+	stfd	f30,`-2*8`($i)
+	stfd	f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
 	ld	$a0,0($ap)	; pull ap[0] value
 	ld	$n0,0($n0)	; pull n0[0] value
 	ld	$t3,0($bp)	; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+	mr	$t1,$n0
+	lwz	$a0,0($ap)	; pull ap[0,1] value
+	lwz	$t0,4($ap)
+	lwz	$n0,0($t1)	; pull n0[0,1] value
+	lwz	$t1,4($t1)
+	lwz	$t3,0($bp)	; bp[0,1]
+	lwz	$t2,4($bp)
+	insrdi	$a0,$t0,32,0
+	insrdi	$n0,$t1,32,0
+	insrdi	$t3,$t2,32,0
+___
+$code.=<<___;
 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
 	li	$i,-64
 	add	$nap_d,$tp,$num
@@ -258,6 +289,8 @@
 	std	$t5,`$FRAME+40`($sp)
 	std	$t6,`$FRAME+48`($sp)
 	std	$t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	lfd	$ba,`$FRAME+0`($sp)
 	lfd	$bb,`$FRAME+8`($sp)
 	lfd	$bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@
 
 .align	5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	std	$t0,`$FRAME+64`($sp)
 	std	$t1,`$FRAME+72`($sp)
 	std	$t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@
 	li	$i,8			; i=1
 .align	5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
 	ldx	$t3,$bp,$i	; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+	add	$t0,$bp,$i
+	lwz	$t3,0($t0)		; bp[i,i+1]
+	lwz	$t0,4($t0)
+	insrdi	$t3,$t0,32,0
+___
+$code.=<<___;
 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
 
@@ -761,6 +830,13 @@
 	stfd	$T0b,`$FRAME+8`($sp)
 	 add	$t7,$t7,$carry
 	 addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	stfd	$T1a,`$FRAME+16`($sp)
 	stfd	$T1b,`$FRAME+24`($sp)
 	 insrdi	$t4,$t7,16,0		; 64..127 bits
@@ -768,6 +844,13 @@
 	stfd	$T2a,`$FRAME+32`($sp)
 	stfd	$T2b,`$FRAME+40`($sp)
 	 adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	stfd	$T3a,`$FRAME+48`($sp)
 	stfd	$T3b,`$FRAME+56`($sp)
 	 addze	$carry,$carry
@@ -816,7 +899,21 @@
 	ld	$t7,`$FRAME+72`($sp)
 
 	addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	addze	$carry,$carry
 
 	std	$t3,-16($tp)		; tp[j-1]
@@ -835,7 +932,9 @@
 	subf	$nap_d,$t7,$nap_d	; rewind pointer
 	cmpw	$i,$num
 	blt-	Louter
+___
 
+$code.=<<___ if ($SIZE_T==8);
 	subf	$np,$num,$np	; rewind np
 	addi	$j,$j,1		; restore counter
 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
 	bdnz-	Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	addi	$np,$np,-4
+	addi	$rp,$rp,-4
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
+	mtctr	$j
+
+.align	4
+Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
+	ldu	$t2,16($tp)
+	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
+	lwz	$t5,8($np)
+	lwz	$t6,12($np)
+	lwzu	$t7,16($np)
+	extrdi	$t1,$t0,32,0
+	extrdi	$t3,$t2,32,0
+	subfe	$t4,$t4,$t0	; tp[j]-np[j]
+	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
+	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
+	 stw	$t1,8($ap)
+	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
+	 stw	$t2,12($ap)
+	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
+	 stwu	$t3,16($ap)
+	stw	$t4,4($rp)
+	stw	$t5,8($rp)
+	stw	$t6,12($rp)
+	stwu	$t7,16($rp)
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	subf	$rp,$num,$rp	; rewind rp
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	lwz	$t0,4($ap)
+	lwz	$t1,8($ap)
+	lwz	$t2,12($ap)
+	lwzu	$t3,16($ap)
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stw	$t0,4($rp)
+	stw	$t1,8($rp)
+	stw	$t2,12($rp)
+	stwu	$t3,16($rp)
+	std	$i,8($tp)	; zap tp at once
+	stdu	$i,16($tp)
+	bdnz-	Lcopy
+___
 
-	$POP	r14,`2*$SIZE_T`($sp)
-	$POP	r15,`3*$SIZE_T`($sp)
-	$POP	r16,`4*$SIZE_T`($sp)
-	$POP	r17,`5*$SIZE_T`($sp)
-	$POP	r18,`6*$SIZE_T`($sp)
-	$POP	r19,`7*$SIZE_T`($sp)
-	$POP	r20,`8*$SIZE_T`($sp)
-	$POP	r21,`9*$SIZE_T`($sp)
-	$POP	r22,`10*$SIZE_T`($sp)
-	$POP	r23,`11*$SIZE_T`($sp)
-	lfd	f14,`12*$SIZE_T+0`($sp)
-	lfd	f15,`12*$SIZE_T+8`($sp)
-	lfd	f16,`12*$SIZE_T+16`($sp)
-	lfd	f17,`12*$SIZE_T+24`($sp)
-	lfd	f18,`12*$SIZE_T+32`($sp)
-	lfd	f19,`12*$SIZE_T+40`($sp)
-	lfd	f20,`12*$SIZE_T+48`($sp)
-	lfd	f21,`12*$SIZE_T+56`($sp)
-	lfd	f22,`12*$SIZE_T+64`($sp)
-	lfd	f23,`12*$SIZE_T+72`($sp)
-	lfd	f24,`12*$SIZE_T+80`($sp)
-	lfd	f25,`12*$SIZE_T+88`($sp)
-	$POP	$sp,0($sp)
+$code.=<<___;
+	$POP	$i,0($sp)
 	li	r3,1	; signal "handled"
+	$POP	r22,`-12*8-10*$SIZE_T`($i)
+	$POP	r23,`-12*8-9*$SIZE_T`($i)
+	$POP	r24,`-12*8-8*$SIZE_T`($i)
+	$POP	r25,`-12*8-7*$SIZE_T`($i)
+	$POP	r26,`-12*8-6*$SIZE_T`($i)
+	$POP	r27,`-12*8-5*$SIZE_T`($i)
+	$POP	r28,`-12*8-4*$SIZE_T`($i)
+	$POP	r29,`-12*8-3*$SIZE_T`($i)
+	$POP	r30,`-12*8-2*$SIZE_T`($i)
+	$POP	r31,`-12*8-1*$SIZE_T`($i)
+	lfd	f20,`-12*8`($i)
+	lfd	f21,`-11*8`($i)
+	lfd	f22,`-10*8`($i)
+	lfd	f23,`-9*8`($i)
+	lfd	f24,`-8*8`($i)
+	lfd	f25,`-7*8`($i)
+	lfd	f26,`-6*8`($i)
+	lfd	f27,`-5*8`($i)
+	lfd	f28,`-4*8`($i)
+	lfd	f29,`-3*8`($i)
+	lfd	f30,`-2*8`($i)
+	lfd	f31,`-1*8`($i)
+	mr	$sp,$i
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x8c,10,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 0000000..cd9f13e
--- /dev/null
+++ b/crypto/bn/asm/s390x-gf2m.pl

@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1·b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0·b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
index f61246f..9fd64e8 100644
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl

@@ -32,6 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
 $mn0="%r0";
 $num="%r1";
 
@@ -60,34 +87,44 @@
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
-	lgf	$num,164($sp)	# pull $num
-	sla	$num,3		# $num to enumerate bytes
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)
 
-	stg	%r2,16($sp)
+	st${g}	%r2,2*$SIZE_T($sp)
 
 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,96		#
 	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 
-	stmg	%r3,%r15,24($sp)
-
-	lghi	$rp,-160-8	# leave room for carry bit
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
-	stg	%r0,0($sp)	# back chain
+	st${g}	%r0,0($sp)	# back chain
 
 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
 
 	lg	$bi,0($bp)
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi
 
@@ -95,6 +132,7 @@
 	msgr	$mn0,$n0
 
 	lg	$nlo,0($np)	#
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -106,12 +144,14 @@
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -119,22 +159,24 @@
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 
 .Louter:
 	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
-	alg	$alo,160($sp)	# +=tp[0]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
@@ -142,6 +184,7 @@
 	msgr	$mn0,$n0	# tp[0]*n0
 
 	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -153,14 +196,16 @@
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
-	alg	$alo,160($j,$sp)# +=tp[j]
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -168,31 +213,33 @@
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
-	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 
 	la	$bp,8($bp)	# bp++
-	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter
 
-	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
-	la	$ap,160($sp)
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"
 
 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
-	slbg	$alo,0($j,$np)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
@@ -207,19 +254,24 @@
 
 	la	$j,0(%r0)
 	lgr	$count,$num
-.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
-	stg	$j,160($j,$sp)	# zap tp
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy
 
-	la	%r1,160+8+48($j,$sp)
-	lmg	%r6,%r15,0(%r1)
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
 close STDOUT;

diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 0000000..808a1e5
--- /dev/null
+++ b/crypto/bn/asm/x86-gf2m.pl

@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1·b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0·b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1·b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0·b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();

diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 0000000..1658acb
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.pl

@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1·b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0·b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3b7a6f2..5d79b35 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl

@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -37,7 +51,6 @@
 $num="%r9";	# int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";	# reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +64,16 @@
 .type	bn_mul_mont,\@function,6
 .align	16
 bn_mul_mont:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	cmp	$ap,$bp
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -66,48 +89,66 @@
 	and	\$-1024,%rsp		# minimize TLB usage
 
 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
-.Lprologue:
-	mov	%rdx,$bp		# $bp reassigned, remember?
-
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
 	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
 
 	xor	$i,$i			# i=0
 	xor	$j,$j			# j=0
 
-	mov	($bp),$m0		# m0=bp[0]
-	mov	($ap),%rax
+	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[0]
 	mov	%rax,$lo0
+	mov	($np),%rax
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
 	mov	%rdx,$hi0
 
-	imulq	$n0,%rax		# "tp[0]"*n0
-	mov	%rax,$m1
-
-	mulq	($np)			# np[0]*m1
-	add	$lo0,%rax		# discarded
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
 .L1st:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[0]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
-	mov	%rdx,$hi0
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
-	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
-	adc	\$0,%rdx
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
 	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.L1st
+	mov	$lo0,$hi0
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -116,50 +157,64 @@
 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
 
 	lea	1($i),$i		# i++
-.align	4
+	jmp	.Louter
+.align	16
 .Louter:
-	xor	$j,$j			# j=0
-
 	mov	($bp,$i,8),$m0		# m0=bp[i]
-	mov	($ap),%rax		# ap[0]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
 	mulq	$m0			# ap[0]*bp[i]
-	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
+
+	imulq	$lo0,$m1		# tp[0]*n0
 	mov	%rdx,$hi0
 
-	imulq	$n0,%rax		# tp[0]*n0
-	mov	%rax,$m1
-
-	mulq	($np,$j,8)		# np[0]*m1
-	add	$lo0,%rax		# discarded
-	mov	8(%rsp),$lo0		# tp[1]
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
-.align	4
+	jmp	.Linner_enter
+
+.align	16
 .Linner:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[i]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
 	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
 	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
-	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
-	adc	\$0,%rdx
-	mov	(%rsp,$j,8),$lo0
 	cmp	$num,$j
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.Linner
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -173,34 +228,35 @@
 	cmp	$num,$i
 	jl	.Louter
 
-	lea	(%rsp),$ap		# borrow ap for tp
-	lea	-1($num),$j		# j=num-1
-
-	mov	($ap),%rax		# tp[0]
 	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
 .Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
-	dec	$j			# doesn't affect CF!
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	jge	.Lsub
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
 	and	%rax,$ap
 	not	%rax
 	mov	$rp,$np
 	and	%rax,$np
-	lea	-1($num),$j
+	mov	$num,$j			# j=num
 	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	($ap,$j,8),%rax
-	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
-	mov	$i,(%rsp,$j,8)		# zap temporary vector
-	dec	$j
-	jge	.Lcopy
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
 
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
@@ -211,9 +267,1236 @@
 	mov	32(%rsi),%rbp
 	mov	40(%rsi),%rbx
 	lea	48(%rsi),%rsp
-.Lepilogue:
+.Lmul_epilogue:
 	ret
 .size	bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	4($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# not used
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 4 and
+			# not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type	bn_sqr4x_mont,\@function,6
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	shl	\$3,${num}d		# convert $num to bytes
+	xor	%r10,%r10
+	mov	%rsp,%r11		# put aside %rsp
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
+	and	\$-1024,%rsp		# minimize TLB usage
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved $rptr
+	# +40	saved $nptr
+	# +48	saved *n0
+	# +56	saved %rsp
+	# +64	t[2*$num]
+	#
+	mov	$rptr,32(%rsp)		# save $rptr
+	mov	$nptr,40(%rsp)
+	mov	$n0,  48(%rsp)
+	mov	%r11, 56(%rsp)		# save original %rsp
+.Lsqr4x_body:
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	16($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[5]
+
+	 mov	16($aptr,$j),$ai	# a[6]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],16($tptr,$j)	# t[6]
+
+
+	 mov	24($aptr,$j),$ai	# a[7]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	32($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[8]
+	lea	16($i),$i
+	mov	$A1[0],8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+	xor	$A1[0],$A1[0]
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[3]
+
+	lea	16($j),$j
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr)	# t[2]
+
+	 mov	-8($aptr),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 mov	%rdx,$A1[1]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	$A1[0],8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	64(%rsp,$num,2),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],-40($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+	mov	40(%rsp),$nptr		# restore $nptr
+	mov	48(%rsp),$n0		# restore *n0
+	xor	$j,$j
+	mov	$num,0(%rsp)		# save $num
+	sub	$num,$j			# $j=-$num
+	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
+	 mov	$n0,$m0			#		# modsched #
+	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
+	lea	64(%rsp,$num),$tptr	# end of t[] window
+	mov	%rax,8(%rsp)		# save end of t[] buffer
+	lea	($nptr,$num),$nptr	# end of n[] buffer
+	xor	$topbit,$topbit		# $topbit=0
+
+	mov	0($nptr,$j),%rax	# n[0]		# modsched #
+	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
+	 mov	%rax,$Ni[0]		#		# modsched #
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xor	$A0[1],$A0[1]
+	mul	$m0			# n[0]*m0
+	add	%rax,$A0[0]		# n[0]*m0+t[0]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+	mov	$n0,$m1
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[1]*m0
+	add	%rax,$A0[1]		# n[1]*m0+t[1]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+	imulq	$A0[1],$m1
+
+	mov	16($nptr,$j),$Ni[0]	# n[2]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[0]*m1
+	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[1]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[2]*m0
+	add	%rax,$A0[0]		# n[2]*m0+t[2]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[3]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[1]*m1
+	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[2]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[3]*m0
+	add	%rax,$A0[1]		# n[3]*m0+t[3]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	mov	($nptr,$j),$Ni[0]	# n[4]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[2]*m1
+	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr,$j)	# "t[3]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[4]*m0
+	add	%rax,$A0[0]		# n[4]*m0+t[4]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	8($nptr,$j),$Ni[1]	# n[5]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[3]*m1
+	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr,$j)	# "t[4]"
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[5]*m0
+	add	%rax,$A0[1]		# n[5]*m0+t[5]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+
+	mov	16($nptr,$j),$Ni[0]	# n[6]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[4]*m1
+	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[5]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[6]*m0
+	add	%rax,$A0[0]		# n[6]*m0+t[6]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[7]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[5]*m1
+	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[6]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[7]*m0
+	add	%rax,$A0[1]		# n[7]*m0+t[7]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	cmp	\$0,$j
+	jne	.Lsqr4x_mont_inner
+
+	 sub	0(%rsp),$j		# $j=-$num	# modsched #
+	 mov	$n0,$m0			#		# modsched #
+
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[6]*m1
+	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
+	mov	$Ni[1],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr)	# "t[7]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr),$A0[0]		# +t[8]
+	adc	\$0,$A0[1]
+	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
+	add	$topbit,$A0[0]
+	adc	\$0,$A0[1]
+
+	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
+	xor	$A1[0],$A1[0]
+	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	add	$A0[0],$A1[1]
+	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
+	adc	\$0,$A1[0]
+	mul	$m1			# n[7]*m1
+	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
+	 mov	$Ni[0],%rax		#		# modsched #
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr)		# "t[8]"
+
+	xor	$topbit,$topbit
+	add	8($tptr),$A1[0]		# +t[9]
+	adc	$topbit,$topbit
+	add	$A0[1],$A1[0]
+	lea	16($tptr),$tptr		# "t[$num]>>128"
+	adc	\$0,$topbit
+	mov	$A1[0],-8($tptr)	# "t[9]"
+	cmp	8(%rsp),$tptr		# are we done?
+	jb	.Lsqr4x_mont_outer
+
+	mov	0(%rsp),$num		# restore $num
+	mov	$topbit,($tptr)		# save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+	mov	64(%rsp,$num),@ri[0]	# tp[0]
+	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
+	mov	40(%rsp),$nptr		# restore $nptr
+	shr	\$5,$num		# num/4
+	mov	8($tptr),@ri[1]		# t[1]
+	xor	$i,$i			# i=0 and clear CF!
+
+	mov	32(%rsp),$rptr		# restore $rptr
+	sub	0($nptr),@ri[0]
+	mov	16($tptr),@ri[2]	# t[2]
+	mov	24($tptr),@ri[3]	# t[3]
+	sbb	8($nptr),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
+	mov	40($tptr,$i,8),@ri[1]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($nptr,$i,8),@ri[0]
+	mov	48($tptr,$i,8),@ri[2]
+	mov	56($tptr,$i,8),@ri[3]
+	sbb	40($nptr,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsqr4x_sub
+
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$tptr
+	not	@ri[0]
+	mov	$rptr,$nptr
+	and	@ri[0],$nptr
+	lea	-1($num),$j
+	or	$nptr,$tptr		# tp=borrow?tp:rp
+
+	pxor	%xmm0,%xmm0
+	lea	64(%rsp,$num,8),$nptr
+	movdqu	($tptr),%xmm1
+	lea	($nptr,$num,8),$nptr
+	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
+	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
+	movdqu	%xmm1,($rptr)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:				# copy or in-place refresh
+	movdqu	16($tptr,$i),%xmm2
+	movdqu	32($tptr,$i),%xmm1
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+	movdqu	%xmm1,32($rptr,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lsqr4x_copy
+
+	movdqu	16($tptr,$i),%xmm2
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+	mov	56(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	ret
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
 ___
@@ -228,9 +1511,9 @@
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
+.type	mul_handler,\@abi-omnipotent
 .align	16
-se_handler:
+mul_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -245,15 +1528,20 @@
 	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip
 
-	lea	.Lprologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
-	lea	.Lepilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
@@ -272,7 +1560,53 @@
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
 
-.Lin_prologue:
+	jmp	.Lcommon_seh_tail
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lsqr4x_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lsqr4x_epilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	56(%rax),%rax		# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -310,7 +1644,7 @@
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	sqr_handler,.-sqr_handler
 
 .section	.pdata
 .align	4
@@ -318,11 +1652,27 @@
 	.rva	.LSEH_end_bn_mul_mont
 	.rva	.LSEH_info_bn_mul_mont
 
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr4x_mont
+	.rva	.LSEH_end_bn_sqr4x_mont
+	.rva	.LSEH_info_bn_sqr4x_mont
+
 .section	.xdata
 .align	8
 .LSEH_info_bn_mul_mont:
 	.byte	9,0,0,0
-	.rva	se_handler
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
 ___
 }
 

diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 0000000..057cda2
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.pl

@@ -0,0 +1,1070 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,\@function,6
+.align	64
+bn_mul_mont_gather5:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	2($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	movq	%xmm0,$m0		# bp[1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	movq	%xmm0,$m0		# bp[i+1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	4($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	xor	$j,$j			# j=0
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[i+1]
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+				("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	$idx,%r11
+	shr	\$`log($N/8)/log(2)`,$idx
+	and	\$`$N/8-1`,%r11
+	not	$idx
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
+	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
+	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
+	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,$idx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	`0*$STRIDE/4-96`($tbl),%xmm0
+	movq	`1*$STRIDE/4-96`($tbl),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($tbl),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($tbl),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($tbl),$tbl
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	lea	0x28(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align	64
+.Lmagic_masks:
+	.long	0,0, 0,0, 0,0, -1,-1
+	.long	0,0, 0,0, 0,0,  0,0
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	lea	`40+48`(%rax),%rax
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# end of alloca label
+	cmp	%r10,%rbx		# context->Rip<end of alloca label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	movaps	(%rax),%xmm0
+	movaps	16(%rax),%xmm1
+	lea	`40+48`(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+	movups	%xmm0,512($context)	# restore context->Xmm6
+	movups	%xmm1,528($context)	# restore context->Xmm7
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_gather5
+	.rva	.LSEH_end_bn_mul_mont_gather5
+	.rva	.LSEH_info_bn_mul_mont_gather5
+
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.align	8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;

diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index a0bc478..f34248e 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h

@@ -558,6 +558,17 @@
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);

diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 802a43d..52b3304 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c

@@ -169,15 +169,13 @@
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+/* BN_div computes  dv := num / divisor,  rounding towards
  * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
-        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -186,6 +184,7 @@
 	BN_ULONG *resp,*wnump;
 	BN_ULONG d0,d1;
 	int num_n,div_n;
+	int no_branch=0;
 
 	/* Invalid zero-padding would have particularly bad consequences
 	 * in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +199,7 @@
 
 	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
 		{
-		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		no_branch=1;
 		}
 
 	bn_check_top(dv);
@@ -214,7 +213,7 @@
 		return(0);
 		}
 
-	if (BN_ucmp(num,divisor) < 0)
+	if (!no_branch && BN_ucmp(num,divisor) < 0)
 		{
 		if (rm != NULL)
 			{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +238,25 @@
 	norm_shift+=BN_BITS2;
 	if (!(BN_lshift(snum,num,norm_shift))) goto err;
 	snum->neg=0;
-	div_n=sdiv->top;
-	num_n=snum->top;
-	loop=num_n-div_n;
-	/* Lets setup a 'window' into snum
-	 * This is the part that corresponds to the current
-	 * 'area' being divided */
-	wnum.neg   = 0;
-	wnum.d     = &(snum->d[loop]);
-	wnum.top   = div_n;
-	/* only needed when BN_ucmp messes up the values between top and max */
-	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
 
-	/* Get the top 2 words of sdiv */
-	/* div_n=sdiv->top; */
-	d0=sdiv->d[div_n-1];
-	d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
-	/* pointer to the 'top' of snum */
-	wnump= &(snum->d[num_n-1]);
-
-	/* Setup to 'res' */
-	res->neg= (num->neg^divisor->neg);
-	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop;
-	resp= &(res->d[loop-1]);
-
-	/* space for temp */
-	if (!bn_wexpand(tmp,(div_n+1))) goto err;
-
-	if (BN_ucmp(&wnum,sdiv) >= 0)
+	if (no_branch)
 		{
-		/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
-		 * bn_pollute) the const bignum arguments =>
-		 * clean the values between top and max again */
-		bn_clear_top2max(&wnum);
-		bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
-		*resp=1;
-		}
-	else
-		res->top--;
-	/* if res->top == 0 then clear the neg value otherwise decrease
-	 * the resp pointer */
-	if (res->top == 0)
-		res->neg = 0;
-	else
-		resp--;
-
-	for (i=0; i<loop-1; i++, wnump--, resp--)
-		{
-		BN_ULONG q,l0;
-		/* the first part of the loop uses the top two words of
-		 * snum and sdiv to calculate a BN_ULONG q such that
-		 * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
-		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
-		q=bn_div_3_words(wnump,d1,d0);
-#else
-		BN_ULONG n0,n1,rem=0;
-
-		n0=wnump[0];
-		n1=wnump[-1];
-		if (n0 == d0)
-			q=BN_MASK2;
-		else 			/* n0 < d0 */
-			{
-#ifdef BN_LLONG
-			BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
-			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			/*
-			 * rem doesn't have to be BN_ULLONG. The least we
-			 * know it's less that d0, isn't it?
-			 */
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-			t2=(BN_ULLONG)d1*q;
-
-			for (;;)
-				{
-				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				t2 -= d1;
-				}
-#else /* !BN_LLONG */
-			BN_ULONG t2l,t2h;
-
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
-			BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
-			t2l = d1 * q;
-			t2h = BN_UMULT_HIGH(d1,q);
-#else
-			{
-			BN_ULONG ql, qh;
-			t2l=LBITS(d1); t2h=HBITS(d1);
-			ql =LBITS(q);  qh =HBITS(q);
-			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
-			}
-#endif
-
-			for (;;)
-				{
-				if ((t2h < rem) ||
-					((t2h == rem) && (t2l <= wnump[-2])))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				if (t2l < d1) t2h--; t2l -= d1;
-				}
-#endif /* !BN_LLONG */
-			}
-#endif /* !BN_DIV3W */
-
-		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
-		tmp->d[div_n]=l0;
-		wnum.d--;
-		/* ingore top values of the bignums just sub the two 
-		 * BN_ULONG arrays with bn_sub_words */
-		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
-			{
-			/* Note: As we have considered only the leading
-			 * two BN_ULONGs in the calculation of q, sdiv * q
-			 * might be greater than wnum (but then (q-1) * sdiv
-			 * is less or equal than wnum)
-			 */
-			q--;
-			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
-				/* we can't have an overflow here (assuming
-				 * that q != 0, but if q == 0 then tmp is
-				 * zero anyway) */
-				(*wnump)++;
-			}
-		/* store part of the result */
-		*resp = q;
-		}
-	bn_correct_top(snum);
-	if (rm != NULL)
-		{
-		/* Keep a copy of the neg flag in num because if rm==num
-		 * BN_rshift() will overwrite it.
+		/* Since we don't know whether snum is larger than sdiv,
+		 * we pad snum with enough zeroes without changing its
+		 * value. 
 		 */
-		int neg = num->neg;
-		BN_rshift(rm,snum,norm_shift);
-		if (!BN_is_zero(rm))
-			rm->neg = neg;
-		bn_check_top(rm);
-		}
-	BN_CTX_end(ctx);
-	return(1);
-err:
-	bn_check_top(rm);
-	BN_CTX_end(ctx);
-	return(0);
-	}
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
-	const BIGNUM *divisor, BN_CTX *ctx)
-	{
-	int norm_shift,i,loop;
-	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
-	BN_ULONG *resp,*wnump;
-	BN_ULONG d0,d1;
-	int num_n,div_n;
-
-	bn_check_top(dv);
-	bn_check_top(rm);
-	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
-	bn_check_top(divisor);
-
-	if (BN_is_zero(divisor))
-		{
-		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
-		return(0);
-		}
-
-	BN_CTX_start(ctx);
-	tmp=BN_CTX_get(ctx);
-	snum=BN_CTX_get(ctx);
-	sdiv=BN_CTX_get(ctx);
-	if (dv == NULL)
-		res=BN_CTX_get(ctx);
-	else	res=dv;
-	if (sdiv == NULL || res == NULL) goto err;
-
-	/* First we normalise the numbers */
-	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
-	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
-	sdiv->neg=0;
-	norm_shift+=BN_BITS2;
-	if (!(BN_lshift(snum,num,norm_shift))) goto err;
-	snum->neg=0;
-
-	/* Since we don't know whether snum is larger than sdiv,
-	 * we pad snum with enough zeroes without changing its
-	 * value. 
-	 */
-	if (snum->top <= sdiv->top+1) 
-		{
-		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
-		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
-		snum->top = sdiv->top + 2;
-		}
-	else
-		{
-		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
-		snum->d[snum->top] = 0;
-		snum->top ++;
+		if (snum->top <= sdiv->top+1) 
+			{
+			if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+			for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+			snum->top = sdiv->top + 2;
+			}
+		else
+			{
+			if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+			snum->d[snum->top] = 0;
+			snum->top ++;
+			}
 		}
 
 	div_n=sdiv->top;
@@ -500,12 +282,27 @@
 	/* Setup to 'res' */
 	res->neg= (num->neg^divisor->neg);
 	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop-1;
+	res->top=loop-no_branch;
 	resp= &(res->d[loop-1]);
 
 	/* space for temp */
 	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
+	if (!no_branch)
+		{
+		if (BN_ucmp(&wnum,sdiv) >= 0)
+			{
+			/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+			 * bn_pollute) the const bignum arguments =>
+			 * clean the values between top and max again */
+			bn_clear_top2max(&wnum);
+			bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+			*resp=1;
+			}
+		else
+			res->top--;
+		}
+
 	/* if res->top == 0 then clear the neg value otherwise decrease
 	 * the resp pointer */
 	if (res->top == 0)
@@ -638,7 +435,7 @@
 			rm->neg = neg;
 		bn_check_top(rm);
 		}
-	bn_correct_top(res);
+	if (no_branch)	bn_correct_top(res);
 	BN_CTX_end(ctx);
 	return(1);
 err:
@@ -646,5 +443,4 @@
 	BN_CTX_end(ctx);
 	return(0);
 	}
-
 #endif

diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d9b6c73..2abf6fd 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c

@@ -113,6 +113,18 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+#  define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE	32
 
@@ -522,23 +534,17 @@
  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
  * from/to that table. */
 
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 	{
 	size_t i, j;
 
-	if (bn_wexpand(b, top) == NULL)
-		return 0;
-	while (b->top < top)
-		{
-		b->d[b->top++] = 0;
-		}
-	
+	if (top > b->top)
+		top = b->top; /* this works because 'buf' is explicitly zeroed */
 	for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 		{
 		buf[j] = ((unsigned char*)b->d)[i];
 		}
 
-	bn_correct_top(b);
 	return 1;
 	}
 
@@ -561,7 +567,7 @@
 
 /* Given a pointer value, compute the next address that is a cache line multiple. */
 #define MOD_EXP_CTIME_ALIGN(x_) \
-	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 
 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
  * precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,bits,ret=0,idx,window,wvalue;
+	int i,bits,ret=0,window,wvalue;
 	int top;
- 	BIGNUM *r;
-	const BIGNUM *aa;
 	BN_MONT_CTX *mont=NULL;
 
 	int numPowers;
 	unsigned char *powerbufFree=NULL;
 	int powerbufLen = 0;
 	unsigned char *powerbuf=NULL;
-	BIGNUM *computeTemp=NULL, *am=NULL;
+	BIGNUM tmp, am;
 
 	bn_check_top(a);
 	bn_check_top(p);
@@ -602,10 +606,7 @@
 		return ret;
 		}
 
- 	/* Initialize BIGNUM context and allocate intermediate result */
 	BN_CTX_start(ctx);
-	r = BN_CTX_get(ctx);
-	if (r == NULL) goto err;
 
 	/* Allocate a montgomery context if it was not supplied by the caller.
 	 * If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@
 
 	/* Get the window size to use with size of p. */
 	window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+#endif
 
 	/* Allocate a buffer large enough to hold all of the pre-computed
-	 * powers of a.
+	 * powers of am, am itself and tmp.
 	 */
 	numPowers = 1 << window;
-	powerbufLen = sizeof(m->d[0])*top*numPowers;
+	powerbufLen = sizeof(m->d[0])*(top*numPowers +
+				((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+	else
+#endif
 	if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
 		goto err;
 		
 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
 	memset(powerbuf, 0, powerbufLen);
 
- 	/* Initialize the intermediate result. Do this early to save double conversion,
-	 * once each for a^0 and intermediate result.
-	 */
- 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = NULL;
+#endif
 
-	/* Initialize computeTemp as a^1 with montgomery precalcs */
-	computeTemp = BN_CTX_get(ctx);
-	am = BN_CTX_get(ctx);
-	if (computeTemp==NULL || am==NULL) goto err;
+	/* lay down tmp and am right after powers table */
+	tmp.d     = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+	am.d      = tmp.d + top;
+	tmp.top   = am.top  = 0;
+	tmp.dmax  = am.dmax = top;
+	tmp.neg   = am.neg  = 0;
+	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
 
+	/* prepare a^0 in Montgomery domain */
+#if 1
+ 	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+#else
+	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+	for (i=1;i<top;i++)
+		tmp.d[i] = (~m->d[i])&BN_MASK2;
+	tmp.top = top;
+#endif
+
+	/* prepare a^1 in Montgomery domain */
 	if (a->neg || BN_ucmp(a,m) >= 0)
 		{
-		if (!BN_mod(am,a,m,ctx))
-			goto err;
-		aa= am;
+		if (!BN_mod(&am,a,m,ctx))			goto err;
+		if (!BN_to_montgomery(&am,&am,mont,ctx))	goto err;
 		}
-	else
-		aa=a;
-	if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
-	if (!BN_copy(computeTemp, am)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+     * specifically optimization of cache-timing attack countermeasures
+     * and pre-computation optimization. */
+
+    /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+     * 512-bit RSA is hardly relevant, we omit it to spare size... */ 
+    if (window==5)
+	{
+	void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+			const void *table,const BN_ULONG *np,
+			const BN_ULONG *n0,int num,int power);
+	void bn_scatter5(const BN_ULONG *inp,size_t num,
+			void *table,size_t power);
+	void bn_gather5(BN_ULONG *out,size_t num,
+			void *table,size_t power);
+
+	BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+	/* BN_to_montgomery can contaminate words above .top
+	 * [in BN_DEBUG[_DEBUG] build]... */
+	for (i=am.top; i<top; i++)	am.d[i]=0;
+	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;
+
+	bn_scatter5(tmp.d,top,powerbuf,0);
+	bn_scatter5(am.d,am.top,powerbuf,1);
+	bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+	bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+	for (i=3; i<32; i++)
+		{
+		/* Calculate a^i = a^(i-1) * a */
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#else
+	/* same as above, but uses squaring for 1/2 of operations */
+	for (i=4; i<32; i*=2)
+		{
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+	for (i=3; i<8; i+=2)
+		{
+		int j;
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		for (j=2*i; j<32; j*=2)
+			{
+			bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+			bn_scatter5(tmp.d,top,powerbuf,j);
+			}
+		}
+	for (; i<16; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,2*i);
+		}
+	for (; i<32; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#endif
+	bits--;
+	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+	while (bits >= 0)
+		{
+		for (wvalue=0, i=0; i<5; i++,bits--)
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+		}
+
+	tmp.top=top;
+	bn_correct_top(&tmp);
+	}
+    else
+#endif
+	{
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1, numPowers)) goto err;
 
 	/* If the window size is greater than 1, then calculate
 	 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@
 	 */
 	if (window > 1)
 		{
-		for (i=2; i<numPowers; i++)
+		if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx))	goto err;
+		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+		for (i=3; i<numPowers; i++)
 			{
 			/* Calculate a^i = a^(i-1) * a */
-			if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+			if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
 				goto err;
-			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
 			}
 		}
 
- 	/* Adjust the number of bits up to a multiple of the window size.
- 	 * If the exponent length is not a multiple of the window size, then
- 	 * this pads the most significant bits with zeros to normalize the
- 	 * scanning loop to there's no special cases.
- 	 *
- 	 * * NOTE: Making the window size a power of two less than the native
-	 * * word size ensures that the padded bits won't go past the last
- 	 * * word in the internal BIGNUM structure. Going past the end will
- 	 * * still produce the correct result, but causes a different branch
- 	 * * to be taken in the BN_is_bit_set function.
- 	 */
- 	bits = ((bits+window-1)/window)*window;
- 	idx=bits-1;	/* The top bit of the window */
-
- 	/* Scan the exponent one window at a time starting from the most
- 	 * significant bits.
- 	 */
- 	while (idx >= 0)
+	bits--;
+	for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+ 
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+ 	while (bits >= 0)
   		{
  		wvalue=0; /* The 'value' of the window */
  		
  		/* Scan the window, squaring the result as we go */
- 		for (i=0; i<window; i++,idx--)
+ 		for (i=0; i<window; i++,bits--)
  			{
-			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))	goto err;
-			wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+			if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx))	goto err;
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
   			}
  		
 		/* Fetch the appropriate pre-computed value from the pre-buf */
-		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
 
  		/* Multiply the result into the intermediate result */
- 		if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ 		if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
   		}
+	}
 
  	/* Convert the final result from montgomery to standard format */
-	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 	if (powerbuf!=NULL)
 		{
 		OPENSSL_cleanse(powerbuf,powerbufLen);
-		OPENSSL_free(powerbufFree);
+		if (powerbufFree) OPENSSL_free(powerbufFree);
 		}
- 	if (am!=NULL) BN_clear(am);
- 	if (computeTemp!=NULL) BN_clear(computeTemp);
 	BN_CTX_end(ctx);
 	return(ret);
 	}
@@ -988,4 +1095,3 @@
 	bn_check_top(r);
 	return(ret);
 	}
-

diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 432a3aa..08ab9fd 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c

@@ -94,6 +94,8 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
 #define MAX_ITERATIONS 50
 
@@ -122,6 +124,7 @@
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
 
+#if !defined(OPENSSL_BN_ASM_GF2m)
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@
 	r[2] ^= m1 ^ r[1] ^ r[3];  /* h0 ^= m1 ^ l1 ^ h1; */
 	r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0;  /* l1 ^= l0 ^ h0 ^ m0; */
 	}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif 
 
 /* Add polynomials a and b and store result in r; r could be a or b, a and b 
  * could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@
 int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p) + 1;
-	int *arr=NULL;
+	int arr[6];
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
-	ret = BN_GF2m_poly2arr(p, arr, max);
-	if (!ret || ret > max)
+	ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+	if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
 		{
 		BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
-		goto err;
+		return 0;
 		}
 	ret = BN_GF2m_mod_arr(r, a, arr);
 	bn_check_top(r);
-err:
-	if (arr) OPENSSL_free(arr);
 	return ret;
 	}
 
@@ -521,7 +522,7 @@
  */
 int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
-	BIGNUM *b, *c, *u, *v, *tmp;
+	BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
 	int ret = 0;
 
 	bn_check_top(a);
@@ -529,18 +530,18 @@
 
 	BN_CTX_start(ctx);
 	
-	b = BN_CTX_get(ctx);
-	c = BN_CTX_get(ctx);
-	u = BN_CTX_get(ctx);
-	v = BN_CTX_get(ctx);
-	if (v == NULL) goto err;
+	if ((b = BN_CTX_get(ctx))==NULL) goto err;
+	if ((c = BN_CTX_get(ctx))==NULL) goto err;
+	if ((u = BN_CTX_get(ctx))==NULL) goto err;
+	if ((v = BN_CTX_get(ctx))==NULL) goto err;
 
-	if (!BN_one(b)) goto err;
 	if (!BN_GF2m_mod(u, a, p)) goto err;
-	if (!BN_copy(v, p)) goto err;
-
 	if (BN_is_zero(u)) goto err;
 
+	if (!BN_copy(v, p)) goto err;
+#if 0
+	if (!BN_one(b)) goto err;
+
 	while (1)
 		{
 		while (!BN_is_odd(u))
@@ -565,13 +566,86 @@
 		if (!BN_GF2m_add(u, u, v)) goto err;
 		if (!BN_GF2m_add(b, b, c)) goto err;
 		}
+#else
+	{
+	int i,	ubits = BN_num_bits(u),
+		vbits = BN_num_bits(v),	/* v is copy of p */
+		top = p->top;
+	BN_ULONG *udp,*bdp,*vdp,*cdp;
 
+	bn_wexpand(u,top);	udp = u->d;
+				for (i=u->top;i<top;i++) udp[i] = 0;
+				u->top = top;
+	bn_wexpand(b,top);	bdp = b->d;
+				bdp[0] = 1;
+				for (i=1;i<top;i++) bdp[i] = 0;
+				b->top = top;
+	bn_wexpand(c,top);	cdp = c->d;
+				for (i=0;i<top;i++) cdp[i] = 0;
+				c->top = top;
+	vdp = v->d;	/* It pays off to "cache" *->d pointers, because
+			 * it allows optimizer to be more aggressive.
+			 * But we don't have to "cache" p->d, because *p
+			 * is declared 'const'... */
+	while (1)
+		{
+		while (ubits && !(udp[0]&1))
+			{
+			BN_ULONG u0,u1,b0,b1,mask;
+
+			u0   = udp[0];
+			b0   = bdp[0];
+			mask = (BN_ULONG)0-(b0&1);
+			b0  ^= p->d[0]&mask;
+			for (i=0;i<top-1;i++)
+				{
+				u1 = udp[i+1];
+				udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+				u0 = u1;
+				b1 = bdp[i+1]^(p->d[i+1]&mask);
+				bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+				b0 = b1;
+				}
+			udp[i] = u0>>1;
+			bdp[i] = b0>>1;
+			ubits--;
+			}
+
+		if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+		if (ubits<vbits)
+			{
+			i = ubits; ubits = vbits; vbits = i;
+			tmp = u; u = v; v = tmp;
+			tmp = b; b = c; c = tmp;
+			udp = vdp; vdp = v->d;
+			bdp = cdp; cdp = c->d;
+			}
+		for(i=0;i<top;i++)
+			{
+			udp[i] ^= vdp[i];
+			bdp[i] ^= cdp[i];
+			}
+		if (ubits==vbits)
+			{
+			bn_correct_top(u);
+			ubits = BN_num_bits(u);
+			}
+		}
+	bn_correct_top(b);
+	}
+#endif
 
 	if (!BN_copy(r, b)) goto err;
 	bn_check_top(r);
 	ret = 1;
 
 err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+        bn_correct_top(c);
+        bn_correct_top(u);
+        bn_correct_top(v);
+#endif
   	BN_CTX_end(ctx);
 	return ret;
 	}
@@ -1033,3 +1107,4 @@
 	return 1;
 	}
 
+#endif

diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 8e5e98e..eecfd8c 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h

@@ -238,7 +238,7 @@
 #  if defined(__DECC)
 #   include <c_asm.h>
 #   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-#  elif defined(__GNUC__)
+#  elif defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("umulh	%1,%2,%0"	\
@@ -247,7 +247,7 @@
 	ret;			})
 #  endif	/* compiler */
 # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("mulhdu	%0,%1,%2"	\
@@ -257,7 +257,7 @@
 #  endif	/* compiler */
 # elif (defined(__x86_64) || defined(__x86_64__)) && \
        (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	asm ("mulq	%3"		\
@@ -280,6 +280,19 @@
 #   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
 #   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
 #  endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("dmultu	%1,%2"		\
+	     : "=h"(ret)		\
+	     : "r"(a), "r"(b) : "l");	\
+	ret;			})
+#   define BN_UMULT_LOHI(low,high,a,b)	\
+	asm ("dmultu	%2,%3"		\
+	     : "=l"(low),"=h"(high)	\
+	     : "r"(a), "r"(b));
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -459,6 +472,10 @@
 	}
 #endif /* !BN_LLONG */
 
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
 void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
 void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
 void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);

diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 5470fbe..7a5676d 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c

@@ -139,25 +139,6 @@
 	return(&const_one);
 	}
 
-char *BN_options(void)
-	{
-	static int init=0;
-	static char data[16];
-
-	if (!init)
-		{
-		init++;
-#ifdef BN_LLONG
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
-		}
-	return(data);
-	}
-
 int BN_num_bits_word(BN_ULONG l)
 	{
 	static const unsigned char bits[256]={

diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 1a86688..427b5cf 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c

@@ -177,31 +177,26 @@
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 	{
 	BIGNUM *n;
-	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
-	int al,nl,max,i,x,ri;
+	BN_ULONG *ap,*np,*rp,n0,v,carry;
+	int nl,max,i;
 
 	n= &(mont->N);
-	/* mont->ri is the size of mont->N in bits (rounded up
-	   to the word size) */
-	al=ri=mont->ri/BN_BITS2;
-
 	nl=n->top;
-	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+	if (nl == 0) { ret->top=0; return(1); }
 
-	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	max=(2*nl); /* carry is stored separately */
 	if (bn_wexpand(r,max) == NULL) return(0);
 
 	r->neg^=n->neg;
 	np=n->d;
 	rp=r->d;
-	nrp= &(r->d[nl]);
 
 	/* clear the top words of T */
 #if 1
 	for (i=r->top; i<max; i++) /* memset? XXX */
-		r->d[i]=0;
+		rp[i]=0;
 #else
-	memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
 #endif
 
 	r->top=max;
@@ -210,7 +205,7 @@
 #ifdef BN_COUNT
 	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
 #endif
-	for (i=0; i<nl; i++)
+	for (carry=0, i=0; i<nl; i++, rp++)
 		{
 #ifdef __TANDEM
                 {
@@ -228,61 +223,33 @@
 #else
 		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-		nrp++;
-		rp++;
-		if (((nrp[-1]+=v)&BN_MASK2) >= v)
-			continue;
-		else
-			{
-			if (((++nrp[0])&BN_MASK2) != 0) continue;
-			if (((++nrp[1])&BN_MASK2) != 0) continue;
-			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
-			}
+		v = (v+carry+rp[nl])&BN_MASK2;
+		carry |= (v != rp[nl]);
+		carry &= (v <= rp[nl]);
+		rp[nl]=v;
 		}
-	bn_correct_top(r);
 
-	/* mont->ri will be a multiple of the word size and below code
-	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-	if (r->top <= ri)
-		{
-		ret->top=0;
-		return(1);
-		}
-	al=r->top-ri;
-
-#define BRANCH_FREE 1
-#if BRANCH_FREE
-	if (bn_wexpand(ret,ri) == NULL) return(0);
-	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
-	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	if (bn_wexpand(ret,nl) == NULL) return(0);
+	ret->top=nl;
 	ret->neg=r->neg;
 
 	rp=ret->d;
-	ap=&(r->d[ri]);
+	ap=&(r->d[nl]);
 
+#define BRANCH_FREE 1
+#if BRANCH_FREE
 	{
-	size_t m1,m2;
+	BN_ULONG *nrp;
+	size_t m;
 
-	v=bn_sub_words(rp,ap,np,ri);
-	/* this ----------------^^ works even in al<ri case
-	 * thanks to zealous zeroing of top of the vector in the
-	 * beginning. */
-
-	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-	/* in other words if subtraction result is real, then
+	v=bn_sub_words(rp,ap,np,nl)-carry;
+	/* if subtraction result is real, then
 	 * trick unconditional memcpy below to perform in-place
 	 * "refresh" instead of actual copy. */
-	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
-	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
-	m1|=m2;			/* (al!=ri) */
-	m1|=(0-(size_t)v);	/* (al!=ri || v) */
-	m1&=~m2;		/* (al!=ri || v) && !al>ri */
-	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
-	}
+	m=(0-(size_t)v);
+	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
 
-	/* 'i<ri' is chosen to eliminate dependency on input data, even
-	 * though it results in redundant copy in al<ri case. */
-	for (i=0,ri-=4; i<ri; i+=4)
+	for (i=0,nl-=4; i<nl; i+=4)
 		{
 		BN_ULONG t1,t2,t3,t4;
 		
@@ -295,40 +262,15 @@
 		rp[i+2]=t3;
 		rp[i+3]=t4;
 		}
-	for (ri+=4; i<ri; i++)
+	for (nl+=4; i<nl; i++)
 		rp[i]=nrp[i], ap[i]=0;
+	}
+#else
+	if (bn_sub_words (rp,ap,np,nl)-carry)
+		memcpy(rp,ap,nl*sizeof(BN_ULONG));
+#endif
 	bn_correct_top(r);
 	bn_correct_top(ret);
-#else
-	if (bn_wexpand(ret,al) == NULL) return(0);
-	ret->top=al;
-	ret->neg=r->neg;
-
-	rp=ret->d;
-	ap=&(r->d[ri]);
-	al-=4;
-	for (i=0; i<al; i+=4)
-		{
-		BN_ULONG t1,t2,t3,t4;
-		
-		t1=ap[i+0];
-		t2=ap[i+1];
-		t3=ap[i+2];
-		t4=ap[i+3];
-		rp[i+0]=t1;
-		rp[i+1]=t2;
-		rp[i+2]=t3;
-		rp[i+3]=t4;
-		}
-	al+=4;
-	for (; i<al; i++)
-		rp[i]=ap[i];
-
-	if (BN_ucmp(ret, &(mont->N)) >= 0)
-		{
-		if (!BN_usub(ret,ret,&(mont->N))) return(0);
-		}
-#endif
 	bn_check_top(ret);
 
 	return(1);

diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index c6de032..43caee4 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c

@@ -319,6 +319,13 @@
 						:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
 #define bn_32_set_0(to, n)		(((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
 #define bn_cp_32(to,n,from,m)		((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+#  if defined(__arch64__)
+#   define NIST_INT64 long
+#  else
+#   define NIST_INT64 long long
+#  endif
+# endif
 #else
 #define bn_cp_64(to, n, from, m) \
 	{ \
@@ -330,13 +337,15 @@
 	bn_32_set_0(to, (n)*2); \
 	bn_32_set_0(to, (n)*2+1); \
 	}
-#if BN_BITS2 == 32
 #define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+#  define NIST_INT64 long long
+# endif
 #endif /* BN_BITS2 != 64 */
 
-
 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
 	bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +359,11 @@
 	int      top = a->top, i;
 	int      carry;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_192_TOP],
-	         buf[BN_NIST_192_TOP],
-		 c_d[BN_NIST_192_TOP],
+	union	{
+		BN_ULONG	bn[BN_NIST_192_TOP];
+		unsigned int	ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_192_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +396,48 @@
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
 
-	nist_set_192(t_d, buf, 0, 3, 3);
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc  = rp[0];	acc += bp[3*2-6];
+			acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[3*2-5];
+			acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[3*2-6];
+			acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[3*2-5];
+			acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_192_TOP];
+
+	nist_set_192(t_d, buf.bn, 0, 3, 3);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 4, 4, 0);
+	nist_set_192(t_d, buf.bn, 4, 4, 0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 5, 5, 5)
+	nist_set_192(t_d, buf.bn, 5, 5, 5)
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+	}
+#endif
 	if (carry > 0)
 		carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
 	else
@@ -435,8 +479,7 @@
 	int	top = a->top, i;
 	int	carry;
 	BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_224_TOP],
-	         buf[BN_NIST_224_TOP],
+	BN_ULONG buf[BN_NIST_224_TOP],
 		 c_d[BN_NIST_224_TOP],
 		*res;
 	PTR_SIZE_INT mask;
@@ -474,14 +517,54 @@
 
 #if BN_BITS2==64
 	/* copy upper 256 bits of 448 bit number ... */
-	nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+	nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
 	/* ... and right shift by 32 to obtain upper 224 bits */
-	nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+	nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8);
 	/* truncate lower part to 224 bits too */
 	r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
 #else
 	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
 #endif
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf;
+
+	acc  = rp[0];	acc -= bp[7-7];
+			acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc -= bp[8-7];
+			acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc -= bp[9-7];
+			acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[7-7];
+			acc += bp[11-7];
+			acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+	acc += rp[4];	acc += bp[8-7];
+			acc += bp[12-7];
+			acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[9-7];
+			acc += bp[13-7];
+			acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[10-7];
+			acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+# if BN_BITS2==64
+	rp[7] = carry;
+# endif
+	}	
+#else
+	{
+	BN_ULONG t_d[BN_NIST_224_TOP];
+
 	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
 	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
@@ -494,6 +577,8 @@
 #if BN_BITS2==64
 	carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
 #endif
+	}
+#endif
 	u.f = bn_sub_words;
 	if (carry > 0)
 		{
@@ -548,9 +633,11 @@
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *a_d = a->d, *r_d;
-	BN_ULONG t_d[BN_NIST_256_TOP],
-	         buf[BN_NIST_256_TOP],
-		 c_d[BN_NIST_256_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_256_TOP];
+		unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_256_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +671,87 @@
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[8-8];
+			acc += bp[9-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[9-8];
+			acc += bp[10-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[10-8];
+			acc += bp[11-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[11-8];
+			acc += bp[11-8];
+			acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc -= bp[15-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc -= bp[9-8];
+			acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc -= bp[10-8];
+			acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[14-8];
+			acc += bp[13-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[6] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[7];	acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[8 -8];
+			acc -= bp[10-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_256_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+	nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
 	carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
 	/* left shift */
 		{
@@ -607,24 +769,26 @@
 		}
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S3*/
-	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+	nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S4*/
-	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+	nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D1*/
-	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+	nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D2*/
-	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+	nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D3*/
-	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+	nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D4*/
-	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+	nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -672,9 +836,11 @@
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_384_TOP],
-	         buf[BN_NIST_384_TOP],
-		 c_d[BN_NIST_384_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_384_TOP];
+		unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_384_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +875,100 @@
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[12-12];
+			acc += bp[21-12];
+			acc += bp[20-12];
+			acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[13-12];
+			acc += bp[22-12];
+			acc += bp[23-12];
+			acc -= bp[12-12];
+			acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[14-12];
+			acc += bp[23-12];
+			acc -= bp[13-12];
+			acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[15-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[21-12];
+			acc -= bp[14-12];
+			acc -= bp[22-12];
+			acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[21-12];
+			acc += bp[21-12];
+			acc += bp[16-12];
+			acc += bp[13-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[22-12];
+			acc -= bp[15-12];
+			acc -= bp[23-12];
+			acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[22-12];
+			acc += bp[22-12];
+			acc += bp[17-12];
+			acc += bp[14-12];
+			acc += bp[13-12];
+			acc += bp[21-12];
+			acc += bp[23-12];
+			acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[6];	acc += bp[23-12];
+			acc += bp[23-12];
+			acc += bp[18-12];
+			acc += bp[15-12];
+			acc += bp[14-12];
+			acc += bp[22-12];
+			acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[7];	acc += bp[19-12];
+			acc += bp[16-12];
+			acc += bp[15-12];
+			acc += bp[23-12];
+			acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[8];	acc += bp[20-12];
+			acc += bp[17-12];
+			acc += bp[16-12];
+			acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[9];	acc += bp[21-12];
+			acc += bp[18-12];
+			acc += bp[17-12];
+			acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[10];	acc += bp[22-12];
+			acc += bp[19-12];
+			acc += bp[18-12];
+			acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[11];	acc += bp[23-12];
+			acc += bp[20-12];
+			acc += bp[19-12];
+			acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_384_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+	nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
 		/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
@@ -729,29 +985,31 @@
 	carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
 		t_d, BN_NIST_256_TOP);
 	/*S2 */
-	carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+	carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
 	/*S3*/
-	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+	nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S4*/
-	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+	nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S5*/
-	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S6*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D1*/
-	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+	nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D2*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D3*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)

diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index bebb466..1743b6a 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c

@@ -357,3 +357,22 @@
 	return(ret);
 	}
 #endif
+
+char *BN_options(void)
+	{
+	static int init=0;
+	static char data[16];
+
+	if (!init)
+		{
+		init++;
+#ifdef BN_LLONG
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+		}
+	return(data);
+	}

diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c
index c4d301a..a6fca2c 100644
--- a/crypto/bn/bn_shift.c
+++ b/crypto/bn/bn_shift.c

@@ -99,7 +99,7 @@
 int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	BN_ULONG *ap,*rp,t,c;
-	int i;
+	int i,j;
 
 	bn_check_top(r);
 	bn_check_top(a);
@@ -109,22 +109,25 @@
 		BN_zero(r);
 		return(1);
 		}
+	i = a->top;
+	ap= a->d;
+	j = i-(ap[i-1]==1);
 	if (a != r)
 		{
-		if (bn_wexpand(r,a->top) == NULL) return(0);
-		r->top=a->top;
+		if (bn_wexpand(r,j) == NULL) return(0);
 		r->neg=a->neg;
 		}
-	ap=a->d;
 	rp=r->d;
-	c=0;
-	for (i=a->top-1; i>=0; i--)
+	t=ap[--i];
+	c=(t&1)?BN_TBIT:0;
+	if (t>>=1) rp[i]=t;
+	while (i>0)
 		{
-		t=ap[i];
+		t=ap[--i];
 		rp[i]=((t>>1)&BN_MASK2)|c;
 		c=(t&1)?BN_TBIT:0;
 		}
-	bn_correct_top(r);
+	r->top=j;
 	bn_check_top(r);
 	return(1);
 	}
@@ -182,10 +185,11 @@
 		BN_zero(r);
 		return(1);
 		}
+	i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
 	if (r != a)
 		{
 		r->neg=a->neg;
-		if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+		if (bn_wexpand(r,i) == NULL) return(0);
 		}
 	else
 		{
@@ -196,7 +200,7 @@
 	f= &(a->d[nw]);
 	t=r->d;
 	j=a->top-nw;
-	r->top=j;
+	r->top=i;
 
 	if (rb == 0)
 		{
@@ -212,9 +216,8 @@
 			l= *(f++);
 			*(t++) =(tmp|(l<<lb))&BN_MASK2;
 			}
-		*(t++) =(l>>rb)&BN_MASK2;
+		if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
 		}
-	bn_correct_top(r);
 	bn_check_top(r);
 	return(1);
 	}

diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 0cd99c5..06f5954 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c

@@ -262,7 +262,7 @@
 	message(out,"BN_mod_sqrt");
 	if (!test_sqrt(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#ifndef OPENSSL_NO_EC2M
 	message(out,"BN_GF2m_add");
 	if (!test_gf2m_add(out)) goto err;
 	(void)BIO_flush(out);
@@ -298,7 +298,7 @@
 	message(out,"BN_GF2m_mod_solve_quad");
 	if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#endif
 	BN_CTX_free(ctx);
 	BIO_free(out);
 
@@ -1061,7 +1061,7 @@
 	BN_free(one);
 	return(1);
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int test_gf2m_add(BIO *bp)
 	{
 	BIGNUM a,b,c;
@@ -1636,7 +1636,7 @@
 	BN_free(e);
 	return ret;
 	}
-
+#endif
 static int genprime_cb(int p, int n, BN_GENCB *arg)
 	{
 	char c='*';

diff --git a/crypto/buffer/buf_str.c b/crypto/buffer/buf_str.c
new file mode 100644
index 0000000..151f5ea
--- /dev/null
+++ b/crypto/buffer/buf_str.c

@@ -0,0 +1,119 @@
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/buffer.h>
+
+char *BUF_strdup(const char *str)
+	{
+	if (str == NULL) return(NULL);
+	return BUF_strndup(str, strlen(str));
+	}
+
+char *BUF_strndup(const char *str, size_t siz)
+	{
+	char *ret;
+
+	if (str == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz+1);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	BUF_strlcpy(ret,str,siz+1);
+	return(ret);
+	}
+
+void *BUF_memdup(const void *data, size_t siz)
+	{
+	void *ret;
+
+	if (data == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	return memcpy(ret, data, siz);
+	}	
+
+size_t BUF_strlcpy(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 1 && *src; size--)
+		{
+		*dst++ = *src++;
+		l++;
+		}
+	if (size)
+		*dst = '\0';
+	return l + strlen(src);
+	}
+
+size_t BUF_strlcat(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 0 && *dst; size--, dst++)
+		l++;
+	return l + BUF_strlcpy(dst, src, size);
+	}

diff --git a/crypto/buffer/buffer.c b/crypto/buffer/buffer.c
index 620ea8d..f4b358b 100644
--- a/crypto/buffer/buffer.c
+++ b/crypto/buffer/buffer.c

@@ -162,64 +162,6 @@
 	return(len);
 	}
 
-char *BUF_strdup(const char *str)
-	{
-	if (str == NULL) return(NULL);
-	return BUF_strndup(str, strlen(str));
-	}
-
-char *BUF_strndup(const char *str, size_t siz)
-	{
-	char *ret;
-
-	if (str == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz+1);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	BUF_strlcpy(ret,str,siz+1);
-	return(ret);
-	}
-
-void *BUF_memdup(const void *data, size_t siz)
-	{
-	void *ret;
-
-	if (data == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	return memcpy(ret, data, siz);
-	}	
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 1 && *src; size--)
-		{
-		*dst++ = *src++;
-		l++;
-		}
-	if (size)
-		*dst = '\0';
-	return l + strlen(src);
-	}
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 0 && *dst; size--, dst++)
-		l++;
-	return l + BUF_strlcpy(dst, src, size);
-	}
-
 void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
 	{
 	size_t i;

diff --git a/crypto/cmac/cm_ameth.c b/crypto/cmac/cm_ameth.c
new file mode 100644
index 0000000..0b8e567
--- /dev/null
+++ b/crypto/cmac/cm_ameth.c

@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+	{
+	return EVP_MAX_BLOCK_LENGTH;
+	}
+
+static void cmac_key_free(EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+	if (cmctx)
+		CMAC_CTX_free(cmctx);
+	}
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_CMAC,
+	0,
+
+	"CMAC",
+	"OpenSSL CMAC method",
+
+	0,0,0,0,
+
+	0,0,0,
+
+	cmac_size,
+	0,
+	0,0,0,0,0,0,0,
+
+	cmac_key_free,
+	0,
+	0,0
+	};
+

diff --git a/crypto/cmac/cm_pmeth.c b/crypto/cmac/cm_pmeth.c
new file mode 100644
index 0000000..072228e
--- /dev/null
+++ b/crypto/cmac/cm_pmeth.c

@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+	{
+	ctx->data = CMAC_CTX_new();
+	if (!ctx->data)
+		return 0;
+	ctx->keygen_info_count = 0;
+	return 1;
+	}
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+	{
+	if (!pkey_cmac_init(dst))
+		return 0;
+	if (!CMAC_CTX_copy(dst->data, src->data))
+		return 0;
+	return 1;
+	}
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+	{
+	CMAC_CTX_free(ctx->data);
+	}
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmkey = CMAC_CTX_new();
+	CMAC_CTX *cmctx = ctx->data;
+	if (!cmkey)
+		return 0;
+	if (!CMAC_CTX_copy(cmkey, cmctx))
+		{
+		CMAC_CTX_free(cmkey);
+		return 0;
+		}
+	EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+	
+	return 1;
+	}
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+	{
+	if (!CMAC_Update(ctx->pctx->data, data, count))
+		return 0;
+	return 1;
+	}
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+	{
+	EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+	mctx->update = int_update;
+	return 1;
+	}
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+					EVP_MD_CTX *mctx)
+	{
+	return CMAC_Final(ctx->data, sig, siglen);
+	}
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+	{
+	CMAC_CTX *cmctx = ctx->data;
+	switch (type)
+		{
+
+		case EVP_PKEY_CTRL_SET_MAC_KEY:
+		if (!p2 || p1 < 0)
+			return 0;
+		if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_CIPHER:
+		if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_MD:
+		if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+					(CMAC_CTX *)ctx->pkey->pkey.ptr))
+			return 0;
+		if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+			return 0;
+		break;
+
+		default:
+		return -2;
+
+		}
+	return 1;
+	}
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+			const char *type, const char *value)
+	{
+	if (!value)
+		{
+		return 0;
+		}
+	if (!strcmp(type, "key"))
+		{
+		void *p = (void *)value;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+								strlen(p), p);
+		}
+	if (!strcmp(type, "cipher"))
+		{
+		const EVP_CIPHER *c;
+		c = EVP_get_cipherbyname(value);
+		if (!c)
+			return 0;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+		}
+	if (!strcmp(type, "hexkey"))
+		{
+		unsigned char *key;
+		int r;
+		long keylen;
+		key = string_to_hex(value, &keylen);
+		if (!key)
+			return 0;
+		r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+		OPENSSL_free(key);
+		return r;
+		}
+	return -2;
+	}
+
+const EVP_PKEY_METHOD cmac_pkey_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+	pkey_cmac_init,
+	pkey_cmac_copy,
+	pkey_cmac_cleanup,
+
+	0, 0,
+
+	0,
+	pkey_cmac_keygen,
+
+	0, 0,
+
+	0, 0,
+
+	0,0,
+
+	cmac_signctx_init,
+	cmac_signctx,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	pkey_cmac_ctrl,
+	pkey_cmac_ctrl_str
+
+	};

diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
new file mode 100644
index 0000000..b586026
--- /dev/null
+++ b/crypto/cmac/cmac.c

@@ -0,0 +1,306 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+struct CMAC_CTX_st
+	{
+	/* Cipher context to use */
+	EVP_CIPHER_CTX cctx;
+	/* Keys k1 and k2 */
+	unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+	unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+	/* Temporary block */
+	unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+	/* Last (possibly partial) block */
+	unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+	/* Number of bytes in last block: -1 means context not initialised */
+	int nlast_block;
+	};
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+	{
+	int i;
+	/* Shift block to left, including carry */
+	for (i = 0; i < bl; i++)
+		{
+		k1[i] = l[i] << 1;
+		if (i < bl - 1 && l[i + 1] & 0x80)
+			k1[i] |= 1;
+		}
+	/* If MSB set fixup with R */
+	if (l[0] & 0x80)
+		k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+	}
+
+CMAC_CTX *CMAC_CTX_new(void)
+	{
+	CMAC_CTX *ctx;
+	ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+	if (!ctx)
+		return NULL;
+	EVP_CIPHER_CTX_init(&ctx->cctx);
+	ctx->nlast_block = -1;
+	return ctx;
+	}
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		{
+		FIPS_cmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
+	EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+	OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+	ctx->nlast_block = -1;
+	}
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+	{
+	return &ctx->cctx;
+	}
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+	{
+	CMAC_CTX_cleanup(ctx);
+	OPENSSL_free(ctx);
+	}
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+	{
+	int bl;
+	if (in->nlast_block == -1)
+		return 0;
+	if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+	memcpy(out->k1, in->k1, bl);
+	memcpy(out->k2, in->k2, bl);
+	memcpy(out->tbl, in->tbl, bl);
+	memcpy(out->last_block, in->last_block, bl);
+	out->nlast_block = in->nlast_block;
+	return 1;
+	}
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl)
+	{
+	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->cctx.engine)
+			&& !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+			{
+			EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_cipherinit().
+		 */
+		if (!impl && !ctx->cctx.engine)
+			return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+		}
+#endif
+	/* All zeros means restart */
+	if (!key && !cipher && !impl && keylen == 0)
+		{
+		/* Not initialised */
+		if (ctx->nlast_block == -1)
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		return 1;
+		}
+	/* Initialiase context */
+	if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+		return 0;
+	/* Non-NULL key means initialisation complete */
+	if (key)
+		{
+		int bl;
+		if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+			return 0;
+		if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+			return 0;
+		bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+			return 0;
+		make_kn(ctx->k1, ctx->tbl, bl);
+		make_kn(ctx->k2, ctx->k1, bl);
+		OPENSSL_cleanse(ctx->tbl, bl);
+		/* Reset context again ready for first data block */
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		/* Zero tbl so resume works */
+		memset(ctx->tbl, 0, bl);
+		ctx->nlast_block = 0;
+		}
+	return 1;
+	}
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+	{
+	const unsigned char *data = in;
+	size_t bl;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_update(ctx, in, dlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	if (dlen == 0)
+		return 1;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	/* Copy into partial block if we need to */
+	if (ctx->nlast_block > 0)
+		{
+		size_t nleft;
+		nleft = bl - ctx->nlast_block;
+		if (dlen < nleft)
+			nleft = dlen;
+		memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+		dlen -= nleft;
+		ctx->nlast_block += nleft;
+		/* If no more to process return */
+		if (dlen == 0)
+			return 1;
+		data += nleft;
+		/* Else not final block so encrypt it */
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+			return 0;
+		}
+	/* Encrypt all but one of the complete blocks left */
+	while(dlen > bl)
+		{
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+			return 0;
+		dlen -= bl;
+		data += bl;
+		}
+	/* Copy any data left to last block buffer */
+	memcpy(ctx->last_block, data, dlen);
+	ctx->nlast_block = dlen;
+	return 1;
+
+	}
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+	{
+	int i, bl, lb;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	*poutlen = (size_t)bl;
+	if (!out)
+		return 1;
+	lb = ctx->nlast_block;
+	/* Is last block complete? */
+	if (lb == bl)
+		{
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k1[i];
+		}
+	else
+		{
+		ctx->last_block[lb] = 0x80;
+		if (bl - lb > 1)
+			memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k2[i];
+		}
+	if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+		{
+		OPENSSL_cleanse(out, bl);	
+		return 0;
+		}
+	return 1;
+	}
+
+int CMAC_resume(CMAC_CTX *ctx)
+	{
+	if (ctx->nlast_block == -1)
+		return 0;
+	/* The buffer "tbl" containes the last fully encrypted block
+	 * which is the last IV (or all zeroes if no last encrypted block).
+	 * The last block has not been modified since CMAC_final().
+	 * So reinitliasing using the last decrypted block will allow
+	 * CMAC to continue after calling CMAC_Final(). 
+	 */
+	return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+	}

diff --git a/crypto/cmac/cmac.h b/crypto/cmac/cmac.h
new file mode 100644
index 0000000..712e92d
--- /dev/null
+++ b/crypto/cmac/cmac.h

@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif

diff --git a/crypto/comp/c_rle.c b/crypto/comp/c_rle.c
index 18bceae..47dfb67 100644
--- a/crypto/comp/c_rle.c
+++ b/crypto/comp/c_rle.c

@@ -30,7 +30,7 @@
 	{
 	/* int i; */
 
-	if (olen < (ilen+1))
+	if (ilen == 0 || olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
@@ -46,7 +46,7 @@
 	{
 	int i;
 
-	if (ilen == 0 || olen < (ilen-1))
+	if (olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);

diff --git a/crypto/cpt_err.c b/crypto/cpt_err.c
index 139b928..289005f 100644
--- a/crypto/cpt_err.c
+++ b/crypto/cpt_err.c

@@ -1,6 +1,6 @@
 /* crypto/cpt_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@
 {ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA),	"CRYPTO_set_ex_data"},
 {ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX),	"DEF_ADD_INDEX"},
 {ERR_FUNC(CRYPTO_F_DEF_GET_CLASS),	"DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET),	"FIPS_mode_set"},
 {ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA),	"INT_DUP_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA),	"INT_FREE_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA),	"INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@
 
 static ERR_STRING_DATA CRYPTO_str_reasons[]=
 	{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
 {0,NULL}
 	};

diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index 24fe123..766ea8c 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c

@@ -409,6 +409,10 @@
 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
 					      const char *file,int line))
 	{
+	/* Calling this here ensures initialisation before any threads
+	 * are started.
+	 */
+	OPENSSL_init();
 	locking_callback=func;
 	}
 
@@ -661,28 +665,52 @@
 	defined(__INTEL__) || \
 	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
 
-unsigned long  OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int  OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{   if (sizeof(long)==4)
+	/*
+	 * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+	 * clear second element to maintain the illusion that vector
+	 * is 32-bit.
+	 */
+	OPENSSL_ia32cap_P[1]=0;
+    return (unsigned long *)OPENSSL_ia32cap_P;
+}
 
 #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
 #define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
 void OPENSSL_cpuid_setup(void)
 { static int trigger=0;
-  unsigned long OPENSSL_ia32_cpuid(void);
+  IA32CAP OPENSSL_ia32_cpuid(void);
+  IA32CAP vec;
   char *env;
 
     if (trigger)	return;
 
     trigger=1;
-    if ((env=getenv("OPENSSL_ia32cap")))
-	OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+    if ((env=getenv("OPENSSL_ia32cap"))) {
+	int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+	if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+	if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+	if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+    }
     else
-	OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+	vec = OPENSSL_ia32_cpuid();
     /*
      * |(1<<10) sets a reserved bit to signal that variable
      * was initialized already... This is to avoid interference
      * with cpuid snippets in ELF .init segment.
      */
+    OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+    OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
 }
 #endif
 

diff --git a/crypto/cryptlib.h b/crypto/cryptlib.h
index fc249c5..1761f6b 100644
--- a/crypto/cryptlib.h
+++ b/crypto/cryptlib.h

@@ -99,7 +99,7 @@
 #define HEX_SIZE(type)		(sizeof(type)*2)
 
 void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
+extern unsigned int OPENSSL_ia32cap_P[];
 void OPENSSL_showfatal(const char *,...);
 void *OPENSSL_stderr(void);
 extern int OPENSSL_NONPIC_relocated;

diff --git a/crypto/crypto.h b/crypto/crypto.h
index b0360ce..6aeda0a 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h

@@ -547,6 +547,33 @@
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +589,13 @@
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus

diff --git a/crypto/des/des.h b/crypto/des/des.h
index 92b6663..1eaedcb 100644
--- a/crypto/des/des.h
+++ b/crypto/des/des.h

@@ -224,6 +224,9 @@
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,

diff --git a/crypto/des/set_key.c b/crypto/des/set_key.c
index 3004cc3..d3e69ca 100644
--- a/crypto/des/set_key.c
+++ b/crypto/des/set_key.c

@@ -65,6 +65,8 @@
  */
 #include "des_locl.h"
 
+#include <openssl/crypto.h>
+
 OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0)	/* defaults to false */
 
 static const unsigned char odd_parity[256]={
@@ -335,6 +337,13 @@
 	}
 
 void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(DES);
+	private_DES_set_key_unchecked(key, schedule);
+	}
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
 	{
 	static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
 	register DES_LONG c,d,t,s,t2;

diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 849309a..ea59e61 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h

@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105

diff --git a/crypto/dh/dh_ameth.c b/crypto/dh/dh_ameth.c
index 377caf9..02ec2d4 100644
--- a/crypto/dh/dh_ameth.c
+++ b/crypto/dh/dh_ameth.c

@@ -493,6 +493,7 @@
 	dh_copy_parameters,
 	dh_cmp_parameters,
 	dh_param_print,
+	0,
 
 	int_dh_free,
 	0

diff --git a/crypto/dh/dh_err.c b/crypto/dh/dh_err.c
index d5cf0c2..56d3df7 100644
--- a/crypto/dh/dh_err.c
+++ b/crypto/dh/dh_err.c

@@ -1,6 +1,6 @@
 /* crypto/dh/dh_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@
 {ERR_FUNC(DH_F_COMPUTE_KEY),	"COMPUTE_KEY"},
 {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP),	"DHparams_print_fp"},
 {ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS),	"DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY),	"DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY),	"DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX),	"DH_generate_parameters_ex"},
 {ERR_FUNC(DH_F_DH_NEW_METHOD),	"DH_new_method"},
 {ERR_FUNC(DH_F_DH_PARAM_DECODE),	"DH_PARAM_DECODE"},
 {ERR_FUNC(DH_F_DH_PRIV_DECODE),	"DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@
 {ERR_REASON(DH_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(DH_R_INVALID_PUBKEY)         ,"invalid public key"},
 {ERR_REASON(DH_R_KEYS_NOT_SET)           ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL)     ,"key size too small"},
 {ERR_REASON(DH_R_MODULUS_TOO_LARGE)      ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD)        ,"non fips method"},
 {ERR_REASON(DH_R_NO_PARAMETERS_SET)      ,"no parameters set"},
 {ERR_REASON(DH_R_NO_PRIVATE_VALUE)       ,"no private value"},
 {ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},

diff --git a/crypto/dh/dh_gen.c b/crypto/dh/dh_gen.c
index cfd5b11..7b1fe9c 100644
--- a/crypto/dh/dh_gen.c
+++ b/crypto/dh/dh_gen.c

@@ -66,12 +66,29 @@
 #include <openssl/bn.h>
 #include <openssl/dh.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
 
 int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->generate_params)
 		return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dh_generate_parameters_ex(ret, prime_len,
+							generator, cb);
+#endif
 	return dh_builtin_genparams(ret, prime_len, generator, cb);
 	}
 

diff --git a/crypto/dh/dh_key.c b/crypto/dh/dh_key.c
index e7db440..89a74db 100644
--- a/crypto/dh/dh_key.c
+++ b/crypto/dh/dh_key.c

@@ -73,11 +73,27 @@
 
 int DH_generate_key(DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->generate_key(dh);
 	}
 
 int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->compute_key(key, pub_key, dh);
 	}
 
@@ -138,8 +154,21 @@
 
 	if (generate_new_key)
 		{
-		l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
-		if (!BN_rand(priv_key, l, 0, 0)) goto err;
+		if (dh->q)
+			{
+			do
+				{
+				if (!BN_rand_range(priv_key, dh->q))
+					goto err;
+				}
+			while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+			}
+		else
+			{
+			/* secret exponent length */
+			l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+			if (!BN_rand(priv_key, l, 0, 0)) goto err;
+			}
 		}
 
 	{

diff --git a/crypto/dh/dh_lib.c b/crypto/dh/dh_lib.c
index 7aef080..00218f2 100644
--- a/crypto/dh/dh_lib.c
+++ b/crypto/dh/dh_lib.c

@@ -64,6 +64,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
 
 static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@
 const DH_METHOD *DH_get_default_method(void)
 	{
 	if(!default_DH_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dh_openssl();
+		else
+			return DH_OpenSSL();
+#else
 		default_DH_method = DH_OpenSSL();
+#endif
+		}
 	return default_DH_method;
 	}
 
@@ -156,7 +169,7 @@
 	ret->counter = NULL;
 	ret->method_mont_p=NULL;
 	ret->references = 1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{

diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h
index ac50a5c..a6f6d0b 100644
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h

@@ -97,6 +97,21 @@
                                               * be used for all exponents.
                                               */
 
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -272,6 +287,8 @@
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +299,7 @@
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +316,8 @@
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 

diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index 6413aae..376156e 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c

@@ -542,6 +542,52 @@
 	return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
 	}
 
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	DSA_SIG *dsa_sig;
+	const unsigned char *p;
+	if (!sig)
+		{
+		if (BIO_puts(bp, "\n") <= 0)
+			return 0;
+		else
+			return 1;
+		}
+	p = sig->data;
+	dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+	if (dsa_sig)
+		{
+		int rv = 0;
+		size_t buf_len = 0;
+		unsigned char *m=NULL;
+		update_buflen(dsa_sig->r, &buf_len);
+		update_buflen(dsa_sig->s, &buf_len);
+		m = OPENSSL_malloc(buf_len+10);
+		if (m == NULL)
+			{
+			DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		if (BIO_write(bp, "\n", 1) != 1)
+			goto err;
+
+		if (!ASN1_bn_print(bp,"r:   ",dsa_sig->r,m,indent))
+			goto err;
+		if (!ASN1_bn_print(bp,"s:   ",dsa_sig->s,m,indent))
+			goto err;
+		rv = 1;
+		err:
+		if (m)
+			OPENSSL_free(m);
+		DSA_SIG_free(dsa_sig);
+		return rv;
+		}
+	return X509_signature_dump(bp, sig, indent);
+	}
+
 static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
 	switch (op)
@@ -647,6 +693,7 @@
 		dsa_copy_parameters,
 		dsa_cmp_parameters,
 		dsa_param_print,
+		dsa_sig_print,
 
 		int_dsa_free,
 		dsa_pkey_ctrl,

diff --git a/crypto/dsa/dsa_asn1.c b/crypto/dsa/dsa_asn1.c
index c37460b..6058534 100644
--- a/crypto/dsa/dsa_asn1.c
+++ b/crypto/dsa/dsa_asn1.c

@@ -61,6 +61,7 @@
 #include <openssl/dsa.h>
 #include <openssl/asn1.h>
 #include <openssl/asn1t.h>
+#include <openssl/rand.h>
 
 /* Override the default new methods */
 static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@
 	ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
 } ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
 
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
 
 /* Override the default free and new methods */
 static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@
 	{
 	return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
 	}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+	     unsigned int *siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	RAND_seed(dgst, dlen);
+	s=DSA_do_sign(dgst,dlen,dsa);
+	if (s == NULL)
+		{
+		*siglen=0;
+		return(0);
+		}
+	*siglen=i2d_DSA_SIG(s,&sig);
+	DSA_SIG_free(s);
+	return(1);
+	}
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ *      1: correct signature
+ *      0: incorrect signature
+ *     -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+	     const unsigned char *sigbuf, int siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	int ret=-1;
+
+	s = DSA_SIG_new();
+	if (s == NULL) return(ret);
+	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+	DSA_SIG_free(s);
+	return(ret);
+	}

diff --git a/crypto/dsa/dsa_err.c b/crypto/dsa/dsa_err.c
index bba984e..00545b7 100644
--- a/crypto/dsa/dsa_err.c
+++ b/crypto/dsa/dsa_err.c

@@ -1,6 +1,6 @@
 /* crypto/dsa/dsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@
 {ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP),	"DSAparams_print_fp"},
 {ERR_FUNC(DSA_F_DSA_DO_SIGN),	"DSA_do_sign"},
 {ERR_FUNC(DSA_F_DSA_DO_VERIFY),	"DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY),	"DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX),	"DSA_generate_parameters_ex"},
 {ERR_FUNC(DSA_F_DSA_NEW_METHOD),	"DSA_new_method"},
 {ERR_FUNC(DSA_F_DSA_PARAM_DECODE),	"DSA_PARAM_DECODE"},
 {ERR_FUNC(DSA_F_DSA_PRINT_FP),	"DSA_print_fp"},
@@ -86,6 +88,7 @@
 {ERR_FUNC(DSA_F_DSA_SIGN),	"DSA_sign"},
 {ERR_FUNC(DSA_F_DSA_SIGN_SETUP),	"DSA_sign_setup"},
 {ERR_FUNC(DSA_F_DSA_SIG_NEW),	"DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT),	"DSA_SIG_PRINT"},
 {ERR_FUNC(DSA_F_DSA_VERIFY),	"DSA_verify"},
 {ERR_FUNC(DSA_F_I2D_DSA_SIG),	"i2d_DSA_SIG"},
 {ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE),	"OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,8 @@
 {ERR_REASON(DSA_R_INVALID_DIGEST_TYPE)   ,"invalid digest type"},
 {ERR_REASON(DSA_R_MISSING_PARAMETERS)    ,"missing parameters"},
 {ERR_REASON(DSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD)   ,"non fips dsa method"},
 {ERR_REASON(DSA_R_NO_PARAMETERS_SET)     ,"no parameters set"},
 {ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
 {0,NULL}

diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c
index cb0b453..c398761 100644
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c

@@ -81,13 +81,33 @@
 #include <openssl/sha.h>
 #include "dsa_locl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int DSA_generate_parameters_ex(DSA *ret, int bits,
 		const unsigned char *seed_in, int seed_len,
 		int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->dsa_paramgen)
 		return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
 				counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+	else if (FIPS_mode())
+		{
+		return FIPS_dsa_generate_parameters_ex(ret, bits, 
+							seed_in, seed_len,
+							counter_ret, h_ret, cb);
+		}
+#endif
 	else
 		{
 		const EVP_MD *evpmd;
@@ -105,12 +125,13 @@
 			}
 
 		return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
-				seed_in, seed_len, counter_ret, h_ret, cb);
+			seed_in, seed_len, NULL, counter_ret, h_ret, cb);
 		}
 	}
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
 	int ok=0;
@@ -201,8 +222,10 @@
 				}
 
 			/* step 2 */
-			EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL);
-			EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL);
+			if (!EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL))
+				goto err;
+			if (!EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL))
+				goto err;
 			for (i = 0; i < qsize; i++)
 				md[i]^=buf2[i];
 
@@ -251,7 +274,9 @@
 						break;
 					}
 
-				EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+				if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+									NULL))
+					goto err;
 
 				/* step 8 */
 				if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@
 			}
 		if (counter_ret != NULL) *counter_ret=counter;
 		if (h_ret != NULL) *h_ret=h;
+		if (seed_out)
+			memcpy(seed_out, seed, qsize);
 		}
 	if(ctx)
 		{

diff --git a/crypto/dsa/dsa_key.c b/crypto/dsa/dsa_key.c
index c4aa86b..9cf669b 100644
--- a/crypto/dsa/dsa_key.c
+++ b/crypto/dsa/dsa_key.c

@@ -64,12 +64,28 @@
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dsa_builtin_keygen(DSA *dsa);
 
 int DSA_generate_key(DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(dsa->meth->dsa_keygen)
 		return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dsa_generate_key(dsa);
+#endif
 	return dsa_builtin_keygen(dsa);
 	}
 

diff --git a/crypto/dsa/dsa_lib.c b/crypto/dsa/dsa_lib.c
index e9b7590..96d8d0c 100644
--- a/crypto/dsa/dsa_lib.c
+++ b/crypto/dsa/dsa_lib.c

@@ -70,6 +70,10 @@
 #include <openssl/dh.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
 
 static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@
 const DSA_METHOD *DSA_get_default_method(void)
 	{
 	if(!default_DSA_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dsa_openssl();
+		else
+			return DSA_OpenSSL();
+#else
 		default_DSA_method = DSA_OpenSSL();
+#endif
+		}
 	return default_DSA_method;
 	}
 
@@ -163,7 +176,7 @@
 	ret->method_mont_p=NULL;
 
 	ret->references=1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
@@ -276,7 +289,8 @@
 DH *DSA_dup_DH(const DSA *r)
 	{
 	/* DSA has p, q, g, optional pub_key, optional priv_key.
-	 * DH has p, optional length, g, optional pub_key, optional priv_key.
+	 * DH has p, optional length, g, optional pub_key, optional priv_key,
+	 * optional q.
 	 */ 
 
 	DH *ret = NULL;
@@ -290,7 +304,11 @@
 		if ((ret->p = BN_dup(r->p)) == NULL)
 			goto err;
 	if (r->q != NULL)
+		{
 		ret->length = BN_num_bits(r->q);
+		if ((ret->q = BN_dup(r->q)) == NULL)
+			goto err;
+		}
 	if (r->g != NULL)
 		if ((ret->g = BN_dup(r->g)) == NULL)
 			goto err;

diff --git a/crypto/dsa/dsa_locl.h b/crypto/dsa/dsa_locl.h
index 2b8cfee..21e2e45 100644
--- a/crypto/dsa/dsa_locl.h
+++ b/crypto/dsa/dsa_locl.h

@@ -56,4 +56,5 @@
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);

diff --git a/crypto/dsa/dsa_ossl.c b/crypto/dsa/dsa_ossl.c
index a3ddd7d..b3d78e5 100644
--- a/crypto/dsa/dsa_ossl.c
+++ b/crypto/dsa/dsa_ossl.c

@@ -136,6 +136,7 @@
 	BN_CTX *ctx=NULL;
 	int reason=ERR_R_BN_LIB;
 	DSA_SIG *ret=NULL;
+	int noredo = 0;
 
 	BN_init(&m);
 	BN_init(&xr);
@@ -150,7 +151,7 @@
 	if (s == NULL) goto err;
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
-
+redo:
 	if ((dsa->kinv == NULL) || (dsa->r == NULL))
 		{
 		if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
@@ -161,6 +162,7 @@
 		dsa->kinv=NULL;
 		r=dsa->r;
 		dsa->r=NULL;
+		noredo = 1;
 		}
 
 	
@@ -181,6 +183,18 @@
 
 	ret=DSA_SIG_new();
 	if (ret == NULL) goto err;
+	/* Redo if r or s is zero as required by FIPS 186-3: this is
+	 * very unlikely.
+	 */
+	if (BN_is_zero(r) || BN_is_zero(s))
+		{
+		if (noredo)
+			{
+			reason = DSA_R_NEED_NEW_SETUP_VALUES;
+			goto err;
+			}
+		goto redo;
+		}
 	ret->r = r;
 	ret->s = s;
 	

diff --git a/crypto/dsa/dsa_pmeth.c b/crypto/dsa/dsa_pmeth.c
index e2df54f..715d8d6 100644
--- a/crypto/dsa/dsa_pmeth.c
+++ b/crypto/dsa/dsa_pmeth.c

@@ -189,7 +189,9 @@
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsa    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
-		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
 			{
 			DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
 			return 0;
@@ -253,7 +255,7 @@
 	if (!dsa)
 		return 0;
 	ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
-	                           NULL, 0, NULL, NULL, pcb);
+	                           NULL, 0, NULL, NULL, NULL, pcb);
 	if (ret)
 		EVP_PKEY_assign_DSA(pkey, dsa);
 	else

diff --git a/crypto/dsa/dsa_sign.c b/crypto/dsa/dsa_sign.c
index 17555e5..c3cc364 100644
--- a/crypto/dsa/dsa_sign.c
+++ b/crypto/dsa/dsa_sign.c

@@ -61,30 +61,54 @@
 #include "cryptlib.h"
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
+#include <openssl/bn.h>
 
 DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	{
-	return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
-	}
-
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
-	     unsigned int *siglen, DSA *dsa)
-	{
-	DSA_SIG *s;
-	RAND_seed(dgst, dlen);
-	s=DSA_do_sign(dgst,dlen,dsa);
-	if (s == NULL)
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
 		{
-		*siglen=0;
-		return(0);
+		DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+		return NULL;
 		}
-	*siglen=i2d_DSA_SIG(s,&sig);
-	DSA_SIG_free(s);
-	return(1);
+#endif
+	return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
 	}
 
 int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
 	}
 
+DSA_SIG *DSA_SIG_new(void)
+	{
+	DSA_SIG *sig;
+	sig = OPENSSL_malloc(sizeof(DSA_SIG));
+	if (!sig)
+		return NULL;
+	sig->r = NULL;
+	sig->s = NULL;
+	return sig;
+	}
+
+void DSA_SIG_free(DSA_SIG *sig)
+	{
+	if (sig)
+		{
+		if (sig->r)
+			BN_free(sig->r);
+		if (sig->s)
+			BN_free(sig->s);
+		OPENSSL_free(sig);
+		}
+	}
+

diff --git a/crypto/dsa/dsa_vrf.c b/crypto/dsa/dsa_vrf.c
index 226a75f..674cb5f 100644
--- a/crypto/dsa/dsa_vrf.c
+++ b/crypto/dsa/dsa_vrf.c

@@ -64,26 +64,13 @@
 int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
 		  DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+		return -1;
+		}
+#endif
 	return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
 	}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- *      1: correct signature
- *      0: incorrect signature
- *     -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
-	     const unsigned char *sigbuf, int siglen, DSA *dsa)
-	{
-	DSA_SIG *s;
-	int ret=-1;
-
-	s = DSA_SIG_new();
-	if (s == NULL) return(ret);
-	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
-	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
-	DSA_SIG_free(s);
-	return(ret);
-	}

diff --git a/crypto/dso/dso_dlfcn.c b/crypto/dso/dso_dlfcn.c
index c2bc617..5f22548 100644
--- a/crypto/dso/dso_dlfcn.c
+++ b/crypto/dso/dso_dlfcn.c

@@ -86,7 +86,8 @@
 # if defined(_AIX) || defined(__CYGWIN__) || \
      defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
      (defined(__osf__) && !defined(RTLD_NEXT))     || \
-     (defined(__OpenBSD__) && !defined(RTLD_SELF))
+     (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+	defined(__ANDROID__)
 #  undef HAVE_DLINFO
 # endif
 #endif

diff --git a/crypto/ec/ec.h b/crypto/ec/ec.h
index ee70781..9d01325 100644
--- a/crypto/ec/ec.h
+++ b/crypto/ec/ec.h

@@ -151,7 +151,24 @@
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
 
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
+
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -282,6 +301,7 @@
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -653,9 +673,11 @@
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -799,6 +831,15 @@
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +967,7 @@
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1010,15 @@
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1061,7 @@
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1092,9 @@
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1107,15 @@
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1150,7 @@
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus

diff --git a/crypto/ec/ec2_mult.c b/crypto/ec/ec2_mult.c
index e12b9b2..26f4a78 100644
--- a/crypto/ec/ec2_mult.c
+++ b/crypto/ec/ec2_mult.c

@@ -71,6 +71,8 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 
 /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective 
  * coordinates.
@@ -384,3 +386,5 @@
 	{
 	return ec_wNAF_have_precompute_mult(group);
  	}
+
+#endif

diff --git a/crypto/ec/ec2_oct.c b/crypto/ec/ec2_oct.c
new file mode 100644
index 0000000..f1d75e5
--- /dev/null
+++ b/crypto/ec/ec2_oct.c

@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+
+#include <openssl/err.h>
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ * 
+ *     Harper, Menezes, Vanstone:
+ *     "Public-Key Cryptosystems with Very Small Key Lengths",
+ *     EUROCRYPT '92, Springer-Verlag LNCS 658,
+ *     published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp, *x, *y, *z;
+	int ret = 0, z0;
+
+	/* clear error queue */
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0) ? 1 : 0;
+
+	BN_CTX_start(ctx);
+	tmp = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	z = BN_CTX_get(ctx);
+	if (z == NULL) goto err;
+
+	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+	if (BN_is_zero(x))
+		{
+		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+		}
+	else
+		{
+		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+			{
+			unsigned long err = ERR_peek_last_error();
+			
+			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+				{
+				ERR_clear_error();
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+				}
+			else
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+			goto err;
+			}
+		z0 = (BN_is_odd(z)) ? 1 : 0;
+		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+		if (z0 != y_bit)
+			{
+			if (!BN_GF2m_add(y, y, x)) goto err;
+			}
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+/* Converts an EC_POINT to an octet string.  
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		yxi = BN_CTX_get(ctx);
+		if (yxi == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+		buf[0] = form;
+		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (BN_is_odd(yxi)) buf[0]++;
+			}
+
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+/* Converts an octet string representation to an EC_POINT. 
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	yxi = BN_CTX_get(ctx);
+	if (yxi == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (y_bit != BN_is_odd(yxi))
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+#endif

diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c
index 03deae6..e0e59c7 100644
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c

@@ -71,10 +71,20 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 
 const EC_METHOD *EC_GF2m_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gf2m_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_characteristic_two_field,
 		ec_GF2m_simple_group_init,
 		ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@
 		0 /* get_Jprojective_coordinates_GFp */,
 		ec_GF2m_simple_point_set_affine_coordinates,
 		ec_GF2m_simple_point_get_affine_coordinates,
-		ec_GF2m_simple_set_compressed_coordinates,
-		ec_GF2m_simple_point2oct,
-		ec_GF2m_simple_oct2point,
+		0,0,0,
 		ec_GF2m_simple_add,
 		ec_GF2m_simple_dbl,
 		ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -405,340 +414,6 @@
 	return ret;
 	}
 
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- * 
- *     Harper, Menezes, Vanstone:
- *     "Public-Key Cryptosystems with Very Small Key Lengths",
- *     EUROCRYPT '92, Springer-Verlag LNCS 658,
- *     published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp, *x, *y, *z;
-	int ret = 0, z0;
-
-	/* clear error queue */
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0) ? 1 : 0;
-
-	BN_CTX_start(ctx);
-	tmp = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	z = BN_CTX_get(ctx);
-	if (z == NULL) goto err;
-
-	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
-	if (BN_is_zero(x))
-		{
-		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
-		}
-	else
-		{
-		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
-		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
-		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
-		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
-		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
-			{
-			unsigned long err = ERR_peek_last_error();
-			
-			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
-				{
-				ERR_clear_error();
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-				}
-			else
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-			goto err;
-			}
-		z0 = (BN_is_odd(z)) ? 1 : 0;
-		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
-		if (z0 != y_bit)
-			{
-			if (!BN_GF2m_add(y, y, x)) goto err;
-			}
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-/* Converts an EC_POINT to an octet string.  
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		yxi = BN_CTX_get(ctx);
-		if (yxi == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-		buf[0] = form;
-		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (BN_is_odd(yxi)) buf[0]++;
-			}
-
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-/* Converts an octet string representation to an EC_POINT. 
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	yxi = BN_CTX_get(ctx);
-	if (yxi == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (y_bit != BN_is_odd(yxi))
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 /* Computes a + b and stores the result in r.  r could be a or b, a could be b.
  * Uses algorithm A.10.2 of IEEE P1363.
  */
@@ -1040,3 +715,5 @@
 	{
 	return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
 	}
+
+#endif

diff --git a/crypto/ec/ec_ameth.c b/crypto/ec/ec_ameth.c
index c00f7d7..83909c1 100644
--- a/crypto/ec/ec_ameth.c
+++ b/crypto/ec/ec_ameth.c

@@ -651,6 +651,7 @@
 	ec_copy_parameters,
 	ec_cmp_parameters,
 	eckey_param_print,
+	0,
 
 	int_ec_free,
 	ec_pkey_ctrl,

diff --git a/crypto/ec/ec_asn1.c b/crypto/ec/ec_asn1.c
index ae55539..175eec5 100644
--- a/crypto/ec/ec_asn1.c
+++ b/crypto/ec/ec_asn1.c

@@ -83,7 +83,7 @@
 		/* everything else is currently not supported */
 		return 0;
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 	{
 	if (group == NULL)
@@ -101,7 +101,6 @@
 
 	return 1;
 	}
-
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 	unsigned int *k2, unsigned int *k3)
 	{
@@ -124,7 +123,7 @@
 
 	return 1;
 	}
-
+#endif
 
 
 /* some structures needed for the asn1 encoding */
@@ -340,6 +339,12 @@
 			}
 		}
 	else	/* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		int		field_type;
 		X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +424,7 @@
 				}
 			}
 		}
+#endif
 
 	ok = 1;
 
@@ -456,6 +462,7 @@
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* nid == NID_X9_62_characteristic_two_field */
 		{
 		if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +471,7 @@
 			goto err;
 			}
 		}
-
+#endif
 	len_1 = (size_t)BN_num_bytes(tmp_1);
 	len_2 = (size_t)BN_num_bytes(tmp_2);
 
@@ -775,8 +782,13 @@
 
 	/* get the field parameters */
 	tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
 	if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		X9_62_CHARACTERISTIC_TWO *char_two;
 
@@ -862,6 +874,7 @@
 		/* create the EC_GROUP structure */
 		ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
 		}
+#endif
 	else if (tmp == NID_X9_62_prime_field)
 		{
 		/* we have a curve over a prime field */
@@ -1065,6 +1078,7 @@
 	if ((group = ec_asn1_pkparameters2group(params)) == NULL)
 		{
 		ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+		ECPKPARAMETERS_free(params);
 		return NULL; 
 		}
 

diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index 23274e4..c72fb26 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c

@@ -3,7 +3,7 @@
  * Written by Nils Larsch for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
 #include "ec_lcl.h"
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
 
 typedef struct {
 	int	field_type,	/* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@
 	  0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
 	};
 
+#ifndef OPENSSL_NO_EC2M
+
 /* characteristic two curves */
 static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
 	_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@
 	{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76,	/* seed */
 	  0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
 
- 	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
+	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
 	  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
 	  0x07,
 	  0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9,	/* a */
@@ -1817,103 +1820,128 @@
 	  0xBA,0xFC,0xA7,0x5E }
 	};
 
+#endif
+
 typedef struct _ec_list_element_st {
 	int	nid;
 	const EC_CURVE_DATA *data;
+	const EC_METHOD *(*meth)(void);
 	const char *comment;
 	} ec_list_element;
 
 static const ec_list_element curve_list[] = {
-	/* prime field curves */	
+	/* prime field curves */
 	/* secg curves */
-	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
-	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
 	/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
-	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
-	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
-	{ NID_secp224r1, &_EC_NIST_PRIME_224.h,   "NIST/SECG curve over a 224 bit prime field"},
-	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
 	/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
-	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
-	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
 	/* X9.62 curves */
-	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
 	/* characteristic two field curves */
 	/* NIST/SECG curves */
-	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
-	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
-	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h,  "NIST/SECG/WTLS curve over a 163 bit binary field" },
-	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
-	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h,  "NIST/SECG curve over a 163 bit binary field" },
-	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
-	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h,  "NIST/SECG curve over a 571 bit binary field" },
-	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h,  "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
 	/* X9.62 curves */
-	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
-	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
-	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
-	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
-	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
-	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
-	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
 	/* the WAP/WTLS curves
 	 * [unlike SECG, spec has its own OIDs for curves from X9.62] */
-	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h,   "NIST/SECG/WTLS curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h,  "SECG curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h,  "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h,  "SECG/WTLS curve over a 160 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
-	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
 	/* IPSec curves */
-	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
-	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
 };
 
 #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
 
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
 	{
 	EC_GROUP *group=NULL;
 	EC_POINT *P=NULL;
 	BN_CTX	 *ctx=NULL;
-	BIGNUM 	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+	BIGNUM	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
 	int	 ok=0;
 	int	 seed_len,param_len;
+	const EC_METHOD *meth;
+	const EC_CURVE_DATA *data;
 	const unsigned char *params;
 
 	if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@
 		goto err;
 		}
 
+	data = curve.data;
 	seed_len  = data->seed_len;
 	param_len = data->param_len;
-	params    = (const unsigned char *)(data+1);	/* skip header */
-	params   += seed_len;				/* skip seed   */
+	params	  = (const unsigned char *)(data+1);	/* skip header */
+	params	 += seed_len;				/* skip seed   */
 
 	if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
 		|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@
 		goto err;
 		}
 
-	if (data->field_type == NID_X9_62_prime_field)
+	if (curve.meth != 0)
+		{
+		meth = curve.meth();
+		if (((group = EC_GROUP_new(meth)) == NULL) ||
+			(!(group->meth->group_set_curve(group, p, a, b, ctx))))
+			{
+			ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+			goto err;
+			}
+		}
+	else if (data->field_type == NID_X9_62_prime_field)
 		{
 		if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
 			{
@@ -1943,6 +1982,7 @@
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* field_type == NID_X9_62_characteristic_two_field */
 		{
 		if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@
 			goto err;
 			}
 		}
+#endif
 
 	if ((P = EC_POINT_new(group)) == NULL)
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
 		}
-	
+
 	if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
 		|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
 		goto err;
 		}
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+	if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
@@ -2025,7 +2066,7 @@
 	for (i=0; i<curve_list_length; i++)
 		if (curve_list[i].nid == nid)
 			{
-			ret = ec_group_new_from_data(curve_list[i].data);
+			ret = ec_group_new_from_data(curve_list[i]);
 			break;
 			}
 

diff --git a/crypto/ec/ec_cvt.c b/crypto/ec/ec_cvt.c
index d45640b..bfcbab3 100644
--- a/crypto/ec/ec_cvt.c
+++ b/crypto/ec/ec_cvt.c

@@ -78,7 +78,32 @@
 	const EC_METHOD *meth;
 	EC_GROUP *ret;
 
+#if defined(OPENSSL_BN_ASM_MONT)
+	/*
+	 * This might appear controversial, but the fact is that generic
+	 * prime method was observed to deliver better performance even
+	 * for NIST primes on a range of platforms, e.g.: 60%-15%
+	 * improvement on IA-64, ~25% on ARM, 30%-90% on P4, 20%-25%
+	 * in 32-bit build and 35%--12% in 64-bit build on Core2...
+	 * Coefficients are relative to optimized bn_nist.c for most
+	 * intensive ECDSA verify and ECDH operations for 192- and 521-
+	 * bit keys respectively. Choice of these boundary values is
+	 * arguable, because the dependency of improvement coefficient
+	 * from key length is not a "monotone" curve. For example while
+	 * 571-bit result is 23% on ARM, 384-bit one is -1%. But it's
+	 * generally faster, sometimes "respectfully" faster, sometimes
+	 * "tolerably" slower... What effectively happens is that loop
+	 * with bn_mul_add_words is put against bn_mul_mont, and the
+	 * latter "wins" on short vectors. Correct solution should be
+	 * implementing dedicated NxN multiplication subroutines for
+	 * small N. But till it materializes, let's stick to generic
+	 * prime method...
+	 *						<appro>
+	 */
+	meth = EC_GFp_mont_method();
+#else
 	meth = EC_GFp_nist_method();
+#endif
 	
 	ret = EC_GROUP_new(meth);
 	if (ret == NULL)
@@ -122,7 +147,7 @@
 	return ret;
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	const EC_METHOD *meth;
@@ -142,3 +167,4 @@
 
 	return ret;
 	}
+#endif

diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 84b4833..0d19398 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c

@@ -1,6 +1,6 @@
 /* crypto/ec/ec_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA EC_str_functs[]=
 	{
+{ERR_FUNC(EC_F_BN_TO_FELEM),	"BN_TO_FELEM"},
 {ERR_FUNC(EC_F_COMPUTE_WNAF),	"COMPUTE_WNAF"},
 {ERR_FUNC(EC_F_D2I_ECPARAMETERS),	"d2i_ECParameters"},
 {ERR_FUNC(EC_F_D2I_ECPKPARAMETERS),	"d2i_ECPKParameters"},
@@ -112,6 +113,15 @@
 {ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR),	"ec_GFp_mont_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE),	"ec_GFp_mont_group_set_curve"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP),	"EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE),	"ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL),	"ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE),	"ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL),	"ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE),	"ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL),	"ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp521_point_get_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL),	"ec_GFp_nist_field_mul"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR),	"ec_GFp_nist_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE),	"ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@
 {ERR_FUNC(EC_F_EC_KEY_NEW),	"EC_KEY_new"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT),	"EC_KEY_print"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT_FP),	"EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES),	"EC_KEY_set_public_key_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE),	"EC_POINTs_make_affine"},
 {ERR_FUNC(EC_F_EC_POINT_ADD),	"EC_POINT_add"},
 {ERR_FUNC(EC_F_EC_POINT_CMP),	"EC_POINT_cmp"},
@@ -184,6 +195,9 @@
 {ERR_FUNC(EC_F_I2D_ECPKPARAMETERS),	"i2d_ECPKParameters"},
 {ERR_FUNC(EC_F_I2D_ECPRIVATEKEY),	"i2d_ECPrivateKey"},
 {ERR_FUNC(EC_F_I2O_ECPUBLICKEY),	"i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW),	"NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW),	"NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW),	"NISTP521_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_O2I_ECPUBLICKEY),	"o2i_ECPublicKey"},
 {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE),	"OLD_EC_PRIV_DECODE"},
 {ERR_FUNC(EC_F_PKEY_EC_CTRL),	"PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@
 	{
 {ERR_REASON(EC_R_ASN1_ERROR)             ,"asn1 error"},
 {ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD)     ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE)    ,"bignum out of range"},
 {ERR_REASON(EC_R_BUFFER_TOO_SMALL)       ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
 {ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
 {ERR_REASON(EC_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO)   ,"discriminant is zero"},
 {ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
 {ERR_REASON(EC_R_FIELD_TOO_LARGE)        ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED)     ,"gf2m not supported"},
 {ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
 {ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
 {ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS)   ,"incompatible objects"},
@@ -239,6 +256,7 @@
 {ERR_REASON(EC_R_UNKNOWN_GROUP)          ,"unknown group"},
 {ERR_REASON(EC_R_UNKNOWN_ORDER)          ,"unknown order"},
 {ERR_REASON(EC_R_UNSUPPORTED_FIELD)      ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
 {ERR_REASON(EC_R_WRONG_ORDER)            ,"wrong order"},
 {0,NULL}
 	};

diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index 522802c..bf9fd2d 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c

@@ -64,7 +64,9 @@
 #include <string.h>
 #include "ec_lcl.h"
 #include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 EC_KEY *EC_KEY_new(void)
 	{
@@ -78,6 +80,7 @@
 		}
 
 	ret->version = 1;	
+	ret->flags = 0;
 	ret->group   = NULL;
 	ret->pub_key = NULL;
 	ret->priv_key= NULL;
@@ -197,6 +200,7 @@
 	dest->enc_flag  = src->enc_flag;
 	dest->conv_form = src->conv_form;
 	dest->version   = src->version;
+	dest->flags = src->flags;
 
 	return dest;
 	}
@@ -237,6 +241,11 @@
 	BIGNUM	*priv_key = NULL, *order = NULL;
 	EC_POINT *pub_key = NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_ec_key_generate_key(eckey);
+#endif
+
 	if (!eckey || !eckey->group)
 		{
 		ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +380,82 @@
 	return(ok);
 	}
 
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+	{
+	BN_CTX *ctx = NULL;
+	BIGNUM *tx, *ty;
+	EC_POINT *point = NULL;
+	int ok = 0, tmp_nid, is_char_two = 0;
+
+	if (!key || !key->group || !x || !y)
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+						ERR_R_PASSED_NULL_PARAMETER);
+		return 0;
+		}
+	ctx = BN_CTX_new();
+	if (!ctx)
+		goto err;
+
+	point = EC_POINT_new(key->group);
+
+	if (!point)
+		goto err;
+
+	tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+        if (tmp_nid == NID_X9_62_characteristic_two_field)
+		is_char_two = 1;
+
+	tx = BN_CTX_get(ctx);
+	ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+	if (is_char_two)
+		{
+		if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	else
+#endif
+		{
+		if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	/* Check if retrieved coordinates match originals: if not values
+	 * are out of range.
+	 */
+	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+			EC_R_COORDINATES_OUT_OF_RANGE);
+		goto err;
+		}
+
+	if (!EC_KEY_set_public_key(key, point))
+		goto err;
+
+	if (EC_KEY_check_key(key) == 0)
+		goto err;
+
+	ok = 1;
+
+	err:
+	if (ctx)
+		BN_CTX_free(ctx);
+	if (point)
+		EC_POINT_free(point);
+	return ok;
+
+	}
+
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
 	{
 	return key->group;
@@ -461,3 +546,18 @@
 		return 0;
 	return EC_GROUP_precompute_mult(key->group, ctx);
 	}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+	{
+	return key->flags;
+	}
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+	{
+	key->flags |= flags;
+	}
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+	{
+	key->flags &= ~flags;
+	}

diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 3e2c34b..da7967d 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h

@@ -3,7 +3,7 @@
  * Originally written by Bodo Moeller for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
 # endif
 #endif
 
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT	0x1
+
 /* Structure details are not part of the exported interface,
  * so all this may change in future versions. */
 
 struct ec_method_st {
+	/* Various method flags */
+	int flags;
 	/* used by EC_METHOD_get_field_type: */
 	int field_type; /* a NID */
 
@@ -244,6 +249,7 @@
 	point_conversion_form_t conv_form;
 
 	int 	references;
+	int	flags;
 
 	EC_EXTRA_DATA *method_data;
 } /* EC_KEY */;
@@ -391,3 +397,50 @@
 	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
 int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
 int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+	size_t felem_size, void *tmp_felems,
+	void (*felem_one)(void *out),
+	int (*felem_is_zero)(const void *in),
+	void (*felem_assign)(void *out, const void *in),
+	void (*felem_square)(void *out, const void *in),
+	void (*felem_mul)(void *out, const void *in1, const void *in2),
+	void (*felem_inv)(void *out, const void *in),
+	void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif

diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index dd7da0f..25247b5 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c

@@ -425,7 +425,7 @@
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@
 		}
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
-
+#endif
 
 int EC_GROUP_get_degree(const EC_GROUP *group)
 	{
@@ -856,7 +856,7 @@
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 	const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
 	{
@@ -872,7 +872,7 @@
 		}
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
-
+#endif
 
 int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
 	{
@@ -906,75 +906,7 @@
 		}
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-        unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->point2oct == 0)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point2oct(group, point, form, buf, len, ctx);
-	}
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
-        const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->oct2point == 0)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->oct2point(group, point, buf, len, ctx);
-	}
-
+#endif
 
 int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{

diff --git a/crypto/ec/ec_oct.c b/crypto/ec/ec_oct.c
new file mode 100644
index 0000000..fd9db07
--- /dev/null
+++ b/crypto/ec/ec_oct.c

@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+#endif
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->point2oct == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_point2oct(group, point,
+							form, buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_point2oct(group, point,
+							form, buf, len, ctx);
+#endif
+		}
+			
+	return group->meth->point2oct(group, point, form, buf, len, ctx);
+	}
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->oct2point == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_oct2point(group, point,
+							buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_oct2point(group, point,
+							buf, len, ctx);
+#endif
+		}
+	return group->meth->oct2point(group, point, buf, len, ctx);
+	}
+

diff --git a/crypto/ec/ec_pmeth.c b/crypto/ec/ec_pmeth.c
index f433076..d1ed66c 100644
--- a/crypto/ec/ec_pmeth.c
+++ b/crypto/ec/ec_pmeth.c

@@ -221,6 +221,7 @@
 
 		case EVP_PKEY_CTRL_MD:
 		if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&

diff --git a/crypto/ec/eck_prn.c b/crypto/ec/eck_prn.c
index 7d3e175..06de8f3 100644
--- a/crypto/ec/eck_prn.c
+++ b/crypto/ec/eck_prn.c

@@ -207,7 +207,7 @@
 			reason = ERR_R_MALLOC_FAILURE;
 			goto err;
 			}
-
+#ifndef OPENSSL_NO_EC2M
 		if (is_char_two)
 			{
 			if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@
 				}
 			}
 		else /* prime field */
+#endif
 			{
 			if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
 				{

diff --git a/crypto/ec/ecp_mont.c b/crypto/ec/ecp_mont.c
index 9fc4a46..079e474 100644
--- a/crypto/ec/ecp_mont.c
+++ b/crypto/ec/ecp_mont.c

@@ -63,12 +63,20 @@
 
 #include <openssl/err.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 
 const EC_METHOD *EC_GFp_mont_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_mont_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_mont_group_init,
 		ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -108,7 +114,9 @@
 		ec_GFp_mont_field_decode,
 		ec_GFp_mont_field_set_to_one };
 
+
 	return &ret;
+#endif
 	}
 
 

diff --git a/crypto/ec/ecp_nist.c b/crypto/ec/ecp_nist.c
index 2a5682e..aad2d5f 100644
--- a/crypto/ec/ecp_nist.c
+++ b/crypto/ec/ecp_nist.c

@@ -67,9 +67,17 @@
 #include <openssl/obj_mac.h>
 #include "ec_lcl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const EC_METHOD *EC_GFp_nist_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_nist_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)

diff --git a/crypto/ec/ecp_oct.c b/crypto/ec/ecp_oct.c
new file mode 100644
index 0000000..374a0ee
--- /dev/null
+++ b/crypto/ec/ecp_oct.c

@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <[email protected]>
+ * for the OpenSSL project. 
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp1, *tmp2, *x, *y;
+	int ret = 0;
+
+	/* clear error queue*/
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0);
+
+	BN_CTX_start(ctx);
+	tmp1 = BN_CTX_get(ctx);
+	tmp2 = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	/* Recover y.  We have a Weierstrass equation
+	 *     y^2 = x^3 + a*x + b,
+	 * so  y  is one of the square roots of  x^3 + a*x + b.
+	 */
+
+	/* tmp1 := x^3 */
+	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+	if (group->meth->field_decode == 0)
+		{
+		/* field_{sqr,mul} work on standard representation */
+		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + a*x */
+	if (group->a_is_minus3)
+		{
+		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (group->meth->field_decode)
+			{
+			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+			}
+		else
+			{
+			/* field_mul works on standard representation */
+			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+			}
+		
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + b */
+	if (group->meth->field_decode)
+		{
+		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+		}
+	
+	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+		{
+		unsigned long err = ERR_peek_last_error();
+		
+		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+			{
+			ERR_clear_error();
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			}
+		else
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+		goto err;
+		}
+
+	if (y_bit != BN_is_odd(y))
+		{
+		if (BN_is_zero(y))
+			{
+			int kron;
+
+			kron = BN_kronecker(x, &group->field, ctx);
+			if (kron == -2) goto err;
+
+			if (kron == 1)
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+			else
+				/* BN_mod_sqrt() should have cought this error (not a square) */
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			goto err;
+			}
+		if (!BN_usub(y, &group->field, y)) goto err;
+		}
+	if (y_bit != BN_is_odd(y))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = BN_num_bytes(&group->field);
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		if (y == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+			buf[0] = form + 1;
+		else
+			buf[0] = form;
+	
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = BN_num_bytes(&group->field);
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (y_bit != BN_is_odd(y))
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+

diff --git a/crypto/ec/ecp_smpl.c b/crypto/ec/ecp_smpl.c
index 66a92e2..7cbb321 100644
--- a/crypto/ec/ecp_smpl.c
+++ b/crypto/ec/ecp_smpl.c

@@ -65,11 +65,19 @@
 #include <openssl/err.h>
 #include <openssl/symhacks.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 const EC_METHOD *EC_GFp_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -633,372 +640,6 @@
 	return ret;
 	}
 
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp1, *tmp2, *x, *y;
-	int ret = 0;
-
-	/* clear error queue*/
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0);
-
-	BN_CTX_start(ctx);
-	tmp1 = BN_CTX_get(ctx);
-	tmp2 = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	/* Recover y.  We have a Weierstrass equation
-	 *     y^2 = x^3 + a*x + b,
-	 * so  y  is one of the square roots of  x^3 + a*x + b.
-	 */
-
-	/* tmp1 := x^3 */
-	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
-	if (group->meth->field_decode == 0)
-		{
-		/* field_{sqr,mul} work on standard representation */
-		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
-		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
-		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + a*x */
-	if (group->a_is_minus3)
-		{
-		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
-		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
-		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (group->meth->field_decode)
-			{
-			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
-			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
-			}
-		else
-			{
-			/* field_mul works on standard representation */
-			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
-			}
-		
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + b */
-	if (group->meth->field_decode)
-		{
-		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
-		}
-	
-	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
-		{
-		unsigned long err = ERR_peek_last_error();
-		
-		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
-			{
-			ERR_clear_error();
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			}
-		else
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-		goto err;
-		}
-
-	if (y_bit != BN_is_odd(y))
-		{
-		if (BN_is_zero(y))
-			{
-			int kron;
-
-			kron = BN_kronecker(x, &group->field, ctx);
-			if (kron == -2) goto err;
-
-			if (kron == 1)
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
-			else
-				/* BN_mod_sqrt() should have cought this error (not a square) */
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			goto err;
-			}
-		if (!BN_usub(y, &group->field, y)) goto err;
-		}
-	if (y_bit != BN_is_odd(y))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
-		goto err;
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = BN_num_bytes(&group->field);
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		if (y == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
-			buf[0] = form + 1;
-		else
-			buf[0] = form;
-	
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = BN_num_bytes(&group->field);
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (y_bit != BN_is_odd(y))
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
 	int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);

diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index 7509cb9..f107782 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c

@@ -94,6 +94,7 @@
 #include <openssl/objects.h>
 #include <openssl/rand.h>
 #include <openssl/bn.h>
+#include <openssl/opensslconf.h>
 
 #if defined(_MSC_VER) && defined(_MIPS_) && (_MSC_VER/100==12)
 /* suppress "too big too optimize" warning */
@@ -107,10 +108,6 @@
 	EXIT(1); \
 } while (0)
 
-void prime_field_tests(void);
-void char2_field_tests(void);
-void internal_curve_test(void);
-
 #define TIMING_BASE_PT 0
 #define TIMING_RAND_PT 1
 #define TIMING_SIMUL 2
@@ -195,7 +192,50 @@
 	}
 #endif
 
-void prime_field_tests()
+/* test multiplication with group order, long and negative scalars */
+static void group_order_tests(EC_GROUP *group)
+	{
+	BIGNUM *n1, *n2, *order;
+	EC_POINT *P = EC_POINT_new(group);
+	EC_POINT *Q = EC_POINT_new(group);
+	BN_CTX *ctx = BN_CTX_new();
+
+	n1 = BN_new(); n2 = BN_new(); order = BN_new();
+	fprintf(stdout, "verify group order ...");
+	fflush(stdout);
+	if (!EC_GROUP_get_order(group, order, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, ".");
+	fflush(stdout);
+	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, " ok\n");
+	fprintf(stdout, "long/negative scalar tests ... ");
+	if (!BN_one(n1)) ABORT;
+	/* n1 = 1 - order */
+	if (!BN_sub(n1, n1, order)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n1, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = 1 + order */
+	if (!BN_add(n2, order, BN_value_one())) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = (1 - order) * (1 + order) */
+	if (!BN_mul(n2, n1, n2, ctx)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	fprintf(stdout, "ok\n");
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	BN_free(n1);
+	BN_free(n2);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+static void prime_field_tests(void)
 	{	
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
@@ -321,21 +361,21 @@
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "Generator as octect string, compressed form:\n     ");
+	fprintf(stdout, "Generator as octet string, compressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_UNCOMPRESSED, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, uncompressed form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, uncompressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_HYBRID, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, hybrid form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, hybrid form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	if (!EC_POINT_get_Jprojective_coordinates_GFp(group, R, x, y, z, ctx)) ABORT;
@@ -381,17 +421,7 @@
 	if (EC_GROUP_get_degree(group) != 160) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_160 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_160, group)) ABORT;
@@ -425,17 +455,7 @@
 	if (EC_GROUP_get_degree(group) != 192) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_192 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_192, group)) ABORT;
@@ -469,17 +489,7 @@
 	if (EC_GROUP_get_degree(group) != 224) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_224 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_224, group)) ABORT;
@@ -514,17 +524,7 @@
 	if (EC_GROUP_get_degree(group) != 256) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_256 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_256, group)) ABORT;
@@ -563,18 +563,8 @@
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 384) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+	group_order_tests(group);
 
 	if (!(P_384 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_384, group)) ABORT;
@@ -619,18 +609,8 @@
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 521) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+ 	group_order_tests(group);
 
 	if (!(P_521 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_521, group)) ABORT;
@@ -659,6 +639,7 @@
 		points[2] = Q;
 		points[3] = Q;
 
+		if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
 		if (!BN_add(y, z, BN_value_one())) ABORT;
 		if (BN_is_odd(y)) ABORT;
 		if (!BN_rshift1(y, y)) ABORT;
@@ -792,22 +773,14 @@
 	fprintf(stdout, "verify degree ..."); \
 	if (EC_GROUP_get_degree(group) != _degree) ABORT; \
 	fprintf(stdout, " ok\n"); \
-	fprintf(stdout, "verify group order ..."); \
-	fflush(stdout); \
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, "."); \
-	fflush(stdout); \
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, " ok\n"); \
+	group_order_tests(group); \
 	if (!(_variable = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT; \
-	if (!EC_GROUP_copy(_variable, group)) ABORT;
+	if (!EC_GROUP_copy(_variable, group)) ABORT; \
 
-void char2_field_tests()
-	{	
+#ifndef OPENSSL_NO_EC2M
+
+static void char2_field_tests(void)
+	{
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
 	EC_GROUP *group;
@@ -1239,8 +1212,9 @@
 	if (C2_B571) EC_GROUP_free(C2_B571);
 
 	}
+#endif
 
-void internal_curve_test(void)
+static void internal_curve_test(void)
 	{
 	EC_builtin_curve *curves = NULL;
 	size_t crv_len = 0, n = 0;
@@ -1287,13 +1261,189 @@
 		EC_GROUP_free(group);
 		}
 	if (ok)
-		fprintf(stdout, " ok\n");
+		fprintf(stdout, " ok\n\n");
 	else
-		fprintf(stdout, " failed\n");
+		{
+		fprintf(stdout, " failed\n\n");
+		ABORT;
+		}
 	OPENSSL_free(curves);
 	return;
 	}
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/* nistp_test_params contains magic numbers for testing our optimized
+ * implementations of several NIST curves with characteristic > 3. */
+struct nistp_test_params
+	{
+	const EC_METHOD* (*meth) ();
+	int degree;
+	/* Qx, Qy and D are taken from
+	 * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+	 * Otherwise, values are standard curve parameters from FIPS 180-3 */
+	const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
+	};
+
+static const struct nistp_test_params nistp_tests_params[] =
+	{
+		{
+		/* P-224 */
+		EC_GFp_nistp224_method,
+		224,
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", /* p */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", /* a */
+		"B4050A850C04B3ABF54132565044B0B7D7BFD8BA270B39432355FFB4", /* b */
+		"E84FB0B8E7000CB657D7973CF6B42ED78B301674276DF744AF130B3E", /* Qx */
+		"4376675C6FC5612C21A0FF2D2A89D2987DF7A2BC52183B5982298555", /* Qy */
+		"B70E0CBD6BB4BF7F321390B94A03C1D356C21122343280D6115C1D21", /* Gx */
+		"BD376388B5F723FB4C22DFE6CD4375A05A07476444D5819985007E34", /* Gy */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFF16A2E0B8F03E13DD29455C5C2A3D", /* order */
+		"3F0C488E987C80BE0FEE521F8D90BE6034EC69AE11CA72AA777481E8", /* d */
+		},
+		{
+		/* P-256 */
+		EC_GFp_nistp256_method,
+		256,
+		"ffffffff00000001000000000000000000000000ffffffffffffffffffffffff", /* p */
+		"ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", /* a */
+		"5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", /* b */
+		"b7e08afdfe94bad3f1dc8c734798ba1c62b3a0ad1e9ea2a38201cd0889bc7a19", /* Qx */
+		"3603f747959dbf7a4bb226e41928729063adc7ae43529e61b563bbc606cc5e09", /* Qy */
+		"6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", /* Gx */
+		"4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", /* Gy */
+		"ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551", /* order */
+		"c477f9f65c22cce20657faa5b2d1d8122336f851a508a1ed04e479c34985bf96", /* d */
+		},
+		{
+		/* P-521 */
+		EC_GFp_nistp521_method,
+		521,
+		"1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", /* p */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc", /* a */
+		"051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00", /* b */
+		"0098e91eef9a68452822309c52fab453f5f117c1da8ed796b255e9ab8f6410cca16e59df403a6bdc6ca467a37056b1e54b3005d8ac030decfeb68df18b171885d5c4", /* Qx */
+		"0164350c321aecfc1cca1ba4364c9b15656150b4b78d6a48d7d28e7f31985ef17be8554376b72900712c4b83ad668327231526e313f5f092999a4632fd50d946bc2e", /* Qy */
+		"c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", /* Gx */
+		"11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650", /* Gy */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", /* order */
+		"0100085f47b8e1b8b11b7eb33028c0b2888e304bfc98501955b45bba1478dc184eeedf09b86a5f7c21994406072787205e69a63709fe35aa93ba333514b24f961722", /* d */
+		},
+	};
+
+void nistp_single_test(const struct nistp_test_params *test)
+	{
+	BN_CTX *ctx;
+	BIGNUM *p, *a, *b, *x, *y, *n, *m, *order;
+	EC_GROUP *NISTP;
+	EC_POINT *G, *P, *Q, *Q_CHECK;
+
+	fprintf(stdout, "\nNIST curve P-%d (optimised implementation):\n", test->degree);
+	ctx = BN_CTX_new();
+	p = BN_new();
+	a = BN_new();
+	b = BN_new();
+	x = BN_new(); y = BN_new();
+	m = BN_new(); n = BN_new(); order = BN_new();
+
+	NISTP = EC_GROUP_new(test->meth());
+	if(!NISTP) ABORT;
+	if (!BN_hex2bn(&p, test->p)) ABORT;
+	if (1 != BN_is_prime_ex(p, BN_prime_checks, ctx, NULL)) ABORT;
+	if (!BN_hex2bn(&a, test->a)) ABORT;
+	if (!BN_hex2bn(&b, test->b)) ABORT;
+	if (!EC_GROUP_set_curve_GFp(NISTP, p, a, b, ctx)) ABORT;
+	G = EC_POINT_new(NISTP);
+	P = EC_POINT_new(NISTP);
+	Q = EC_POINT_new(NISTP);
+	Q_CHECK = EC_POINT_new(NISTP);
+	if(!BN_hex2bn(&x, test->Qx)) ABORT;
+	if(!BN_hex2bn(&y, test->Qy)) ABORT;
+	if(!EC_POINT_set_affine_coordinates_GFp(NISTP, Q_CHECK, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&x, test->Gx)) ABORT;
+	if (!BN_hex2bn(&y, test->Gy)) ABORT;
+	if (!EC_POINT_set_affine_coordinates_GFp(NISTP, G, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&order, test->order)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+
+	fprintf(stdout, "verify degree ... ");
+	if (EC_GROUP_get_degree(NISTP) != test->degree) ABORT;
+	fprintf(stdout, "ok\n");
+
+	fprintf(stdout, "NIST test vectors ... ");
+	if (!BN_hex2bn(&n, test->d)) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* set generator to P = 2*G, where G is the standard generator */
+	if (!EC_POINT_dbl(NISTP, P, G, ctx)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, P, order, BN_value_one())) ABORT;
+	/* set the scalar to m=n/2, where n is the NIST test scalar */
+	if (!BN_rshift(m, n, 1)) ABORT;
+
+	/* test the non-standard generator */
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* now repeat all tests with precomputation */
+	if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT;
+
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* reset generator */
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	fprintf(stdout, "ok\n");
+	group_order_tests(NISTP);
+#if 0
+	timings(NISTP, TIMING_BASE_PT, ctx);
+	timings(NISTP, TIMING_RAND_PT, ctx);
+#endif
+	EC_GROUP_free(NISTP);
+	EC_POINT_free(G);
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	EC_POINT_free(Q_CHECK);
+	BN_free(n);
+	BN_free(m);
+	BN_free(p);
+	BN_free(a);
+	BN_free(b);
+	BN_free(x);
+	BN_free(y);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+void nistp_tests()
+	{
+	unsigned i;
+
+	for (i = 0; i < sizeof(nistp_tests_params) / sizeof(struct nistp_test_params); i++)
+		{
+		nistp_single_test(&nistp_tests_params[i]);
+		}
+	}
+#endif
+
 static const char rnd_seed[] = "string to make the random number generator think it has entropy";
 
 int main(int argc, char *argv[])
@@ -1317,7 +1467,12 @@
 
 	prime_field_tests();
 	puts("");
+#ifndef OPENSSL_NO_EC2M
 	char2_field_tests();
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	nistp_tests();
+#endif
 	/* test the internal curves */
 	internal_curve_test();
 

diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h
index b4b58ee..8887102 100644
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h

@@ -109,11 +109,13 @@
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 

diff --git a/crypto/ecdh/ecdhtest.c b/crypto/ecdh/ecdhtest.c
index 212a87e..823d7ba 100644
--- a/crypto/ecdh/ecdhtest.c
+++ b/crypto/ecdh/ecdhtest.c

@@ -158,11 +158,13 @@
 		if (!EC_POINT_get_affine_coordinates_GFp(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#endif
 #ifdef NOISY
 	BIO_puts(out,"  pri 1=");
 	BN_print(out,a->priv_key);
@@ -183,11 +185,13 @@
 		if (!EC_POINT_get_affine_coordinates_GFp(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#endif
 
 #ifdef NOISY
 	BIO_puts(out,"  pri 2=");
@@ -324,6 +328,7 @@
 	if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err;
+#ifndef OPENSSL_NO_EC2M
 	/* NIST BINARY CURVES TESTS */
 	if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err;
@@ -335,6 +340,7 @@
 	if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err;
+#endif
 
 	ret = 0;
 

diff --git a/crypto/ecdh/ech_err.c b/crypto/ecdh/ech_err.c
index 6f4b0c9..3bd2473 100644
--- a/crypto/ecdh/ech_err.c
+++ b/crypto/ecdh/ech_err.c

@@ -1,6 +1,6 @@
 /* crypto/ecdh/ech_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDH_str_functs[]=
 	{
+{ERR_FUNC(ECDH_F_ECDH_CHECK),	"ECDH_CHECK"},
 {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY),	"ECDH_compute_key"},
 {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD),	"ECDH_DATA_new_method"},
 {0,NULL}
@@ -78,6 +79,7 @@
 static ERR_STRING_DATA ECDH_str_reasons[]=
 	{
 {ERR_REASON(ECDH_R_KDF_FAILED)           ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD)      ,"non fips method"},
 {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE)     ,"no private value"},
 {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
 {0,NULL}

diff --git a/crypto/ecdh/ech_lib.c b/crypto/ecdh/ech_lib.c
index 4d8ea03..dadbfd3 100644
--- a/crypto/ecdh/ech_lib.c
+++ b/crypto/ecdh/ech_lib.c

@@ -73,6 +73,9 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
 
@@ -90,7 +93,16 @@
 const ECDH_METHOD *ECDH_get_default_method(void)
 	{
 	if(!default_ECDH_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdh_openssl();
+		else
+			return ECDH_OpenSSL();
+#else
 		default_ECDH_method = ECDH_OpenSSL();
+#endif
+		}
 	return default_ECDH_method;
 	}
 
@@ -215,6 +227,14 @@
 	}
 	else
 		ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 	
 
 	return ecdh_data;

diff --git a/crypto/ecdh/ech_locl.h b/crypto/ecdh/ech_locl.h
index f658526..f6cad6a 100644
--- a/crypto/ecdh/ech_locl.h
+++ b/crypto/ecdh/ech_locl.h

@@ -75,6 +75,14 @@
 	char *app_data;
 	};
 
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdh_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);

diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c
index 2a40ff1..4a30628 100644
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c

@@ -157,6 +157,7 @@
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx)) 
@@ -165,6 +166,7 @@
 			goto err;
 			}
 		}
+#endif
 
 	buflen = (EC_GROUP_get_degree(group) + 7)/8;
 	len = BN_num_bytes(x);

diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h
index e61c539..7fb5254 100644
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h

@@ -238,6 +238,7 @@
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,7 @@
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 

diff --git a/crypto/ecdsa/ecdsatest.c b/crypto/ecdsa/ecdsatest.c
index c3c20c3..537bb30 100644
--- a/crypto/ecdsa/ecdsatest.c
+++ b/crypto/ecdsa/ecdsatest.c

@@ -262,6 +262,7 @@
 		"3238135532097973577080787768312505059318910517550078427819"
 		"78505179448783"))
 		goto x962_err;
+#ifndef OPENSSL_NO_EC2M
 	if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1,
 		"87194383164871543355722284926904419997237591535066528048",
 		"308992691965804947361541664549085895292153777025772063598"))
@@ -272,7 +273,7 @@
 		"1970303740007316867383349976549972270528498040721988191026"
 		"49413465737174"))
 		goto x962_err;
-
+#endif
 	ret = 1;
 x962_err:
 	if (!restore_rand())

diff --git a/crypto/ecdsa/ecs_err.c b/crypto/ecdsa/ecs_err.c
index 98e38d5..81542e6 100644
--- a/crypto/ecdsa/ecs_err.c
+++ b/crypto/ecdsa/ecs_err.c

@@ -1,6 +1,6 @@
 /* crypto/ecdsa/ecs_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDSA_str_functs[]=
 	{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK),	"ECDSA_CHECK"},
 {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD),	"ECDSA_DATA_NEW_METHOD"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN),	"ECDSA_do_sign"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY),	"ECDSA_do_verify"},
@@ -84,6 +85,7 @@
 {ERR_REASON(ECDSA_R_ERR_EC_LIB)          ,"err ec lib"},
 {ERR_REASON(ECDSA_R_MISSING_PARAMETERS)  ,"missing parameters"},
 {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD)     ,"non fips method"},
 {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
 {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
 {0,NULL}

diff --git a/crypto/ecdsa/ecs_lib.c b/crypto/ecdsa/ecs_lib.c
index 2ebae3a..e477da4 100644
--- a/crypto/ecdsa/ecs_lib.c
+++ b/crypto/ecdsa/ecs_lib.c

@@ -60,6 +60,9 @@
 #endif
 #include <openssl/err.h>
 #include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
 
@@ -77,7 +80,16 @@
 const ECDSA_METHOD *ECDSA_get_default_method(void)
 {
 	if(!default_ECDSA_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdsa_openssl();
+		else
+			return ECDSA_OpenSSL();
+#else
 		default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+		}
 	return default_ECDSA_method;
 }
 
@@ -193,7 +205,14 @@
 	}
 	else
 		ecdsa_data = (ECDSA_DATA *)data;
-	
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 
 	return ecdsa_data;
 }

diff --git a/crypto/ecdsa/ecs_locl.h b/crypto/ecdsa/ecs_locl.h
index 3a69a84..cb3be13 100644
--- a/crypto/ecdsa/ecs_locl.h
+++ b/crypto/ecdsa/ecs_locl.h

@@ -82,6 +82,14 @@
 	char *app_data;
 	};
 
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdsa_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);

diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index 1bbf328..7725935 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c

@@ -167,6 +167,7 @@
 				goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 		else /* NID_X9_62_characteristic_two_field */
 		{
 			if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +177,7 @@
 				goto err;
 			}
 		}
+#endif
 		if (!BN_nnmod(r, X, order, ctx))
 		{
 			ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -454,6 +456,7 @@
 			goto err;
 		}
 	}
+#ifndef OPENSSL_NO_EC2M
 	else /* NID_X9_62_characteristic_two_field */
 	{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +466,7 @@
 			goto err;
 		}
 	}
-	
+#endif	
 	if (!BN_nnmod(u1, X, order, ctx))
 	{
 		ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);

diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c
index 22c1204..6093376 100644
--- a/crypto/engine/eng_all.c
+++ b/crypto/engine/eng_all.c

@@ -61,6 +61,8 @@
 
 void ENGINE_load_builtin_engines(void)
 	{
+	/* Some ENGINEs need this */
+	OPENSSL_cpuid_setup();
 #if 0
 	/* There's no longer any need for an "openssl" ENGINE unless, one day,
 	 * it is the *only* way for standard builtin implementations to be be
@@ -71,6 +73,12 @@
 #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
 	ENGINE_load_cryptodev();
 #endif
+#ifndef OPENSSL_NO_RSAX
+	ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+	ENGINE_load_rdrand();
+#endif
 	ENGINE_load_dynamic();
 #ifndef OPENSSL_NO_STATIC_ENGINE
 #ifndef OPENSSL_NO_HW
@@ -112,6 +120,7 @@
 	ENGINE_load_capi();
 #endif
 #endif
+	ENGINE_register_all_complete();
 	}
 
 #if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)

diff --git a/crypto/engine/eng_fat.c b/crypto/engine/eng_fat.c
index db66e62..789b8d5 100644
--- a/crypto/engine/eng_fat.c
+++ b/crypto/engine/eng_fat.c

@@ -176,6 +176,7 @@
 	ENGINE *e;
 
 	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
-		ENGINE_register_complete(e);
+		if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+			ENGINE_register_complete(e);
 	return 1;
 	}

diff --git a/crypto/engine/engine.h b/crypto/engine/engine.h
index 943aeae..f8be497 100644
--- a/crypto/engine/engine.h
+++ b/crypto/engine/engine.h

@@ -141,6 +141,13 @@
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation

diff --git a/crypto/err/err.c b/crypto/err/err.c
index 69713a6..fcdb244 100644
--- a/crypto/err/err.c
+++ b/crypto/err/err.c

@@ -1066,6 +1066,13 @@
 void ERR_add_error_data(int num, ...)
 	{
 	va_list args;
+	va_start(args, num);
+	ERR_add_error_vdata(num, args);
+	va_end(args);
+	}
+
+void ERR_add_error_vdata(int num, va_list args)
+	{
 	int i,n,s;
 	char *str,*p,*a;
 
@@ -1074,7 +1081,6 @@
 	if (str == NULL) return;
 	str[0]='\0';
 
-	va_start(args, num);
 	n=0;
 	for (i=0; i<num; i++)
 		{
@@ -1090,7 +1096,7 @@
 				if (p == NULL)
 					{
 					OPENSSL_free(str);
-					goto err;
+					return;
 					}
 				else
 					str=p;
@@ -1099,9 +1105,6 @@
 			}
 		}
 	ERR_set_error_data(str,ERR_TXT_MALLOCED|ERR_TXT_STRING);
-
-err:
-	va_end(args);
 	}
 
 int ERR_set_mark(void)

diff --git a/crypto/err/err.h b/crypto/err/err.h
index b9f8c16..974cc9c 100644
--- a/crypto/err/err.h
+++ b/crypto/err/err.h

@@ -344,8 +344,9 @@
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);

diff --git a/crypto/err/err_all.c b/crypto/err/err_all.c
index fc049e8..bd8946d 100644
--- a/crypto/err/err_all.c
+++ b/crypto/err/err_all.c

@@ -104,6 +104,10 @@
 #endif
 #include <openssl/comp.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void ERR_load_crypto_strings(void)
 	{
 #ifndef OPENSSL_NO_ERR
@@ -157,4 +161,7 @@
 #endif
 	ERR_load_COMP_strings();
 #endif
+#ifdef OPENSSL_FIPS
+	ERR_load_FIPS_strings();
+#endif
 	}

diff --git a/crypto/evp/bio_md.c b/crypto/evp/bio_md.c
index 9841e32..144fdfd 100644
--- a/crypto/evp/bio_md.c
+++ b/crypto/evp/bio_md.c

@@ -153,8 +153,12 @@
 		{
 		if (ret > 0)
 			{
-			EVP_DigestUpdate(ctx,(const unsigned char *)in,
-				(unsigned int)ret);
+			if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+				(unsigned int)ret))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 		}
 	if(b->next_bio != NULL)
@@ -220,7 +224,8 @@
 	case BIO_CTRL_DUP:
 		dbio=ptr;
 		dctx=dbio->ptr;
-		EVP_MD_CTX_copy_ex(dctx,ctx);
+		if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+			return 0;
 		b->init=1;
 		break;
 	default:

diff --git a/crypto/evp/bio_ok.c b/crypto/evp/bio_ok.c
index 98bc1ab..e643353 100644
--- a/crypto/evp/bio_ok.c
+++ b/crypto/evp/bio_ok.c

@@ -133,10 +133,10 @@
 static int ok_free(BIO *data);
 static long ok_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
 
-static void sig_out(BIO* b);
-static void sig_in(BIO* b);
-static void block_out(BIO* b);
-static void block_in(BIO* b);
+static int sig_out(BIO* b);
+static int sig_in(BIO* b);
+static int block_out(BIO* b);
+static int block_in(BIO* b);
 #define OK_BLOCK_SIZE	(1024*4)
 #define OK_BLOCK_BLOCK	4
 #define IOBS		(OK_BLOCK_SIZE+ OK_BLOCK_BLOCK+ 3*EVP_MAX_MD_SIZE)
@@ -266,10 +266,24 @@
 		ctx->buf_len+= i;
 
 		/* no signature yet -- check if we got one */
-		if (ctx->sigio == 1) sig_in(b);
+		if (ctx->sigio == 1)
+			{
+			if (!sig_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* signature ok -- check if we got block */
-		if (ctx->sigio == 0) block_in(b);
+		if (ctx->sigio == 0)
+			{
+			if (!block_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* invalid block -- cancel */
 		if (ctx->cont <= 0) break;
@@ -293,7 +307,8 @@
 
 	if ((ctx == NULL) || (b->next_bio == NULL) || (b->init == 0)) return(0);
 
-	if(ctx->sigio) sig_out(b);
+	if(ctx->sigio && !sig_out(b))
+		return 0;
 
 	do{
 		BIO_clear_retry_flags(b);
@@ -332,7 +347,11 @@
 
 		if(ctx->buf_len >= OK_BLOCK_SIZE+ OK_BLOCK_BLOCK)
 			{
-			block_out(b);
+			if (!block_out(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 	}while(inl > 0);
 
@@ -379,7 +398,8 @@
 	case BIO_CTRL_FLUSH:
 		/* do a final write */
 		if(ctx->blockout == 0)
-			block_out(b);
+			if (!block_out(b))
+				return 0;
 
 		while (ctx->blockout)
 			{
@@ -408,7 +428,8 @@
 		break;
 	case BIO_C_SET_MD:
 		md=ptr;
-		EVP_DigestInit_ex(&ctx->md, md, NULL);
+		if (!EVP_DigestInit_ex(&ctx->md, md, NULL))
+			return 0;
 		b->init=1;
 		break;
 	case BIO_C_GET_MD:
@@ -455,7 +476,7 @@
 	}
 }
 
-static void sig_out(BIO* b)
+static int sig_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -463,9 +484,10 @@
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return;
+	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	/* FIXME: there's absolutely no guarantee this makes any sense at all,
 	 * particularly now EVP_MD_CTX has been restructured.
 	 */
@@ -474,14 +496,20 @@
 	longswap(&(ctx->buf[ctx->buf_len]), md->digest->md_size);
 	ctx->buf_len+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
 	ctx->sigio= 0;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void sig_in(BIO* b)
+static int sig_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -491,15 +519,18 @@
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return;
+	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	memcpy(md->md_data, &(ctx->buf[ctx->buf_off]), md->digest->md_size);
 	longswap(md->md_data, md->digest->md_size);
 	ctx->buf_off+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	ret= memcmp(&(ctx->buf[ctx->buf_off]), tmp, md->digest->md_size) == 0;
 	ctx->buf_off+= md->digest->md_size;
 	if(ret == 1)
@@ -516,9 +547,13 @@
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_out(BIO* b)
+static int block_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -532,13 +567,20 @@
 	ctx->buf[1]=(unsigned char)(tl>>16);
 	ctx->buf[2]=(unsigned char)(tl>>8);
 	ctx->buf[3]=(unsigned char)(tl);
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md,
+		(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_in(BIO* b)
+static int block_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -554,10 +596,13 @@
 	tl|=ctx->buf[2]; tl<<=8;
 	tl|=ctx->buf[3];
 
-	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return;
+	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return 1;
  
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md,
+			(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	if(memcmp(&(ctx->buf[tl+ OK_BLOCK_BLOCK]), tmp, md->digest->md_size) == 0)
 		{
 		/* there might be parts from next block lurking around ! */
@@ -571,5 +616,9 @@
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 

diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
index c5f9268..2a45d43 100644
--- a/crypto/evp/c_allc.c
+++ b/crypto/evp/c_allc.c

@@ -98,6 +98,9 @@
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
 	EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_IDEA
@@ -166,9 +169,9 @@
 	EVP_add_cipher(EVP_aes_128_cfb1());
 	EVP_add_cipher(EVP_aes_128_cfb8());
 	EVP_add_cipher(EVP_aes_128_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_128_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_128_xts());
 	EVP_add_cipher_alias(SN_aes_128_cbc,"AES128");
 	EVP_add_cipher_alias(SN_aes_128_cbc,"aes128");
 	EVP_add_cipher(EVP_aes_192_ecb());
@@ -177,9 +180,8 @@
 	EVP_add_cipher(EVP_aes_192_cfb1());
 	EVP_add_cipher(EVP_aes_192_cfb8());
 	EVP_add_cipher(EVP_aes_192_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_192_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_192_gcm());
 	EVP_add_cipher_alias(SN_aes_192_cbc,"AES192");
 	EVP_add_cipher_alias(SN_aes_192_cbc,"aes192");
 	EVP_add_cipher(EVP_aes_256_ecb());
@@ -188,11 +190,15 @@
 	EVP_add_cipher(EVP_aes_256_cfb1());
 	EVP_add_cipher(EVP_aes_256_cfb8());
 	EVP_add_cipher(EVP_aes_256_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_256_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_256_gcm());
+	EVP_add_cipher(EVP_aes_256_xts());
 	EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
 	EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_CAMELLIA

diff --git a/crypto/evp/digest.c b/crypto/evp/digest.c
index 982ba2b..467e6b5 100644
--- a/crypto/evp/digest.c
+++ b/crypto/evp/digest.c

@@ -117,6 +117,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
 	{
 	memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@
 		}
 	if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
 		return 1;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		if (FIPS_digestinit(ctx, type))
+			return 1;
+		OPENSSL_free(ctx->md_data);
+		ctx->md_data = NULL;
+		return 0;
+		}
+#endif
 	return ctx->digest->init(ctx);
 	}
 
 int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestupdate(ctx, data, count);
+#else
 	return ctx->update(ctx,data,count);
+#endif
 	}
 
 /* The caller can assume that this removes any secret data from the context */
@@ -245,8 +263,10 @@
 /* The caller can assume that this removes any secret data from the context */
 int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestfinal(ctx, md, size);
+#else
 	int ret;
-
 	OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
 	ret=ctx->digest->final(ctx,md);
 	if (size != NULL)
@@ -258,6 +278,7 @@
 		}
 	memset(ctx->md_data,0,ctx->digest->ctx_size);
 	return ret;
+#endif
 	}
 
 int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -351,6 +372,7 @@
 /* This call frees resources associated with the context */
 int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 	{
+#ifndef OPENSSL_FIPS
 	/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
 	 * because sometimes only copies of the context are ever finalised.
 	 */
@@ -363,6 +385,7 @@
 		OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
 		OPENSSL_free(ctx->md_data);
 		}
+#endif
 	if (ctx->pctx)
 		EVP_PKEY_CTX_free(ctx->pctx);
 #ifndef OPENSSL_NO_ENGINE
@@ -371,6 +394,9 @@
 		 * functional reference we held for this reason. */
 		ENGINE_finish(ctx->engine);
 #endif
+#ifdef OPENSSL_FIPS
+	FIPS_md_ctx_cleanup(ctx);
+#endif
 	memset(ctx,'\0',sizeof *ctx);
 
 	return 1;

diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index bd6c0a3..1e4af0c 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c

@@ -1,5 +1,5 @@
 /* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
 #include <assert.h>
 #include <openssl/aes.h>
 #include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-					const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include <openssl/rand.h>
 
 typedef struct
 	{
 	AES_KEY ks;
+	block128_f block;
+	union {
+		cbc128_f cbc;
+		ctr128_f ctr;
+	} stream;
 	} EVP_AES_KEY;
 
-#define data(ctx)	EVP_C_DATA(EVP_AES_KEY,ctx)
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	GCM128_CONTEXT gcm;
+	unsigned char *iv;	/* Temporary IV store */
+	int ivlen;		/* IV length */
+	int taglen;
+	int iv_gen;		/* It is OK to generate IVs */
+	int tls_aad_len;	/* TLS AAD length */
+	ctr128_f ctr;
+	} EVP_AES_GCM_CTX;
 
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
-		       NID_aes_128, 16, 16, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
-		       NID_aes_192, 16, 24, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
-		       NID_aes_256, 16, 32, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
+typedef struct
+	{
+	AES_KEY ks1, ks2;	/* AES key schedules to use */
+	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+	} EVP_AES_XTS_CTX;
 
-#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	int tag_set;		/* Set if tag is valid */
+	int len_set;		/* Set if message length set */
+	int L, M;		/* L and M parameters from RFC3610 */
+	CCM128_CONTEXT ccm;
+	ccm128_f str;
+	} EVP_AES_CCM_CTX;
 
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
+#define MAXBITCHUNK	((size_t)1<<(sizeof(size_t)*8-4))
 
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+			size_t len, const AES_KEY *key,
+			const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+			size_t blocks, const AES_KEY *key,
+			const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+#endif
+
+#if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
+	((defined(__i386)	|| defined(__i386__)	|| \
+	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)				)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE	VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define	AESNI_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+		   const unsigned char *iv, int enc)
+	{
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+		{ 
+		ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)aesni_cbc_encrypt :
+					NULL;
+		}
+	else	{
+		ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_encrypt;
+		if (mode==EVP_CIPH_CBC_MODE)
+			dat->stream.cbc	= (cbc128_f)aesni_cbc_encrypt;
+		else if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		else
+			dat->stream.cbc = NULL;
+		}
+
+	if(ret < 0)
+		{
+		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+		return 0;
+		}
+
+	return 1;
+	}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+	return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+
+	if (len<bl)	return 1;
+
+	aesni_ecb_encrypt(in,out,len,ctx->cipher_data,ctx->encrypt);
+
+	return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+				(block128_f)aesni_encrypt);
+		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key)
+		{
+		/* key_len is two AES keys */
+		if (enc)
+			{
+			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
+			}
+		else
+			{
+			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
+			}
+
+		aesni_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		}
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)aesni_encrypt);
+		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+				(ccm128_f)aesni_ccm64_decrypt_blocks;
+		cctx->key_set = 1;
+		}
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_init_key,			\
+	aesni_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,	\
+	keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_##mode##_init_key,	\
+	aesni_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
+	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
 
 static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		   const unsigned char *iv, int enc)
 	{
-	int ret;
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
 
-	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
-	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
-	    || enc) 
-		ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+		}
 	else
-		ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+#ifdef AES_CTR_ASM
+		if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+		}
 
 	if(ret < 0)
 		{
@@ -117,4 +571,743 @@
 	return 1;
 	}
 
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.cbc)
+		(*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+	else if (ctx->encrypt)
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+	else
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+	return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+	size_t	i;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (len<bl)	return 1;
+
+	for (i=0,len-=bl;i<=len;i+=bl)
+		(*dat->block)(in+i,out+i,&dat->ks);
+
+	return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,dat->block);
+	return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+		CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		return 1;
+	}
+
+	while (len>=MAXBITCHUNK) {
+		CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		len-=MAXBITCHUNK;
+	}
+	if (len)
+		CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	
+	return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+{
+	unsigned int num = ctx->num;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.ctr)
+		CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->stream.ctr);
+	else
+		CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->block);
+	ctx->num = (size_t)num;
+	return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+	if (gctx->iv != c->iv)
+		OPENSSL_free(gctx->iv);
+	return 1;
+	}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+	int n=8;
+	unsigned char  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		gctx->key_set = 0;
+		gctx->iv_set = 0;
+		gctx->ivlen = c->cipher->iv_len;
+		gctx->iv = c->iv;
+		gctx->taglen = -1;
+		gctx->iv_gen = 0;
+		gctx->tls_aad_len = -1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IVLEN:
+		if (arg <= 0)
+			return 0;
+#ifdef OPENSSL_FIPS
+		if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+						 && arg < 12)
+			return 0;
+#endif
+		/* Allocate memory for IV if needed */
+		if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+			{
+			if (gctx->iv != c->iv)
+				OPENSSL_free(gctx->iv);
+			gctx->iv = OPENSSL_malloc(arg);
+			if (!gctx->iv)
+				return 0;
+			}
+		gctx->ivlen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_TAG:
+		if (arg <= 0 || arg > 16 || c->encrypt)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->taglen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_GET_TAG:
+		if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+			return 0;
+		memcpy(ptr, c->buf, arg);
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_FIXED:
+		/* Special case: -1 length restores whole IV */
+		if (arg == -1)
+			{
+			memcpy(gctx->iv, ptr, gctx->ivlen);
+			gctx->iv_gen = 1;
+			return 1;
+			}
+		/* Fixed field must be at least 4 bytes and invocation field
+		 * at least 8.
+		 */
+		if ((arg < 4) || (gctx->ivlen - arg) < 8)
+			return 0;
+		if (arg)
+			memcpy(gctx->iv, ptr, arg);
+		if (c->encrypt &&
+			RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+			return 0;
+		gctx->iv_gen = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_IV_GEN:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0)
+			return 0;
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		if (arg <= 0 || arg > gctx->ivlen)
+			arg = gctx->ivlen;
+		memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+		/* Invocation field will be at least 8 bytes in size and
+		 * so no need to check wrap around or increment more than
+		 * last 8 bytes.
+		 */
+		ctr64_inc(gctx->iv + gctx->ivlen - 8);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_INV:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+			return 0;
+		memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		/* Save the AAD for later use */
+		if (arg != 13)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->tls_aad_len = arg;
+			{
+			unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+			/* Correct length for explicit IV */
+			len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			/* If decrypting correct for tag too */
+			if (!c->encrypt)
+				len -= EVP_GCM_TLS_TAG_LEN;
+                        c->buf[arg-2] = len>>8;
+                        c->buf[arg-1] = len & 0xff;
+			}
+		/* Extra padding: tag appended to record */
+		return EVP_GCM_TLS_TAG_LEN;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{ do {
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			{
+			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)AES_encrypt);
+			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+			break;
+			}
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)vpaes_encrypt);
+			gctx->ctr = NULL;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+		gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+		gctx->ctr = NULL;
+#endif
+		} while (0);
+
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	int rv = -1;
+	/* Encrypt/decrypt must be performed in place */
+	if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+		return -1;
+	/* Set IV from start of buffer or generate IV and write to start
+	 * of buffer.
+	 */
+	if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+				EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+				EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+		goto err;
+	/* Use saved AAD */
+	if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+		goto err;
+	/* Fix buffer and length to point to payload */
+	in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+	if (ctx->encrypt)
+		{
+		/* Encrypt payload */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		out += len;
+		/* Finally write tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+		rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+		}
+	else
+		{
+		/* Decrypt */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		/* Retrieve tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+					EVP_GCM_TLS_TAG_LEN);
+		/* If tag mismatch wipe buffer */
+		if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+			{
+			OPENSSL_cleanse(out, len);
+			goto err;
+			}
+		rv = len;
+		}
+
+	err:
+	gctx->iv_set = 0;
+	gctx->tls_aad_len = -1;
+	return rv;
+	}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	/* If not set up, return error */
+	if (!gctx->key_set)
+		return -1;
+
+	if (gctx->tls_aad_len >= 0)
+		return aes_gcm_tls_cipher(ctx, out, in, len);
+
+	if (!gctx->iv_set)
+		return -1;
+	if (!ctx->encrypt && gctx->taglen < 0)
+		return -1;
+	if (in)
+		{
+		if (out == NULL)
+			{
+			if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+				return -1;
+			}
+		else if (ctx->encrypt)
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		else
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		return len;
+		}
+	else
+		{
+		if (!ctx->encrypt)
+			{
+			if (CRYPTO_gcm128_finish(&gctx->gcm,
+					ctx->buf, gctx->taglen) != 0)
+				return -1;
+			gctx->iv_set = 0;
+			return 0;
+			}
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+		gctx->taglen = 16;
+		/* Don't reuse the IV */
+		gctx->iv_set = 0;
+		return 0;
+		}
+
+	}
+
+#define CUSTOM_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 \
+		| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_XTS_CTX *xctx = c->cipher_data;
+	if (type != EVP_CTRL_INIT)
+		return -1;
+	/* key1 and key2 are used as an indicator both key and IV are set */
+	xctx->xts.key1 = NULL;
+	xctx->xts.key2 = NULL;
+	return 1;
+	}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key) do
+		{
+#ifdef AES_XTS_ASM
+		xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+		xctx->stream = NULL;
+#endif
+		/* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+		    {
+		    if (enc)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_encrypt;
+			}
+		    else
+			{
+			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_decrypt;
+			}
+
+		vpaes_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		break;
+		}
+#endif
+		if (enc)
+			{
+			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_encrypt;
+			}
+		else
+			{
+			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_decrypt;
+			}
+
+		AES_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)AES_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		} while (0);
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!xctx->xts.key1 || !xctx->xts.key2)
+		return 0;
+	if (!out || !in || len<AES_BLOCK_SIZE)
+		return 0;
+#ifdef OPENSSL_FIPS
+	/* Requirement of SP800-38E */
+	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+			(len > (1UL<<20)*16))
+		{
+		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+		return 0;
+		}
+#endif
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+								ctx->encrypt))
+		return 0;
+	return 1;
+	}
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_CCM_CTX *cctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		cctx->key_set = 0;
+		cctx->iv_set = 0;
+		cctx->L = 8;
+		cctx->M = 12;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_IVLEN:
+		arg = 15 - arg;
+	case EVP_CTRL_CCM_SET_L:
+		if (arg < 2 || arg > 8)
+			return 0;
+		cctx->L = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_TAG:
+		if ((arg & 1) || arg < 4 || arg > 16)
+			return 0;
+		if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+			return 0;
+		if (ptr)
+			{
+			cctx->tag_set = 1;
+			memcpy(c->buf, ptr, arg);
+			}
+		cctx->M = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_GET_TAG:
+		if (!c->encrypt || !cctx->tag_set)
+			return 0;
+		if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+			return 0;
+		cctx->tag_set = 0;
+		cctx->iv_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key) do
+		{
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)vpaes_encrypt);
+			cctx->key_set = 1;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)AES_encrypt);
+		cctx->str = NULL;
+		cctx->key_set = 1;
+		} while (0);
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	CCM128_CONTEXT *ccm = &cctx->ccm;
+	/* If not set up, return error */
+	if (!cctx->iv_set && !cctx->key_set)
+		return -1;
+	if (!ctx->encrypt && !cctx->tag_set)
+		return -1;
+	if (!out)
+		{
+		if (!in)
+			{
+			if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+				return -1;
+			cctx->len_set = 1;
+			return len;
+			}
+		/* If have AAD need message length */
+		if (!cctx->len_set && len)
+			return -1;
+		CRYPTO_ccm128_aad(ccm, in, len);
+		return len;
+		}
+	/* EVP_*Final() doesn't return any data */
+	if (!in)
+		return 0;
+	/* If not set length yet do it */
+	if (!cctx->len_set)
+		{
+		if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+			return -1;
+		cctx->len_set = 1;
+		}
+	if (ctx->encrypt)
+		{
+		if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				CRYPTO_ccm128_encrypt(ccm, in, out, len))
+			return -1;
+		cctx->tag_set = 1;
+		return len;
+		}
+	else
+		{
+		int rv = -1;
+		if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				!CRYPTO_ccm128_decrypt(ccm, in, out, len))
+			{
+			unsigned char tag[16];
+			if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+				{
+				if (!memcmp(tag, ctx->buf, cctx->M))
+					rv = len;
+				}
+			}
+		if (rv == -1)
+			OPENSSL_cleanse(out, len);
+		cctx->iv_set = 0;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return rv;
+		}
+
+	}
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
 #endif

diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000..278c6ca
--- /dev/null
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c

@@ -0,0 +1,404 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY		ks;
+    SHA_CTX		head,tail,md;
+    size_t		payload_length;	/* AAD length in decrypt case */
+    union {
+	unsigned int	tls_ver;
+    	unsigned char	tls_aad[16];	/* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#if	defined(AES_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)	)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+			   unsigned char *out,
+			   size_t length,
+			   const AES_KEY *key,
+			   unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+		const AES_KEY *key, unsigned char iv[16],
+		SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	int ret;
+
+	if (enc)
+		ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+	else
+		ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+	SHA1_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = 0;
+
+	return ret<0?0:1;
+	}
+
+#define	STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define	aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{	const unsigned char *ptr = data;
+	size_t res;
+
+	if ((res = c->num)) {
+		res = SHA_CBLOCK-res;
+		if (len<res) res=len;
+		SHA1_Update (c,ptr,res);
+		ptr += res;
+		len -= res;
+	}
+
+	res = len % SHA_CBLOCK;
+	len -= res;
+
+	if (len) {
+		sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+		ptr += len;
+		c->Nh += len>>29;
+		c->Nl += len<<=3;
+		if (c->Nl<(unsigned int)len) c->Nh++;
+	}
+
+	if (res)
+		SHA1_Update(c,ptr,res);
+}
+
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	unsigned int l;
+	size_t	plen = key->payload_length,
+		iv = 0,		/* explicit IV in TLS 1.1 and later */
+		sha_off = 0;
+#if defined(STITCHED_CALL)
+	size_t	aes_off = 0,
+		blocks;
+
+	sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+	if (len%AES_BLOCK_SIZE) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==0)
+			plen = len;
+		else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+			return 0;
+		else if (key->aux.tls_ver >= TLS1_1_VERSION)
+			iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+		if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+			SHA1_Update(&key->md,in+iv,sha_off);
+
+			aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+				ctx->iv,&key->md,in+iv+sha_off);
+			blocks *= SHA_CBLOCK;
+			aes_off += blocks;
+			sha_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			sha_off = 0;
+		}
+#endif
+		sha_off += iv;
+		SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+		if (plen!=len)	{	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+			/* calculate HMAC and append it to payload */
+			SHA1_Final(out+plen,&key->md);
+			key->md = key->tail;
+			SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+			SHA1_Final(out+plen,&key->md);
+
+			/* pad the payload|hmac */
+			plen += SHA_DIGEST_LENGTH;
+			for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+			/* encrypt HMAC|padding at once */
+			aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		} else {
+			aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		}
+	} else {
+		unsigned char mac[SHA_DIGEST_LENGTH];
+
+		/* decrypt HMAC|padding at once */
+		aesni_cbc_encrypt(in,out,len,
+				&key->ks,ctx->iv,0);
+
+		if (plen) {	/* "TLS" mode of operation */
+			/* figure out payload length */
+			if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH))
+				return 0;
+
+			len -= (out[len-1]+1+SHA_DIGEST_LENGTH);
+
+			if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+			    >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				iv = AES_BLOCK_SIZE;
+			}
+
+			key->aux.tls_aad[plen-2] = len>>8;
+			key->aux.tls_aad[plen-1] = len;
+
+			/* calculate HMAC and verify it */
+			key->md = key->head;
+			SHA1_Update(&key->md,key->aux.tls_aad,plen);
+			SHA1_Update(&key->md,out+iv,len);
+			SHA1_Final(mac,&key->md);
+
+			key->md = key->tail;
+			SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH);
+			SHA1_Final(mac,&key->md);
+
+			if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH))
+				return 0;
+		} else {
+			SHA1_Update(&key->md,out,len);
+		}
+	}
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			SHA1_Init(&key->head);
+			SHA1_Update(&key->head,ptr,arg);
+			SHA1_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		SHA1_Init(&key->head);
+		SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		SHA1_Init(&key->tail);
+		SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (ctx->encrypt)
+			{
+			key->payload_length = len;
+			if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				p[arg-2] = len>>8;
+				p[arg-1] = len;
+			}
+			key->md = key->head;
+			SHA1_Update(&key->md,p,arg);
+
+			return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+				- len);
+			}
+		else
+			{
+			if (arg>13) arg = 13;
+			memcpy(key->aux.tls_aad,ptr,arg);
+			key->payload_length = arg;
+
+			return SHA_DIGEST_LENGTH;
+			}
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_128_cbc_hmac_sha1
+	NID_aes_128_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,16,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_256_cbc_hmac_sha1
+	NID_aes_256_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,32,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_128_cbc_hmac_sha1_cipher:NULL);
+	}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_256_cbc_hmac_sha1_cipher:NULL);
+	}
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+#endif
+#endif

diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index 3232cfe..1e69972 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c

@@ -65,6 +65,8 @@
 #include <openssl/des.h>
 #include <openssl/rand.h>
 
+#ifndef OPENSSL_FIPS
+
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			    const unsigned char *iv,int enc);
 
@@ -311,3 +313,4 @@
 	return &des_ede3_ecb;
 }
 #endif
+#endif

diff --git a/crypto/evp/e_null.c b/crypto/evp/e_null.c
index 7cf50e1..f0c1f78 100644
--- a/crypto/evp/e_null.c
+++ b/crypto/evp/e_null.c

@@ -61,6 +61,8 @@
 #include <openssl/evp.h>
 #include <openssl/objects.h>
 
+#ifndef OPENSSL_FIPS
+
 static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	const unsigned char *iv,int enc);
 static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -99,4 +101,4 @@
 		memcpy((char *)out,(const char *)in,inl);
 	return 1;
 	}
-
+#endif

diff --git a/crypto/evp/e_rc2.c b/crypto/evp/e_rc2.c
index f78d781..d4c33b5 100644
--- a/crypto/evp/e_rc2.c
+++ b/crypto/evp/e_rc2.c

@@ -183,7 +183,8 @@
 		key_bits =rc2_magic_to_meth((int)num);
 		if (!key_bits)
 			return(-1);
-		if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1);
+		if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1))
+			return -1;
 		EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL);
 		EVP_CIPHER_CTX_set_key_length(c, key_bits / 8);
 		}

diff --git a/crypto/evp/e_rc4.c b/crypto/evp/e_rc4.c
index 8b5175e..b4f6bda 100644
--- a/crypto/evp/e_rc4.c
+++ b/crypto/evp/e_rc4.c

@@ -62,6 +62,7 @@
 #ifndef OPENSSL_NO_RC4
 
 #include <openssl/evp.h>
+#include "evp_locl.h"
 #include <openssl/objects.h>
 #include <openssl/rc4.h>
 

diff --git a/crypto/evp/e_rc4_hmac_md5.c b/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 0000000..eaa7a53
--- /dev/null
+++ b/crypto/evp/e_rc4_hmac_md5.c

@@ -0,0 +1,293 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE		16
+
+typedef struct
+    {
+    RC4_KEY		ks;
+    MD5_CTX		head,tail,md;
+    size_t		payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+		MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+		    inkey);
+
+	MD5_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+#if	!defined(OPENSSL_NO_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)		) && \
+	!(defined(__APPLE__) && defined(__MACH__))
+#define	STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define	rc4_off 0
+#define	md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+	size_t	rc4_off = 32-1-(key->ks.x&(32-1)),	/* 32 is $MOD from rc4_md5-x86_64.pl */
+		md5_off = MD5_CBLOCK-key->md.num,
+		blocks;
+	unsigned int l;
+#endif
+	size_t	plen = key->payload_length;
+
+	if (plen && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==0) plen = len;
+#if defined(STITCHED_CALL)
+		/* cipher has to "fall behind" */
+		if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+		if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK)) {
+			MD5_Update(&key->md,in,md5_off);
+			RC4(&key->ks,rc4_off,in,out);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,in+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			rc4_off = 0;
+			md5_off = 0;
+		}
+#endif
+		MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+		if (plen!=len) {	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+			/* calculate HMAC and append it to payload */
+			MD5_Final(out+plen,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+			MD5_Final(out+plen,&key->md);
+			/* encrypt HMAC at once */
+			RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+		} else {
+			RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		}
+	} else {
+		unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+		/* digest has to "fall behind" */
+		if (md5_off>rc4_off)	rc4_off += 2*MD5_CBLOCK;
+		else			rc4_off += MD5_CBLOCK;
+
+		if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK)) {
+			RC4(&key->ks,rc4_off,in,out);
+			MD5_Update(&key->md,out,md5_off);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,out+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+			if (l<key->md.Nl) key->md.Nh++;
+			key->md.Nl  = l;
+			key->md.Nh += blocks>>29;
+		} else {
+			md5_off=0;
+			rc4_off=0;
+		}
+#endif
+		/* decrypt HMAC at once */
+		RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		if (plen) {	/* "TLS" mode of operation */
+			MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+			/* calculate HMAC and verify it */
+			MD5_Final(mac,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+			MD5_Final(mac,&key->md);
+
+			if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+				return 0;
+		} else {
+			MD5_Update(&key->md,out+md5_off,len-md5_off);
+		}
+	}
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			MD5_Init(&key->head);
+			MD5_Update(&key->head,ptr,arg);
+			MD5_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		MD5_Init(&key->head);
+		MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		MD5_Init(&key->tail);
+		MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (!ctx->encrypt)
+			{
+			len -= MD5_DIGEST_LENGTH;
+			p[arg-2] = len>>8;
+			p[arg-1] = len;
+			}
+		key->payload_length=len;
+		key->md = key->head;
+		MD5_Update(&key->md,p,arg);
+
+		return MD5_DIGEST_LENGTH;
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+	{
+#ifdef NID_rc4_hmac_md5
+	NID_rc4_hmac_md5,
+#else
+	NID_undef,
+#endif
+	1,EVP_RC4_KEY_SIZE,0,
+	EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+	rc4_hmac_md5_init_key,
+	rc4_hmac_md5_cipher,
+	NULL,
+	sizeof(EVP_RC4_HMAC_MD5),
+	NULL,
+	NULL,
+	rc4_hmac_md5_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+	{
+	return(&r4_hmac_md5_cipher);
+	}
+#endif

diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
index 9f9795e..0d1b20a 100644
--- a/crypto/evp/evp.h
+++ b/crypto/evp/evp.h

@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,34 @@
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
+
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +419,7 @@
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +739,9 @@
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +788,10 @@
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +799,9 @@
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_gcm(void);
+const EVP_CIPHER *EVP_aes_192_ccm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +809,13 @@
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -1047,13 +1100,22 @@
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1190,8 +1252,13 @@
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1313,24 @@
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,6 +1348,7 @@
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
@@ -1303,6 +1380,7 @@
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121

diff --git a/crypto/evp/evp_enc.c b/crypto/evp/evp_enc.c
index c268d25..6910726 100644
--- a/crypto/evp/evp_enc.c
+++ b/crypto/evp/evp_enc.c

@@ -64,8 +64,18 @@
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 
+#ifdef OPENSSL_FIPS
+#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl)
+#else
+#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl)
+#endif
+
+
 const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
 
 void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx)
@@ -115,10 +125,14 @@
 		/* Ensure a context left lying around from last time is cleared
 		 * (the previous check attempted to avoid this if the same
 		 * ENGINE and EVP_CIPHER could be used). */
-		EVP_CIPHER_CTX_cleanup(ctx);
-
-		/* Restore encrypt field: it is zeroed by cleanup */
-		ctx->encrypt = enc;
+		if (ctx->cipher)
+			{
+			unsigned long flags = ctx->flags;
+			EVP_CIPHER_CTX_cleanup(ctx);
+			/* Restore encrypt and flags */
+			ctx->encrypt = enc;
+			ctx->flags = flags;
+			}
 #ifndef OPENSSL_NO_ENGINE
 		if(impl)
 			{
@@ -155,6 +169,9 @@
 			ctx->engine = NULL;
 #endif
 
+#ifdef OPENSSL_FIPS
+		return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#else
 		ctx->cipher=cipher;
 		if (ctx->cipher->ctx_size)
 			{
@@ -179,6 +196,7 @@
 				return 0;
 				}
 			}
+#endif
 		}
 	else if(!ctx->cipher)
 		{
@@ -188,6 +206,9 @@
 #ifndef OPENSSL_NO_ENGINE
 skip_to_init:
 #endif
+#ifdef OPENSSL_FIPS
+	return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#else
 	/* we assume block size is a power of 2 in *cryptUpdate */
 	OPENSSL_assert(ctx->cipher->block_size == 1
 	    || ctx->cipher->block_size == 8
@@ -214,6 +235,13 @@
 			memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx));
 			break;
 
+			case EVP_CIPH_CTR_MODE:
+			ctx->num = 0;
+			/* Don't reuse IV for CTR mode */
+			if(iv)
+				memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx));
+			break;
+
 			default:
 			return 0;
 			break;
@@ -227,6 +255,7 @@
 	ctx->final_used=0;
 	ctx->block_mask=ctx->cipher->block_size-1;
 	return 1;
+#endif
 	}
 
 int EVP_CipherUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
@@ -280,6 +309,16 @@
 	{
 	int i,j,bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, in, inl);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -288,7 +327,7 @@
 
 	if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0)
 		{
-		if(ctx->cipher->do_cipher(ctx,out,in,inl))
+		if(M_do_cipher(ctx,out,in,inl))
 			{
 			*outl=inl;
 			return 1;
@@ -315,7 +354,7 @@
 			{
 			j=bl-i;
 			memcpy(&(ctx->buf[i]),in,j);
-			if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0;
+			if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0;
 			inl-=j;
 			in+=j;
 			out+=bl;
@@ -328,7 +367,7 @@
 	inl-=i;
 	if (inl > 0)
 		{
-		if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0;
+		if(!M_do_cipher(ctx,out,in,inl)) return 0;
 		*outl+=inl;
 		}
 
@@ -350,6 +389,16 @@
 	int n,ret;
 	unsigned int i, b, bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		ret = M_do_cipher(ctx, out, NULL, 0);
+		if (ret < 0)
+			return 0;
+		else 
+			*outl = ret;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	OPENSSL_assert(b <= sizeof ctx->buf);
 	if (b == 1)
@@ -372,7 +421,7 @@
 	n=b-bl;
 	for (i=bl; i<b; i++)
 		ctx->buf[i]=n;
-	ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b);
+	ret=M_do_cipher(ctx,out,ctx->buf,b);
 
 
 	if(ret)
@@ -387,6 +436,19 @@
 	int fix_len;
 	unsigned int b;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		fix_len = M_do_cipher(ctx, out, in, inl);
+		if (fix_len < 0)
+			{
+			*outl = 0;
+			return 0;
+			}
+		else
+			*outl = fix_len;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -440,8 +502,18 @@
 	{
 	int i,n;
 	unsigned int b;
-
 	*outl=0;
+
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, NULL, 0);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	if (ctx->flags & EVP_CIPH_NO_PADDING)
 		{
@@ -496,6 +568,7 @@
 
 int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 	{
+#ifndef OPENSSL_FIPS
 	if (c->cipher != NULL)
 		{
 		if(c->cipher->cleanup && !c->cipher->cleanup(c))
@@ -506,12 +579,16 @@
 		}
 	if (c->cipher_data)
 		OPENSSL_free(c->cipher_data);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (c->engine)
 		/* The EVP_CIPHER we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(c->engine);
 #endif
+#ifdef OPENSSL_FIPS
+	FIPS_cipher_ctx_cleanup(c);
+#endif
 	memset(c,0,sizeof(EVP_CIPHER_CTX));
 	return 1;
 	}

diff --git a/crypto/evp/evp_err.c b/crypto/evp/evp_err.c
index d8bfec0..db0f76d 100644
--- a/crypto/evp/evp_err.c
+++ b/crypto/evp/evp_err.c

@@ -1,6 +1,6 @@
 /* crypto/evp/evp_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,8 +70,13 @@
 
 static ERR_STRING_DATA EVP_str_functs[]=
 	{
+{ERR_FUNC(EVP_F_AESNI_INIT_KEY),	"AESNI_INIT_KEY"},
+{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER),	"AESNI_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_AES_INIT_KEY),	"AES_INIT_KEY"},
+{ERR_FUNC(EVP_F_AES_XTS),	"AES_XTS"},
+{ERR_FUNC(EVP_F_AES_XTS_CIPHER),	"AES_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY),	"CAMELLIA_INIT_KEY"},
+{ERR_FUNC(EVP_F_CMAC_INIT),	"CMAC_INIT"},
 {ERR_FUNC(EVP_F_D2I_PKEY),	"D2I_PKEY"},
 {ERR_FUNC(EVP_F_DO_SIGVER_INIT),	"DO_SIGVER_INIT"},
 {ERR_FUNC(EVP_F_DSAPKEY2PKCS8),	"DSAPKEY2PKCS8"},
@@ -86,7 +91,7 @@
 {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX),	"EVP_DigestInit_ex"},
 {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX),	"EVP_EncryptFinal_ex"},
 {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX),	"EVP_MD_CTX_copy_ex"},
-{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_SIZE"},
+{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_size"},
 {ERR_FUNC(EVP_F_EVP_OPENINIT),	"EVP_OpenInit"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD),	"EVP_PBE_alg_add"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE),	"EVP_PBE_alg_add_type"},
@@ -126,9 +131,17 @@
 {ERR_FUNC(EVP_F_EVP_RIJNDAEL),	"EVP_RIJNDAEL"},
 {ERR_FUNC(EVP_F_EVP_SIGNFINAL),	"EVP_SignFinal"},
 {ERR_FUNC(EVP_F_EVP_VERIFYFINAL),	"EVP_VerifyFinal"},
+{ERR_FUNC(EVP_F_FIPS_CIPHERINIT),	"FIPS_CIPHERINIT"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY),	"FIPS_CIPHER_CTX_COPY"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL),	"FIPS_CIPHER_CTX_CTRL"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH),	"FIPS_CIPHER_CTX_SET_KEY_LENGTH"},
+{ERR_FUNC(EVP_F_FIPS_DIGESTINIT),	"FIPS_DIGESTINIT"},
+{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY),	"FIPS_MD_CTX_COPY"},
+{ERR_FUNC(EVP_F_HMAC_INIT_EX),	"HMAC_Init_ex"},
 {ERR_FUNC(EVP_F_INT_CTX_NEW),	"INT_CTX_NEW"},
 {ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN),	"PKCS5_PBE_keyivgen"},
 {ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN),	"PKCS5_v2_PBE_keyivgen"},
+{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN),	"PKCS5_V2_PBKDF2_KEYIVGEN"},
 {ERR_FUNC(EVP_F_PKCS8_SET_BROKEN),	"PKCS8_set_broken"},
 {ERR_FUNC(EVP_F_PKEY_SET_TYPE),	"PKEY_SET_TYPE"},
 {ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH),	"RC2_MAGIC_TO_METH"},
@@ -138,6 +151,7 @@
 
 static ERR_STRING_DATA EVP_str_reasons[]=
 	{
+{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED)   ,"aes iv setup failed"},
 {ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED)  ,"aes key setup failed"},
 {ERR_REASON(EVP_R_ASN1_LIB)              ,"asn1 lib"},
 {ERR_REASON(EVP_R_BAD_BLOCK_LENGTH)      ,"bad block length"},
@@ -155,6 +169,7 @@
 {ERR_REASON(EVP_R_DECODE_ERROR)          ,"decode error"},
 {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES)   ,"different key types"},
 {ERR_REASON(EVP_R_DIFFERENT_PARAMETERS)  ,"different parameters"},
+{ERR_REASON(EVP_R_DISABLED_FOR_FIPS)     ,"disabled for fips"},
 {ERR_REASON(EVP_R_ENCODE_ERROR)          ,"encode error"},
 {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
 {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY)  ,"expecting an rsa key"},
@@ -186,6 +201,7 @@
 {ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"},
 {ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"},
 {ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA)    ,"public key not rsa"},
+{ERR_REASON(EVP_R_TOO_LARGE)             ,"too large"},
 {ERR_REASON(EVP_R_UNKNOWN_CIPHER)        ,"unknown cipher"},
 {ERR_REASON(EVP_R_UNKNOWN_DIGEST)        ,"unknown digest"},
 {ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"},

diff --git a/crypto/evp/evp_key.c b/crypto/evp/evp_key.c
index 839d6a3..7961fbe 100644
--- a/crypto/evp/evp_key.c
+++ b/crypto/evp/evp_key.c

@@ -120,7 +120,7 @@
 	unsigned char md_buf[EVP_MAX_MD_SIZE];
 	int niv,nkey,addmd=0;
 	unsigned int mds=0,i;
-
+	int rv = 0;
 	nkey=type->key_len;
 	niv=type->iv_len;
 	OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH);
@@ -134,17 +134,24 @@
 		if (!EVP_DigestInit_ex(&c,md, NULL))
 			return 0;
 		if (addmd++)
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-		EVP_DigestUpdate(&c,data,datal);
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+		if (!EVP_DigestUpdate(&c,data,datal))
+			goto err;
 		if (salt != NULL)
-			EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN);
-		EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN))
+				goto err;
+		if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+			goto err;
 
 		for (i=1; i<(unsigned int)count; i++)
 			{
-			EVP_DigestInit_ex(&c,md, NULL);
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-			EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestInit_ex(&c,md, NULL))
+				goto err;
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+			if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+				goto err;
 			}
 		i=0;
 		if (nkey)
@@ -173,8 +180,10 @@
 			}
 		if ((nkey == 0) && (niv == 0)) break;
 		}
+	rv = type->key_len;
+	err:
 	EVP_MD_CTX_cleanup(&c);
 	OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE);
-	return(type->key_len);
+	return rv;
 	}
 

diff --git a/crypto/evp/evp_lib.c b/crypto/evp/evp_lib.c
index 40951a0..b180e48 100644
--- a/crypto/evp/evp_lib.c
+++ b/crypto/evp/evp_lib.c

@@ -67,6 +67,8 @@
 
 	if (c->cipher->set_asn1_parameters != NULL)
 		ret=c->cipher->set_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_set_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
@@ -78,6 +80,8 @@
 
 	if (c->cipher->get_asn1_parameters != NULL)
 		ret=c->cipher->get_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_get_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);

diff --git a/crypto/evp/evp_locl.h b/crypto/evp/evp_locl.h
index 292d74c..08c0a66 100644
--- a/crypto/evp/evp_locl.h
+++ b/crypto/evp/evp_locl.h

@@ -343,3 +343,43 @@
 	} /* EVP_PKEY_METHOD */;
 
 void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx);
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+			     ASN1_TYPE *param,
+			     const EVP_CIPHER *c, const EVP_MD *md, int en_de);
+
+#ifdef OPENSSL_FIPS
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef SHA1_Init
+#undef SHA1_Update
+#undef SHA224_Init
+#undef SHA256_Init
+#undef SHA384_Init
+#undef SHA512_Init
+#undef DES_set_key_unchecked
+#endif
+
+#define RIPEMD160_Init	private_RIPEMD160_Init
+#define WHIRLPOOL_Init	private_WHIRLPOOL_Init
+#define MD5_Init	private_MD5_Init
+#define MD4_Init	private_MD4_Init
+#define MD2_Init	private_MD2_Init
+#define MDC2_Init	private_MDC2_Init
+#define SHA_Init	private_SHA_Init
+#define SHA1_Init	private_SHA1_Init
+#define SHA224_Init	private_SHA224_Init
+#define SHA256_Init	private_SHA256_Init
+#define SHA384_Init	private_SHA384_Init
+#define SHA512_Init	private_SHA512_Init
+
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define Camellia_set_key	private_Camellia_set_key
+
+#endif

diff --git a/crypto/evp/evp_pbe.c b/crypto/evp/evp_pbe.c
index c9d932d..f8c32d8 100644
--- a/crypto/evp/evp_pbe.c
+++ b/crypto/evp/evp_pbe.c

@@ -61,6 +61,7 @@
 #include <openssl/evp.h>
 #include <openssl/pkcs12.h>
 #include <openssl/x509.h>
+#include "evp_locl.h"
 
 /* Password based encryption (PBE) functions */
 
@@ -87,6 +88,10 @@
 	{EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC,
 			NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen},
 
+#ifndef OPENSSL_NO_HMAC
+	{EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen},
+#endif
+
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4,
 			NID_rc4, NID_sha1, PKCS12_PBE_keyivgen},
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4,

diff --git a/crypto/evp/evptests.txt b/crypto/evp/evptests.txt
index beb1214..c273707 100644
--- a/crypto/evp/evptests.txt
+++ b/crypto/evp/evptests.txt

@@ -158,6 +158,19 @@
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:E1C656305ED1A7A6563805746FE03EDC:30C81C46A35CE411E5FBC1191A0A52EF:71AB47A086E86EEDF39D1C5BBA97C408:0
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:41635BE625B48AFC1666DD42A09D96E7:F69F2445DF4F9B17AD2B417BE66C3710:0126141D67F37BE8538F5A8BE740E484:0
 
+# AES Counter test vectors from RFC3686
+aes-128-ctr:AE6852F8121067CC4BF7A5765577F39E:00000030000000000000000000000001:53696E676C6520626C6F636B206D7367:E4095D4FB7A7B3792D6175A3261311B8:1
+aes-128-ctr:7E24067817FAE0D743D6CE1F32539163:006CB6DBC0543B59DA48D90B00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:5104A106168A72D9790D41EE8EDAD388EB2E1EFC46DA57C8FCE630DF9141BE28:1
+aes-128-ctr:7691BE035E5020A8AC6E618529F9A0DC:00E0017B27777F3F4A1786F000000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:C1CF48A89F2FFDD9CF4652E9EFDB72D74540A42BDE6D7836D59A5CEAAEF3105325B2072F:1
+
+aes-192-ctr:16AF5B145FC9F579C175F93E3BFB0EED863D06CCFDB78515:0000004836733C147D6D93CB00000001:53696E676C6520626C6F636B206D7367:4B55384FE259C9C84E7935A003CBE928:1
+aes-192-ctr:7C5CB2401B3DC33C19E7340819E0F69C678C3DB8E6F6A91A:0096B03B020C6EADC2CB500D00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:453243FC609B23327EDFAAFA7131CD9F8490701C5AD4A79CFC1FE0FF42F4FB00:1
+aes-192-ctr:02BF391EE8ECB159B959617B0965279BF59B60A786D3E0FE:0007BDFD5CBD60278DCC091200000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:96893FC55E5C722F540B7DD1DDF7E758D288BC95C69165884536C811662F2188ABEE0935:1
+
+aes-256-ctr:776BEFF2851DB06F4C8A0542C8696F6C6A81AF1EEC96B4D37FC1D689E6C1C104:00000060DB5672C97AA8F0B200000001:53696E676C6520626C6F636B206D7367:145AD01DBF824EC7560863DC71E3E0C0:1
+aes-256-ctr:F6D66D6BD52D59BB0796365879EFF886C66DD51A5B6A99744B50590C87A23884:00FAAC24C1585EF15A43D87500000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:F05E231B3894612C49EE000B804EB2A9B8306B508F839D6A5530831D9344AF1C:1
+aes-256-ctr:FF7A617CE69148E4F1726E2F43581DE2AA62D9F805532EDFF1EED687FB54153D:001CC5B751A51D70A1C1114800000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:EB6C52821D0BBBF7CE7594462ACA4FAAB407DF866569FD07F48CC0B583D6071F1EC0E6B8:1
+
 # DES ECB tests (from destest)
 
 DES-ECB:0000000000000000::0000000000000000:8CA64DE9C1B123A7

diff --git a/crypto/evp/m_dss.c b/crypto/evp/m_dss.c
index 48c2689..4ad63ad 100644
--- a/crypto/evp/m_dss.c
+++ b/crypto/evp/m_dss.c

@@ -66,6 +66,7 @@
 #endif
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
 
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
@@ -97,3 +98,4 @@
 	return(&dsa_md);
 	}
 #endif
+#endif

diff --git a/crypto/evp/m_dss1.c b/crypto/evp/m_dss1.c
index 4f03fb7..f80170e 100644
--- a/crypto/evp/m_dss1.c
+++ b/crypto/evp/m_dss1.c

@@ -68,6 +68,8 @@
 #include <openssl/dsa.h>
 #endif
 
+#ifndef OPENSSL_FIPS 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -98,3 +100,4 @@
 	return(&dss1_md);
 	}
 #endif
+#endif

diff --git a/crypto/evp/m_ecdsa.c b/crypto/evp/m_ecdsa.c
index 8d87a49..4b15fb0 100644
--- a/crypto/evp/m_ecdsa.c
+++ b/crypto/evp/m_ecdsa.c

@@ -116,6 +116,8 @@
 #include <openssl/x509.h>
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -146,3 +148,4 @@
 	return(&ecdsa_md);
 	}
 #endif
+#endif

diff --git a/crypto/evp/m_md4.c b/crypto/evp/m_md4.c
index 1e0b7c5..6d47f61 100644
--- a/crypto/evp/m_md4.c
+++ b/crypto/evp/m_md4.c

@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MD4_Init(ctx->md_data); }
 

diff --git a/crypto/evp/m_md5.c b/crypto/evp/m_md5.c
index 63c1421..9a8bae0 100644
--- a/crypto/evp/m_md5.c
+++ b/crypto/evp/m_md5.c

@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return MD5_Init(ctx->md_data); }

diff --git a/crypto/evp/m_mdc2.c b/crypto/evp/m_mdc2.c
index b08d559..3602bed 100644
--- a/crypto/evp/m_mdc2.c
+++ b/crypto/evp/m_mdc2.c

@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MDC2_Init(ctx->md_data); }
 

diff --git a/crypto/evp/m_ripemd.c b/crypto/evp/m_ripemd.c
index a1d60ee..7bf4804 100644
--- a/crypto/evp/m_ripemd.c
+++ b/crypto/evp/m_ripemd.c

@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return RIPEMD160_Init(ctx->md_data); }

diff --git a/crypto/evp/m_sha1.c b/crypto/evp/m_sha1.c
index 9a2790f..3cb11f1 100644
--- a/crypto/evp/m_sha1.c
+++ b/crypto/evp/m_sha1.c

@@ -59,6 +59,8 @@
 #include <stdio.h>
 #include "cryptlib.h"
 
+#ifndef OPENSSL_FIPS
+
 #ifndef OPENSSL_NO_SHA
 
 #include <openssl/evp.h>
@@ -68,6 +70,7 @@
 #include <openssl/rsa.h>
 #endif
 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -202,3 +205,5 @@
 const EVP_MD *EVP_sha512(void)
 	{ return(&sha512_md); }
 #endif	/* ifndef OPENSSL_NO_SHA512 */
+
+#endif

diff --git a/crypto/evp/m_wp.c b/crypto/evp/m_wp.c
index 1ce47c0..c51bc2d 100644
--- a/crypto/evp/m_wp.c
+++ b/crypto/evp/m_wp.c

@@ -9,6 +9,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/whrlpool.h>
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return WHIRLPOOL_Init(ctx->md_data); }

diff --git a/crypto/evp/names.c b/crypto/evp/names.c
index f2869f5..6311ad7 100644
--- a/crypto/evp/names.c
+++ b/crypto/evp/names.c

@@ -66,6 +66,10 @@
 	{
 	int r;
 
+	if (c == NULL) return 0;
+
+	OPENSSL_init();
+
 	r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
 	if (r == 0) return(0);
 	check_defer(c->nid);
@@ -78,6 +82,7 @@
 	{
 	int r;
 	const char *name;
+	OPENSSL_init();
 
 	name=OBJ_nid2sn(md->type);
 	r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);

diff --git a/crypto/evp/p5_crpt.c b/crypto/evp/p5_crpt.c
index 7ecfa8d..7d9c1f0 100644
--- a/crypto/evp/p5_crpt.c
+++ b/crypto/evp/p5_crpt.c

@@ -82,6 +82,8 @@
 	unsigned char *salt;
 	const unsigned char *pbuf;
 	int mdsize;
+	int rv = 0;
+	EVP_MD_CTX_init(&ctx);
 
 	/* Extract useful info from parameter */
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
@@ -104,29 +106,37 @@
 	if(!pass) passlen = 0;
 	else if(passlen == -1) passlen = strlen(pass);
 
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, md, NULL);
-	EVP_DigestUpdate(&ctx, pass, passlen);
-	EVP_DigestUpdate(&ctx, salt, saltlen);
+	if (!EVP_DigestInit_ex(&ctx, md, NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, pass, passlen))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, salt, saltlen))
+		goto err;
 	PBEPARAM_free(pbe);
-	EVP_DigestFinal_ex(&ctx, md_tmp, NULL);
+	if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL))
+		goto err;
 	mdsize = EVP_MD_size(md);
 	if (mdsize < 0)
 	    return 0;
 	for (i = 1; i < iter; i++) {
-		EVP_DigestInit_ex(&ctx, md, NULL);
-		EVP_DigestUpdate(&ctx, md_tmp, mdsize);
-		EVP_DigestFinal_ex (&ctx, md_tmp, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md, NULL))
+			goto err;
+		if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize))
+			goto err;
+		if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL))
+			goto err;
 	}
-	EVP_MD_CTX_cleanup(&ctx);
 	OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp));
 	memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher));
 	OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16);
 	memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)),
 						 EVP_CIPHER_iv_length(cipher));
-	EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de);
+	if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de))
+		goto err;
 	OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE);
 	OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
 	OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
-	return 1;
+	rv = 1;
+	err:
+	return rv;
 }

diff --git a/crypto/evp/p5_crpt2.c b/crypto/evp/p5_crpt2.c
index 334379f..975d004 100644
--- a/crypto/evp/p5_crpt2.c
+++ b/crypto/evp/p5_crpt2.c

@@ -62,6 +62,7 @@
 #include <openssl/x509.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#include "evp_locl.h"
 
 /* set this to print out info about the keygen algorithm */
 /* #define DEBUG_PKCS5V2 */
@@ -110,10 +111,14 @@
 		itmp[1] = (unsigned char)((i >> 16) & 0xff);
 		itmp[2] = (unsigned char)((i >> 8) & 0xff);
 		itmp[3] = (unsigned char)(i & 0xff);
-		HMAC_Init_ex(&hctx, pass, passlen, digest, NULL);
-		HMAC_Update(&hctx, salt, saltlen);
-		HMAC_Update(&hctx, itmp, 4);
-		HMAC_Final(&hctx, digtmp, NULL);
+		if (!HMAC_Init_ex(&hctx, pass, passlen, digest, NULL)
+			|| !HMAC_Update(&hctx, salt, saltlen)
+			|| !HMAC_Update(&hctx, itmp, 4)
+			|| !HMAC_Final(&hctx, digtmp, NULL))
+			{
+			HMAC_CTX_cleanup(&hctx);
+			return 0;
+			}
 		memcpy(p, digtmp, cplen);
 		for(j = 1; j < iter; j++)
 			{
@@ -168,27 +173,24 @@
                          ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md,
                          int en_de)
 {
-	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
 	const unsigned char *pbuf;
-	int saltlen, iter, plen;
-	unsigned int keylen;
+	int plen;
 	PBE2PARAM *pbe2 = NULL;
 	const EVP_CIPHER *cipher;
-	PBKDF2PARAM *kdf = NULL;
-	const EVP_MD *prfmd;
-	int prf_nid, hmac_md_nid;
+
+	int rv = 0;
 
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
 	    param->value.sequence == NULL) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	pbuf = param->value.sequence->data;
 	plen = param->value.sequence->length;
 	if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	/* See if we recognise the key derivation function */
@@ -211,38 +213,63 @@
 	}
 
 	/* Fixup cipher based on AlgorithmIdentifier */
-	EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de);
+	if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de))
+		goto err;
 	if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
 					EVP_R_CIPHER_PARAMETER_ERROR);
 		goto err;
 	}
+	rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen,
+					pbe2->keyfunc->parameter, c, md, en_de);
+	err:
+	PBE2PARAM_free(pbe2);
+	return rv;
+}
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+                         ASN1_TYPE *param,
+			 const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+	const unsigned char *pbuf;
+	int saltlen, iter, plen;
+	int rv = 0;
+	unsigned int keylen = 0;
+	int prf_nid, hmac_md_nid;
+	PBKDF2PARAM *kdf = NULL;
+	const EVP_MD *prfmd;
+
+	if (EVP_CIPHER_CTX_cipher(ctx) == NULL)
+		{
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET);
+		goto err;
+		}
 	keylen = EVP_CIPHER_CTX_key_length(ctx);
 	OPENSSL_assert(keylen <= sizeof key);
 
-	/* Now decode key derivation function */
+	/* Decode parameter */
 
-	if(!pbe2->keyfunc->parameter ||
-		 (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE))
+	if(!param || (param->type != V_ASN1_SEQUENCE))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 		}
 
-	pbuf = pbe2->keyfunc->parameter->value.sequence->data;
-	plen = pbe2->keyfunc->parameter->value.sequence->length;
+	pbuf = param->value.sequence->data;
+	plen = param->value.sequence->length;
+
 	if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 	}
 
-	PBE2PARAM_free(pbe2);
-	pbe2 = NULL;
+	keylen = EVP_CIPHER_CTX_key_length(ctx);
 
 	/* Now check the parameters of the kdf */
 
 	if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_KEYLENGTH);
 		goto err;
 	}
@@ -254,19 +281,19 @@
 
 	if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	prfmd = EVP_get_digestbynid(hmac_md_nid);
 	if (prfmd == NULL)
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	if(kdf->salt->type != V_ASN1_OCTET_STRING) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_SALT_TYPE);
 		goto err;
 	}
@@ -278,15 +305,11 @@
 	if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd,
 						   keylen, key))
 		goto err;
-	EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
+	rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
+	err:
 	OPENSSL_cleanse(key, keylen);
 	PBKDF2PARAM_free(kdf);
-	return 1;
-
-	err:
-	PBE2PARAM_free(pbe2);
-	PBKDF2PARAM_free(kdf);
-	return 0;
+	return rv;
 }
 
 #ifdef DEBUG_PKCS5V2

diff --git a/crypto/evp/p_open.c b/crypto/evp/p_open.c
index 53a59a2..c748fbe 100644
--- a/crypto/evp/p_open.c
+++ b/crypto/evp/p_open.c

@@ -115,7 +115,8 @@
 	int i;
 
 	i=EVP_DecryptFinal_ex(ctx,out,outl);
-	EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i)
+		i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return(i);
 	}
 #else /* !OPENSSL_NO_RSA */

diff --git a/crypto/evp/p_seal.c b/crypto/evp/p_seal.c
index d832452..e5919b0 100644
--- a/crypto/evp/p_seal.c
+++ b/crypto/evp/p_seal.c

@@ -110,6 +110,7 @@
 	{
 	int i;
 	i = EVP_EncryptFinal_ex(ctx,out,outl);
-	EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i) 
+		i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return i;
 	}

diff --git a/crypto/evp/p_sign.c b/crypto/evp/p_sign.c
index bb893f5..dfa48c1 100644
--- a/crypto/evp/p_sign.c
+++ b/crypto/evp/p_sign.c

@@ -80,18 +80,20 @@
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i=0,ok=0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	*siglen=0;
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);   
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;  
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		size_t sltmp = (size_t)EVP_PKEY_size(pkey);
 		i = 0;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);

diff --git a/crypto/evp/p_verify.c b/crypto/evp/p_verify.c
index 41d4b67..5f5c409 100644
--- a/crypto/evp/p_verify.c
+++ b/crypto/evp/p_verify.c

@@ -67,17 +67,19 @@
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i=-1,ok=0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);     
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;    
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		i = -1;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
 		if (!pkctx)

diff --git a/crypto/evp/pmeth_gn.c b/crypto/evp/pmeth_gn.c
index 5d74161..4651c81 100644
--- a/crypto/evp/pmeth_gn.c
+++ b/crypto/evp/pmeth_gn.c

@@ -199,7 +199,7 @@
 	}
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen)
+				const unsigned char *key, int keylen)
 	{
 	EVP_PKEY_CTX *mac_ctx = NULL;
 	EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@
 	if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
 		goto merr;
 	if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
-				EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+				EVP_PKEY_CTRL_SET_MAC_KEY,
+				keylen, (void *)key) <= 0)
 		goto merr;
 	if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
 		goto merr;

diff --git a/crypto/evp/pmeth_lib.c b/crypto/evp/pmeth_lib.c
index 5481d4b..acfa7b6 100644
--- a/crypto/evp/pmeth_lib.c
+++ b/crypto/evp/pmeth_lib.c

@@ -73,7 +73,7 @@
 STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
 
 extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
 
 static const EVP_PKEY_METHOD *standard_methods[] =
 	{
@@ -90,6 +90,7 @@
 	&ec_pkey_meth,
 #endif
 	&hmac_pkey_meth,
+	&cmac_pkey_meth
 	};
 
 DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@
 	if (!pmeth)
 		return NULL;
 
+	memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
+
 	pmeth->pkey_id = id;
 	pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
 
@@ -235,6 +238,56 @@
 	return pmeth;
 	}
 
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth)
+	{
+	if (ppkey_id)
+		*ppkey_id = meth->pkey_id;
+	if (pflags)
+		*pflags = meth->flags;
+	}
+
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+	{
+
+	dst->init = src->init;
+	dst->copy = src->copy;
+	dst->cleanup = src->cleanup;
+
+	dst->paramgen_init = src->paramgen_init;
+	dst->paramgen = src->paramgen;
+
+	dst->keygen_init = src->keygen_init;
+	dst->keygen = src->keygen;
+
+	dst->sign_init = src->sign_init;
+	dst->sign = src->sign;
+
+	dst->verify_init = src->verify_init;
+	dst->verify = src->verify;
+
+	dst->verify_recover_init = src->verify_recover_init;
+	dst->verify_recover = src->verify_recover;
+
+	dst->signctx_init = src->signctx_init;
+	dst->signctx = src->signctx;
+
+	dst->verifyctx_init = src->verifyctx_init;
+	dst->verifyctx = src->verifyctx;
+
+	dst->encrypt_init = src->encrypt_init;
+	dst->encrypt = src->encrypt;
+
+	dst->decrypt_init = src->decrypt_init;
+	dst->decrypt = src->decrypt;
+
+	dst->derive_init = src->derive_init;
+	dst->derive = src->derive;
+
+	dst->ctrl = src->ctrl;
+	dst->ctrl_str = src->ctrl_str;
+	}
+
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
 	{
 	if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))

diff --git a/crypto/hmac/hm_ameth.c b/crypto/hmac/hm_ameth.c
index 6d8a891..e03f24a 100644
--- a/crypto/hmac/hm_ameth.c
+++ b/crypto/hmac/hm_ameth.c

@@ -153,7 +153,7 @@
 
 	hmac_size,
 	0,
-	0,0,0,0,0,0,
+	0,0,0,0,0,0,0,
 
 	hmac_key_free,
 	hmac_pkey_ctrl,

diff --git a/crypto/hmac/hm_pmeth.c b/crypto/hmac/hm_pmeth.c
index 71e8567..0daa445 100644
--- a/crypto/hmac/hm_pmeth.c
+++ b/crypto/hmac/hm_pmeth.c

@@ -100,7 +100,8 @@
 	dctx = dst->data;
 	dctx->md = sctx->md;
 	HMAC_CTX_init(&dctx->ctx);
-	HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+	if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+		return 0;
 	if (sctx->ktmp.data)
 		{
 		if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@
 static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
 	{
 	HMAC_PKEY_CTX *hctx = ctx->pctx->data;
-	HMAC_Update(&hctx->ctx, data, count);
+	if (!HMAC_Update(&hctx->ctx, data, count))
+		return 0;
 	return 1;
 	}
 
@@ -167,7 +169,8 @@
 	if (!sig)
 		return 1;
 
-	HMAC_Final(&hctx->ctx, sig, &hlen);
+	if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+		return 0;
 	*siglen = (size_t)hlen;
 	return 1;
 	}
@@ -192,8 +195,9 @@
 
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
-		HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
-				ctx->engine);
+		if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+				ctx->engine))
+			return 0;
 		break;
 
 		default:

diff --git a/crypto/hmac/hmac.c b/crypto/hmac/hmac.c
index 6c98fc4..ba27cbf 100644
--- a/crypto/hmac/hmac.c
+++ b/crypto/hmac/hmac.c

@@ -61,12 +61,34 @@
 #include "cryptlib.h"
 #include <openssl/hmac.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
 		  const EVP_MD *md, ENGINE *impl)
 	{
 	int i,j,reset=0;
 	unsigned char pad[HMAC_MAX_MD_CBLOCK];
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->i_ctx.engine)
+			&&  !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+			{
+			EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_hmac_init_ex().
+		 */
+		if (!impl && !ctx->i_ctx.engine)
+			return FIPS_hmac_init_ex(ctx, key, len, md, NULL);
+		}
+#endif
+
 	if (md != NULL)
 		{
 		reset=1;
@@ -133,6 +155,10 @@
 
 int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_update(ctx, data, len);
+#endif
 	return EVP_DigestUpdate(&ctx->md_ctx,data,len);
 	}
 
@@ -140,6 +166,10 @@
 	{
 	unsigned int i;
 	unsigned char buf[EVP_MAX_MD_SIZE];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_final(ctx, md, len);
+#endif
 
 	if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i))
 		goto err;
@@ -179,6 +209,13 @@
 
 void HMAC_CTX_cleanup(HMAC_CTX *ctx)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		{
+		FIPS_hmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
 	EVP_MD_CTX_cleanup(&ctx->i_ctx);
 	EVP_MD_CTX_cleanup(&ctx->o_ctx);
 	EVP_MD_CTX_cleanup(&ctx->md_ctx);

diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S
index d705fff..7832b9b 100644
--- a/crypto/ia64cpuid.S
+++ b/crypto/ia64cpuid.S

@@ -26,7 +26,7 @@
 { .mii;	mov		ar.ccv=r2
 	add		r8=r2,r33
 	mov		r3=r2		};;
-{ .mmi;	mf
+{ .mmi;	mf;;
 	cmpxchg4.acq	r2=[r32],r8,ar.ccv
 	nop.i		0		};;
 { .mib;	cmp.ne		p6,p0=r2,r3

diff --git a/crypto/md4/md4.h b/crypto/md4/md4.h
index c3ed9b3..a55368a 100644
--- a/crypto/md4/md4.h
+++ b/crypto/md4/md4.h

@@ -105,6 +105,9 @@
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);

diff --git a/crypto/md4/md4_dgst.c b/crypto/md4/md4_dgst.c
index e0c42e8..82c2cb2 100644
--- a/crypto/md4/md4_dgst.c
+++ b/crypto/md4/md4_dgst.c

@@ -57,8 +57,9 @@
  */
 
 #include <stdio.h>
-#include "md4_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include "md4_locl.h"
 
 const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD4_Init(MD4_CTX *c)
+fips_md_init(MD4)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;

diff --git a/crypto/md5/md5.h b/crypto/md5/md5.h
index 4cbf843..541cc92 100644
--- a/crypto/md5/md5.h
+++ b/crypto/md5/md5.h

@@ -105,6 +105,9 @@
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);

diff --git a/crypto/md5/md5_dgst.c b/crypto/md5/md5_dgst.c
index beace63..265890d 100644
--- a/crypto/md5/md5_dgst.c
+++ b/crypto/md5/md5_dgst.c

@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "md5_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD5_Init(MD5_CTX *c)
+fips_md_init(MD5)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;

diff --git a/crypto/mdc2/mdc2.h b/crypto/mdc2/mdc2.h
index 72778a5..f3e8e57 100644
--- a/crypto/mdc2/mdc2.h
+++ b/crypto/mdc2/mdc2.h

@@ -81,6 +81,9 @@
 	} MDC2_CTX;
 
 
+#ifdef OPENSSL_FIPS
+int private_MDC2_Init(MDC2_CTX *c);
+#endif
 int MDC2_Init(MDC2_CTX *c);
 int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
 int MDC2_Final(unsigned char *md, MDC2_CTX *c);

diff --git a/crypto/mdc2/mdc2dgst.c b/crypto/mdc2/mdc2dgst.c
index 4aa406e..b74bb1a 100644
--- a/crypto/mdc2/mdc2dgst.c
+++ b/crypto/mdc2/mdc2dgst.c

@@ -61,6 +61,7 @@
 #include <string.h>
 #include <openssl/des.h>
 #include <openssl/mdc2.h>
+#include <openssl/crypto.h>
 
 #undef c2l
 #define c2l(c,l)	(l =((DES_LONG)(*((c)++)))    , \
@@ -75,7 +76,7 @@
 			*((c)++)=(unsigned char)(((l)>>24L)&0xff))
 
 static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
-int MDC2_Init(MDC2_CTX *c)
+fips_md_init(MDC2)
 	{
 	c->num=0;
 	c->pad_type=1;

diff --git a/crypto/mem.c b/crypto/mem.c
index 6f80dd3..8f736c3 100644
--- a/crypto/mem.c
+++ b/crypto/mem.c

@@ -125,6 +125,7 @@
 int CRYPTO_set_mem_functions(void *(*m)(size_t), void *(*r)(void *, size_t),
 	void (*f)(void *))
 	{
+	OPENSSL_init();
 	if (!allow_customize)
 		return 0;
 	if ((m == 0) || (r == 0) || (f == 0))
@@ -186,6 +187,7 @@
 	{
 	if (!allow_customize_debug)
 		return 0;
+	OPENSSL_init();
 	malloc_debug_func=m;
 	realloc_debug_func=r;
 	free_debug_func=f;

diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000..6358b27
--- /dev/null
+++ b/crypto/modes/asm/ghash-alpha.pl

@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	br	$rem_4bit,.Lpic1
+.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	br	$rem_4bit,.Lpic2
+.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+rem_4bit:
+	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+

diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000..d91586e
--- /dev/null
+++ b/crypto/modes/asm/ghash-armv4.pl

@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;	# $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		$Htbl,#16		@ point at H in GCM128_CTX
+	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
+	veor		$zero,$zero
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Qpost,$Qpost
+	veor		$R,$R
+	mov		$cnt,#16
+	veor		$Z,$Z
+	mov		$len,#16
+	veor		$Zo,$Zo
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
+	veor		$zero,$zero
+	nop
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+.Louter_neon:
+	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
+	veor		$Qpost,$Qpost
+	vld1.64		`&Dlo($IN)`,[$inp]!
+	veor		$R,$R
+	mov		$cnt,#16
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Zo,$Zo
+	veor		$IN,$Z			@ inp^=Xi
+	veor		$Z,$Z
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		$cnt,$cnt,#1
+	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i]
+	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i]
+	vext.8		$IN,$zero,#1		@ IN>>=8
+
+	veor		$Z,$Qpost		@ modulo-scheduled part
+	vshl.i64	`&Dlo("$R")`,#48
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+	vuzp.8		$Qlo,$Qhi
+	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
+	vext.8		$Z,$zero,#1		@ Z>>=8
+
+	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1
+	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
+	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
+	veor		$Z,$Qhi
+	bne		.Linner_neon
+
+	veor		$Z,$Qpost		@ modulo-scheduled artefact
+	vshl.i64	`&Dlo("$R")`,#48
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+
+	@ finalization, normalize Z:Zo
+	vand		$Zo,$mod		@ suffices to mask the bit
+	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+	vshl.i64	$Z,#1
+	subs		$len,#16
+	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+	sub		$Xi,#16	
+	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush

diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000..0354c95
--- /dev/null
+++ b/crypto/modes/asm/ghash-ia64.pl

@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.align	128
+.skip	16					// aligns loop body
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,0x1ffff	};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+	$code.=<<___;
+{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
+	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
+{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
+	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
+___
+	$code.=shift	if (($i+$#_)==7);
+	$code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align	32
+.skip	16					// aligns loop body
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,2,0,0
+	.vframe	prevsp
+	mov	prevsp=sp
+	mov	$rem_8bit=ip		};;
+	.body
+{ .mfi;	$ADDP	r8=0+0,$Htbl
+	$ADDP	r9=0+8,$Htbl		}
+{ .mfi;	$ADDP	r10=128+0,$Htbl
+	$ADDP	r11=128+8,$Htbl		};;
+___
+	&load_htable(
+	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
+	"	$ADDP	$len=$len,$inp",	# &inp[len]
+	"	$ADDP	$inp=15,$inp",		# &inp[15]
+	"	mov	$mask0xff=0xff",
+	"	add	sp=-512,sp",
+	"	andcm	sp=sp,$mask0xff",	# align stack frame
+	"	add	r14=0,sp",
+	"	add	r15=8,sp");
+$code.=<<___;
+{ .mmi;	$sum	1<<1				// go big-endian
+	add	r8=256+0,sp
+	add	r9=256+8,sp		}
+{ .mmi;	add	r10=256+128+0,sp
+	add	r11=256+128+8,sp
+	add	$len=-17,$len		};;
+___
+for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
+	st8	[r9]=$rhi,16			// Htable[$i].hi
+	shrp	$rlo=$rhi,$rlo,4	}//;;
+{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
+	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
+	shr.u	$rhi=$rhi,4		};;
+{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
+	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
+	ld8	r17=[r9],16		};;	// Htable[8].hi
+{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
+	ld8	r19=[r9],16		}	// Htable[9].hi
+{ .mmi;	rum	1<<5				// clear um.mfh
+	shrp	r16=r17,r16,4		};;
+___
+for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
+	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
+	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+___
+}
+$code.=<<___;
+{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
+	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;	# (p16)
+{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17)
+{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
+	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+.align	32
+.LOOP:
+{ .mmi;
+(p6)	st8	[$Xip]=$Zhi,13
+	xor	$Zlo=$Zlo,$Zlo
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17),(p18)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1		}	//(p16) *inp--
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;	# (p16),(p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1			//(p16) *inp--
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+}
+
+$code.=<<___;	# (p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p18),(p19)
+{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
+	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p19)
+{ .mmi;	cmp.ltu	p6,p0=$inp,$len
+	add	$inp=32,$inp
+	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	add	$Xip=9,$Xip		};;	//	&Xi.lo
+{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
+{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
+(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
+{ .mmi;	st8	[$Xip]=$Zlo,-8
+(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
+	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
+{ .mib;
+(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
+(p6)	br.cond.dptk.many	.LOOP	};;
+
+{ .mib;	st8	[$Xip]=$Zhi		};;
+{ .mib;	$rum	1<<1				// return to little-endian
+	.restore	sp
+	mov	sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+___
+$code.=<<___;
+.align	128
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+.type	rem_8bit#,\@object
+rem_8bit:
+	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size	rem_8bit#,512
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;

diff --git a/crypto/modes/asm/ghash-parisc.pl b/crypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000..8c7454e
--- /dev/null
+++ b/crypto/modes/asm/ghash-parisc.pl

@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$NREGS		=6;
+} else {
+	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$NREGS		=11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+				#                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";	# argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;	# variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+	$Zhl="%r6";
+	$Zlh="%r7";
+	$Hhl="%r8";
+	$Hlh="%r9";
+	$Thl="%r10";
+	$Tlh="%r11";
+}
+$rem2="%r6";	# used in PA-RISC 2.0 code
+
+$code.=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	64
+gcm_gmult_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_gmult
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_gmult
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_gmult_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+	ldbx	$cnt($Xi),$nlo
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$Tll,$Zll,$Zll
+	addib,uv -1,$cnt,L\$oop_gmult_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	std	$Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_gmult
+	nop
+
+L\$parisc1_gmult
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_gmult_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_gmult_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	stw	$Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+gcm_ghash_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_ghash
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_ghash
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+	ldb	15($inp),$nhi
+	xor	$nhi,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_ghash_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldbx	$cnt($Xi),$nlo
+	ldbx	$cnt($inp),$byte
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	xor	$byte,$nlo,$nlo
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+
+	ldd	$rem($rem_4bit),$rem
+	addib,uv -1,$cnt,L\$oop_ghash_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	ldo	16($inp),$inp
+	std	$Zhh,0($Xi)
+	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+	copy	$Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_ghash
+	nop
+
+L\$parisc1_ghash
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+	ldb	15($inp),$byte
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_ghash_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldbx	$cnt($inp),$byte
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$byte,$nlo,$nlo
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_ghash_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	ldo	16($inp),$inp
+	stw	$Zhh,0($Xi)
+	comb,<>	$inp,$len,L\$outer_ghash_pa1
+	copy	$Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	64
+L\$rem_4bit
+	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+	.ALIGN	64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
+    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+    	my $cpos=63-$2;
+	my $len=32-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	if ($SIZE_T==4) {
+		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+		s/cmpb,\*/comb,/;
+		s/,\*/,/;
+	}
+	print $_,"\n";
+}
+
+close STDOUT;

diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000..48cb08d
--- /dev/null
+++ b/crypto/modes/asm/ghash-s390x.pl

@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";	# argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";	# variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_gmult
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_gmult
+	stg	%r0,16($sp)	# arrange 16 bytes of zero input
+	stg	%r0,24($sp)
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	la	$inp,16($sp)
+	lghi	$len,16
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_gmult:
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	lghi	$len,1
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	j	.Lgmult_shortcut
+.type	gcm_gmult_4bit,\@function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_ghash
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_ghash
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_ghash:
+___
+$cdoe.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	srlg	$len,$len,4
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	lg	$Zhi,0+1($Xi)
+	lghi	$tmp,0
+.Louter:
+	xg	$Zhi,0($inp)		# Xi ^= inp 
+	xg	$Zlo,8($inp)
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+	lghi	$tmp,0xf0
+	sllg	$nlo,$Zlo,4
+	srlg	$xi,$Zlo,8		# extract second byte
+	ngr	$nlo,$tmp
+	lgr	$nhi,$Zlo
+	lghi	$cnt,14
+	ngr	$nhi,$tmp
+
+	lg	$Zlo,8($nlo,$Htbl)
+	lg	$Zhi,0($nlo,$Htbl)
+
+	sllg	$nlo,$xi,4
+	sllg	$rem0,$Zlo,3
+	ngr	$nlo,$tmp
+	ngr	$rem0,$x78
+	ngr	$xi,$tmp
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	j	.Lghash_inner
+.align	16
+.Lghash_inner:
+	srlg	$Zlo,$Zlo,4
+	sllg	$tmp,$Zhi,60
+	xg	$Zlo,8($nlo,$Htbl)
+	srlg	$Zhi,$Zhi,4
+	llgc	$xi,0($cnt,$Xi)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$nlo,$xi,4
+	xg	$Zhi,0($rem0,$rem_4bit)
+	nill	$nlo,0xf0
+	sllg	$rem0,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem0,$x78
+	nill	$xi,0xf0
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	xg	$Zhi,0($rem1,$rem_4bit)
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	brct	$cnt,.Lghash_inner
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$xi,$Zlo,3
+	xg	$Zhi,0($rem0,$rem_4bit)
+	xgr	$Zlo,$tmp
+	ngr	$xi,$x78
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	xgr	$Zlo,$tmp
+	xg	$Zhi,0($rem1,$rem_4bit)
+
+	lg	$tmp,0($xi,$rem_4bit)
+	la	$inp,16($inp)
+	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
+	brctg	$len,.Louter
+
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
+	br	%r14
+.type	gcm_ghash_4bit,\@function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type	rem_4bit,\@object
+.size	rem_4bit,(.-rem_4bit)
+.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000..70e7b04
--- /dev/null
+++ b/crypto/modes/asm/ghash-sparcv9.pl

@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
+# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#		gcc 4.4.1			this assembler
+#
+# 32-bit build	566				50	(+1000%)
+# 64-bit build	56				50	(+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000..6b09669
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86.pl

@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with MMX results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <[email protected]> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
+		# than unrolled, which has to be weighted against
+		# 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
+	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
+	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
+	&mov	($Zll,&DWP(8,$Htbl,$Zll));
+	&xor	($rem,$rem);	# avoid partial register stalls on PIII
+
+	# shrd practically kills P4, 2.5x deterioration, but P4 has
+	# MMX code-path to execute. shrd runs tad faster [than twice
+	# the shifts, move's and or's] on pre-MMX Pentium (as well as
+	# PIII and Core2), *but* minimizes code size, spares register
+	# and thus allows to fold the loop...
+	if (!$unroll) {
+	my $cnt = $inp;
+	&mov	($cnt,15);
+	&jmp	(&label("x86_loop"));
+	&set_label("x86_loop",16);
+	    for($i=1;$i<=2;$i++) {
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		&mov	(&LB($rem),&BP($off,"esp",$cnt));
+		if ($i&1) {
+			&and	(&LB($rem),0xf0);
+		} else {
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+
+		if ($i&1) {
+			&dec	($cnt);
+			&js	(&label("x86_break"));
+		} else {
+			&jmp	(&label("x86_loop"));
+		}
+	    }
+	&set_label("x86_break",16);
+	} else {
+	    for($i=1;$i<32;$i++) {
+		&comment($i);
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		if ($i&1) {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&and	(&LB($rem),0xf0);
+		} else {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+	    }
+	}
+	&bswap	($Zll);
+	&bswap	($Zlh);
+	&bswap	($Zhl);
+	if (!$x86only) {
+		&bswap	($Zhh);
+	} else {
+		&mov	("eax",$Zhh);
+		&bswap	("eax");
+		&mov	($Zhh,"eax");
+	}
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+	&x86_loop(4);
+	&ret	();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
+	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
+	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
+	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
+	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
+	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
+	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
+	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
+	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
+	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
+	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
+	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
+	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
+	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
+	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
+	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for stack alignment
+	&mov	($inp,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+
+	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$inp));
+	&mov	($Zlh,&DWP(8,$inp));
+	&mov	($Zll,&DWP(12,$inp));
+
+	&deposit_rem_4bit(16);
+
+	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(0));
+	}
+
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for 64-bit alignment
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
+	&add	("ecx",$inp);
+	&mov	(&wparam(3),"ecx");
+
+	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zll));
+	&mov	($Zlh,&DWP(8,$Zll));
+	&mov	($Zll,&DWP(12,$Zll));
+
+	&deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));		# xor with input
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(2));
+	}
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
+	&jb	(&label("x86_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{	# pure-MMX "May" version...
+
+$S=12;		# shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem[0],$Zlo);
+
+	for ($cnt=28;$cnt>=-2;$cnt--) {
+	    my $odd = $cnt&1;
+	    my $nix = $odd ? $nlo : $nhi;
+
+		&shl	(&LB($nlo),4)			if ($odd);
+		&psrlq	($Zlo,4);
+		&movq	($tmp,$Zhi);
+		&psrlq	($Zhi,4);
+		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
+		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
+		&psllq	($tmp,60);
+		&and	($nhi,0xf0)			if ($odd);
+		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+		&and	($rem[0],0xf);
+		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
+		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
+		&movd	($rem[1],$Zlo);
+		&pxor	($Zlo,$tmp);
+
+		push	(@rem,shift(@rem));		# "rotate" registers
+	}
+
+	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&xor	($Zhh,$inp);
+	&bswap	($Zhh);
+
+	&ret	();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&add	($Zlh,$inp);
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
+	&stack_push(4+1);		# +1 for stack alignment
+
+	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zhh));
+	&mov	($Zlh,&DWP(8,$Zhh));
+	&mov	($Zhh,&DWP(0,$Zhh));
+	&jmp	(&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&wparam(2),$inp);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&mov	($inp,"esp");
+	&shr	($Zll,24);
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(2));
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&jb	(&label("mmx_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+
+	&stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{	# "June" MMX version...
+		# ... has slower "April" gcm_gmult_4bit_mmx with folded
+		# loop. This is done to conserve code size...
+$S=16;		# shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&mov	($cnt,14);
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem,$Zlo);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&dec	($cnt);
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+	&js	(&label("mmx_break"));
+
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&bswap	($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&mmx_loop($inp,"eax");
+
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov	("eax",&wparam(0));		# Xi
+    &mov	("ebx",&wparam(1));		# Htable
+    &mov	("ecx",&wparam(2));		# inp
+    &mov	("edx",&wparam(3));		# len
+    &mov	("ebp","esp");			# original %esp
+    &call	(&label("pic_point"));
+    &set_label	("pic_point");
+    &blindpop	($rem_8bit);
+    &lea	($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub	("esp",512+16+16);		# allocate stack frame...
+    &and	("esp",-64);			# ...and align it
+    &sub	("esp",16);			# place for (u8)(H[]<<4)
+
+    &add	("edx","ecx");			# pointer to the end of input
+    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
+    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
+    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my $off1=0,$off2=0,$i;
+
+      &add	($Htbl,128);			# optimize for size
+      &lea	("edi",&DWP(16+128,"esp"));
+      &lea	("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
+	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
+	&psllq	($tmp[1],60)				if ($i>1);
+	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
+	&por	($lo[2],$tmp[1])			if ($i>1);
+	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
+	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
+	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
+	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
+	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
+	&shl	("edx",4)				if ($i<16);
+	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
+
+	unshift	(@lo,pop(@lo));			# "rotate" registers
+	unshift	(@hi,pop(@hi));
+	unshift	(@tmp,pop(@tmp));
+	$off1 += 8	if ($i>0);
+	$off2 += 8	if ($i>1);
+      }
+    }
+
+    &movq	($Zhi,&QWP(0,"eax"));
+    &mov	("ebx",&DWP(8,"eax"));
+    &mov	("edx",&DWP(12,"eax"));		# load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor	($dat,&DWP(12,"ecx"));		# merge input data
+    &xor	("ebx",&DWP(8,"ecx"));
+    &pxor	($Zhi,&QWP(0,"ecx"));
+    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
+    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
+    &mov	(&DWP(528+8,"esp"),"ebx");
+    &movq	(&QWP(528+0,"esp"),$Zhi);
+    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
+
+    &xor	($nlo,$nlo);
+    &rol	($dat,8);
+    &mov	(&LB($nlo),&LB($dat));
+    &mov	($nhi[1],$nlo);
+    &and	(&LB($nlo),0x0f);
+    &shr	($nhi[1],4);
+    &pxor	($red[0],$red[0]);
+    &rol	($dat,8);			# next byte
+    &pxor	($red[1],$red[1]);
+    &pxor	($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+	&rol	($dat,8);				# next byte
+	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+	&pxor	($Zlo,$tmp);
+	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+      } else {
+	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
+	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+	&mov	(&LB($nlo),&LB($dat));
+	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0);
+
+	&movd	($rem[0],$Zlo);
+	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
+	&psrlq	($Zlo,8);				# Z>>=8
+
+	&movq	($tmp,$Zhi);
+	&mov	($nhi[0],$nlo);
+	&psrlq	($Zhi,8);
+
+	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
+	&and	(&LB($nlo),0x0f);
+	&psllq	($tmp,56);
+
+	&pxor	($Zhi,$red[1])				if ($i>1);
+	&shr	($nhi[0],4);
+	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
+
+	unshift	(@red,pop(@red));			# "rotate" registers
+	unshift	(@rem,pop(@rem));
+	unshift	(@nhi,pop(@nhi));
+    }
+
+    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz	($rem[1],&LB($rem[1]));
+
+    &pxor	($red[2],$red[2]);			# clear 2nd word
+    &psllq	($red[1],4);
+
+    &movd	($rem[0],$Zlo);
+    &psrlq	($Zlo,4);				# Z>>=4
+
+    &movq	($tmp,$Zhi);
+    &psrlq	($Zhi,4);
+    &shl	($rem[0],4);				# rem<<4
+
+    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
+    &psllq	($tmp,60);
+    &movz	($rem[0],&LB($rem[0]));
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor	($Zhi,$red[1]);
+
+    &movd	($dat,$Zlo);
+    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
+
+    &psllq	($red[0],12);				# correct by <<16>>4
+    &pxor	($Zhi,$red[0]);
+    &psrlq	($Zlo,32);
+    &pxor	($Zhi,$red[2]);
+
+    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
+    &movd	("ebx",$Zlo);
+    &movq	($tmp,$Zhi);			# 01234567
+    &psllw	($Zhi,8);			# 1.3.5.7.
+    &psrlw	($tmp,8);			# .0.2.4.6
+    &por	($Zhi,$tmp);			# 10325476
+    &bswap	($dat);
+    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
+    &bswap	("ebx");
+    
+    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
+    &jne	(&label("outer"));
+  }
+
+    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
+    &mov	(&DWP(12,"eax"),"edx");
+    &mov	(&DWP(8,"eax"),"ebx");
+    &movq	(&QWP(0,"eax"),$Zhi);
+
+    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
+    &emms	();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110);
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey);
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T2,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T1,$Xi)		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,5);		#
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T2,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T2,8);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+	&pxor		($T2,$Xhi);
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+
+	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+
+	  &movdqa	($T1,$Xi)		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	  &psllq	($Xi,5);		#
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T2,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T2,8);		#	
+	  &pxor		($Xi,$T1);
+	&pshufd		($T1,$T3,0b01001110);
+	  &pxor		($Xhi,$T2);		#
+	&pxor		($T1,$T3);
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T3,$Hkey);		#
+
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+	  &pxor		($T2,$Xhi);
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&xorps		($T1,$Xn);		#
+	&xorps		($T1,$Xhn);		#
+
+	&movdqa		($T3,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhn,$T1);
+	&pxor		($Xn,$T3);		#
+	&movdqa		($T3,&QWP(0,$const));
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...

diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000..a5ae180
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86_64.pl

@@ -0,0 +1,805 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <[email protected]> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
+			$r =~ s/%[er]([sd]i)/%\1l/	or
+			$r =~ s/%[er](bp)/%\1l/		or
+			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+	$N++;
+$code.=<<___;
+	xor	$nlo,$nlo
+	xor	$nhi,$nhi
+	mov	`&LB("$Zlo")`,`&LB("$nlo")`
+	mov	`&LB("$Zlo")`,`&LB("$nhi")`
+	shl	\$4,`&LB("$nlo")`
+	mov	\$14,$cnt
+	mov	8($Htbl,$nlo),$Zlo
+	mov	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	mov	$Zlo,$rem
+	jmp	.Loop$N
+
+.align	16
+.Loop$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	mov	($inp,$cnt),`&LB("$nlo")`
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	mov	`&LB("$nlo")`,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	shl	\$4,`&LB("$nlo")`
+	xor	$tmp,$Zlo
+	dec	$cnt
+	js	.Lbreak$N
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+	jmp	.Loop$N
+
+.align	16
+.Lbreak$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	xor	$tmp,$Zlo
+	xor	($rem_4bit,$rem,8),$Zhi
+
+	bswap	$Zlo
+	bswap	$Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,\@function,2
+.align	16
+gcm_gmult_4bit:
+	push	%rbx
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+	movzb	15($Xi),$Zlo
+	lea	.Lrem_4bit(%rip),$rem_4bit
+___
+	&loop	($Xi);
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
+.Lgmult_epilogue:
+	ret
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,\@function,4
+.align	16
+gcm_ghash_4bit:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
+.Lghash_prologue:
+	mov	$inp,%r14		# reassign couple of args
+	mov	$len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+	&sub	($Htbl,-128);		# size optimization
+	&lea	($Hshr4,"16+128(%rsp)");
+	{ my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+	  &xor	($dat,$dat);
+	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
+	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
+	    &or		($lo[0],$tmp)			if ($i>1);
+	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
+	    &shr	($lo[1],4)			if ($i>0 && $i<17);
+	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
+	    &shr	($hi[1],4)			if ($i>0 && $i<17);
+	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
+	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
+	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
+	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
+	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
+	    &shl	($tmp,60)			if ($i>0 && $i<17);
+
+	    push	(@lo,shift(@lo));
+	    push	(@hi,shift(@hi));
+	  }
+	}
+	&add	($Htbl,-128);
+	&mov	($Zlo,"8($Xi)");
+	&mov	($Zhi,"0($Xi)");
+	&add	($len,$inp);		# pointer to the end of data
+	&lea	($rem_8bit,".Lrem_8bit(%rip)");
+	&jmp	(".Louter_loop");
+
+$code.=".align	16\n.Louter_loop:\n";
+	&xor	($Zhi,"($inp)");
+	&mov	("%rdx","8($inp)");
+	&lea	($inp,"16($inp)");
+	&xor	("%rdx",$Zlo);
+	&mov	("($Xi)",$Zhi);
+	&mov	("8($Xi)","%rdx");
+	&shr	("%rdx",32);
+
+	&xor	($nlo,$nlo);
+	&rol	($dat,8);
+	&mov	(&LB($nlo),&LB($dat));
+	&movz	($nhi[0],&LB($dat));
+	&shl	(&LB($nlo),4);
+	&shr	($nhi[0],4);
+
+	for ($j=11,$i=0;$i<15;$i++) {
+	    &rol	($dat,8);
+	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
+	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
+	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
+	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
+
+	    &mov	(&LB($nlo),&LB($dat));
+	    &xor	($Zlo,$tmp)				if ($i>0);
+	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
+
+	    &movz	($nhi[1],&LB($dat));
+	    &shl	(&LB($nlo),4);
+	    &movzb	($rem[0],"(%rsp,$nhi[0])");
+
+	    &shr	($nhi[1],4)				if ($i<14);
+	    &and	($nhi[1],0xf0)				if ($i==14);
+	    &shl	($rem[1],48)				if ($i>0);
+	    &xor	($rem[0],$Zlo);
+
+	    &mov	($tmp,$Zhi);
+	    &xor	($Zhi,$rem[1])				if ($i>0);
+	    &shr	($Zlo,8);
+
+	    &movz	($rem[0],&LB($rem[0]));
+	    &mov	($dat,"$j($Xi)")			if (--$j%4==0);
+	    &shr	($Zhi,8);
+
+	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
+	    &shl	($tmp,56);
+	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
+
+	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
+	    unshift	(@rem,pop(@rem));
+	}
+	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
+	&xor	($Zlo,"8($Htbl,$nlo)");
+	&xor	($Zhi,"($Htbl,$nlo)");
+
+	&shl	($rem[1],48);
+	&xor	($Zlo,$tmp);
+
+	&xor	($Zhi,$rem[1]);
+	&movz	($rem[0],&LB($Zlo));
+	&shr	($Zlo,4);
+
+	&mov	($tmp,$Zhi);
+	&shl	(&LB($rem[0]),4);
+	&shr	($Zhi,4);
+
+	&xor	($Zlo,"8($Htbl,$nhi[0])");
+	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
+	&shl	($tmp,60);
+
+	&xor	($Zhi,"($Htbl,$nhi[0])");
+	&xor	($Zlo,$tmp);
+	&shl	($rem[0],48);
+
+	&bswap	($Zlo);
+	&xor	($Zhi,$rem[0]);
+
+	&bswap	($Zhi);
+	&cmp	($inp,$len);
+	&jb	(".Louter_loop");
+}
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	lea	280(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lghash_epilogue:
+	ret
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$T2,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$5,$Xi			#
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T2			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T2			#	
+	pxor		$T1,$Xi
+	pxor		$T2,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+	pxor		$Xhi,$T2
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	movdqa		$Hkey,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqu		$Hkey,($Htbl)		# save H
+	movdqu		$Xi,16($Htbl)		# save H^2
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		16($Htbl),$Hkey2
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey2,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey2,$T2
+
+	lea		32($inp),$inp		# i+=2
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+
+.Lmod_loop:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	movdqu		($inp),$T1		# Ii
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+
+	movdqa		$Xn,$Xhn		#
+	pshufd		\$0b01001110,$Xn,$T1n
+	pshufd		\$0b01001110,$Hkey,$T2n
+	pxor		$Xn,$T1n		#
+	pxor		$Hkey,$T2n
+	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+
+	  movdqa	$Xi,$T1			# 1st phase
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$5,$Xi			#
+	  pxor		$T1,$Xi			#
+	pclmulqdq	\$0x00,$Hkey,$Xn	#######
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T2			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T2			#	
+	  pxor		$T1,$Xi
+	  pxor		$T2,$Xhi		#
+
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	  pxor		$Xhi,$T2
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+
+	pclmulqdq	\$0x00,$T2n,$T1n	#######
+	 movdqa		$Xi,$Xhi		#
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pshufd		\$0b01001110,$Hkey2,$T2
+	 pxor		$Xi,$T1			#
+	 pxor		$Hkey2,$T2
+
+	pxor		$Xn,$T1n		#
+	pxor		$Xhn,$T1n		#
+	movdqa		$T1n,$T2n		#
+	psrldq		\$8,$T1n
+	pslldq		\$8,$T2n		#
+	pxor		$T1n,$Xhn
+	pxor		$T2n,$Xn		#
+
+	lea		32($inp),$inp
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	add	\$0x58,%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,\@object
+.Lrem_4bit:
+	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type	.Lrem_8bit,\@object
+.Lrem_8bit:
+	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	24(%rax),%rax		# adjust "rsp"
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_gcm_gmult_4bit
+	.rva	.LSEH_end_gcm_gmult_4bit
+	.rva	.LSEH_info_gcm_gmult_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_4bit
+	.rva	.LSEH_end_gcm_ghash_4bit
+	.rva	.LSEH_info_gcm_ghash_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
+
+.section	.xdata
+.align	8
+.LSEH_info_gcm_gmult_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x1f,0x0b,0x00
+	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
+	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/modes/cbc128.c b/crypto/modes/cbc128.c
index 8f8bd56..3d3782c 100644
--- a/crypto/modes/cbc128.c
+++ b/crypto/modes/cbc128.c

@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
 

diff --git a/crypto/modes/ccm128.c b/crypto/modes/ccm128.c
new file mode 100644
index 0000000..c9b35e5
--- /dev/null
+++ b/crypto/modes/ccm128.c

@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M,unsigned int L,void *key,block128_f block)
+{
+	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+	ctx->blocks = 0;
+	ctx->block = block;
+	ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */
+
+	if (nlen<(14-L)) return -1;		/* nonce is too short */
+
+	if (sizeof(mlen)==8 && L>=3) {
+		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+	}
+	else
+		*(u32*)(&ctx->nonce.c[8]) = 0;
+
+	ctx->nonce.c[12] = (u8)(mlen>>24);
+	ctx->nonce.c[13] = (u8)(mlen>>16);
+	ctx->nonce.c[14] = (u8)(mlen>>8);
+	ctx->nonce.c[15] = (u8)mlen;
+
+	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
+	memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+	return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad,size_t alen)
+{	unsigned int i;
+	block128_f block = ctx->block;
+
+	if (alen==0) return;
+
+	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
+	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+	ctx->blocks++;
+
+	if (alen<(0x10000-0x100)) {
+		ctx->cmac.c[0] ^= (u8)(alen>>8);
+		ctx->cmac.c[1] ^= (u8)alen;
+		i=2;
+	}
+	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFF;
+		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+		ctx->cmac.c[6] ^= (u8)(alen>>24);
+		ctx->cmac.c[7] ^= (u8)(alen>>16);
+		ctx->cmac.c[8] ^= (u8)(alen>>8);
+		ctx->cmac.c[9] ^= (u8)alen;
+		i=10;
+	}
+	else {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFE;
+		ctx->cmac.c[2] ^= (u8)(alen>>24);
+		ctx->cmac.c[3] ^= (u8)(alen>>16);
+		ctx->cmac.c[4] ^= (u8)(alen>>8);
+		ctx->cmac.c[5] ^= (u8)alen;
+		i=6;
+	}
+
+	do {
+		for(;i<16 && alen;++i,++aad,--alen)
+			ctx->cmac.c[i] ^= *aad;
+		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+		ctx->blocks++;
+		i=0;
+	} while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+	unsigned int n=8;
+	u8  c;
+
+	counter += 8;
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= temp.u[0];
+		ctx->cmac.u[1] ^= temp.u[1];
+#else
+		ctx->cmac.u[0] ^= ((u64*)inp)[0];
+		ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		temp.u[0] ^= scratch.u[0];
+		temp.u[1] ^= scratch.u[1];
+		memcpy(out,temp.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+#endif
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+		memcpy (out,scratch.c,16);
+#else
+		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{	size_t n=8, val=0;
+
+	counter += 8;
+	do {
+		--n;
+		val += counter[n] + (inc&0xff);
+		counter[n] = (unsigned char)val;
+		val >>= 8;	/* carry bit */
+		inc >>= 8;
+	} while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */
+
+	M *= 2; M += 2;
+	if (len<M)	return 0;
+	memcpy(tag,ctx->cmac.c,M);
+	return M;
+}

diff --git a/crypto/modes/cfb128.c b/crypto/modes/cfb128.c
index e5938c6..4e6f5d3 100644
--- a/crypto/modes/cfb128.c
+++ b/crypto/modes/cfb128.c

@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit cfb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;

diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c
index 932037f..ee642c5 100644
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c

@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
 #endif
 #include <assert.h>
 
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
  * is endian-neutral. */
 
@@ -182,3 +172,81 @@
 
 	*num=n;
 }
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+	u32 n=12;
+	u8  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f func)
+{
+	unsigned int n,ctr32;
+
+	assert(in && out && key && ecount_buf && num);
+	assert(*num < 16);
+
+	n = *num;
+
+	while (n && len) {
+		*(out++) = *(in++) ^ ecount_buf[n];
+		--len;
+		n = (n+1) % 16;
+	}
+
+	ctr32 = GETU32(ivec+12);
+	while (len>=16) {
+		size_t blocks = len/16;
+		/*
+		 * 1<<28 is just a not-so-small yet not-so-large number...
+		 * Below condition is practically never met, but it has to
+		 * be checked for code correctness.
+		 */
+		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+			blocks = (1U<<28);
+		/*
+		 * As (*func) operates on 32-bit counter, caller
+		 * has to handle overflow. 'if' below detects the
+		 * overflow, which is then handled by limiting the
+		 * amount of blocks to the exact overflow point...
+		 */
+		ctr32 += (u32)blocks;
+		if (ctr32 < blocks) {
+			blocks -= ctr32;
+			ctr32   = 0;
+		}
+		(*func)(in,out,blocks,key,ivec);
+		/* (*ctr) does not update ivec, caller does: */
+		PUTU32(ivec+12,ctr32);
+		/* ... overflow was detected, propogate carry. */
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		blocks *= 16;
+		len -= blocks;
+		out += blocks;
+		in  += blocks;
+	}
+	if (len) {
+		memset(ecount_buf,0,16);
+		(*func)(ecount_buf,ecount_buf,1,key,ivec);
+		++ctr32;
+		PUTU32(ivec+12,ctr32);
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		while (len--) {
+			out[n] = in[n] ^ ecount_buf[n];
+			++n;
+		}
+	}
+
+	*num=n;
+}

diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
new file mode 100644
index 0000000..7d6d034
--- /dev/null
+++ b/crypto/modes/gcm128.c

@@ -0,0 +1,1757 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef	GETU32
+#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
+#undef	PUTU32
+#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)	do { \
+	if (sizeof(size_t)==8) { \
+		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^T; \
+	} \
+	else { \
+		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^((u64)T<<32); \
+	} \
+} while(0)
+
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if	TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+	int  i, j;
+	u128 V;
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+	for (Htable[128]=V, i=64; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<256; i<<=1) {
+		u128 *Hi = Htable+i, H0 = *Hi;
+		for (j=1; j<i; ++j) {
+			Hi[j].hi = H0.hi^Htable[j].hi;
+			Hi[j].lo = H0.lo^Htable[j].lo;
+		}
+	}
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+	u128 Z = { 0, 0};
+	const u8 *xi = (const u8 *)Xi+15;
+	size_t rem, n = *xi;
+	const union { long one; char little; } is_endian = {1};
+	static const size_t rem_8bit[256] = {
+		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+
+	while (1) {
+		Z.hi ^= Htable[n].hi;
+		Z.lo ^= Htable[n].lo;
+
+		if ((u8 *)Xi==xi)	break;
+
+		n = *(--xi);
+
+		rem  = (size_t)Z.lo&0xff;
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_8bit[rem];
+		else
+			Z.hi ^= (u64)rem_8bit[rem]<<32;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif	TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+	u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	int  i;
+#endif
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	for (Htable[8]=V, i=4; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<16; i<<=1) {
+		u128 *Hi = Htable+i;
+		int   j;
+		for (V=*Hi, j=1; j<i; ++j) {
+			Hi[j].hi = V.hi^Htable[j].hi;
+			Hi[j].lo = V.lo^Htable[j].lo;
+		}
+	}
+#else
+	Htable[8] = V;
+	REDUCE1BIT(V);
+	Htable[4] = V;
+	REDUCE1BIT(V);
+	Htable[2] = V;
+	REDUCE1BIT(V);
+	Htable[1] = V;
+	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
+	V=Htable[4];
+	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
+	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
+	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
+	V=Htable[8];
+	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
+	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+	/*
+	 * ARM assembler expects specific dword order in Htable.
+	 */
+	{
+	int j;
+	const union { long one; char little; } is_endian = {1};
+
+	if (is_endian.little)
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo;
+			Htable[j].lo = V.hi;
+		}
+	else
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo<<32|V.lo>>32;
+			Htable[j].lo = V.hi<<32|V.hi>>32;
+		}
+	}
+#endif
+}
+
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+	u128 Z;
+	int cnt = 15;
+	size_t rem, nlo, nhi;
+	const union { long one; char little; } is_endian = {1};
+
+	nlo  = ((const u8 *)Xi)[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)
+{
+    u128 Z;
+    int cnt;
+    size_t rem, nlo, nhi;
+    const union { long one; char little; } is_endian = {1};
+
+#if 1
+    do {
+	cnt  = 15;
+	nlo  = ((const u8 *)Xi)[15];
+	nlo ^= inp[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+#else
+    /*
+     * Extra 256+16 bytes per-key plus 512 bytes shared tables
+     * [should] give ~50% improvement... One could have PACK()-ed
+     * the rem_8bit even here, but the priority is to minimize
+     * cache footprint...
+     */ 
+    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
+    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
+    static const unsigned short rem_8bit[256] = {
+	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+    /*
+     * This pre-processing phase slows down procedure by approximately
+     * same time as it makes each loop spin faster. In other words
+     * single block performance is approximately same as straightforward
+     * "4-bit" implementation, and then it goes only faster...
+     */
+    for (cnt=0; cnt<16; ++cnt) {
+	Z.hi = Htable[cnt].hi;
+	Z.lo = Htable[cnt].lo;
+	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+	Hshr4[cnt].hi = (Z.hi>>4);
+	Hshl4[cnt]    = (u8)(Z.lo<<4);
+    }
+
+    do {
+	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+
+		rem = (size_t)Z.lo&0xff;
+
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+
+		Z.hi ^= Hshr4[nhi].hi;
+		Z.lo ^= Hshr4[nhi].lo;
+		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+	}
+
+	nlo  = ((const u8 *)Xi)[0];
+	nlo ^= inp[0];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi ^= Htable[nlo].hi;
+	Z.lo ^= Htable[nlo].lo;
+
+	rem = (size_t)Z.lo&0xf;
+
+	Z.lo = (Z.hi<<60)|(Z.lo>>4);
+	Z.hi = (Z.hi>>4);
+
+	Z.hi ^= Htable[nhi].hi;
+	Z.lo ^= Htable[nhi].lo;
+	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+    } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK       (3*1024)
+#endif
+
+#else	/* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+	u128 V,Z = { 0,0 };
+	long X;
+	int  i,j;
+	const long *xi = (const long *)Xi;
+	const union { long one; char little; } is_endian = {1};
+
+	V.hi = H[0];	/* H is in host byte order, no byte swapping */
+	V.lo = H[1];
+
+	for (j=0; j<16/sizeof(long); ++j) {
+		if (is_endian.little) {
+			if (sizeof(long)==8) {
+#ifdef BSWAP8
+				X = (long)(BSWAP8(xi[j]));
+#else
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+			}
+			else {
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)GETU32(p);
+			}
+		}
+		else
+			X = xi[j];
+
+		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+			u64 M = (u64)(X>>(8*sizeof(long)-1));
+			Z.hi ^= V.hi&M;
+			Z.lo ^= V.lo&M;
+
+			REDUCE1BIT(V);
+		}
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if	TABLE_BITS==4 && defined(GHASH_ASM)
+# if	!defined(I386_ONLY) && \
+	(defined(__i386)	|| defined(__i386__)	|| \
+	 defined(__x86_64)	|| defined(__x86_64__)	|| \
+	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm)
+#  include "arm_arch.h"
+#  if __ARM_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+	const union { long one; char little; } is_endian = {1};
+
+	memset(ctx,0,sizeof(*ctx));
+	ctx->block = block;
+	ctx->key   = key;
+
+	(*block)(ctx->H.c,ctx->H.c,key);
+
+	if (is_endian.little) {
+		/* H is stored in host byte order */
+#ifdef BSWAP8
+		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+		u8 *p = ctx->H.c;
+		u64 hi,lo;
+		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
+		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+		ctx->H.u[0] = hi;
+		ctx->H.u[1] = lo;
+#endif
+	}
+
+#if	TABLE_BITS==8
+	gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif	TABLE_BITS==4
+# if	defined(GHASH_ASM_X86_OR_64)
+#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
+	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
+		gcm_init_clmul(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_clmul;
+		ctx->ghash = gcm_ghash_clmul;
+		return;
+	}
+#  endif
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if	defined(GHASH_ASM_X86)			/* x86 only */
+#   if defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
+#   else
+	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
+#   endif
+		ctx->gmult = gcm_gmult_4bit_mmx;
+		ctx->ghash = gcm_ghash_4bit_mmx;
+	} else {
+		ctx->gmult = gcm_gmult_4bit_x86;
+		ctx->ghash = gcm_ghash_4bit_x86;
+	}
+#  else
+	ctx->gmult = gcm_gmult_4bit;
+	ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif	defined(GHASH_ASM_ARM)
+	if (OPENSSL_armcap_P & ARMV7_NEON) {
+		ctx->gmult = gcm_gmult_neon;
+		ctx->ghash = gcm_ghash_neon;
+	} else {
+		gcm_init_4bit(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# else
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	ctx->Yi.u[0]  = 0;
+	ctx->Yi.u[1]  = 0;
+	ctx->Xi.u[0]  = 0;
+	ctx->Xi.u[1]  = 0;
+	ctx->len.u[0] = 0;	/* AAD length */
+	ctx->len.u[1] = 0;	/* message length */
+	ctx->ares = 0;
+	ctx->mres = 0;
+
+	if (len==12) {
+		memcpy(ctx->Yi.c,iv,12);
+		ctx->Yi.c[15]=1;
+		ctr=1;
+	}
+	else {
+		size_t i;
+		u64 len0 = len;
+
+		while (len>=16) {
+			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+			iv += 16;
+			len -= 16;
+		}
+		if (len) {
+			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+		}
+		len0 <<= 3;
+		if (is_endian.little) {
+#ifdef BSWAP8
+			ctx->Yi.u[1]  ^= BSWAP8(len0);
+#else
+			ctx->Yi.c[8]  ^= (u8)(len0>>56);
+			ctx->Yi.c[9]  ^= (u8)(len0>>48);
+			ctx->Yi.c[10] ^= (u8)(len0>>40);
+			ctx->Yi.c[11] ^= (u8)(len0>>32);
+			ctx->Yi.c[12] ^= (u8)(len0>>24);
+			ctx->Yi.c[13] ^= (u8)(len0>>16);
+			ctx->Yi.c[14] ^= (u8)(len0>>8);
+			ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+		}
+		else
+			ctx->Yi.u[1]  ^= len0;
+
+		GCM_MUL(ctx,Yi);
+
+		if (is_endian.little)
+			ctr = GETU32(ctx->Yi.c+12);
+		else
+			ctr = ctx->Yi.d[3];
+	}
+
+	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+	++ctr;
+	if (is_endian.little)
+		PUTU32(ctx->Yi.c+12,ctr);
+	else
+		ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+	size_t i;
+	unsigned int n;
+	u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	if (ctx->len.u[1]) return -2;
+
+	alen += len;
+	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+		return -1;
+	ctx->len.u[0] = alen;
+
+	n = ctx->ares;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(aad++);
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->ares = n;
+			return 0;
+		}
+	}
+
+#ifdef GHASH
+	if ((i = (len&(size_t)-16))) {
+		GHASH(ctx,aad,i);
+		aad += i;
+		len -= i;
+	}
+#else
+	while (len>=16) {
+		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+		GCM_MUL(ctx,Xi);
+		aad += 16;
+		len -= 16;
+	}
+#endif
+	if (len) {
+		n = (unsigned int)len;
+		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+	}
+
+	ctx->ares = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+#if 0
+	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL(ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    size_t j=i;
+
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		    GHASH(ctx,out-j,j);
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(ctx->Xi.c+i) ^=
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				u8 c = *(in++);
+				*(out++) = c^ctx->EKi.c[n];
+				ctx->Xi.c[n] ^= c;
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL (ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    GHASH(ctx,in,GHASH_CHUNK);
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    GHASH(ctx,in,i);
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t)) {
+				size_t c = *(size_t *)(in+i);
+				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
+				*(size_t *)(ctx->Xi.c+i) ^= c;
+			}
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				u8 c = in[n];
+				ctx->Xi.c[n] ^= c;
+				out[n] = c^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		u8 c;
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		c = in[i];
+		out[i] = c^ctx->EKi.c[n];
+		ctx->Xi.c[n] ^= c;
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len, ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		GHASH(ctx,out,GHASH_CHUNK);
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		in  += i;
+		len -= i;
+#if defined(GHASH)
+		GHASH(ctx,out,i);
+		out += i;
+#else
+		while (j--) {
+			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+			GCM_MUL(ctx,Xi);
+			out += 16;
+		}
+#endif
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len,ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			u8 c = *(in++);
+			*(out++) = c^ctx->EKi.c[n];
+			ctx->Xi.c[n] ^= c;
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL (ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		GHASH(ctx,in,GHASH_CHUNK);
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+#if defined(GHASH)
+		GHASH(ctx,in,i);
+#else
+		while (j--) {
+			size_t k;
+			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+			GCM_MUL(ctx,Xi);
+			in += 16;
+		}
+		j   = i/16;
+		in -= i;
+#endif
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += i;
+		in  += i;
+		len -= i;
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			u8 c = in[n];
+			ctx->Xi.c[n] ^= c;
+			out[n] = c^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	u64 alen = ctx->len.u[0]<<3;
+	u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	if (ctx->mres)
+		GCM_MUL(ctx,Xi);
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		alen = BSWAP8(alen);
+		clen = BSWAP8(clen);
+#else
+		u8 *p = ctx->len.c;
+
+		ctx->len.u[0] = alen;
+		ctx->len.u[1] = clen;
+
+		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
+		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+	}
+
+	ctx->Xi.u[0] ^= alen;
+	ctx->Xi.u[1] ^= clen;
+	GCM_MUL(ctx,Xi);
+
+	ctx->Xi.u[0] ^= ctx->EK0.u[0];
+	ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+	if (tag && len<=sizeof(ctx->Xi))
+		return memcmp(ctx->Xi.c,tag,len);
+	else
+		return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+	CRYPTO_gcm128_finish(ctx, NULL, 0);
+	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+	GCM128_CONTEXT *ret;
+
+	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+		CRYPTO_gcm128_init(ret,key,block);
+
+	return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+	if (ctx) {
+		OPENSSL_cleanse(ctx,sizeof(*ctx));
+		OPENSSL_free(ctx);
+	}
+}
+
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8	K1[16],
+		*P1=NULL,
+		*A1=NULL,
+		IV1[12],
+		*C1=NULL,
+		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8	P2[16],
+		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+
+/* Test Case 3 */
+#define A3 A2
+static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+
+/* Test Case 7 */
+static const u8 K7[24],
+		*P7=NULL,
+		*A7=NULL,
+		IV7[12],
+		*C7=NULL,
+		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8	P8[16],
+		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+
+/* Test Case 9 */
+#define A9 A8
+static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+
+/* Test Case 13 */
+static const u8	K13[32],
+		*P13=NULL,
+		*A13=NULL,
+		IV13[12],
+		*C13=NULL,
+		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8	P14[16],
+		IV14[12],
+		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+
+/* Test Case 15 */
+#define A15 A14
+static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+
+#define TEST_CASE(n)	do {					\
+	u8 out[sizeof(P##n)];					\
+	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (C##n && memcmp(out,C##n,sizeof(out))))		\
+		ret++, printf ("encrypt test#%d failed.\n",n);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (P##n && memcmp(out,P##n,sizeof(out))))		\
+		ret++, printf ("decrypt test#%d failed.\n",n);	\
+	} while(0)
+
+int main()
+{
+	GCM128_CONTEXT ctx;
+	AES_KEY key;
+	int ret=0;
+
+	TEST_CASE(1);
+	TEST_CASE(2);
+	TEST_CASE(3);
+	TEST_CASE(4);
+	TEST_CASE(5);
+	TEST_CASE(6);
+	TEST_CASE(7);
+	TEST_CASE(8);
+	TEST_CASE(9);
+	TEST_CASE(10);
+	TEST_CASE(11);
+	TEST_CASE(12);
+	TEST_CASE(13);
+	TEST_CASE(14);
+	TEST_CASE(15);
+	TEST_CASE(16);
+	TEST_CASE(17);
+	TEST_CASE(18);
+
+#ifdef OPENSSL_CPUID_OBJ
+	{
+	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+	union { u64 u; u8 c[1024]; } buf;
+	int i;
+
+	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	start = OPENSSL_rdtsc();
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	ctr_t = OPENSSL_rdtsc() - start;
+
+	printf("%.2f-%.2f=%.2f\n",
+			gcm_t/(double)sizeof(buf),
+			ctr_t/(double)sizeof(buf),
+			(gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+	GHASH(&ctx,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+#endif
+	}
+#endif
+
+	return ret;
+}
+#endif

diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
new file mode 100644
index 0000000..7a82a98
--- /dev/null
+++ b/crypto/modes/modes_lcl.h

@@ -0,0 +1,131 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+
+#include <openssl/modes.h>
+
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#if defined(__i386)	|| defined(__i386__)	|| \
+    defined(__x86_64)	|| defined(__x86_64__)	|| \
+    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
+    defined(__s390__)	|| defined(__s390x__)	|| \
+    ( (defined(__arm__)	|| defined(__arm)) && \
+      (defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__) || \
+       defined(__ARM_ARCH_7R__)	|| defined(__ARM_ARCH_7M__)) )
+# undef STRICT_ALIGNMENT
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+#  define BSWAP8(x) ({	u64 ret=(x);			\
+			asm ("bswapq %0"		\
+			: "+r"(ret));	ret;		})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__i386) || defined(__i386__))
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("bswapl %0; bswapl %1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("rev %0,%0; rev %1,%1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret;			\
+			asm ("rev %0,%1"		\
+			: "=r"(ret) : "r"((u32)(x)));	\
+			ret;				})
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#  define BSWAP8(x)	_byteswap_uint64((u64)(x))
+#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+	_asm mov eax,val
+	_asm bswap eax
+   }
+#  define BSWAP4(x)	_bswap4(x)
+# endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p)	BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef	TABLE_BITS
+#undef	TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define	TABLE_BITS 4
+
+struct gcm128_context {
+	/* Following 6 names follow names in GCM specification */
+	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,len,
+						Xi,H;
+	/* Relative position of Xi, H and pre-computed Htable is used
+	 * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+	u128 Htable[256];
+#else
+	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+	unsigned int mres, ares;
+	block128_f block;
+	void *key;
+};
+
+struct xts128_context {
+	void      *key1, *key2;
+	block128_f block1,block2;
+};
+
+struct ccm128_context {
+	union { u64 u[2]; u8 c[16]; } nonce, cmac;
+	u64 blocks;
+	block128_f block;
+	void *key;
+};
+

diff --git a/crypto/modes/ofb128.c b/crypto/modes/ofb128.c
index c732e2e..01c0170 100644
--- a/crypto/modes/ofb128.c
+++ b/crypto/modes/ofb128.c

@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit ofb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;

diff --git a/crypto/modes/xts128.c b/crypto/modes/xts128.c
new file mode 100644
index 0000000..9cf27a2
--- /dev/null
+++ b/crypto/modes/xts128.c

@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out,
+	size_t len, int enc)
+{
+	const union { long one; char little; } is_endian = {1};
+	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+	unsigned int i;
+
+	if (len<16) return -1;
+
+	memcpy(tweak.c, iv, 16);
+
+	(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+	if (!enc && (len%16)) len-=16;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+
+		if (len==0)	return 0;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+			
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak.u[0] = (tweak.u[0]<<1)^res;
+			tweak.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak.c[0] ^= (u8)(0x87&(0-c));
+		}
+	}
+	if (enc) {
+		for (i=0;i<len;++i) {
+			u8 c = inp[i];
+			out[i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out-16,scratch.c,16);
+	}
+	else {
+		union { u64 u[2]; u8 c[16]; } tweak1;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak1.u[0] = (tweak.u[0]<<1)^res;
+			tweak1.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak1.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak1.c[0] ^= (u8)(0x87&(0-c));
+		}
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+
+		for (i=0;i<len;++i) {
+			u8 c = inp[16+i];
+			out[16+i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy (out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+	}
+
+	return 0;
+}

diff --git a/crypto/o_init.c b/crypto/o_init.c
new file mode 100644
index 0000000..db4cdc4
--- /dev/null
+++ b/crypto/o_init.c

@@ -0,0 +1,82 @@
+/* o_init.c */
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <e_os.h>
+#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/rand.h>
+#endif
+
+/* Perform any essential OpenSSL initialization operations.
+ * Currently only sets FIPS callbacks
+ */
+
+void OPENSSL_init(void)
+	{
+	static int done = 0;
+	if (done)
+		return;
+	done = 1;
+#ifdef OPENSSL_FIPS
+	FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock);
+	FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata);
+	FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free);
+	RAND_init_fips();
+#endif
+#if 0
+	fprintf(stderr, "Called OPENSSL_init\n");
+#endif
+	}
+

diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h
index 6449be6..d404ad0 100644
--- a/crypto/objects/obj_dat.h
+++ b/crypto/objects/obj_dat.h

@@ -62,12 +62,12 @@
  * [including the GNU Public Licence.]
  */
 
-#define NUM_NID 893
-#define NUM_SN 886
-#define NUM_LN 886
-#define NUM_OBJ 840
+#define NUM_NID 920
+#define NUM_SN 913
+#define NUM_LN 913
+#define NUM_OBJ 857
 
-static const unsigned char lvalues[5824]={
+static const unsigned char lvalues[5980]={
 0x00,                                        /* [  0] OBJ_undef */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,               /* [  1] OBJ_rsadsi */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,          /* [  7] OBJ_pkcs */
@@ -908,6 +908,23 @@
 0x55,0x04,0x34,                              /* [5814] OBJ_supportedAlgorithms */
 0x55,0x04,0x35,                              /* [5817] OBJ_deltaRevocationList */
 0x55,0x04,0x36,                              /* [5820] OBJ_dmdName */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x09,0x10,0x03,0x09,/* [5823] OBJ_id_alg_PWRI_KEK */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x06,/* [5834] OBJ_aes_128_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x07,/* [5843] OBJ_aes_128_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x08,/* [5852] OBJ_id_aes128_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1A,/* [5861] OBJ_aes_192_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1B,/* [5870] OBJ_aes_192_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1C,/* [5879] OBJ_id_aes192_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2E,/* [5888] OBJ_aes_256_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2F,/* [5897] OBJ_aes_256_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x30,/* [5906] OBJ_id_aes256_wrap_pad */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x02,/* [5915] OBJ_id_camellia128_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x03,/* [5926] OBJ_id_camellia192_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x04,/* [5937] OBJ_id_camellia256_wrap */
+0x55,0x1D,0x25,0x00,                         /* [5948] OBJ_anyExtendedKeyUsage */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x08,/* [5952] OBJ_mgf1 */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x0A,/* [5961] OBJ_rsassaPss */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x07,/* [5970] OBJ_rsaesOaep */
 };
 
 static const ASN1_OBJECT nid_objs[NUM_NID]={
@@ -2351,28 +2368,74 @@
 {"deltaRevocationList","deltaRevocationList",NID_deltaRevocationList,
 	3,&(lvalues[5817]),0},
 {"dmdName","dmdName",NID_dmdName,3,&(lvalues[5820]),0},
+{"id-alg-PWRI-KEK","id-alg-PWRI-KEK",NID_id_alg_PWRI_KEK,11,
+	&(lvalues[5823]),0},
+{"CMAC","cmac",NID_cmac,0,NULL,0},
+{"id-aes128-GCM","aes-128-gcm",NID_aes_128_gcm,9,&(lvalues[5834]),0},
+{"id-aes128-CCM","aes-128-ccm",NID_aes_128_ccm,9,&(lvalues[5843]),0},
+{"id-aes128-wrap-pad","id-aes128-wrap-pad",NID_id_aes128_wrap_pad,9,
+	&(lvalues[5852]),0},
+{"id-aes192-GCM","aes-192-gcm",NID_aes_192_gcm,9,&(lvalues[5861]),0},
+{"id-aes192-CCM","aes-192-ccm",NID_aes_192_ccm,9,&(lvalues[5870]),0},
+{"id-aes192-wrap-pad","id-aes192-wrap-pad",NID_id_aes192_wrap_pad,9,
+	&(lvalues[5879]),0},
+{"id-aes256-GCM","aes-256-gcm",NID_aes_256_gcm,9,&(lvalues[5888]),0},
+{"id-aes256-CCM","aes-256-ccm",NID_aes_256_ccm,9,&(lvalues[5897]),0},
+{"id-aes256-wrap-pad","id-aes256-wrap-pad",NID_id_aes256_wrap_pad,9,
+	&(lvalues[5906]),0},
+{"AES-128-CTR","aes-128-ctr",NID_aes_128_ctr,0,NULL,0},
+{"AES-192-CTR","aes-192-ctr",NID_aes_192_ctr,0,NULL,0},
+{"AES-256-CTR","aes-256-ctr",NID_aes_256_ctr,0,NULL,0},
+{"id-camellia128-wrap","id-camellia128-wrap",NID_id_camellia128_wrap,
+	11,&(lvalues[5915]),0},
+{"id-camellia192-wrap","id-camellia192-wrap",NID_id_camellia192_wrap,
+	11,&(lvalues[5926]),0},
+{"id-camellia256-wrap","id-camellia256-wrap",NID_id_camellia256_wrap,
+	11,&(lvalues[5937]),0},
+{"anyExtendedKeyUsage","Any Extended Key Usage",
+	NID_anyExtendedKeyUsage,4,&(lvalues[5948]),0},
+{"MGF1","mgf1",NID_mgf1,9,&(lvalues[5952]),0},
+{"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
+{"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
+{"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+	NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+	NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+	NID_aes_256_cbc_hmac_sha1,0,NULL,0},
+{"RSAES-OAEP","rsaesOaep",NID_rsaesOaep,9,&(lvalues[5970]),0},
 };
 
 static const unsigned int sn_objs[NUM_SN]={
 364,	/* "AD_DVCS" */
 419,	/* "AES-128-CBC" */
+916,	/* "AES-128-CBC-HMAC-SHA1" */
 421,	/* "AES-128-CFB" */
 650,	/* "AES-128-CFB1" */
 653,	/* "AES-128-CFB8" */
+904,	/* "AES-128-CTR" */
 418,	/* "AES-128-ECB" */
 420,	/* "AES-128-OFB" */
+913,	/* "AES-128-XTS" */
 423,	/* "AES-192-CBC" */
+917,	/* "AES-192-CBC-HMAC-SHA1" */
 425,	/* "AES-192-CFB" */
 651,	/* "AES-192-CFB1" */
 654,	/* "AES-192-CFB8" */
+905,	/* "AES-192-CTR" */
 422,	/* "AES-192-ECB" */
 424,	/* "AES-192-OFB" */
 427,	/* "AES-256-CBC" */
+918,	/* "AES-256-CBC-HMAC-SHA1" */
 429,	/* "AES-256-CFB" */
 652,	/* "AES-256-CFB1" */
 655,	/* "AES-256-CFB8" */
+906,	/* "AES-256-CTR" */
 426,	/* "AES-256-ECB" */
 428,	/* "AES-256-OFB" */
+914,	/* "AES-256-XTS" */
 91,	/* "BF-CBC" */
 93,	/* "BF-CFB" */
 92,	/* "BF-ECB" */
@@ -2400,6 +2463,7 @@
 110,	/* "CAST5-CFB" */
 109,	/* "CAST5-ECB" */
 111,	/* "CAST5-OFB" */
+894,	/* "CMAC" */
 13,	/* "CN" */
 141,	/* "CRLReason" */
 417,	/* "CSPName" */
@@ -2451,6 +2515,7 @@
  4,	/* "MD5" */
 114,	/* "MD5-SHA1" */
 95,	/* "MDC2" */
+911,	/* "MGF1" */
 388,	/* "Mail" */
 393,	/* "NULL" */
 404,	/* "NULL" */
@@ -2487,6 +2552,7 @@
 40,	/* "RC2-OFB" */
  5,	/* "RC4" */
 97,	/* "RC4-40" */
+915,	/* "RC4-HMAC-MD5" */
 120,	/* "RC5-CBC" */
 122,	/* "RC5-CFB" */
 121,	/* "RC5-ECB" */
@@ -2507,6 +2573,8 @@
 668,	/* "RSA-SHA256" */
 669,	/* "RSA-SHA384" */
 670,	/* "RSA-SHA512" */
+919,	/* "RSAES-OAEP" */
+912,	/* "RSASSA-PSS" */
 777,	/* "SEED-CBC" */
 779,	/* "SEED-CFB" */
 776,	/* "SEED-ECB" */
@@ -2540,6 +2608,7 @@
 363,	/* "ad_timestamping" */
 376,	/* "algorithm" */
 405,	/* "ansi-X9-62" */
+910,	/* "anyExtendedKeyUsage" */
 746,	/* "anyPolicy" */
 370,	/* "archiveCutoff" */
 484,	/* "associatedDomain" */
@@ -2716,14 +2785,27 @@
 357,	/* "id-aca-group" */
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
+896,	/* "id-aes128-CCM" */
+895,	/* "id-aes128-GCM" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
+899,	/* "id-aes192-CCM" */
+898,	/* "id-aes192-GCM" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
+902,	/* "id-aes256-CCM" */
+901,	/* "id-aes256-GCM" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3246,6 +3328,7 @@
 363,	/* "AD Time Stamping" */
 405,	/* "ANSI X9.62" */
 368,	/* "Acceptable OCSP Responses" */
+910,	/* "Any Extended Key Usage" */
 664,	/* "Any language" */
 177,	/* "Authority Information Access" */
 365,	/* "Basic OCSP Response" */
@@ -3386,23 +3469,37 @@
 364,	/* "ad dvcs" */
 606,	/* "additional verification" */
 419,	/* "aes-128-cbc" */
+916,	/* "aes-128-cbc-hmac-sha1" */
+896,	/* "aes-128-ccm" */
 421,	/* "aes-128-cfb" */
 650,	/* "aes-128-cfb1" */
 653,	/* "aes-128-cfb8" */
+904,	/* "aes-128-ctr" */
 418,	/* "aes-128-ecb" */
+895,	/* "aes-128-gcm" */
 420,	/* "aes-128-ofb" */
+913,	/* "aes-128-xts" */
 423,	/* "aes-192-cbc" */
+917,	/* "aes-192-cbc-hmac-sha1" */
+899,	/* "aes-192-ccm" */
 425,	/* "aes-192-cfb" */
 651,	/* "aes-192-cfb1" */
 654,	/* "aes-192-cfb8" */
+905,	/* "aes-192-ctr" */
 422,	/* "aes-192-ecb" */
+898,	/* "aes-192-gcm" */
 424,	/* "aes-192-ofb" */
 427,	/* "aes-256-cbc" */
+918,	/* "aes-256-cbc-hmac-sha1" */
+902,	/* "aes-256-ccm" */
 429,	/* "aes-256-cfb" */
 652,	/* "aes-256-cfb1" */
 655,	/* "aes-256-cfb8" */
+906,	/* "aes-256-ctr" */
 426,	/* "aes-256-ecb" */
+901,	/* "aes-256-gcm" */
 428,	/* "aes-256-ofb" */
+914,	/* "aes-256-xts" */
 376,	/* "algorithm" */
 484,	/* "associatedDomain" */
 485,	/* "associatedName" */
@@ -3467,6 +3564,7 @@
 407,	/* "characteristic-two-field" */
 395,	/* "clearance" */
 633,	/* "cleartext track 2" */
+894,	/* "cmac" */
 13,	/* "commonName" */
 513,	/* "content types" */
 50,	/* "contentType" */
@@ -3602,13 +3700,20 @@
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3806,6 +3911,7 @@
 602,	/* "merchant initiated auth" */
 514,	/* "message extensions" */
 51,	/* "messageDigest" */
+911,	/* "mgf1" */
 506,	/* "mime-mhs-bodies" */
 505,	/* "mime-mhs-headings" */
 488,	/* "mobileTelephoneNumber" */
@@ -3889,6 +3995,7 @@
 40,	/* "rc2-ofb" */
  5,	/* "rc4" */
 97,	/* "rc4-40" */
+915,	/* "rc4-hmac-md5" */
 120,	/* "rc5-cbc" */
 122,	/* "rc5-cfb" */
 121,	/* "rc5-ecb" */
@@ -3905,6 +4012,8 @@
  6,	/* "rsaEncryption" */
 644,	/* "rsaOAEPEncryptionSET" */
 377,	/* "rsaSignature" */
+919,	/* "rsaesOaep" */
+912,	/* "rsassaPss" */
 124,	/* "run length compression" */
 482,	/* "sOARecord" */
 155,	/* "safeContentsBag" */
@@ -4254,6 +4363,7 @@
 96,	/* OBJ_mdc2WithRSA                  2 5 8 3 100 */
 95,	/* OBJ_mdc2                         2 5 8 3 101 */
 746,	/* OBJ_any_policy                   2 5 29 32 0 */
+910,	/* OBJ_anyExtendedKeyUsage          2 5 29 37 0 */
 519,	/* OBJ_setct_PANData                2 23 42 0 0 */
 520,	/* OBJ_setct_PANToken               2 23 42 0 1 */
 521,	/* OBJ_setct_PANOnly                2 23 42 0 2 */
@@ -4720,6 +4830,9 @@
  8,	/* OBJ_md5WithRSAEncryption         1 2 840 113549 1 1 4 */
 65,	/* OBJ_sha1WithRSAEncryption        1 2 840 113549 1 1 5 */
 644,	/* OBJ_rsaOAEPEncryptionSET         1 2 840 113549 1 1 6 */
+919,	/* OBJ_rsaesOaep                    1 2 840 113549 1 1 7 */
+911,	/* OBJ_mgf1                         1 2 840 113549 1 1 8 */
+912,	/* OBJ_rsassaPss                    1 2 840 113549 1 1 10 */
 668,	/* OBJ_sha256WithRSAEncryption      1 2 840 113549 1 1 11 */
 669,	/* OBJ_sha384WithRSAEncryption      1 2 840 113549 1 1 12 */
 670,	/* OBJ_sha512WithRSAEncryption      1 2 840 113549 1 1 13 */
@@ -4785,16 +4898,25 @@
 420,	/* OBJ_aes_128_ofb128               2 16 840 1 101 3 4 1 3 */
 421,	/* OBJ_aes_128_cfb128               2 16 840 1 101 3 4 1 4 */
 788,	/* OBJ_id_aes128_wrap               2 16 840 1 101 3 4 1 5 */
+895,	/* OBJ_aes_128_gcm                  2 16 840 1 101 3 4 1 6 */
+896,	/* OBJ_aes_128_ccm                  2 16 840 1 101 3 4 1 7 */
+897,	/* OBJ_id_aes128_wrap_pad           2 16 840 1 101 3 4 1 8 */
 422,	/* OBJ_aes_192_ecb                  2 16 840 1 101 3 4 1 21 */
 423,	/* OBJ_aes_192_cbc                  2 16 840 1 101 3 4 1 22 */
 424,	/* OBJ_aes_192_ofb128               2 16 840 1 101 3 4 1 23 */
 425,	/* OBJ_aes_192_cfb128               2 16 840 1 101 3 4 1 24 */
 789,	/* OBJ_id_aes192_wrap               2 16 840 1 101 3 4 1 25 */
+898,	/* OBJ_aes_192_gcm                  2 16 840 1 101 3 4 1 26 */
+899,	/* OBJ_aes_192_ccm                  2 16 840 1 101 3 4 1 27 */
+900,	/* OBJ_id_aes192_wrap_pad           2 16 840 1 101 3 4 1 28 */
 426,	/* OBJ_aes_256_ecb                  2 16 840 1 101 3 4 1 41 */
 427,	/* OBJ_aes_256_cbc                  2 16 840 1 101 3 4 1 42 */
 428,	/* OBJ_aes_256_ofb128               2 16 840 1 101 3 4 1 43 */
 429,	/* OBJ_aes_256_cfb128               2 16 840 1 101 3 4 1 44 */
 790,	/* OBJ_id_aes256_wrap               2 16 840 1 101 3 4 1 45 */
+901,	/* OBJ_aes_256_gcm                  2 16 840 1 101 3 4 1 46 */
+902,	/* OBJ_aes_256_ccm                  2 16 840 1 101 3 4 1 47 */
+903,	/* OBJ_id_aes256_wrap_pad           2 16 840 1 101 3 4 1 48 */
 672,	/* OBJ_sha256                       2 16 840 1 101 3 4 2 1 */
 673,	/* OBJ_sha384                       2 16 840 1 101 3 4 2 2 */
 674,	/* OBJ_sha512                       2 16 840 1 101 3 4 2 3 */
@@ -4901,6 +5023,9 @@
 751,	/* OBJ_camellia_128_cbc             1 2 392 200011 61 1 1 1 2 */
 752,	/* OBJ_camellia_192_cbc             1 2 392 200011 61 1 1 1 3 */
 753,	/* OBJ_camellia_256_cbc             1 2 392 200011 61 1 1 1 4 */
+907,	/* OBJ_id_camellia128_wrap          1 2 392 200011 61 1 1 3 2 */
+908,	/* OBJ_id_camellia192_wrap          1 2 392 200011 61 1 1 3 3 */
+909,	/* OBJ_id_camellia256_wrap          1 2 392 200011 61 1 1 3 4 */
 196,	/* OBJ_id_smime_mod_cms             1 2 840 113549 1 9 16 0 1 */
 197,	/* OBJ_id_smime_mod_ess             1 2 840 113549 1 9 16 0 2 */
 198,	/* OBJ_id_smime_mod_oid             1 2 840 113549 1 9 16 0 3 */
@@ -4956,6 +5081,7 @@
 246,	/* OBJ_id_smime_alg_CMS3DESwrap     1 2 840 113549 1 9 16 3 6 */
 247,	/* OBJ_id_smime_alg_CMSRC2wrap      1 2 840 113549 1 9 16 3 7 */
 125,	/* OBJ_zlib_compression             1 2 840 113549 1 9 16 3 8 */
+893,	/* OBJ_id_alg_PWRI_KEK              1 2 840 113549 1 9 16 3 9 */
 248,	/* OBJ_id_smime_cd_ldap             1 2 840 113549 1 9 16 4 1 */
 249,	/* OBJ_id_smime_spq_ets_sqt_uri     1 2 840 113549 1 9 16 5 1 */
 250,	/* OBJ_id_smime_spq_ets_sqt_unotice 1 2 840 113549 1 9 16 5 2 */

diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h
index 282f11a..b5ea7cd 100644
--- a/crypto/objects/obj_mac.h
+++ b/crypto/objects/obj_mac.h

@@ -580,6 +580,21 @@
 #define NID_sha1WithRSAEncryption		65
 #define OBJ_sha1WithRSAEncryption		OBJ_pkcs1,5L
 
+#define SN_rsaesOaep		"RSAES-OAEP"
+#define LN_rsaesOaep		"rsaesOaep"
+#define NID_rsaesOaep		919
+#define OBJ_rsaesOaep		OBJ_pkcs1,7L
+
+#define SN_mgf1		"MGF1"
+#define LN_mgf1		"mgf1"
+#define NID_mgf1		911
+#define OBJ_mgf1		OBJ_pkcs1,8L
+
+#define SN_rsassaPss		"RSASSA-PSS"
+#define LN_rsassaPss		"rsassaPss"
+#define NID_rsassaPss		912
+#define OBJ_rsassaPss		OBJ_pkcs1,10L
+
 #define SN_sha256WithRSAEncryption		"RSA-SHA256"
 #define LN_sha256WithRSAEncryption		"sha256WithRSAEncryption"
 #define NID_sha256WithRSAEncryption		668
@@ -981,6 +996,10 @@
 #define NID_id_smime_alg_CMSRC2wrap		247
 #define OBJ_id_smime_alg_CMSRC2wrap		OBJ_id_smime_alg,7L
 
+#define SN_id_alg_PWRI_KEK		"id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK		893
+#define OBJ_id_alg_PWRI_KEK		OBJ_id_smime_alg,9L
+
 #define SN_id_smime_cd_ldap		"id-smime-cd-ldap"
 #define NID_id_smime_cd_ldap		248
 #define OBJ_id_smime_cd_ldap		OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
 #define NID_no_rev_avail		403
 #define OBJ_no_rev_avail		OBJ_id_ce,56L
 
+#define SN_anyExtendedKeyUsage		"anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage		"Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage		910
+#define OBJ_anyExtendedKeyUsage		OBJ_ext_key_usage,0L
+
 #define SN_netscape		"Netscape"
 #define LN_netscape		"Netscape Communications Corp."
 #define NID_netscape		57
@@ -2586,6 +2610,24 @@
 #define NID_aes_128_cfb128		421
 #define OBJ_aes_128_cfb128		OBJ_aes,4L
 
+#define SN_id_aes128_wrap		"id-aes128-wrap"
+#define NID_id_aes128_wrap		788
+#define OBJ_id_aes128_wrap		OBJ_aes,5L
+
+#define SN_aes_128_gcm		"id-aes128-GCM"
+#define LN_aes_128_gcm		"aes-128-gcm"
+#define NID_aes_128_gcm		895
+#define OBJ_aes_128_gcm		OBJ_aes,6L
+
+#define SN_aes_128_ccm		"id-aes128-CCM"
+#define LN_aes_128_ccm		"aes-128-ccm"
+#define NID_aes_128_ccm		896
+#define OBJ_aes_128_ccm		OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad		"id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad		897
+#define OBJ_id_aes128_wrap_pad		OBJ_aes,8L
+
 #define SN_aes_192_ecb		"AES-192-ECB"
 #define LN_aes_192_ecb		"aes-192-ecb"
 #define NID_aes_192_ecb		422
@@ -2606,6 +2648,24 @@
 #define NID_aes_192_cfb128		425
 #define OBJ_aes_192_cfb128		OBJ_aes,24L
 
+#define SN_id_aes192_wrap		"id-aes192-wrap"
+#define NID_id_aes192_wrap		789
+#define OBJ_id_aes192_wrap		OBJ_aes,25L
+
+#define SN_aes_192_gcm		"id-aes192-GCM"
+#define LN_aes_192_gcm		"aes-192-gcm"
+#define NID_aes_192_gcm		898
+#define OBJ_aes_192_gcm		OBJ_aes,26L
+
+#define SN_aes_192_ccm		"id-aes192-CCM"
+#define LN_aes_192_ccm		"aes-192-ccm"
+#define NID_aes_192_ccm		899
+#define OBJ_aes_192_ccm		OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad		"id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad		900
+#define OBJ_id_aes192_wrap_pad		OBJ_aes,28L
+
 #define SN_aes_256_ecb		"AES-256-ECB"
 #define LN_aes_256_ecb		"aes-256-ecb"
 #define NID_aes_256_ecb		426
@@ -2626,6 +2686,24 @@
 #define NID_aes_256_cfb128		429
 #define OBJ_aes_256_cfb128		OBJ_aes,44L
 
+#define SN_id_aes256_wrap		"id-aes256-wrap"
+#define NID_id_aes256_wrap		790
+#define OBJ_id_aes256_wrap		OBJ_aes,45L
+
+#define SN_aes_256_gcm		"id-aes256-GCM"
+#define LN_aes_256_gcm		"aes-256-gcm"
+#define NID_aes_256_gcm		901
+#define OBJ_aes_256_gcm		OBJ_aes,46L
+
+#define SN_aes_256_ccm		"id-aes256-CCM"
+#define LN_aes_256_ccm		"aes-256-ccm"
+#define NID_aes_256_ccm		902
+#define OBJ_aes_256_ccm		OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad		"id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad		903
+#define OBJ_id_aes256_wrap_pad		OBJ_aes,48L
+
 #define SN_aes_128_cfb1		"AES-128-CFB1"
 #define LN_aes_128_cfb1		"aes-128-cfb1"
 #define NID_aes_128_cfb1		650
@@ -2650,6 +2728,26 @@
 #define LN_aes_256_cfb8		"aes-256-cfb8"
 #define NID_aes_256_cfb8		655
 
+#define SN_aes_128_ctr		"AES-128-CTR"
+#define LN_aes_128_ctr		"aes-128-ctr"
+#define NID_aes_128_ctr		904
+
+#define SN_aes_192_ctr		"AES-192-CTR"
+#define LN_aes_192_ctr		"aes-192-ctr"
+#define NID_aes_192_ctr		905
+
+#define SN_aes_256_ctr		"AES-256-CTR"
+#define LN_aes_256_ctr		"aes-256-ctr"
+#define NID_aes_256_ctr		906
+
+#define SN_aes_128_xts		"AES-128-XTS"
+#define LN_aes_128_xts		"aes-128-xts"
+#define NID_aes_128_xts		913
+
+#define SN_aes_256_xts		"AES-256-XTS"
+#define LN_aes_256_xts		"aes-256-xts"
+#define NID_aes_256_xts		914
+
 #define SN_des_cfb1		"DES-CFB1"
 #define LN_des_cfb1		"des-cfb1"
 #define NID_des_cfb1		656
@@ -2666,18 +2764,6 @@
 #define LN_des_ede3_cfb8		"des-ede3-cfb8"
 #define NID_des_ede3_cfb8		659
 
-#define SN_id_aes128_wrap		"id-aes128-wrap"
-#define NID_id_aes128_wrap		788
-#define OBJ_id_aes128_wrap		OBJ_aes,5L
-
-#define SN_id_aes192_wrap		"id-aes192-wrap"
-#define NID_id_aes192_wrap		789
-#define OBJ_id_aes192_wrap		OBJ_aes,25L
-
-#define SN_id_aes256_wrap		"id-aes256-wrap"
-#define NID_id_aes256_wrap		790
-#define OBJ_id_aes256_wrap		OBJ_aes,45L
-
 #define OBJ_nist_hashalgs		OBJ_nistAlgorithms,2L
 
 #define SN_sha256		"SHA256"
@@ -3810,6 +3896,18 @@
 #define NID_camellia_256_cbc		753
 #define OBJ_camellia_256_cbc		1L,2L,392L,200011L,61L,1L,1L,1L,4L
 
+#define SN_id_camellia128_wrap		"id-camellia128-wrap"
+#define NID_id_camellia128_wrap		907
+#define OBJ_id_camellia128_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap		"id-camellia192-wrap"
+#define NID_id_camellia192_wrap		908
+#define OBJ_id_camellia192_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap		"id-camellia256-wrap"
+#define NID_id_camellia256_wrap		909
+#define OBJ_id_camellia256_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
 #define OBJ_ntt_ds		0L,3L,4401L,5L
 
 #define OBJ_camellia		OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
 #define LN_hmac		"hmac"
 #define NID_hmac		855
 
+#define SN_cmac		"CMAC"
+#define LN_cmac		"cmac"
+#define NID_cmac		894
+
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+

diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num
index 8c50aac..1d0a7c8 100644
--- a/crypto/objects/obj_mac.num
+++ b/crypto/objects/obj_mac.num

@@ -890,3 +890,30 @@
 supportedAlgorithms		890
 deltaRevocationList		891
 dmdName		892
+id_alg_PWRI_KEK		893
+cmac		894
+aes_128_gcm		895
+aes_128_ccm		896
+id_aes128_wrap_pad		897
+aes_192_gcm		898
+aes_192_ccm		899
+id_aes192_wrap_pad		900
+aes_256_gcm		901
+aes_256_ccm		902
+id_aes256_wrap_pad		903
+aes_128_ctr		904
+aes_192_ctr		905
+aes_256_ctr		906
+id_camellia128_wrap		907
+id_camellia192_wrap		908
+id_camellia256_wrap		909
+anyExtendedKeyUsage		910
+mgf1		911
+rsassaPss		912
+aes_128_xts		913
+aes_256_xts		914
+rc4_hmac_md5		915
+aes_128_cbc_hmac_sha1		916
+aes_192_cbc_hmac_sha1		917
+aes_256_cbc_hmac_sha1		918
+rsaesOaep		919

diff --git a/crypto/objects/obj_xref.c b/crypto/objects/obj_xref.c
index 152eca5..9f744bc 100644
--- a/crypto/objects/obj_xref.c
+++ b/crypto/objects/obj_xref.c

@@ -110,8 +110,10 @@
 #endif
 	if (rv == NULL)
 		return 0;
-	*pdig_nid = rv->hash_id;
-	*ppkey_nid = rv->pkey_id;
+	if (pdig_nid)
+		*pdig_nid = rv->hash_id;
+	if (ppkey_nid)
+		*ppkey_nid = rv->pkey_id;
 	return 1;
 	}
 
@@ -144,7 +146,8 @@
 #endif
 	if (rv == NULL)
 		return 0;
-	*psignid = (*rv)->sign_id;
+	if (psignid)
+		*psignid = (*rv)->sign_id;
 	return 1;
 	}
 

diff --git a/crypto/objects/obj_xref.h b/crypto/objects/obj_xref.h
index d5b9b8e..e23938c 100644
--- a/crypto/objects/obj_xref.h
+++ b/crypto/objects/obj_xref.h

@@ -38,10 +38,12 @@
 	{NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
 	{NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
 	{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+	{NID_rsassaPss, NID_undef, NID_rsaEncryption},
 	};
 
 static const nid_triple * const sigoid_srt_xref[] =
 	{
+	&sigoid_srt[29],
 	&sigoid_srt[17],
 	&sigoid_srt[18],
 	&sigoid_srt[0],

diff --git a/crypto/objects/obj_xref.txt b/crypto/objects/obj_xref.txt
index e45b3d3..cb91718 100644
--- a/crypto/objects/obj_xref.txt
+++ b/crypto/objects/obj_xref.txt

@@ -13,6 +13,10 @@
 sha224WithRSAEncryption	sha224	rsaEncryption
 mdc2WithRSA		mdc2	rsaEncryption
 ripemd160WithRSA	ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss		undef	rsaEncryption
 
 # Alternative deprecated OIDs. By using the older "rsa" OID this
 # type will be recognized by not normally used.

diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt
index e61fe60..d3bfad7 100644
--- a/crypto/objects/objects.txt
+++ b/crypto/objects/objects.txt

@@ -166,6 +166,10 @@
 pkcs1 4			: RSA-MD5		: md5WithRSAEncryption
 pkcs1 5			: RSA-SHA1		: sha1WithRSAEncryption
 # According to PKCS #1 version 2.1
+pkcs1 7			: RSAES-OAEP		: rsaesOaep
+pkcs1 8			: MGF1			: mgf1
+pkcs1 10		: RSASSA-PSS		: rsassaPss
+
 pkcs1 11		: RSA-SHA256		: sha256WithRSAEncryption
 pkcs1 12		: RSA-SHA384		: sha384WithRSAEncryption
 pkcs1 13		: RSA-SHA512		: sha512WithRSAEncryption
@@ -299,6 +303,7 @@
 id-smime-alg 5		: id-smime-alg-ESDH
 id-smime-alg 6		: id-smime-alg-CMS3DESwrap
 id-smime-alg 7		: id-smime-alg-CMSRC2wrap
+id-smime-alg 9		: id-alg-PWRI-KEK
 
 # S/MIME Certificate Distribution
 id-smime-cd 1		: id-smime-cd-ldap
@@ -770,6 +775,10 @@
 !Cname no-rev-avail
 id-ce 56		: noRevAvail		: X509v3 No Revocation Available
 
+# From RFC5280
+ext-key-usage 0		: anyExtendedKeyUsage	: Any Extended Key Usage
+
+
 !Cname netscape
 2 16 840 1 113730	: Netscape		: Netscape Communications Corp.
 !Cname netscape-cert-extension
@@ -846,6 +855,10 @@
 aes 3			: AES-128-OFB		: aes-128-ofb
 !Cname aes-128-cfb128
 aes 4			: AES-128-CFB		: aes-128-cfb
+aes 5			: id-aes128-wrap
+aes 6			: id-aes128-GCM		: aes-128-gcm
+aes 7			: id-aes128-CCM		: aes-128-ccm
+aes 8			: id-aes128-wrap-pad
 
 aes 21			: AES-192-ECB		: aes-192-ecb
 aes 22			: AES-192-CBC		: aes-192-cbc
@@ -853,6 +866,10 @@
 aes 23			: AES-192-OFB		: aes-192-ofb
 !Cname aes-192-cfb128
 aes 24			: AES-192-CFB		: aes-192-cfb
+aes 25			: id-aes192-wrap
+aes 26			: id-aes192-GCM		: aes-192-gcm
+aes 27			: id-aes192-CCM		: aes-192-ccm
+aes 28			: id-aes192-wrap-pad
 
 aes 41			: AES-256-ECB		: aes-256-ecb
 aes 42			: AES-256-CBC		: aes-256-cbc
@@ -860,6 +877,10 @@
 aes 43			: AES-256-OFB		: aes-256-ofb
 !Cname aes-256-cfb128
 aes 44			: AES-256-CFB		: aes-256-cfb
+aes 45			: id-aes256-wrap
+aes 46			: id-aes256-GCM		: aes-256-gcm
+aes 47			: id-aes256-CCM		: aes-256-ccm
+aes 48			: id-aes256-wrap-pad
 
 # There are no OIDs for these modes...
 
@@ -869,15 +890,16 @@
 			: AES-128-CFB8		: aes-128-cfb8
 			: AES-192-CFB8		: aes-192-cfb8
 			: AES-256-CFB8		: aes-256-cfb8
+			: AES-128-CTR		: aes-128-ctr
+			: AES-192-CTR		: aes-192-ctr
+			: AES-256-CTR		: aes-256-ctr
+			: AES-128-XTS		: aes-128-xts
+			: AES-256-XTS		: aes-256-xts
 			: DES-CFB1		: des-cfb1
 			: DES-CFB8		: des-cfb8
 			: DES-EDE3-CFB1		: des-ede3-cfb1
 			: DES-EDE3-CFB8		: des-ede3-cfb8
 
-aes 5			: id-aes128-wrap 
-aes 25			: id-aes192-wrap 
-aes 45			: id-aes256-wrap 
-
 # OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84.
 !Alias nist_hashalgs nistAlgorithms 2
 nist_hashalgs 1		: SHA256		: sha256
@@ -1211,6 +1233,9 @@
 1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC		: camellia-128-cbc
 1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC		: camellia-192-cbc
 1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC		: camellia-256-cbc
+1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap
+1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap
+1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap
 
 # Definitions for Camellia cipher - ECB, CFB, OFB MODE
 
@@ -1257,3 +1282,11 @@
 # There is no OID that just denotes "HMAC" oddly enough...
 
 			: HMAC				: hmac
+# Nor CMAC either
+			: CMAC				: cmac
+
+# Synthetic composite ciphersuites
+			: RC4-HMAC-MD5			: rc4-hmac-md5
+			: AES-128-CBC-HMAC-SHA1		: aes-128-cbc-hmac-sha1
+			: AES-192-CBC-HMAC-SHA1		: aes-192-cbc-hmac-sha1
+			: AES-256-CBC-HMAC-SHA1		: aes-256-cbc-hmac-sha1

diff --git a/crypto/ocsp/ocsp_lib.c b/crypto/ocsp/ocsp_lib.c
index e92b86c..a94dc83 100644
--- a/crypto/ocsp/ocsp_lib.c
+++ b/crypto/ocsp/ocsp_lib.c

@@ -124,7 +124,8 @@
 	if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err;
 
 	/* Calculate the issuerKey hash, excluding tag and length */
-	EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL);
+	if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL))
+		goto err;
 
 	if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err;
 

diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index 26ac6ba..f17eaa9 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h

@@ -8,6 +8,9 @@
 #ifndef OPENSSL_NO_CAST
 # define OPENSSL_NO_CAST
 #endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
 #ifndef OPENSSL_NO_GMP
 # define OPENSSL_NO_GMP
 #endif
@@ -29,6 +32,9 @@
 #ifndef OPENSSL_NO_RFC3779
 # define OPENSSL_NO_RFC3779
 #endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
 #ifndef OPENSSL_NO_SEED
 # define OPENSSL_NO_SEED
 #endif
@@ -59,6 +65,9 @@
 # if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
 #  define NO_CAST
 # endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
 # if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
 #  define NO_GMP
 # endif
@@ -80,6 +89,9 @@
 # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
 #  define NO_RFC3779
 # endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
 # if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
 #  define NO_SEED
 # endif

diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index 66a6d0d..bf42556 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h

@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000008fL
+#define OPENSSL_VERSION_NUMBER	0x1000100fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0h-fips 12 Mar 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1-fips 14 Mar 2012"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0h 12 Mar 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1 14 Mar 2012"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 

diff --git a/crypto/ossl_typ.h b/crypto/ossl_typ.h
index 12bd701..ea9227f 100644
--- a/crypto/ossl_typ.h
+++ b/crypto/ossl_typ.h

@@ -91,10 +91,12 @@
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32

diff --git a/crypto/pariscid.pl b/crypto/pariscid.pl
new file mode 100644
index 0000000..477ec9b
--- /dev/null
+++ b/crypto/pariscid.pl

@@ -0,0 +1,224 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$ST		="std";
+} else {
+	$LEVEL		="1.1";
+	$SIZE_T		=4;
+	$ST		="stw";
+}
+
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	OPENSSL_cpuid_setup,ENTRY
+	.ALIGN	8
+OPENSSL_cpuid_setup
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_rdtsc,ENTRY
+	.ALIGN	8
+OPENSSL_rdtsc
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	mfctl	%cr16,$rv
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_wipe_cpu,ENTRY
+	.ALIGN	8
+OPENSSL_wipe_cpu
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	xor		%r0,%r0,%r1
+	fcpy,dbl	%fr0,%fr4
+	xor		%r0,%r0,%r19
+	fcpy,dbl	%fr0,%fr5
+	xor		%r0,%r0,%r20
+	fcpy,dbl	%fr0,%fr6
+	xor		%r0,%r0,%r21
+	fcpy,dbl	%fr0,%fr7
+	xor		%r0,%r0,%r22
+	fcpy,dbl	%fr0,%fr8
+	xor		%r0,%r0,%r23
+	fcpy,dbl	%fr0,%fr9
+	xor		%r0,%r0,%r24
+	fcpy,dbl	%fr0,%fr10
+	xor		%r0,%r0,%r25
+	fcpy,dbl	%fr0,%fr11
+	xor		%r0,%r0,%r26
+	fcpy,dbl	%fr0,%fr22
+	xor		%r0,%r0,%r29
+	fcpy,dbl	%fr0,%fr23
+	xor		%r0,%r0,%r31
+	fcpy,dbl	%fr0,%fr24
+	fcpy,dbl	%fr0,%fr25
+	fcpy,dbl	%fr0,%fr26
+	fcpy,dbl	%fr0,%fr27
+	fcpy,dbl	%fr0,%fr28
+	fcpy,dbl	%fr0,%fr29
+	fcpy,dbl	%fr0,%fr30
+	fcpy,dbl	%fr0,%fr31
+	bv		($rp)
+	.EXIT
+	ldo		0($sp),$rv
+	.PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+
+$code.=<<___;
+	.EXPORT	OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_cleanse
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	cmpib,*=	0,$len,Ldone
+	nop
+	cmpib,*>>=	15,$len,Little
+	ldi		$SIZE_T-1,%r1
+
+Lalign
+	and,*<>		$inp,%r1,%r28
+	b,n		Laligned
+	stb		%r0,0($inp)
+	ldo		-1($len),$len
+	b		Lalign
+	ldo		1($inp),$inp
+
+Laligned
+	andcm		$len,%r1,%r28
+Lot
+	$ST		%r0,0($inp)
+	addib,*<>	-$SIZE_T,%r28,Lot
+	ldo		$SIZE_T($inp),$inp
+
+	and,*<>		$len,%r1,$len
+	b,n		Ldone
+Little
+	stb		%r0,0($inp)
+	addib,*<>	-1,$len,Little
+	ldo		1($inp),$inp
+Ldone
+	bv		($rp)
+	.EXIT
+	nop
+	.PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+	.EXPORT	OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+Loop
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,<>	-1,$cnt,Loop
+	addi		4,$out,$out
+
+	bv		($rp)
+	.EXIT
+	sub		$rv,$cnt,$rv
+	.PROCEND
+
+	.EXPORT	OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus2
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	sub		%r0,$cnt,$cnt
+
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+Loop2
+	copy		$diff,$lastdiff
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,=		-1,$max,Ldone2
+	nop
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+	cmpclr,<>	$lastdiff,$diff,$tick
+	ldi		1,$tick
+
+	ldi		1,%r1
+	xor		%r1,$tick,$tick
+	addb,<>		$tick,$cnt,Loop2
+	shladd,l	$tick,2,$out,$out
+Ldone2
+	bv		($rp)
+	.EXIT
+	add		$rv,$cnt,$rv
+	.PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
+

diff --git a/crypto/pem/pvkfmt.c b/crypto/pem/pvkfmt.c
index 5f130c4..b1bf71a 100644
--- a/crypto/pem/pvkfmt.c
+++ b/crypto/pem/pvkfmt.c

@@ -709,13 +709,16 @@
 			const unsigned char *pass, int passlen)
 	{
 	EVP_MD_CTX mctx;
+	int rv = 1;
 	EVP_MD_CTX_init(&mctx);
-	EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
-	EVP_DigestUpdate(&mctx, salt, saltlen);
-	EVP_DigestUpdate(&mctx, pass, passlen);
-	EVP_DigestFinal_ex(&mctx, key, NULL);
+	if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
+		|| !EVP_DigestUpdate(&mctx, salt, saltlen)
+		|| !EVP_DigestUpdate(&mctx, pass, passlen)
+		|| !EVP_DigestFinal_ex(&mctx, key, NULL))
+			rv = 0;
+
 	EVP_MD_CTX_cleanup(&mctx);
-	return 1;
+	return rv;
 	}
 	
 
@@ -727,11 +730,12 @@
 	const unsigned char *p = *in;
 	unsigned int magic;
 	unsigned char *enctmp = NULL, *q;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (saltlen)
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@
 		p += 8;
 		inlen = keylen - 8;
 		q = enctmp + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
-		EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-		EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+		if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto err;
+		if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+			goto err;
+		if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+			goto err;
 		magic = read_ledword((const unsigned char **)&q);
 		if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 			{
 			q = enctmp + 8;
 			memset(keybuf + 5, 0, 11);
-			EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
-								NULL);
+			if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+								NULL))
+				goto err;
 			OPENSSL_cleanse(keybuf, 20);
-			EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-			EVP_DecryptFinal_ex(&cctx, q + enctmplen,
-								&enctmplen);
+			if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+				goto err;
+			if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+								&enctmplen))
+				goto err;
 			magic = read_ledword((const unsigned char **)&q);
 			if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 				{
-				EVP_CIPHER_CTX_cleanup(&cctx);
 				PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
 				goto err;
 				}
 			}
 		else
 			OPENSSL_cleanse(keybuf, 20);
-		EVP_CIPHER_CTX_cleanup(&cctx);
 		p = enctmp;
 		}
 
 	ret = b2i_PrivateKey(&p, keylen);
 	err:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	if (enctmp && saltlen)
 		OPENSSL_free(enctmp);
 	return ret;
@@ -841,6 +849,8 @@
 	{
 	int outlen = 24, pklen;
 	unsigned char *p, *salt = NULL;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (enclevel)
 		outlen += PVK_SALTLEN;
 	pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@
 		if (enclevel == 1)
 			memset(keybuf + 5, 0, 11);
 		p = salt + PVK_SALTLEN + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+		if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto error;
 		OPENSSL_cleanse(keybuf, 20);
-		EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
-		EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
-		EVP_CIPHER_CTX_cleanup(&cctx);
+		if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
+			goto error;
+		if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+			goto error;
 		}
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return outlen;
 
 	error:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return -1;
 	}
 

diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index 4579671..a3edd98 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl

@@ -31,10 +31,9 @@
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
-	/linux.*64/	&& do {	$ret .= ".globl	.$name\n";
-				$ret .= ".type	.$name,\@function\n";
+	/linux.*64/	&& do {	$ret .= ".globl	$name\n";
+				$ret .= ".type	$name,\@function\n";
 				$ret .= ".section	\".opd\",\"aw\"\n";
-				$ret .= ".globl	$name\n";
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@
     }
     ".machine	$arch";
 };
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {	shift;
+	".size	" . join(",",@_);
+    }
+    else
+    {	"";	}
+};
 my $asciz = sub {
     shift;
     my $line = join(",",@_);

diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 674da3b..56d9b64 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl

@@ -62,12 +62,8 @@
 my $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
-  my ($outdev,$outino,@junk)=stat($output);
-
-    open STDOUT,">$output" || die "can't open $output: $!"
-	if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
 
 my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
 my $elf=1;	$elf=0 if (!$gas);
@@ -116,12 +112,16 @@
 	    $line = substr($line,@+[0]); $line =~ s/^\s+//;
 
 	    undef $self->{sz};
-	    if ($self->{op} =~ /^(movz)b.*/) {	# movz is pain...
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
 		$self->{op} = $1;
-		$self->{sz} = "b";
+		$self->{sz} = $2;
 	    } elsif ($self->{op} =~ /call|jmp/) {
 		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^v/) { # VEX
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
 		$self->{sz} = "";
 	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
 		$self->{op} = $1;
@@ -246,35 +246,39 @@
 	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}, new gas requires sign extension...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+	$self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
 	if ($gas) {
-	    # Solaris /usr/ccs/bin/as can't handle multiplications
-	    # in $self->{label}, new gas requires sign extension...
-	    use integer;
-	    $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
-	    $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
 	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
 
 	    if (defined($self->{index})) {
-		sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk},
-					$self->{label},$self->{base},
+		sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+					$self->{label},
+					$self->{base}?"%$self->{base}":"",
 					$self->{index},$self->{scale};
 	    } else {
 		sprintf "%s%s(%%%s)",	$self->{asterisk},$self->{label},$self->{base};
 	    }
 	} else {
-	    %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" );
+	    %szmap = (	b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
 
 	    $self->{label} =~ s/\./\$/g;
 	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
 	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-	    $sz="q" if ($self->{asterisk});
+	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+	    $sz="l" if (opcode->mnemonic() eq "movd");
 
 	    if (defined($self->{index})) {
-		sprintf "%s[%s%s*%d+%s]",$szmap{$sz},
+		sprintf "%s[%s%s*%d%s]",$szmap{$sz},
 					$self->{label}?"$self->{label}+":"",
 					$self->{index},$self->{scale},
-					$self->{base};
+					$self->{base}?"+$self->{base}":"";
 	    } elsif ($self->{base} eq "rip") {
 		sprintf "%s[%s]",$szmap{$sz},$self->{label};
 	    } else {
@@ -506,6 +510,12 @@
 		    }
 		} elsif ($dir =~ /\.(text|data)/) {
 		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
 		}
 		$line = "";
 		return $self;
@@ -578,7 +588,7 @@
 					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
 					    $self->{value}.=":\n" if($masm);
 					}
-					$self->{value}.="$current_function->{name}\tENDP" if($masm);
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
 					undef $current_function;
 				    }
 				    last;
@@ -614,6 +624,19 @@
 						.join(",",@str) if (@str);
 				    last;
 				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
 	    }
 	    $line = "";
 	}
@@ -626,9 +649,133 @@
     }
 }
 
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$1,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
 if ($nasm) {
     print <<___;
 default	rel
+%define XMMWORD
 ___
 } elsif ($masm) {
     print <<___;
@@ -645,14 +792,22 @@
 
     undef $label;
     undef $opcode;
-    undef $sz;
     undef @args;
 
     if ($label=label->re(\$line))	{ print $label->out(); }
 
     if (directive->re(\$line)) {
 	printf "%s",directive->out();
-    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
+    } elsif ($opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+	undef @bytes;
+	
+	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	ARGUMENT: while (1) {
 	my $arg;
 
 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
@@ -668,19 +823,26 @@
 	$line =~ s/^,\s*//;
 	} # ARGUMENT:
 
-	$sz=opcode->size();
-
 	if ($#args>=0) {
 	    my $insn;
+	    my $sz=opcode->size();
+
 	    if ($gas) {
 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
 	    } else {
 		$insn = $opcode->out();
-		$insn .= $sz if (map($_->out() =~ /x?mm/,@args));
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
 		@args = reverse(@args);
 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	    }
-	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	} else {
 	    printf "\t%s",$opcode->out();
 	}

diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index 28080ca..eb543db 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl

@@ -80,6 +80,57 @@
     {	&::generic("movq",@_);			}
 }
 
+# SSE>2 instructions
+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
+    else
+    {	&::generic("pextrd",@_);		}
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
+    else
+    {	&::generic("pinsrd",@_);		}
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
+    else
+    {	&::generic("pshufb",@_);		}
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("palignr",@_);		}
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("pclmulqdq",@_);		}
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
 # label management
 $lbdecor="L";		# local label decoration, set by package
 $label="000";
@@ -167,7 +218,7 @@
     $filename=$fn;
     $i386=$cpu;
 
-    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0;
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
     if    (($type eq "elf"))
     {	$elf=1;			require "x86gas.pl";	}
     elsif (($type eq "a\.out"))
@@ -184,6 +235,8 @@
     {	$win32=1;		require "x86masm.pl";	}
     elsif (($type eq "macosx"))
     {	$aout=1; $macosx=1;	require "x86gas.pl";	}
+    elsif (($type eq "android"))
+    {	$elf=1; $android=1;	require "x86gas.pl";	}
     else
     {	print STDERR <<"EOF";
 Pick one target type from

diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl
index 6eab727..682a3a3 100644
--- a/crypto/perlasm/x86gas.pl
+++ b/crypto/perlasm/x86gas.pl

@@ -45,9 +45,8 @@
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
@@ -91,6 +90,7 @@
 }
 sub ::QWP	{ &::DWP(@_);	}
 sub ::BP	{ &::DWP(@_);	}
+sub ::WP	{ &::DWP(@_);	}
 sub ::BC	{ @_;		}
 sub ::DWC	{ @_;		}
 
@@ -149,22 +149,24 @@
 {   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
 
 sub ::file_end
-{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
-	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4";
-	if ($::elf)	{ push (@out,"$tmp,4\n"); }
-	else		{ push (@out,"$tmp\n"); }
-    }
-    if ($::macosx)
+{   if ($::macosx)
     {	if (%non_lazy_ptr)
     	{   push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
 	    foreach $i (keys %non_lazy_ptr)
 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
 	}
     }
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
+	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
+	else		{ push (@out,"$tmp\n"); }
+    }
     push(@out,$initseg) if ($initseg);
 }
 
 sub ::data_byte	{   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
 sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
 
 sub ::align
@@ -180,7 +182,7 @@
 sub ::picmeup
 { my($dst,$sym,$base,$reflabel)=@_;
 
-    if ($::pic && ($::elf || $::aout))
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
     {	if (!defined($base))
 	{   &::call(&::label("PIC_me_up"));
 	    &::set_label("PIC_me_up");
@@ -206,13 +208,17 @@
 sub ::initseg
 { my $f=$nmdecor.shift;
 
-    if ($::elf)
+    if ($::android)
+    {	$initseg.=<<___;
+.section	.init_array
+.align	4
+.long	$f
+___
+    }
+    elsif ($::elf)
     {	$initseg.=<<___;
 .section	.init
 	call	$f
-	jmp	.Linitalign
-.align	$align
-.Linitalign:
 ___
     }
     elsif ($::coff)

diff --git a/crypto/perlasm/x86masm.pl b/crypto/perlasm/x86masm.pl
index 3d50e4a..3af0453 100644
--- a/crypto/perlasm/x86masm.pl
+++ b/crypto/perlasm/x86masm.pl

@@ -14,9 +14,11 @@
 { my ($opcode,@arg)=@_;
 
     # fix hexadecimal constants
-    for (@arg) { s/0x([0-9a-f]+)/0$1h/oi; }
+    for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
 
-    if ($opcode !~ /movq/)
+    if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+([^\[]+)$/$1/)	# no []
+    {	$opcode="mov";	}
+    elsif ($opcode !~ /movq/)
     {	# fix xmm references
 	$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
 	$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -65,6 +67,7 @@
   $ret;
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::DWP	{ &get_mem("DWORD",@_); }
 sub ::QWP	{ &get_mem("QWORD",@_); }
 sub ::BC	{ "@_";  }
@@ -129,7 +132,7 @@
     if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 .bss	SEGMENT 'BSS'
-COMM	${nmdecor}OPENSSL_ia32cap_P:DWORD
+COMM	${nmdecor}OPENSSL_ia32cap_P:QWORD
 .bss	ENDS
 ___
 	# comment out OPENSSL_ia32cap_P declarations
@@ -156,6 +159,9 @@
 sub ::data_byte
 {   push(@out,("DB\t").join(',',@_)."\n");	}
 
+sub ::data_short
+{   push(@out,("DW\t").join(',',@_)."\n");	}
+
 sub ::data_word
 {   push(@out,("DD\t").join(',',@_)."\n");	}
 
@@ -181,4 +187,11 @@
 sub ::dataseg
 {   push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA";   }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"IF \@Version GE 710\n");
+    push(@out,".SAFESEH	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"ENDIF\n");
+}
+
 1;

diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl
index ce2bed9..ca2511c 100644
--- a/crypto/perlasm/x86nasm.pl
+++ b/crypto/perlasm/x86nasm.pl

@@ -19,6 +19,8 @@
 	{   $_[0] = "NEAR $_[0]";   	}
 	elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
 	{   $_[1] =~ s/^[^\[]*\[/\[/o;	}
+	elsif ($opcode eq "clflush" && $#_==0)
+	{   $_[0] =~ s/^[^\[]*\[/\[/o;	}
     }
     &::emit($opcode,@_);
   1;
@@ -67,6 +69,7 @@
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
 sub ::DWP	{ &get_mem("DWORD",@_); }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::QWP	{ &get_mem("",@_);      }
 sub ::BC	{ (($::mwerks)?"":"BYTE ")."@_";  }
 sub ::DWC	{ (($::mwerks)?"":"DWORD ")."@_"; }
@@ -114,7 +117,7 @@
 {   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 ${drdecor}segment	.bss
-${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 4
+${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 8
 ___
 	# comment out OPENSSL_ia32cap_P declarations
 	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
@@ -135,7 +138,8 @@
 
 sub ::data_byte
 {   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");	}
-
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");	}
 sub ::data_word
 {   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");	}
 
@@ -163,4 +167,11 @@
     else		{ push(@out,"section\t.data align=4\n"); }
 }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if	__NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
 1;

diff --git a/crypto/pkcs12/p12_decr.c b/crypto/pkcs12/p12_decr.c
index ba77dbb..9d3557e 100644
--- a/crypto/pkcs12/p12_decr.c
+++ b/crypto/pkcs12/p12_decr.c

@@ -89,7 +89,14 @@
 		goto err;
 	}
 
-	EVP_CipherUpdate(&ctx, out, &i, in, inlen);
+	if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen))
+		{
+		OPENSSL_free(out);
+		out = NULL;
+		PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB);
+		goto err;
+		}
+
 	outlen = i;
 	if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) {
 		OPENSSL_free(out);

diff --git a/crypto/pkcs12/p12_key.c b/crypto/pkcs12/p12_key.c
index 424203f..c55c7b6 100644
--- a/crypto/pkcs12/p12_key.c
+++ b/crypto/pkcs12/p12_key.c

@@ -152,14 +152,16 @@
 	for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen];
 	for (i = 0; i < Plen; i++) *p++ = pass[i % passlen];
 	for (;;) {
-		EVP_DigestInit_ex(&ctx, md_type, NULL);
-		EVP_DigestUpdate(&ctx, D, v);
-		EVP_DigestUpdate(&ctx, I, Ilen);
-		EVP_DigestFinal_ex(&ctx, Ai, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+			|| !EVP_DigestUpdate(&ctx, D, v)
+			|| !EVP_DigestUpdate(&ctx, I, Ilen)
+			|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		for (j = 1; j < iter; j++) {
-			EVP_DigestInit_ex(&ctx, md_type, NULL);
-			EVP_DigestUpdate(&ctx, Ai, u);
-			EVP_DigestFinal_ex(&ctx, Ai, NULL);
+			if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+				|| !EVP_DigestUpdate(&ctx, Ai, u)
+				|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		}
 		memcpy (out, Ai, min (n, u));
 		if (u >= n) {

diff --git a/crypto/pkcs12/p12_mutl.c b/crypto/pkcs12/p12_mutl.c
index 9ab740d..96de1bd 100644
--- a/crypto/pkcs12/p12_mutl.c
+++ b/crypto/pkcs12/p12_mutl.c

@@ -97,10 +97,14 @@
 		return 0;
 	}
 	HMAC_CTX_init(&hmac);
-	HMAC_Init_ex(&hmac, key, md_size, md_type, NULL);
-    	HMAC_Update(&hmac, p12->authsafes->d.data->data,
-					 p12->authsafes->d.data->length);
-    	HMAC_Final(&hmac, mac, maclen);
+	if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL)
+    		|| !HMAC_Update(&hmac, p12->authsafes->d.data->data,
+					 p12->authsafes->d.data->length)
+    		|| !HMAC_Final(&hmac, mac, maclen))
+		{
+    		HMAC_CTX_cleanup(&hmac);
+		return 0;
+		}
     	HMAC_CTX_cleanup(&hmac);
 	return 1;
 }

diff --git a/crypto/pkcs7/pk7_doit.c b/crypto/pkcs7/pk7_doit.c
index e4b095e..fae8eda 100644
--- a/crypto/pkcs7/pk7_doit.c
+++ b/crypto/pkcs7/pk7_doit.c

@@ -705,7 +705,11 @@
 		}
 
 	/* Add digest */
-	EVP_DigestFinal_ex(mctx, md_data,&md_len);
+	if (!EVP_DigestFinal_ex(mctx, md_data,&md_len))
+		{
+		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB);
+		return 0;
+		}
 	if (!PKCS7_add1_attrib_digest(si, md_data, md_len))
 		{
 		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE);
@@ -813,7 +817,8 @@
 
 			/* We now have the EVP_MD_CTX, lets do the
 			 * signing. */
-			EVP_MD_CTX_copy_ex(&ctx_tmp,mdc);
+			if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc))
+				goto err;
 
 			sk=si->auth_attr;
 
@@ -851,7 +856,8 @@
 		if (!PKCS7_find_digest(&mdc, bio,
 				OBJ_obj2nid(p7->d.digest->md->algorithm)))
 			goto err;
-		EVP_DigestFinal_ex(mdc,md_data,&md_len);
+		if (!EVP_DigestFinal_ex(mdc,md_data,&md_len))
+			goto err;
 		M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len);
 		}
 
@@ -1044,7 +1050,8 @@
 
 	/* mdc is the digest ctx that we want, unless there are attributes,
 	 * in which case the digest is the signed attributes */
-	EVP_MD_CTX_copy_ex(&mdc_tmp,mdc);
+	if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc))
+		goto err;
 
 	sk=si->auth_attr;
 	if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0))
@@ -1054,7 +1061,8 @@
 		int alen;
 		ASN1_OCTET_STRING *message_digest;
 
-		EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len);
+		if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len))
+			goto err;
 		message_digest=PKCS7_digest_from_attributes(sk);
 		if (!message_digest)
 			{
@@ -1079,7 +1087,8 @@
 			goto err;
 			}
 
-		EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL);
+		if (!EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL))
+			goto err;
 
 		alen = ASN1_item_i2d((ASN1_VALUE *)sk, &abuf,
 						ASN1_ITEM_rptr(PKCS7_ATTR_VERIFY));
@@ -1089,7 +1098,8 @@
 			ret = -1;
 			goto err;
 			}
-		EVP_VerifyUpdate(&mdc_tmp, abuf, alen);
+		if (!EVP_VerifyUpdate(&mdc_tmp, abuf, alen))
+			goto err;
 
 		OPENSSL_free(abuf);
 		}

diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 369e1d0..0677469 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl

@@ -23,36 +23,67 @@
 .machine	"any"
 .text
 
-.globl	.OPENSSL_cpuid_setup
+.globl	.OPENSSL_ppc64_probe
 .align	4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+	fcfid	f1,f1
+	extrdi	r0,r0,32,0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
+.globl	.OPENSSL_altivec_probe
+.align	4
+.OPENSSL_altivec_probe:
+	.long	0x10000484	# vor	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
 	xor	r0,r0,r0
+	fmr	f0,f31
+	fmr	f1,f31
+	fmr	f2,f31
 	mr	r3,r1
+	fmr	f3,f31
 	xor	r4,r4,r4
+	fmr	f4,f31
 	xor	r5,r5,r5
+	fmr	f5,f31
 	xor	r6,r6,r6
+	fmr	f6,f31
 	xor	r7,r7,r7
+	fmr	f7,f31
 	xor	r8,r8,r8
+	fmr	f8,f31
 	xor	r9,r9,r9
+	fmr	f9,f31
 	xor	r10,r10,r10
+	fmr	f10,f31
 	xor	r11,r11,r11
+	fmr	f11,f31
 	xor	r12,r12,r12
+	fmr	f12,f31
+	fmr	f13,f31
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_atomic_add
 .align	4
 .OPENSSL_atomic_add:
-Loop:	lwarx	r5,0,r3
+Ladd:	lwarx	r5,0,r3
 	add	r0,r4,r5
 	stwcx.	r0,0,r3
-	bne-	Loop
+	bne-	Ladd
 	$SIGNX	r3,r0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 .globl	.OPENSSL_rdtsc
 .align	4
@@ -60,6 +91,8 @@
 	mftb	r3
 	mftbu	r4
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_cleanse
 .align	4
@@ -89,6 +122,9 @@
 	andi.	r4,r4,3
 	bne	Little
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/rand/md_rand.c b/crypto/rand/md_rand.c
index b2f04ff..fcdd3f2 100644
--- a/crypto/rand/md_rand.c
+++ b/crypto/rand/md_rand.c

@@ -109,6 +109,8 @@
  *
  */
 
+#define OPENSSL_FIPSEVP
+
 #ifdef MD_RAND_DEBUG
 # ifndef NDEBUG
 #   define NDEBUG
@@ -157,13 +159,14 @@
 static void ssleay_rand_cleanup(void);
 static void ssleay_rand_seed(const void *buf, int num);
 static void ssleay_rand_add(const void *buf, int num, double add_entropy);
-static int ssleay_rand_bytes(unsigned char *buf, int num);
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo);
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_status(void);
 
 RAND_METHOD rand_ssleay_meth={
 	ssleay_rand_seed,
-	ssleay_rand_bytes,
+	ssleay_rand_nopseudo_bytes,
 	ssleay_rand_cleanup,
 	ssleay_rand_add,
 	ssleay_rand_pseudo_bytes,
@@ -328,7 +331,7 @@
 	ssleay_rand_add(buf, num, (double)num);
 	}
 
-static int ssleay_rand_bytes(unsigned char *buf, int num)
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo)
 	{
 	static volatile int stirred_pool = 0;
 	int i,j,k,st_num,st_idx;
@@ -517,7 +520,9 @@
 	EVP_MD_CTX_cleanup(&m);
 	if (ok)
 		return(1);
-	else
+	else if (pseudo)
+		return 0;
+	else 
 		{
 		RANDerr(RAND_F_SSLEAY_RAND_BYTES,RAND_R_PRNG_NOT_SEEDED);
 		ERR_add_error_data(1, "You need to read the OpenSSL FAQ, "
@@ -526,22 +531,16 @@
 		}
 	}
 
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num)
+	{
+	return ssleay_rand_bytes(buf, num, 0);
+	}
+
 /* pseudo-random bytes that are guaranteed to be unique but not
    unpredictable */
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num) 
 	{
-	int ret;
-	unsigned long err;
-
-	ret = RAND_bytes(buf, num);
-	if (ret == 0)
-		{
-		err = ERR_peek_error();
-		if (ERR_GET_LIB(err) == ERR_LIB_RAND &&
-		    ERR_GET_REASON(err) == RAND_R_PRNG_NOT_SEEDED)
-			ERR_clear_error();
-		}
-	return (ret);
+	return ssleay_rand_bytes(buf, num, 1);
 	}
 
 static int ssleay_rand_status(void)

diff --git a/crypto/rand/rand.h b/crypto/rand/rand.h
index ac6c021..dc8fcf9 100644
--- a/crypto/rand/rand.h
+++ b/crypto/rand/rand.h

@@ -119,6 +119,11 @@
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,13 @@
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus

diff --git a/crypto/rand/rand_err.c b/crypto/rand/rand_err.c
index 03cda4d..b8586c8 100644
--- a/crypto/rand/rand_err.c
+++ b/crypto/rand/rand_err.c

@@ -1,6 +1,6 @@
 /* crypto/rand/rand_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -71,12 +71,16 @@
 static ERR_STRING_DATA RAND_str_functs[]=
 	{
 {ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD),	"RAND_get_rand_method"},
+{ERR_FUNC(RAND_F_RAND_INIT_FIPS),	"RAND_init_fips"},
 {ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES),	"SSLEAY_RAND_BYTES"},
 {0,NULL}
 	};
 
 static ERR_STRING_DATA RAND_str_reasons[]=
 	{
+{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"},
+{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"},
+{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"},
 {ERR_REASON(RAND_R_PRNG_NOT_SEEDED)      ,"PRNG not seeded"},
 {0,NULL}
 	};

diff --git a/crypto/rand/rand_lib.c b/crypto/rand/rand_lib.c
index 513e338..daf1dab 100644
--- a/crypto/rand/rand_lib.c
+++ b/crypto/rand/rand_lib.c

@@ -60,10 +60,16 @@
 #include <time.h>
 #include "cryptlib.h"
 #include <openssl/rand.h>
+
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/fips_rand.h>
+#endif
+
 #ifndef OPENSSL_NO_ENGINE
 /* non-NULL if default_RAND_meth is ENGINE-provided */
 static ENGINE *funct_ref =NULL;
@@ -174,3 +180,116 @@
 		return meth->status();
 	return 0;
 	}
+
+#ifdef OPENSSL_FIPS
+
+/* FIPS DRBG initialisation code. This sets up the DRBG for use by the
+ * rest of OpenSSL. 
+ */
+
+/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather
+ * entropy internally through RAND_poll().
+ */
+
+static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout,
+                                int entropy, size_t min_len, size_t max_len)
+        {
+	/* Round up request to multiple of block size */
+	min_len = ((min_len + 19) / 20) * 20;
+	*pout = OPENSSL_malloc(min_len);
+	if (!*pout)
+		return 0;
+	if (RAND_SSLeay()->bytes(*pout, min_len) <= 0)
+		{
+		OPENSSL_free(*pout);
+		*pout = NULL;
+		return 0;
+		}
+        return min_len;
+        }
+
+static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen)
+	{
+	OPENSSL_cleanse(out, olen);
+	OPENSSL_free(out);
+	}
+
+/* Set "additional input" when generating random data. This uses the
+ * current PID, a time value and a counter.
+ */
+
+static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout)
+    	{
+	/* Use of static variables is OK as this happens under a lock */
+	static unsigned char buf[16];
+	static unsigned long counter;
+	FIPS_get_timevec(buf, &counter);
+	*pout = buf;
+	return sizeof(buf);
+	}
+
+/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is 
+ * correctly seeded by RAND_poll().
+ */
+
+static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen,
+				double entropy)
+	{
+	RAND_SSLeay()->add(in, inlen, entropy);
+	return 1;
+	}
+
+static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen)
+	{
+	RAND_SSLeay()->seed(in, inlen);
+	return 1;
+	}
+
+#ifndef OPENSSL_DRBG_DEFAULT_TYPE
+#define OPENSSL_DRBG_DEFAULT_TYPE	NID_aes_256_ctr
+#endif
+#ifndef OPENSSL_DRBG_DEFAULT_FLAGS
+#define OPENSSL_DRBG_DEFAULT_FLAGS	DRBG_FLAG_CTR_USE_DF
+#endif 
+
+static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE;
+static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS;
+
+void RAND_set_fips_drbg_type(int type, int flags)
+	{
+	fips_drbg_type = type;
+	fips_drbg_flags = flags;
+	}
+
+int RAND_init_fips(void)
+	{
+	DRBG_CTX *dctx;
+	size_t plen;
+	unsigned char pers[32], *p;
+	dctx = FIPS_get_default_drbg();
+        if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG);
+		return 0;
+		}
+		
+        FIPS_drbg_set_callbacks(dctx,
+				drbg_get_entropy, drbg_free_entropy, 20,
+				drbg_get_entropy, drbg_free_entropy);
+	FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0,
+					drbg_rand_seed, drbg_rand_add);
+	/* Personalisation string: a string followed by date time vector */
+	strcpy((char *)pers, "OpenSSL DRBG2.0");
+	plen = drbg_get_adin(dctx, &p);
+	memcpy(pers + 16, p, plen);
+
+        if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG);
+		return 0;
+		}
+        FIPS_rand_set_method(FIPS_drbg_method());
+	return 1;
+	}
+
+#endif

diff --git a/crypto/rc2/rc2.h b/crypto/rc2/rc2.h
index 34c8362..e542ec9 100644
--- a/crypto/rc2/rc2.h
+++ b/crypto/rc2/rc2.h

@@ -79,7 +79,9 @@
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);

diff --git a/crypto/rc2/rc2_skey.c b/crypto/rc2/rc2_skey.c
index 0150b0e..6668ac0 100644
--- a/crypto/rc2/rc2_skey.c
+++ b/crypto/rc2/rc2_skey.c

@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/rc2.h>
 #include "rc2_locl.h"
 
@@ -95,6 +96,13 @@
  * the same as specifying 1024 for the 'bits' parameter.  Bsafe uses
  * a version where the bits parameter is the same as len*8 */
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(RC2);
+	private_RC2_set_key(key, len, data, bits);
+	}
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#endif
 	{
 	int i,j;
 	unsigned char *k;

diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl
index 38a44a7..5c9ac6a 100644
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl

@@ -28,6 +28,34 @@
 #
 #					<[email protected]>
 
+# May 2011
+#
+# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
+#
+# Pentium	10.2			# original numbers
+# Pentium III	7.8(*)
+# Intel P4	7.5
+#
+# Opteron	6.1/+20%		# new MMX numbers
+# Core2		5.3/+67%(**)
+# Westmere	5.1/+94%(**)
+# Sandy Bridge	5.0/+8%
+# Atom		12.6/+6%
+#
+# (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
+#	but this specific code performs poorly on Core2. And vice
+#	versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
+#	poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
+#	[anymore], I chose to discard PIII-specific code path and opt
+#	for original IALU-only code, which is why MMX/SSE code path
+#	is guarded by SSE2 bit (see below), not MMX/SSE.
+# (**)	Performance vs. block size on Core2 and Westmere had a maximum
+#	at ... 64 bytes block size. And it was quite a maximum, 40-60%
+#	in comparison to largest 8KB block size. Above improvement
+#	coefficients are for the largest block size.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -62,6 +90,68 @@
 	&$func	($out,&DWP(0,$dat,$ty,4));
 }
 
+if ($alt=0) {
+  # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
+  # but ~40% slower on Core2 and Westmere... Attempt to add movz
+  # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
+  # on Core2 with movz it's almost 20% slower than below alternative
+  # code... Yes, it's a total mess...
+  my @XX=($xx,$out);
+  $RC4_loop_mmx = sub {		# SSE actually...
+    my $i=shift;
+    my $j=$i<=0?0:$i>>1;
+    my $mm=$i<=0?"mm0":"mm".($i&1);
+
+	&add	(&LB($yy),&LB($tx));
+	&lea	(@XX[1],&DWP(1,@XX[0]));
+	&pxor	("mm2","mm0")				if ($i==0);
+	&psllq	("mm1",8)				if ($i==0);
+	&and	(@XX[1],0xff);
+	&pxor	("mm0","mm0")				if ($i<=0);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&pxor	("mm1","mm2")				if ($i==0);
+	&mov	(&DWP(0,$dat,$XX[0],4),$ty);
+	&add	(&LB($ty),&LB($tx));
+	&movd	(@XX[0],"mm7")				if ($i==0);
+	&mov	($tx,&DWP(0,$dat,@XX[1],4));
+	&pxor	("mm1","mm1")				if ($i==1);
+	&movq	("mm2",&QWP(0,$inp))			if ($i==1);
+	&movq	(&QWP(-8,(@XX[0],$inp)),"mm1")		if ($i==0);
+	&pinsrw	($mm,&DWP(0,$dat,$ty,4),$j);
+
+	push	(@XX,shift(@XX))			if ($i>=0);
+  }
+} else {
+  # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+  # brings down AMD by 7%...
+  $RC4_loop_mmx = sub {
+    my $i=shift;
+
+	&add	(&LB($yy),&LB($tx));
+	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&mov	(&DWP(0,$dat,$xx,4),$ty);
+	&inc	($xx);
+	&add	($ty,$tx);
+	&movz	($xx,&LB($xx));				# (*)
+	&movz	($ty,&LB($ty));				# (*)
+	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
+	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
+	&movq	(&QWP(-8,($out,$inp)),"mm2")		if ($i==0);
+	&mov	($tx,&DWP(0,$dat,$xx,4));
+	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
+
+	# (*)	This is the key to Core2 and Westmere performance.
+	#	Whithout movz out-of-order execution logic confuses
+	#	itself and fails to reorder loads and stores. Problem
+	#	appears to be fixed in Sandy Bridge...
+  }
+}
+
+&external_label("OPENSSL_ia32cap_P");
+
 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
 &function_begin("RC4");
 	&mov	($dat,&wparam(0));	# load key schedule pointer
@@ -94,11 +184,56 @@
 	&and	($ty,-4);		# how many 4-byte chunks?
 	&jz	(&label("loop1"));
 
+	&test	($ty,-8);
+	&mov	(&wparam(3),$out);	# $out as accumulator in these loops
+	&jz	(&label("go4loop4"));
+
+	&picmeup($out,"OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,$out),26);	# check SSE2 bit [could have been MMX]
+	&jnc	(&label("go4loop4"));
+
+	&mov	($out,&wparam(3))	if (!$alt);
+	&movd	("mm7",&wparam(3))	if ($alt);
+	&and	($ty,-8);
+	&lea	($ty,&DWP(-8,$inp,$ty));
+	&mov	(&DWP(-4,$dat),$ty);	# save input+(len/8)*8-8
+
+	&$RC4_loop_mmx(-1);
+	&jmp(&label("loop_mmx_enter"));
+
+	&set_label("loop_mmx",16);
+		&$RC4_loop_mmx(0);
+	&set_label("loop_mmx_enter");
+		for 	($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+		&mov	($ty,$yy);
+		&xor	($yy,$yy);		# this is second key to Core2
+		&mov	(&LB($yy),&LB($ty));	# and Westmere performance...
+		&cmp	($inp,&DWP(-4,$dat));
+		&lea	($inp,&DWP(8,$inp));
+	&jb	(&label("loop_mmx"));
+
+    if ($alt) {
+	&movd	($out,"mm7");
+	&pxor	("mm2","mm0");
+	&psllq	("mm1",8);
+	&pxor	("mm1","mm2");
+	&movq	(&QWP(-8,$out,$inp),"mm1");
+    } else {
+	&psllq	("mm1",56);
+	&pxor	("mm2","mm1");
+	&movq	(&QWP(-8,$out,$inp),"mm2");
+    }
+	&emms	();
+
+	&cmp	($inp,&wparam(1));	# compare to input+len
+	&je	(&label("done"));
+	&jmp	(&label("loop1"));
+
+&set_label("go4loop4",16);
 	&lea	($ty,&DWP(-4,$inp,$ty));
 	&mov	(&wparam(2),$ty);	# save input+(len/4)*4-4
-	&mov	(&wparam(3),$out);	# $out as accumulator in this loop
 
-	&set_label("loop4",16);
+	&set_label("loop4");
 		for ($i=0;$i<4;$i++) { RC4_loop($i); }
 		&ror	($out,8);
 		&xor	($out,&DWP(0,$inp));
@@ -151,7 +286,7 @@
 
 &set_label("done");
 	&dec	(&LB($xx));
-	&mov	(&BP(-4,$dat),&LB($yy));	# save key->y
+	&mov	(&DWP(-4,$dat),$yy);		# save key->y
 	&mov	(&BP(-8,$dat),&LB($xx));	# save key->x
 &set_label("abort");
 &function_end("RC4");
@@ -164,10 +299,8 @@
 $ido="ecx";
 $idx="edx";
 
-&external_label("OPENSSL_ia32cap_P");
-
 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("RC4_set_key");
+&function_begin("private_RC4_set_key");
 	&mov	($out,&wparam(0));		# load key
 	&mov	($idi,&wparam(1));		# load len
 	&mov	($inp,&wparam(2));		# load data
@@ -245,7 +378,7 @@
 	&xor	("eax","eax");
 	&mov	(&DWP(-8,$out),"eax");		# key->x=0;
 	&mov	(&DWP(-4,$out),"eax");		# key->y=0;
-&function_end("RC4_set_key");
+&function_end("private_RC4_set_key");
 
 # const char *RC4_options(void);
 &function_begin_B("RC4_options");
@@ -254,14 +387,21 @@
 	&blindpop("eax");
 	&lea	("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
 	&picmeup("edx","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"edx"),20);
-	&jnc	(&label("skip"));
-	  &add	("eax",12);
-	&set_label("skip");
+	&mov	("edx",&DWP(0,"edx"));
+	&bt	("edx",20);
+	&jc	(&label("1xchar"));
+	&bt	("edx",26);
+	&jnc	(&label("ret"));
+	&add	("eax",25);
+	&ret	();
+&set_label("1xchar");
+	&add	("eax",12);
+&set_label("ret");
 	&ret	();
 &set_label("opts",64);
 &asciz	("rc4(4x,int)");
 &asciz	("rc4(1x,char)");
+&asciz	("rc4(8x,mmx)");
 &asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &align	(64);
 &function_end_B("RC4_options");

diff --git a/crypto/rc4/asm/rc4-md5-x86_64.pl b/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000..7f68409
--- /dev/null
+++ b/crypto/rc4/asm/rc4-md5-x86_64.pl

@@ -0,0 +1,631 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#		RC4	MD5	RC4+MD5	stitch	gain
+# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
+# Core2		6.5	5.8	12.3	7.7	+60%
+# Westmere	4.3	5.2	9.5	7.0	+36%
+# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
+# Atom		9.3	6.5	15.8	11.1	+42%
+#
+# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#	is +53%...
+
+my ($rc4,$md5)=(1,1);	# what to generate?
+my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
+			# but its result is discarded. Idea here is
+			# to be able to use 'openssl speed rc4' for
+			# benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";				$nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";	$nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";			$nargs=6;
+  # void rc4_md5_enc(
+  #		RC4_KEY *key,		#
+  #		const void *in0,	# RC4 input
+  #		void *out,		# RC4 output
+  #		MD5_CTX *ctx,		#
+  #		const void *inp,	# MD5 input
+  #		size_t len);		# number of 64-byte blocks
+}
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
+
+my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");			# RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;				# 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl	$func
+.type	$func,\@function,$nargs
+$func:
+	cmp	\$0,$len
+	je	.Labort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#	mov	$ctx,%r11		# reassign arguments
+	mov	$len,%r12
+	mov	$in0,%r13
+	mov	$out,%r14
+$D#md5#	mov	$inp,%r15
+___
+    $ctx="%r11"	if ($md5);		# reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"	if ($md5);
+    $inp=$in0	if (!$md5);
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
+
+	inc	$XX[0]#b
+	sub	$in0,$out
+	movl	($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+	xor	$TX[1],$TX[1]
+	test	\$-128,$len
+	jz	.Loop1
+	sub	$XX[0],$TX[1]
+	and	\$`$MOD-1`,$TX[1]
+	jz	.Loop${MOD}_is_hot
+	sub	$TX[1],$len
+.Loop${MOD}_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$TX[1]
+	jnz	.Loop${MOD}_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+	mov	$len,32(%rsp)		# save original $len
+	shr	\$6,$len		# number of 64-byte blocks
+___
+  if ($D && !$md5) {			# stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+	mov	%rsp,$ctx
+	mov	$in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#	add	$TX[0]#b,$YY#b
+#rc4#	lea	($dat,$XX[0],4),$XX[1]
+	shl	\$6,$len
+	add	$inp,$len		# pointer to the end of input
+	mov	$len,16(%rsp)
+
+#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
+#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
+#md5#	mov	1*4($ctx),$V[1]
+#md5#	mov	2*4($ctx),$V[2]
+#md5#	mov	3*4($ctx),$V[3]
+	jmp	.Loop
+
+.align	16
+.Loop:
+#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
+#md5#	mov	$V[1],1*4(%rsp)
+#md5#	mov	$V[2],2*4(%rsp)
+#md5#	mov	$V[3],$tmp		# forward reference
+#md5#	mov	$V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$b,$tmp
+#md5#	add	4*`$j`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$d,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot0[$j%4],$a
+#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$b,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$d,$tmp
+#md5#	add	4*`((1+5*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$c,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot1[$j%4],$a
+#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	xor	$b,$tmp
+#md5#	add	4*`((5+3*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	add	$tmp,$a
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot2[$j%4],$a
+#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$d,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	or	$b,$tmp
+#md5#	add	4*`((7*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot3[$j%4],$a
+#md5#	mov	\$-1,$tmp			# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	mov	$XX[0],$XX[1]
+	xor	$XX[0],$XX[0]			# keyword to partial register
+	mov	$XX[1]#b,$XX[0]#b
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
+#md5#	add	1*4(%rsp),$V[1]
+#md5#	add	2*4(%rsp),$V[2]
+#md5#	add	3*4(%rsp),$V[3]
+
+#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
+#rc4#	movdqu	%xmm3,16($out,$in0)
+#rc4#	movdqu	%xmm4,32($out,$in0)
+#rc4#	movdqu	%xmm5,48($out,$in0)
+#md5#	lea	64($inp),$inp
+#rc4#	lea	64($in0),$in0
+	cmp	16(%rsp),$inp		# are we done?
+	jb	.Loop
+
+#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
+#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
+#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
+#md5#	mov	$V[1],1*4($len)
+#md5#	mov	$V[2],2*4($len)
+#md5#	mov	$V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+	mov	32(%rsp),$len		# restore original $len
+	and	\$63,$len		# remaining bytes
+	jnz	.Loop1
+	jmp	.Ldone
+	
+.align	16
+.Loop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$len
+	jnz	.Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#	sub	\$1,$XX[0]#b
+#rc4#	movl	$XX[0]#d,-8($dat)
+#rc4#	movl	$YY#d,-4($dat)
+
+	mov	40(%rsp),%r15
+	mov	48(%rsp),%r14
+	mov	56(%rsp),%r13
+	mov	64(%rsp),%r12
+	mov	72(%rsp),%rbp
+	mov	80(%rsp),%rbx
+	lea	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {	# sole purpose of this section is to provide
+			# option to use the generated module as drop-in
+			# replacement for rc4-x86_64.pl for debugging
+			# and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@abi-omnipotent
+.align	16
+RC4_options:
+	lea	.Lopts(%rip),%rax
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(64x,int)"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lbody
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	40(%rax),%r15
+	mov	48(%rax),%r14
+	mov	56(%rax),%r13
+	mov	64(%rax),%r12
+	mov	72(%rax),%rbp
+	mov	80(%rax),%rbx
+	lea	88(%rax),%rax
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd	/gm;
+
+$code =~ s/#md5#//gm	if ($md5);
+$code =~ s/#rc4#//gm	if ($rc4);
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/rc4/asm/rc4-parisc.pl b/crypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 0000000..9165067
--- /dev/null
+++ b/crypto/rc4/asm/rc4-parisc.pl

@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# RC4 for PA-RISC.
+
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
+				#                [+ argument transfer]
+$SZ=1;				# defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+    while(<CONF>) {
+	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+	    $SZ = ($1=~/char$/) ? 1 : 4;
+	    last;
+	}
+    }
+    close CONF;
+}
+
+if ($SZ==1) {	# RC4_CHAR
+    $LD="ldb";
+    $LDX="ldbx";
+    $MKX="addl";
+    $ST="stb";
+} else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+    $LD="ldw";
+    $LDX="ldwx,s";
+    $MKX="sh2addl";
+    $ST="stw";
+}
+
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+	ldo	1($XX[0]),$XX[1]
+	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`	
+	and	$mask,$XX[1],$XX[1]
+	$LDX	$YY($key),$TY
+	$MKX	$YY,$key,$ix
+	$LDX	$XX[1]($key),$TX[1]
+	$MKX	$XX[0],$key,$iy
+	$ST	$TX[0],0($ix)
+	comclr,<> $XX[1],$YY,%r0	; conditional
+	copy	$TX[0],$TX[1]		; move
+	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+	$ST	$TY,0($iy)
+	addl	$TX[0],$TY,$TY
+	addl	$TX[1],$YY,$YY
+	and	$mask,$TY,$TY
+	and	$mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+} }
+
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+	$MKX	$YY,$key,$iy
+	$LDX	$YY($key),$TY
+	$MKX	$XX[0],$key,$ix
+	$ST	$TX[0],0($iy)
+	ldo	1($XX[0]),$XX[0]
+	$ST	$TY,0($ix)
+	addl	$TX[0],$TY,$TY
+	ldbx	$inp($out),$dat1
+	and	$mask,$TY,$TY
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$TY($key),$acc
+	$LDX	$XX[0]($key),$TX[0]
+	ldo	1($out),$out
+	xor	$dat1,$acc,$acc
+	addl	$TX[0],$YY,$YY
+	stb	$acc,-1($out)
+	addib,<> -1,$count,$label	; $count is always small
+	and	$mask,$YY,$YY
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+
+	cmpib,*= 0,$len,L\$abort
+	sub	$inp,$out,$inp		; distance between $inp and $out
+
+	$LD	`0*$SZ`($key),$XX[0]
+	$LD	`1*$SZ`($key),$YY
+	ldo	`2*$SZ`($key),$key
+
+	ldi	0xff,$mask
+	ldi	3,$dat0		
+
+	ldo	1($XX[0]),$XX[0]	; warm up loop
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$XX[0]($key),$TX[0]
+	addl	$TX[0],$YY,$YY
+	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
+	and	$mask,$YY,$YY
+
+	and,<>	$out,$dat0,$rem		; is $out aligned?
+	b	L\$alignedout
+	subi	4,$rem,$rem
+	sub	$len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem);	# process till $out is aligned
+
+$code.=<<___;
+L\$alignedout				; $len is at least 4 here
+	and,<>	$inp,$dat0,$acc		; is $inp aligned?
+	b	L\$oop4
+	sub	$inp,$acc,$rem		; align $inp
+
+	sh3addl	$acc,%r0,$acc
+	subi	32,$acc,$acc
+	mtctl	$acc,%cr11		; load %sar with vshd align factor
+	ldwx	$rem($out),$dat0
+	ldo	4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$rem($out),$dat1
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	vshd	$dat0,$dat1,$iy		; align data
+	copy	$dat1,$dat0
+	xor	$iy,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4misalignedinp
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+	b	L\$oop1
+	nop
+
+	.ALIGN	8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$inp($out),$dat0
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	xor	$dat0,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
+	ldo	-1($XX[0]),$XX[0]	; chill out loop
+	sub	$YY,$TX[0],$YY
+	and	$mask,$XX[0],$XX[0]
+	and	$mask,$YY,$YY
+	$ST	$XX[0],`-2*$SZ`($key)
+	$ST	$YY,`-1*$SZ`($key)
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+___
+
+$code.=<<___;
+
+	.EXPORT	private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	8
+private_RC4_set_key
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	$ST	%r0,`0*$SZ`($key)
+	$ST	%r0,`1*$SZ`($key)
+	ldo	`2*$SZ`($key),$key
+	copy	%r0,@XX[0]
+L\$1st
+	$ST	@XX[0],0($key)
+	ldo	1(@XX[0]),@XX[0]
+	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
+	ldo	$SZ($key),$key
+
+	ldo	`-256*$SZ`($key),$key	; rewind $key
+	addl	$len,$inp,$inp		; $inp to point at the end
+	sub	%r0,$len,%r23		; inverse index
+	copy	%r0,@XX[0]
+	copy	%r0,@XX[1]
+	ldi	0xff,$mask
+
+L\$2nd
+	$LDX	@XX[0]($key),@TX[0]
+	ldbx	%r23($inp),@TX[1]
+	addi,nuv 1,%r23,%r23		; increment and conditional
+	sub	%r0,$len,%r23		; inverse index
+	addl	@TX[0],@XX[1],@XX[1]
+	addl	@TX[1],@XX[1],@XX[1]
+	and	$mask,@XX[1],@XX[1]
+	$MKX	@XX[0],$key,$TY
+	$LDX	@XX[1]($key),@TX[1]
+	$MKX	@XX[1],$key,$YY
+	ldo	1(@XX[0]),@XX[0]
+	$ST	@TX[0],0($YY)
+	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
+	$ST	@TX[1],0($TY)
+
+	bv,n	(%r2)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	RC4_options,ENTRY
+	.ALIGN	8
+RC4_options
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	blr	%r0,%r28
+	ldi	3,%r1
+L\$pic
+	andcm	%r28,%r1,%r28
+	bv	(%r2)
+	.EXIT
+	ldo	L\$opts-L\$pic(%r28),%r28
+	.PROCEND
+	.ALIGN	8
+L\$opts
+	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+
+print $code;
+close STDOUT;

diff --git a/crypto/rc4/asm/rc4-s390x.pl b/crypto/rc4/asm/rc4-s390x.pl
index 96681fa..1aa7548 100644
--- a/crypto/rc4/asm/rc4-s390x.pl
+++ b/crypto/rc4/asm/rc4-s390x.pl

@@ -13,6 +13,29 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $rp="%r14";
 $sp="%r15";
 $code=<<___;
@@ -39,7 +62,12 @@
 .type	RC4,\@function
 .align	64
 RC4:
-	stmg	%r6,%r11,48($sp)
+	stm${g}	%r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
 	llgc	$XX[0],0($key)
 	llgc	$YY,1($key)
 	la	$XX[0],1($XX[0])
@@ -90,7 +118,7 @@
 	xgr	$acc,$TX[1]
 	stg	$acc,0($out)
 	la	$out,8($out)
-	brct	$cnt,.Loop8
+	brctg	$cnt,.Loop8
 
 .Lshort:
 	lghi	$acc,7
@@ -122,7 +150,7 @@
 	ahi	$XX[0],-1
 	stc	$XX[0],0($key)
 	stc	$YY,1($key)
-	lmg	%r6,%r11,48($sp)
+	lm${g}	%r6,%r11,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4,.-RC4
 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -147,7 +175,7 @@
 .type	RC4_set_key,\@function
 .align	64
 RC4_set_key:
-	stmg	%r6,%r8,48($sp)
+	stm${g}	%r6,%r8,6*$SIZE_T($sp)
 	lhi	$cnt,256
 	la	$idx,0(%r0)
 	sth	$idx,0($key)
@@ -180,7 +208,7 @@
 	la	$iinp,0(%r0)
 	j	.L2ndloop
 .Ldone:
-	lmg	%r6,%r8,48($sp)
+	lm${g}	%r6,%r8,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4_set_key,.-RC4_set_key
 
@@ -203,3 +231,4 @@
 ___
 
 print $code;
+close STDOUT;	# force flush

diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 677be5f..d6eac20 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl

@@ -7,6 +7,8 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# July 2004
+#
 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
 # "hand-coded assembler"] doesn't stand for the whole improvement
 # coefficient. It turned out that eliminating RC4_CHAR from config
@@ -19,6 +21,8 @@
 # to operate on partial registers, it turned out to be the best bet.
 # At least for AMD... How IA32E would perform remains to be seen...
 
+# November 2004
+#
 # As was shown by Marc Bevand reordering of couple of load operations
 # results in even higher performance gain of 3.3x:-) At least on
 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
@@ -26,6 +30,8 @@
 # Latter means that if you want to *estimate* what to expect from
 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
 
+# November 2004
+#
 # Intel P4 EM64T core was found to run the AMD64 code really slow...
 # The only way to achieve comparable performance on P4 was to keep
 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
@@ -33,10 +39,14 @@
 # on either AMD and Intel platforms, I implement both cases. See
 # rc4_skey.c for further details...
 
+# April 2005
+#
 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
 # those with add/sub results in 50% performance improvement of folded
 # loop...
 
+# May 2005
+#
 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
 # performance by >30% [unlike P4 32-bit case that is]. But this is
 # provided that loads are reordered even more aggressively! Both code
@@ -50,6 +60,8 @@
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# March 2007
+#
 # Intel Core2 was observed to perform poorly on both code paths:-( It
 # apparently suffers from some kind of partial register stall, which
 # occurs in 64-bit mode only [as virtually identical 32-bit loop was
@@ -58,6 +70,37 @@
 # fit for Core2 and therefore the code was modified to skip cloop8 on
 # this CPU.
 
+# May 2010
+#
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
+# May 2011
+#
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
+#
+# Opteron	5.3/+0%(*)
+# P4		6.5
+# Core2		6.2/+15%(**)
+# Westmere	4.2/+60%
+# Sandy Bridge	4.2/+120%
+# Atom		9.3/+80%
+#
+# (*)	But corresponding loop has less instructions, which should have
+#	positive effect on upcoming Bulldozer, which has one less ALU.
+#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**)	Note that Core2 result is ~15% lower than corresponding result
+#	for 32-bit code, meaning that it's possible to improve it,
+#	but more than likely at the cost of the others (see rc4-586.pl
+#	to get the idea)...
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -76,13 +119,10 @@
 $inp="%rdx";	    # arg3
 $out="%rcx";	    # arg4
 
-@XX=("%r8","%r10");
-@TX=("%r9","%r11");
-$YY="%r12";
-$TY="%r13";
-
+{
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	RC4
 .type	RC4,\@function,4
@@ -95,48 +135,173 @@
 	push	%r12
 	push	%r13
 .Lprologue:
+	mov	$len,%r11
+	mov	$inp,%r12
+	mov	$out,%r13
+___
+my $len="%r11";		# reassign input arguments
+my $inp="%r12";
+my $out="%r13";
 
-	add	\$8,$dat
-	movl	-8($dat),$XX[0]#d
-	movl	-4($dat),$YY#d
+my @XX=("%r10","%rsi");
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
 	cmpl	\$-1,256($dat)
 	je	.LRC4_CHAR
+	mov	OPENSSL_ia32cap_P(%rip),%r8d
+	xor	$TX[1],$TX[1]
 	inc	$XX[0]#b
+	sub	$XX[0],$TX[1]
+	sub	$inp,$out
 	movl	($dat,$XX[0],4),$TX[0]#d
-	test	\$-8,$len
+	test	\$-16,$len
 	jz	.Lloop1
-	jmp	.Lloop8
+	bt	\$30,%r8d	# Intel CPU?
+	jc	.Lintel
+	and	\$7,$TX[1]
+	lea	1($XX[0]),$XX[1]
+	jz	.Loop8
+	sub	$TX[1],$len
+.Loop8_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop8_warmup
+
+	lea	1($XX[0]),$XX[1]
+	jmp	.Loop8
 .align	16
-.Lloop8:
+.Loop8:
 ___
 for ($i=0;$i<8;$i++) {
+$code.=<<___ if ($i==7);
+	add	\$8,$XX[1]#b
+___
 $code.=<<___;
 	add	$TX[0]#b,$YY#b
-	mov	$XX[0],$XX[1]
 	movl	($dat,$YY,4),$TY#d
-	ror	\$8,%rax			# ror is redundant when $i=0
-	inc	$XX[1]#b
-	movl	($dat,$XX[1],4),$TX[1]#d
-	cmp	$XX[1],$YY
 	movl	$TX[0]#d,($dat,$YY,4)
-	cmove	$TX[0],$TX[1]
-	movl	$TY#d,($dat,$XX[0],4)
+	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
+	ror	\$8,%r8				# ror is redundant when $i=0
+	movl	$TY#d,4*$i($dat,$XX[0],4)
 	add	$TX[0]#b,$TY#b
-	movb	($dat,$TY,4),%al
+	movb	($dat,$TY,4),%r8b
 ___
-push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
-	ror	\$8,%rax
+	add	\$8,$XX[0]#b
+	ror	\$8,%r8
 	sub	\$8,$len
 
-	xor	($inp),%rax
-	add	\$8,$inp
-	mov	%rax,($out)
-	add	\$8,$out
+	xor	($inp),%r8
+	mov	%r8,($out,$inp)
+	lea	8($inp),$inp
 
 	test	\$-8,$len
-	jnz	.Lloop8
+	jnz	.Loop8
+	cmp	\$0,$len
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	test	\$-32,$len
+	jz	.Lloop1
+	and	\$15,$TX[1]
+	jz	.Loop16_is_hot
+	sub	$TX[1],$len
+.Loop16_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop16_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop16_is_hot:
+	lea	($dat,$XX[0],4),$XX[1]
+___
+sub RC4_loop {
+  my $i=shift;
+  my $j=$i<0?0:$i;
+  my $xmm="%xmm".($j&1);
+
+    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
+    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
+    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
+    $code.="	movl	($dat,$YY,4),$TY#d\n";
+    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
+    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
+    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
+    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
+    $code.="	add	$TY#b,$TX[0]#b\n";
+    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
+    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
+    $code.="	movl	$TY#d,4*$j($XX[1])\n";
+    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
+    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
+    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
+    $code.="	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
+    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
+    $code.="	lea	16($inp),$inp\n"		if ($i==0);
+    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
+}
+	RC4_loop(-1);
+$code.=<<___;
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+___
+
+for ($i=0;$i<16;$i++) {
+    $code.=".Loop16_enter:\n"		if ($i==1);
+	RC4_loop($i);
+	push(@TX,shift(@TX)); 		# "rotate" registers
+}
+$code.=<<___;
+	mov	$YY,$TX[1]
+	xor	$YY,$YY			# keyword to partial register
+	sub	\$16,$len
+	mov	$TX[1]#b,$YY#b
+	test	\$-16,$len
+	jnz	.Loop16
+
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,($out,$inp)
+	lea	16($inp),$inp
+
 	cmp	\$0,$len
 	jne	.Lloop1
 	jmp	.Lexit
@@ -152,9 +317,8 @@
 	movl	($dat,$TX[0],4),$TY#d
 	movl	($dat,$XX[0],4),$TX[0]#d
 	xorb	($inp),$TY#b
-	inc	$inp
-	movb	$TY#b,($out)
-	inc	$out
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
 	dec	$len
 	jnz	.Lloop1
 	jmp	.Lexit
@@ -165,13 +329,11 @@
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
-	cmpl	\$0,260($dat)
-	jnz	.Lcloop1
 	jmp	.Lcloop8
 .align	16
 .Lcloop8:
-	mov	($inp),%eax
-	mov	4($inp),%ebx
+	mov	($inp),%r8d
+	mov	4($inp),%r9d
 ___
 # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
 for ($i=0;$i<4;$i++) {
@@ -188,8 +350,8 @@
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%al
-	ror	\$8,%eax
+	xor	($dat,$TY),%r8b
+	ror	\$8,%r8d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
@@ -207,16 +369,16 @@
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%bl
-	ror	\$8,%ebx
+	xor	($dat,$TY),%r9b
+	ror	\$8,%r9d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
 	lea	-8($len),$len
-	mov	%eax,($out)
+	mov	%r8d,($out)
 	lea	8($inp),$inp
-	mov	%ebx,4($out)
+	mov	%r9d,4($out)
 	lea	8($out),$out
 
 	test	\$-8,$len
@@ -229,6 +391,7 @@
 .align	16
 .Lcloop1:
 	add	$TX[0]#b,$YY#b
+	movzb	$YY#b,$YY#d
 	movzb	($dat,$YY),$TY#d
 	movb	$TX[0]#b,($dat,$YY)
 	movb	$TY#b,($dat,$XX[0])
@@ -260,16 +423,16 @@
 	ret
 .size	RC4,.-RC4
 ___
+}
 
 $idx="%r8";
 $ido="%r9";
 
 $code.=<<___;
-.extern	OPENSSL_ia32cap_P
-.globl	RC4_set_key
-.type	RC4_set_key,\@function,3
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function,3
 .align	16
-RC4_set_key:
+private_RC4_set_key:
 	lea	8($dat),$dat
 	lea	($inp,$len),$inp
 	neg	$len
@@ -280,12 +443,9 @@
 	xor	%r11,%r11
 
 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
-	bt	\$20,$idx#d
-	jnc	.Lw1stloop
-	bt	\$30,$idx#d
-	setc	$ido#b
-	mov	$ido#d,260($dat)
-	jmp	.Lc1stloop
+	bt	\$20,$idx#d	# RC4_CHAR?
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
 
 .align	16
 .Lw1stloop:
@@ -339,7 +499,7 @@
 	mov	%eax,-8($dat)
 	mov	%eax,-4($dat)
 	ret
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 .globl	RC4_options
 .type	RC4_options,\@abi-omnipotent
@@ -348,18 +508,20 @@
 	lea	.Lopts(%rip),%rax
 	mov	OPENSSL_ia32cap_P(%rip),%edx
 	bt	\$20,%edx
-	jnc	.Ldone
-	add	\$12,%rax
+	jc	.L8xchar
 	bt	\$30,%edx
 	jnc	.Ldone
-	add	\$13,%rax
+	add	\$25,%rax
+	ret
+.L8xchar:
+	add	\$12,%rax
 .Ldone:
 	ret
 .align	64
 .Lopts:
 .asciz	"rc4(8x,int)"
 .asciz	"rc4(8x,char)"
-.asciz	"rc4(1x,char)"
+.asciz	"rc4(16x,int)"
 .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 .size	RC4_options,.-RC4_options
@@ -482,22 +644,32 @@
 	.rva	.LSEH_end_RC4
 	.rva	.LSEH_info_RC4
 
-	.rva	.LSEH_begin_RC4_set_key
-	.rva	.LSEH_end_RC4_set_key
-	.rva	.LSEH_info_RC4_set_key
+	.rva	.LSEH_begin_private_RC4_set_key
+	.rva	.LSEH_end_private_RC4_set_key
+	.rva	.LSEH_info_private_RC4_set_key
 
 .section	.xdata
 .align	8
 .LSEH_info_RC4:
 	.byte	9,0,0,0
 	.rva	stream_se_handler
-.LSEH_info_RC4_set_key:
+.LSEH_info_private_RC4_set_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 ___
 }
 
-$code =~ s/#([bwd])/$1/gm;
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
 
 print $code;
 

diff --git a/crypto/rc4/rc4.h b/crypto/rc4/rc4.h
index 29d1acc..88ceb46 100644
--- a/crypto/rc4/rc4.h
+++ b/crypto/rc4/rc4.h

@@ -79,6 +79,7 @@
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 

diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c
index b22c40b..fda2763 100644
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c

@@ -85,7 +85,7 @@
  * Date: Wed, 14 Sep 1994 06:35:31 GMT
  */
 
-void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 	{
         register RC4_INT tmp;
         register int id1,id2;
@@ -104,40 +104,6 @@
 		d[(n)]=d[id2]; \
 		d[id2]=tmp; }
 
-#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
-# if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
-	defined(__INTEL__) || \
-	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
-	if (sizeof(RC4_INT) > 1) {
-		/*
-		 * Unlike all other x86 [and x86_64] implementations,
-		 * Intel P4 core [including EM64T] was found to perform
-		 * poorly with wider RC4_INT. Performance improvement
-		 * for IA-32 hand-coded assembler turned out to be 2.8x
-		 * if re-coded for RC4_CHAR! It's however inappropriate
-		 * to just switch to RC4_CHAR for x86[_64], as non-P4
-		 * implementations suffer from significant performance
-		 * losses then, e.g. PIII exhibits >2x deterioration,
-		 * and so does Opteron. In order to assure optimal
-		 * all-round performance, let us [try to] detect P4 at
-		 * run-time by checking upon HTT bit in CPU capability
-		 * vector and set up compressed key schedule, which is
-		 * recognized by correspondingly updated assembler
-		 * module...
-		 *				<[email protected]>
-		 */
-		if (OPENSSL_ia32cap_P & (1<<28)) {
-			unsigned char *cp=(unsigned char *)d;
-
-			for (i=0;i<256;i++) cp[i]=i;
-			for (i=0;i<256;i++) SK_LOOP(cp,i);
-			/* mark schedule as compressed! */
-			d[256/sizeof(RC4_INT)]=-1;
-			return;
-		}
-	}
-# endif
-#endif
 	for (i=0; i < 256; i++) d[i]=i;
 	for (i=0; i < 256; i+=4)
 		{

diff --git a/crypto/rc4/rc4_utl.c b/crypto/rc4/rc4_utl.c
new file mode 100644
index 0000000..ab3f02f
--- /dev/null
+++ b/crypto/rc4/rc4_utl.c

@@ -0,0 +1,62 @@
+/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include <openssl/rc4.h>
+
+void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(RC4);
+#endif
+	private_RC4_set_key(key, len, data);
+	}

diff --git a/crypto/rc4/rc4test.c b/crypto/rc4/rc4test.c
index 633a79e..4312605 100644
--- a/crypto/rc4/rc4test.c
+++ b/crypto/rc4/rc4test.c

@@ -120,6 +120,12 @@
 	RC4_KEY key;
 	unsigned char obuf[512];
 
+#if !defined(OPENSSL_PIC)
+	void OPENSSL_cpuid_setup(void);
+
+	OPENSSL_cpuid_setup();
+#endif
+
 	for (i=0; i<6; i++)
 		{
 		RC4_set_key(&key,keys[i][0],&(keys[i][1]));

diff --git a/crypto/ripemd/ripemd.h b/crypto/ripemd/ripemd.h
index 5942eb6..189bd8c 100644
--- a/crypto/ripemd/ripemd.h
+++ b/crypto/ripemd/ripemd.h

@@ -91,6 +91,9 @@
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);

diff --git a/crypto/ripemd/rmd_dgst.c b/crypto/ripemd/rmd_dgst.c
index 59b017f..63f0d98 100644
--- a/crypto/ripemd/rmd_dgst.c
+++ b/crypto/ripemd/rmd_dgst.c

@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "rmd_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
 
@@ -69,7 +70,7 @@
      void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num);
 #  endif
 
-int RIPEMD160_Init(RIPEMD160_CTX *c)
+fips_md_init(RIPEMD160)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=RIPEMD160_A;

diff --git a/crypto/rsa/rsa.h b/crypto/rsa/rsa.h
index cf74343..4814a2f 100644
--- a/crypto/rsa/rsa.h
+++ b/crypto/rsa/rsa.h

@@ -222,12 +222,22 @@
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -300,6 +323,16 @@
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 

diff --git a/crypto/rsa/rsa_ameth.c b/crypto/rsa/rsa_ameth.c
index 8c32098..2460910 100644
--- a/crypto/rsa/rsa_ameth.c
+++ b/crypto/rsa/rsa_ameth.c

@@ -265,6 +265,147 @@
 	return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
 	}
 
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+					X509_ALGOR **pmaskHash)
+	{
+	const unsigned char *p;
+	int plen;
+	RSA_PSS_PARAMS *pss;
+
+	*pmaskHash = NULL;
+
+	if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+		return NULL;
+	p = alg->parameter->value.sequence->data;
+	plen = alg->parameter->value.sequence->length;
+	pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+
+	if (!pss)
+		return NULL;
+	
+	if (pss->maskGenAlgorithm)
+		{
+		ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+			&& param->type == V_ASN1_SEQUENCE)
+			{
+			p = param->value.sequence->data;
+			plen = param->value.sequence->length;
+			*pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+			}
+		}
+
+	return pss;
+	}
+
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, 
+				X509_ALGOR *maskHash, int indent)
+	{
+	int rv = 0;
+	if (!pss)
+		{
+		if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+			return 0;
+		return 1;
+		}
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+		goto err;
+
+	if (pss->hashAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "sha1 (default)") <= 0)
+		goto err;
+
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+
+	if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+			goto err;
+	if (pss->maskGenAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+			goto err;
+		if (BIO_puts(bp, " with ") <= 0)
+			goto err;
+		if (maskHash)
+			{
+			if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+			goto err;
+			}
+		else if (BIO_puts(bp, "INVALID") <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Salt Length: ") <= 0)
+			goto err;
+	if (pss->saltLength)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "20 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Trailer Field: ") <= 0)
+			goto err;
+	if (pss->trailerField)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "0xbc (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+	
+	rv = 1;
+
+	err:
+	return rv;
+
+	}
+
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+		{
+		int rv;
+		RSA_PSS_PARAMS *pss;
+		X509_ALGOR *maskHash;
+		pss = rsa_pss_decode(sigalg, &maskHash);
+		rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (maskHash)
+			X509_ALGOR_free(maskHash);
+		if (!rv)
+			return 0;
+		}
+	else if (!sig && BIO_puts(bp, "\n") <= 0)
+		return 0;
+	if (sig)
+		return X509_signature_dump(bp, sig, indent);
+	return 1;
+	}
 
 static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
@@ -310,6 +451,211 @@
 
 	}
 
+/* Customised RSA item verification routine. This is called 
+ * when a signature is encountered requiring special handling. We 
+ * currently only handle PSS.
+ */
+
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+			X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+			EVP_PKEY *pkey)
+	{
+	int rv = -1;
+	int saltlen;
+	const EVP_MD *mgf1md = NULL, *md = NULL;
+	RSA_PSS_PARAMS *pss;
+	X509_ALGOR *maskHash;
+	EVP_PKEY_CTX *pkctx;
+	/* Sanity check: make sure it is PSS */
+	if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+		return -1;
+		}
+	/* Decode PSS parameters */
+	pss = rsa_pss_decode(sigalg, &maskHash);
+
+	if (pss == NULL)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+		goto err;
+		}
+	/* Check mask and lookup mask hash algorithm */
+	if (pss->maskGenAlgorithm)
+		{
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+			goto err;
+			}
+		if (!maskHash)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+			goto err;
+			}
+		mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+		if (mgf1md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+			goto err;
+			}
+		}
+	else
+		mgf1md = EVP_sha1();
+
+	if (pss->hashAlgorithm)
+		{
+		md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+		if (md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+			goto err;
+			}
+		}
+	else
+		md = EVP_sha1();
+
+	if (pss->saltLength)
+		{
+		saltlen = ASN1_INTEGER_get(pss->saltLength);
+
+		/* Could perform more salt length sanity checks but the main
+		 * RSA routines will trap other invalid values anyway.
+		 */
+		if (saltlen < 0)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+			goto err;
+			}
+		}
+	else
+		saltlen = 20;
+
+	/* low-level routines support only trailer field 0xbc (value 1)
+	 * and PKCS#1 says we should reject any other value anyway.
+	 */
+	if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+		goto err;
+		}
+
+	/* We have all parameters now set up context */
+
+	if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+		goto err;
+	/* Carry on */
+	rv = 2;
+
+	err:
+	RSA_PSS_PARAMS_free(pss);
+	if (maskHash)
+		X509_ALGOR_free(maskHash);
+	return rv;
+	}
+
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig)
+	{
+	int pad_mode;
+	EVP_PKEY_CTX *pkctx = ctx->pctx;
+	if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+		return 0;
+	if (pad_mode == RSA_PKCS1_PADDING)
+		return 2;
+	if (pad_mode == RSA_PKCS1_PSS_PADDING)
+		{
+		const EVP_MD *sigmd, *mgf1md;
+		RSA_PSS_PARAMS *pss = NULL;
+		X509_ALGOR *mgf1alg = NULL;
+		ASN1_STRING *os1 = NULL, *os2 = NULL;
+		EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+		int saltlen, rv = 0;
+		sigmd = EVP_MD_CTX_md(ctx);
+		if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+			goto err;
+		if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+			goto err;
+		if (saltlen == -1)
+			saltlen = EVP_MD_size(sigmd);
+		else if (saltlen == -2)
+			{
+			saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+			if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+				saltlen--;
+			}
+		pss = RSA_PSS_PARAMS_new();
+		if (!pss)
+			goto err;
+		if (saltlen != 20)
+			{
+			pss->saltLength = ASN1_INTEGER_new();
+			if (!pss->saltLength)
+				goto err;
+			if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+				goto err;
+			}
+		if (EVP_MD_type(sigmd) != NID_sha1)
+			{
+			pss->hashAlgorithm = X509_ALGOR_new();
+			if (!pss->hashAlgorithm)
+				goto err;
+			X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+			}
+		if (EVP_MD_type(mgf1md) != NID_sha1)
+			{
+			ASN1_STRING *stmp = NULL;
+			/* need to embed algorithm ID inside another */
+			mgf1alg = X509_ALGOR_new();
+			X509_ALGOR_set_md(mgf1alg, mgf1md);
+			if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+									&stmp))
+					goto err;
+			pss->maskGenAlgorithm = X509_ALGOR_new();
+			if (!pss->maskGenAlgorithm)
+				goto err;
+			X509_ALGOR_set0(pss->maskGenAlgorithm,
+					OBJ_nid2obj(NID_mgf1),
+					V_ASN1_SEQUENCE, stmp);
+			}
+		/* Finally create string with pss parameter encoding. */
+		if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+			goto err;
+		if (alg2)
+			{
+			os2 = ASN1_STRING_dup(os1);
+			if (!os2)
+				goto err;
+			X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+						V_ASN1_SEQUENCE, os2);
+			}
+		X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+					V_ASN1_SEQUENCE, os1);
+		os1 = os2 = NULL;
+		rv = 3;
+		err:
+		if (mgf1alg)
+			X509_ALGOR_free(mgf1alg);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (os1)
+			ASN1_STRING_free(os1);
+		return rv;
+		
+		}
+	return 2;
+	}
 
 const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 
 	{
@@ -335,10 +681,13 @@
 
 		0,0,0,0,0,0,
 
+		rsa_sig_print,
 		int_rsa_free,
 		rsa_pkey_ctrl,
 		old_rsa_priv_decode,
-		old_rsa_priv_encode
+		old_rsa_priv_encode,
+		rsa_item_verify,
+		rsa_item_sign
 		},
 
 		{

diff --git a/crypto/rsa/rsa_asn1.c b/crypto/rsa/rsa_asn1.c
index 4efca8c..6ed5de3 100644
--- a/crypto/rsa/rsa_asn1.c
+++ b/crypto/rsa/rsa_asn1.c

@@ -60,6 +60,7 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#include <openssl/x509.h>
 #include <openssl/asn1t.h>
 
 /* Override the default free and new methods */
@@ -96,6 +97,15 @@
 	ASN1_SIMPLE(RSA, e, BIGNUM),
 } ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey)
 
+ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3)
+} ASN1_SEQUENCE_END(RSA_PSS_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
 
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)

diff --git a/crypto/rsa/rsa_crpt.c b/crypto/rsa/rsa_crpt.c
new file mode 100644
index 0000000..d3e4478
--- /dev/null
+++ b/crypto/rsa/rsa_crpt.c

@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+
+int RSA_size(const RSA *r)
+	{
+	return(BN_num_bytes(r->n));
+	}
+
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_flags(const RSA *r)
+	{
+	return((r == NULL)?0:r->meth->flags);
+	}
+
+void RSA_blinding_off(RSA *rsa)
+	{
+	if (rsa->blinding != NULL)
+		{
+		BN_BLINDING_free(rsa->blinding);
+		rsa->blinding=NULL;
+		}
+	rsa->flags &= ~RSA_FLAG_BLINDING;
+	rsa->flags |= RSA_FLAG_NO_BLINDING;
+	}
+
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+	{
+	int ret=0;
+
+	if (rsa->blinding != NULL)
+		RSA_blinding_off(rsa);
+
+	rsa->blinding = RSA_setup_blinding(rsa, ctx);
+	if (rsa->blinding == NULL)
+		goto err;
+
+	rsa->flags |= RSA_FLAG_BLINDING;
+	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+	ret=1;
+err:
+	return(ret);
+	}
+
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+	const BIGNUM *q, BN_CTX *ctx)
+{
+	BIGNUM *ret = NULL, *r0, *r1, *r2;
+
+	if (d == NULL || p == NULL || q == NULL)
+		return NULL;
+
+	BN_CTX_start(ctx);
+	r0 = BN_CTX_get(ctx);
+	r1 = BN_CTX_get(ctx);
+	r2 = BN_CTX_get(ctx);
+	if (r2 == NULL)
+		goto err;
+
+	if (!BN_sub(r1, p, BN_value_one())) goto err;
+	if (!BN_sub(r2, q, BN_value_one())) goto err;
+	if (!BN_mul(r0, r1, r2, ctx)) goto err;
+
+	ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+	BN_CTX_end(ctx);
+	return ret;
+}
+
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+	BIGNUM local_n;
+	BIGNUM *e,*n;
+	BN_CTX *ctx;
+	BN_BLINDING *ret = NULL;
+
+	if (in_ctx == NULL)
+		{
+		if ((ctx = BN_CTX_new()) == NULL) return 0;
+		}
+	else
+		ctx = in_ctx;
+
+	BN_CTX_start(ctx);
+	e  = BN_CTX_get(ctx);
+	if (e == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (rsa->e == NULL)
+		{
+		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+		if (e == NULL)
+			{
+			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+			goto err;
+			}
+		}
+	else
+		e = rsa->e;
+
+	
+	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+		{
+		/* if PRNG is not properly seeded, resort to secret
+		 * exponent as unpredictable seed */
+		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+		}
+
+	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+		{
+		/* Set BN_FLG_CONSTTIME flag */
+		n = &local_n;
+		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+		}
+	else
+		n = rsa->n;
+
+	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+	if (ret == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+		goto err;
+		}
+	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+	BN_CTX_end(ctx);
+	if (in_ctx == NULL)
+		BN_CTX_free(ctx);
+	if(rsa->e == NULL)
+		BN_free(e);
+
+	return ret;
+}

diff --git a/crypto/rsa/rsa_err.c b/crypto/rsa/rsa_err.c
index cf9f110..46e0bf9 100644
--- a/crypto/rsa/rsa_err.c
+++ b/crypto/rsa/rsa_err.c

@@ -1,6 +1,6 @@
 /* crypto/rsa/rsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -78,6 +78,7 @@
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL),	"PKEY_RSA_CTRL"},
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR),	"PKEY_RSA_CTRL_STR"},
 {ERR_FUNC(RSA_F_PKEY_RSA_SIGN),	"PKEY_RSA_SIGN"},
+{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY),	"PKEY_RSA_VERIFY"},
 {ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER),	"PKEY_RSA_VERIFYRECOVER"},
 {ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN),	"RSA_BUILTIN_KEYGEN"},
 {ERR_FUNC(RSA_F_RSA_CHECK_KEY),	"RSA_check_key"},
@@ -86,6 +87,8 @@
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT),	"RSA_EAY_PUBLIC_DECRYPT"},
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT),	"RSA_EAY_PUBLIC_ENCRYPT"},
 {ERR_FUNC(RSA_F_RSA_GENERATE_KEY),	"RSA_generate_key"},
+{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX),	"RSA_generate_key_ex"},
+{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY),	"RSA_ITEM_VERIFY"},
 {ERR_FUNC(RSA_F_RSA_MEMORY_LOCK),	"RSA_memory_lock"},
 {ERR_FUNC(RSA_F_RSA_NEW_METHOD),	"RSA_new_method"},
 {ERR_FUNC(RSA_F_RSA_NULL),	"RSA_NULL"},
@@ -97,6 +100,7 @@
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE),	"RSA_padding_add_none"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP),	"RSA_padding_add_PKCS1_OAEP"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS),	"RSA_padding_add_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1),	"RSA_padding_add_PKCS1_PSS_mgf1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1),	"RSA_padding_add_PKCS1_type_1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2),	"RSA_padding_add_PKCS1_type_2"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23),	"RSA_padding_add_SSLv23"},
@@ -109,8 +113,12 @@
 {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931),	"RSA_padding_check_X931"},
 {ERR_FUNC(RSA_F_RSA_PRINT),	"RSA_print"},
 {ERR_FUNC(RSA_F_RSA_PRINT_FP),	"RSA_print_fp"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT),	"RSA_private_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT),	"RSA_private_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PRIV_DECODE),	"RSA_PRIV_DECODE"},
 {ERR_FUNC(RSA_F_RSA_PRIV_ENCODE),	"RSA_PRIV_ENCODE"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT),	"RSA_public_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT),	"RSA_public_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PUB_DECODE),	"RSA_PUB_DECODE"},
 {ERR_FUNC(RSA_F_RSA_SETUP_BLINDING),	"RSA_setup_blinding"},
 {ERR_FUNC(RSA_F_RSA_SIGN),	"RSA_sign"},
@@ -118,6 +126,7 @@
 {ERR_FUNC(RSA_F_RSA_VERIFY),	"RSA_verify"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING),	"RSA_verify_ASN1_OCTET_STRING"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS),	"RSA_verify_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1),	"RSA_verify_PKCS1_PSS_mgf1"},
 {0,NULL}
 	};
 
@@ -146,19 +155,24 @@
 {ERR_REASON(RSA_R_INVALID_HEADER)        ,"invalid header"},
 {ERR_REASON(RSA_R_INVALID_KEYBITS)       ,"invalid keybits"},
 {ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"},
+{ERR_REASON(RSA_R_INVALID_MGF1_MD)       ,"invalid mgf1 md"},
 {ERR_REASON(RSA_R_INVALID_PADDING)       ,"invalid padding"},
 {ERR_REASON(RSA_R_INVALID_PADDING_MODE)  ,"invalid padding mode"},
+{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"},
 {ERR_REASON(RSA_R_INVALID_PSS_SALTLEN)   ,"invalid pss saltlen"},
+{ERR_REASON(RSA_R_INVALID_SALT_LENGTH)   ,"invalid salt length"},
 {ERR_REASON(RSA_R_INVALID_TRAILER)       ,"invalid trailer"},
 {ERR_REASON(RSA_R_INVALID_X931_DIGEST)   ,"invalid x931 digest"},
 {ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"},
 {ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL)    ,"key size too small"},
 {ERR_REASON(RSA_R_LAST_OCTET_INVALID)    ,"last octet invalid"},
 {ERR_REASON(RSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD)   ,"non fips rsa method"},
 {ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT)    ,"no public exponent"},
 {ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"},
 {ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q)  ,"n does not equal p q"},
 {ERR_REASON(RSA_R_OAEP_DECODING_ERROR)   ,"oaep decoding error"},
+{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"},
 {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
 {ERR_REASON(RSA_R_PADDING_CHECK_FAILED)  ,"padding check failed"},
 {ERR_REASON(RSA_R_P_NOT_PRIME)           ,"p not prime"},
@@ -169,7 +183,12 @@
 {ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"},
 {ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"},
 {ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"},
+{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST)   ,"unknown mask digest"},
 {ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE)  ,"unknown padding type"},
+{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST)    ,"unknown pss digest"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"},
+{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"},
 {ERR_REASON(RSA_R_VALUE_MISSING)         ,"value missing"},
 {ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {0,NULL}

diff --git a/crypto/rsa/rsa_gen.c b/crypto/rsa/rsa_gen.c
index 767f7ab..42290cc 100644
--- a/crypto/rsa/rsa_gen.c
+++ b/crypto/rsa/rsa_gen.c

@@ -67,6 +67,9 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
 
@@ -77,8 +80,20 @@
  * now just because key-generation is part of RSA_METHOD. */
 int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if(rsa->meth->rsa_keygen)
 		return rsa->meth->rsa_keygen(rsa, bits, e_value, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb);
+#endif
 	return rsa_builtin_keygen(rsa, bits, e_value, cb);
 	}
 

diff --git a/crypto/rsa/rsa_lib.c b/crypto/rsa/rsa_lib.c
index de45088..c95ceaf 100644
--- a/crypto/rsa/rsa_lib.c
+++ b/crypto/rsa/rsa_lib.c

@@ -67,6 +67,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT;
 
 static const RSA_METHOD *default_RSA_meth=NULL;
@@ -87,12 +91,15 @@
 	{
 	if (default_RSA_meth == NULL)
 		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_rsa_pkcs1_ssleay();
+		else
+			return RSA_PKCS1_SSLeay();
+#else
 #ifdef RSA_NULL
 		default_RSA_meth=RSA_null_method();
 #else
-#if 0 /* was: #ifdef RSAref */
-		default_RSA_meth=RSA_PKCS1_RSAref();
-#else
 		default_RSA_meth=RSA_PKCS1_SSLeay();
 #endif
 #endif
@@ -181,7 +188,7 @@
 	ret->blinding=NULL;
 	ret->mt_blinding=NULL;
 	ret->bignum_data=NULL;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW;
 	if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data))
 		{
 #ifndef OPENSSL_NO_ENGINE
@@ -280,163 +287,6 @@
 	return(CRYPTO_get_ex_data(&r->ex_data,idx));
 	}
 
-int RSA_size(const RSA *r)
-	{
-	return(BN_num_bytes(r->n));
-	}
-
-int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_flags(const RSA *r)
-	{
-	return((r == NULL)?0:r->meth->flags);
-	}
-
-void RSA_blinding_off(RSA *rsa)
-	{
-	if (rsa->blinding != NULL)
-		{
-		BN_BLINDING_free(rsa->blinding);
-		rsa->blinding=NULL;
-		}
-	rsa->flags &= ~RSA_FLAG_BLINDING;
-	rsa->flags |= RSA_FLAG_NO_BLINDING;
-	}
-
-int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
-	{
-	int ret=0;
-
-	if (rsa->blinding != NULL)
-		RSA_blinding_off(rsa);
-
-	rsa->blinding = RSA_setup_blinding(rsa, ctx);
-	if (rsa->blinding == NULL)
-		goto err;
-
-	rsa->flags |= RSA_FLAG_BLINDING;
-	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
-	ret=1;
-err:
-	return(ret);
-	}
-
-static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
-	const BIGNUM *q, BN_CTX *ctx)
-{
-	BIGNUM *ret = NULL, *r0, *r1, *r2;
-
-	if (d == NULL || p == NULL || q == NULL)
-		return NULL;
-
-	BN_CTX_start(ctx);
-	r0 = BN_CTX_get(ctx);
-	r1 = BN_CTX_get(ctx);
-	r2 = BN_CTX_get(ctx);
-	if (r2 == NULL)
-		goto err;
-
-	if (!BN_sub(r1, p, BN_value_one())) goto err;
-	if (!BN_sub(r2, q, BN_value_one())) goto err;
-	if (!BN_mul(r0, r1, r2, ctx)) goto err;
-
-	ret = BN_mod_inverse(NULL, d, r0, ctx);
-err:
-	BN_CTX_end(ctx);
-	return ret;
-}
-
-BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
-{
-	BIGNUM local_n;
-	BIGNUM *e,*n;
-	BN_CTX *ctx;
-	BN_BLINDING *ret = NULL;
-
-	if (in_ctx == NULL)
-		{
-		if ((ctx = BN_CTX_new()) == NULL) return 0;
-		}
-	else
-		ctx = in_ctx;
-
-	BN_CTX_start(ctx);
-	e  = BN_CTX_get(ctx);
-	if (e == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-
-	if (rsa->e == NULL)
-		{
-		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
-		if (e == NULL)
-			{
-			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
-			goto err;
-			}
-		}
-	else
-		e = rsa->e;
-
-	
-	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
-		{
-		/* if PRNG is not properly seeded, resort to secret
-		 * exponent as unpredictable seed */
-		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
-		}
-
-	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
-		{
-		/* Set BN_FLG_CONSTTIME flag */
-		n = &local_n;
-		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
-		}
-	else
-		n = rsa->n;
-
-	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
-			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
-	if (ret == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
-		goto err;
-		}
-	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
-err:
-	BN_CTX_end(ctx);
-	if (in_ctx == NULL)
-		BN_CTX_free(ctx);
-	if(rsa->e == NULL)
-		BN_free(e);
-
-	return ret;
-}
-
 int RSA_memory_lock(RSA *r)
 	{
 	int i,j,k,off;

diff --git a/crypto/rsa/rsa_oaep.c b/crypto/rsa/rsa_oaep.c
index 18d307e..553d212 100644
--- a/crypto/rsa/rsa_oaep.c
+++ b/crypto/rsa/rsa_oaep.c

@@ -56,7 +56,8 @@
 	seed = to + 1;
 	db = to + SHA_DIGEST_LENGTH + 1;
 
-	EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+		return 0;
 	memset(db + SHA_DIGEST_LENGTH, 0,
 		emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
 	db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
@@ -145,7 +146,8 @@
 	for (i = 0; i < dblen; i++)
 		db[i] ^= maskeddb[i];
 
-	EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+		return -1;
 
 	if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
 		goto decoding_err;

diff --git a/crypto/rsa/rsa_pmeth.c b/crypto/rsa/rsa_pmeth.c
index c6892ec..5b2ecf5 100644
--- a/crypto/rsa/rsa_pmeth.c
+++ b/crypto/rsa/rsa_pmeth.c

@@ -63,6 +63,12 @@
 #include <openssl/rsa.h>
 #include <openssl/bn.h>
 #include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 #include "rsa_locl.h"
 
@@ -79,6 +85,8 @@
 	int pad_mode;
 	/* message digest */
 	const EVP_MD *md;
+	/* message digest for MGF1 */
+	const EVP_MD *mgf1md;
 	/* PSS/OAEP salt length */
 	int saltlen;
 	/* Temp buffer */
@@ -95,6 +103,7 @@
 	rctx->pub_exp = NULL;
 	rctx->pad_mode = RSA_PKCS1_PADDING;
 	rctx->md = NULL;
+	rctx->mgf1md = NULL;
 	rctx->tbuf = NULL;
 
 	rctx->saltlen = -2;
@@ -147,6 +156,31 @@
 		OPENSSL_free(rctx);
 		}
 	}
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1  : redirect to FIPS.
+ * 0  : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+	{
+	RSA_PKEY_CTX *rctx = ctx->data;
+	RSA *rsa = ctx->pkey->pkey.rsa;
+	int rv = -1;
+	if (!FIPS_mode())
+		return 0;
+	if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+		rv = 0;
+	if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+		return -1;
+	if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	return 1;
+	}
+#endif
 
 static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 
+#ifdef OPENSSL_FIPS
+	ret = pkey_fips_check_ctx(ctx);
+	if (ret < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
+
 	if (rctx->md)
 		{
 		if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@
 					RSA_R_INVALID_DIGEST_LENGTH);
 			return -1;
 			}
-		if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+		if (ret > 0)
+			{
+			unsigned int slen;
+			ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, &slen);
+			if (ret > 0)
+				*siglen = slen;
+			else
+				*siglen = 0;
+			return ret;
+			}
+#endif
+
+		if (EVP_MD_type(rctx->md) == NID_mdc2)
+			{
+			unsigned int sltmp;
+			if (rctx->pad_mode != RSA_PKCS1_PADDING)
+				return -1;
+			ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+						tbs, tbslen, sig, &sltmp, rsa);
+
+			if (ret <= 0)
+				return ret;
+			ret = sltmp;
+			}
+		else if (rctx->pad_mode == RSA_X931_PADDING)
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
@@ -186,8 +258,10 @@
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
-			if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
-						rctx->md, rctx->saltlen))
+			if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
+						rctx->tbuf, tbs,
+						rctx->md, rctx->mgf1md,
+						rctx->saltlen))
 				return -1;
 			ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
 						sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 	size_t rslen;
+#ifdef OPENSSL_FIPS
+	int rv;
+	rv = pkey_fips_check_ctx(ctx);
+	if (rv < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
 	if (rctx->md)
 		{
+#ifdef OPENSSL_FIPS
+		if (rv > 0)
+			{
+			return FIPS_rsa_verify_digest(rsa,
+							tbs, tbslen,
+							rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, siglen);
+							
+			}
+#endif
 		if (rctx->pad_mode == RSA_PKCS1_PADDING)
 			return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
 					sig, siglen, rsa);
@@ -289,7 +385,8 @@
 							rsa, RSA_NO_PADDING);
 			if (ret <= 0)
 				return 0;
-			ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+			ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+						rctx->md, rctx->mgf1md,
 						rctx->tbuf, rctx->saltlen);
 			if (ret <= 0)
 				return 0;
@@ -403,15 +500,25 @@
 				RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
 		return -2;
 
+		case EVP_PKEY_CTRL_GET_RSA_PADDING:
+		*(int *)p2 = rctx->pad_mode;
+		return 1;
+
 		case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
-		if (p1 < -2)
-			return -2;
+		case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
 		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
 			{
 			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
 			return -2;
 			}
-		rctx->saltlen = p1;
+		if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+			*(int *)p2 = rctx->saltlen;
+		else
+			{
+			if (p1 < -2)
+				return -2;
+			rctx->saltlen = p1;
+			}
 		return 1;
 
 		case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@
 		rctx->md = p2;
 		return 1;
 
+		case EVP_PKEY_CTRL_RSA_MGF1_MD:
+		case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+			{
+			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+			return -2;
+			}
+		if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+			{
+			if (rctx->mgf1md)
+				*(const EVP_MD **)p2 = rctx->mgf1md;
+			else
+				*(const EVP_MD **)p2 = rctx->md;
+			}
+		else
+			rctx->mgf1md = p2;
+		return 1;
+
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
 		case EVP_PKEY_CTRL_PKCS7_DECRYPT:
 		case EVP_PKEY_CTRL_PKCS7_SIGN:
-#ifndef OPENSSL_NO_CMS
-		case EVP_PKEY_CTRL_CMS_ENCRYPT:
-		case EVP_PKEY_CTRL_CMS_DECRYPT:
-		case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
 		return 1;
+#ifndef OPENSSL_NO_CMS
+		case EVP_PKEY_CTRL_CMS_DECRYPT:
+		{
+		X509_ALGOR *alg = NULL;
+		ASN1_OBJECT *encalg = NULL;
+		if (p2)
+			CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+		if (alg)
+			X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+		if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+			rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+		}
+		case EVP_PKEY_CTRL_CMS_ENCRYPT:
+		case EVP_PKEY_CTRL_CMS_SIGN:
+		return 1;
+#endif
 		case EVP_PKEY_CTRL_PEER_KEY:
 			RSAerr(RSA_F_PKEY_RSA_CTRL,
 			RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);

diff --git a/crypto/rsa/rsa_pss.c b/crypto/rsa/rsa_pss.c
index ac211e2..5f9f533 100644
--- a/crypto/rsa/rsa_pss.c
+++ b/crypto/rsa/rsa_pss.c

@@ -73,6 +73,13 @@
 int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 			const EVP_MD *Hash, const unsigned char *EM, int sLen)
 	{
+	return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+	}
+
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+			const unsigned char *EM, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@
 	unsigned char *DB = NULL;
 	EVP_MD_CTX ctx;
 	unsigned char H_[EVP_MAX_MD_SIZE];
+	EVP_MD_CTX_init(&ctx);
+
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
 
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
@@ -94,7 +105,7 @@
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -102,7 +113,7 @@
 	emLen = RSA_size(rsa);
 	if (EM[0] & (0xFF << MSBits))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
 		goto err;
 		}
 	if (MSBits == 0)
@@ -112,12 +123,12 @@
 		}
 	if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
 		goto err;
 		}
 	if (EM[emLen - 1] != 0xbc)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
 		goto err;
 		}
 	maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@
 	DB = OPENSSL_malloc(maskedDBLen);
 	if (!DB)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
-	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
 		goto err;
 	for (i = 0; i < maskedDBLen; i++)
 		DB[i] ^= EM[i];
@@ -137,25 +148,28 @@
 	for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
 	if (DB[i++] != 0x1)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
 		goto err;
 		}
 	if (sLen >= 0 && (maskedDBLen - i) != sLen)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
 	if (maskedDBLen - i)
-		EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
-	EVP_DigestFinal(&ctx, H_, NULL);
-	EVP_MD_CTX_cleanup(&ctx);
+		{
+		if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
+			goto err;
+		}
+	if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+		goto err;
 	if (memcmp(H_, H, hLen))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
 		ret = 0;
 		}
 	else 
@@ -164,6 +178,7 @@
 	err:
 	if (DB)
 		OPENSSL_free(DB);
+	EVP_MD_CTX_cleanup(&ctx);
 
 	return ret;
 
@@ -173,12 +188,22 @@
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen)
 	{
+	return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+	}
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
 	unsigned char *H, *salt = NULL, *p;
 	EVP_MD_CTX ctx;
 
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
+
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
 		goto err;
@@ -192,7 +217,7 @@
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -209,8 +234,7 @@
 		}
 	else if (emLen < (hLen + sLen + 2))
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
 		goto err;
 		}
 	if (sLen > 0)
@@ -218,8 +242,7 @@
 		salt = OPENSSL_malloc(sLen);
 		if (!salt)
 			{
-			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   		ERR_R_MALLOC_FAILURE);
+			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
 			goto err;
 			}
 		if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@
 	maskedDBLen = emLen - hLen - 1;
 	H = EM + maskedDBLen;
 	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
-	if (sLen)
-		EVP_DigestUpdate(&ctx, salt, sLen);
-	EVP_DigestFinal(&ctx, H, NULL);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
+	if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+		goto err;
 	EVP_MD_CTX_cleanup(&ctx);
 
 	/* Generate dbMask in place then perform XOR on it */
-	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
 		goto err;
 
 	p = EM;

diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c
index 0be4ec7..b6f6037 100644
--- a/crypto/rsa/rsa_sign.c
+++ b/crypto/rsa/rsa_sign.c

@@ -77,6 +77,14 @@
 	const unsigned char *s = NULL;
 	X509_ALGOR algor;
 	ASN1_OCTET_STRING digest;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign)
 		{
 		return rsa->meth->rsa_sign(type, m, m_len,
@@ -153,6 +161,15 @@
 	unsigned char *s;
 	X509_SIG *sig=NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
+
 	if (siglen != (unsigned int)RSA_size(rsa))
 		{
 		RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH);
@@ -182,6 +199,22 @@
 	i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING);
 
 	if (i <= 0) goto err;
+	/* Oddball MDC2 case: signature can be OCTET STRING.
+	 * check for correct tag and length octets.
+	 */
+	if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10)
+		{
+		if (rm)
+			{
+			memcpy(rm, s + 2, 16);
+			*prm_len = 16;
+			ret = 1;
+			}
+		else if(memcmp(m, s + 2, 16))
+			RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE);
+		else
+			ret = 1;
+		}
 
 	/* Special case: SSL signature */
 	if(dtype == NID_md5_sha1) {

diff --git a/crypto/s390xcap.c b/crypto/s390xcap.c
index ffbe023..f2e94ef 100644
--- a/crypto/s390xcap.c
+++ b/crypto/s390xcap.c

@@ -4,7 +4,7 @@
 #include <setjmp.h>
 #include <signal.h>
 
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
 
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@
 	sigset_t oset;
 	struct sigaction ill_act,oact;
 
-	if (OPENSSL_s390xcap_P) return;
+	if (OPENSSL_s390xcap_P[0]) return;
+
+	OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
 
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@
 	sigaction (SIGILL,&ill_act,&oact);
 
 	/* protection against missing store-facility-list-extended */
-	if (sigsetjmp(ill_jmp,0) == 0)
-		OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
-	else
-		OPENSSL_s390xcap_P = 1UL<<63;
+	if (sigsetjmp(ill_jmp,1) == 0)
+		OPENSSL_s390x_facilities();
 
 	sigaction (SIGILL,&oact,NULL);
 	sigprocmask(SIG_SETMASK,&oset,NULL);

diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S
index b053c6a..0681534 100644
--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S

@@ -5,10 +5,14 @@
 .align	16
 OPENSSL_s390x_facilities:
 	lghi	%r0,0
-	.long	0xb2b0f010	# stfle	16(%r15)
-	lg	%r2,16(%r15)
-	larl	%r1,OPENSSL_s390xcap_P
-	stg	%r2,0(%r1)
+	larl	%r2,OPENSSL_s390xcap_P
+	stg	%r0,8(%r2)
+	.long	0xb2b02000	# stfle	0(%r2)
+	brc	8,.Ldone
+	lghi	%r0,1
+	.long	0xb2b02000	# stfle 0(%r2)
+.Ldone:
+	lg	%r2,0(%r2)
 	br	%r14
 .size	OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
 
@@ -58,6 +62,9 @@
 .type	OPENSSL_cleanse,@function
 .align	16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+	llgfr	%r3,%r3
+#endif
 	lghi	%r4,15
 	lghi	%r0,0
 	clgr	%r3,%r4
@@ -89,4 +96,4 @@
 .section	.init
 	brasl	%r14,OPENSSL_cpuid_setup
 
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8

diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index a1f8762..1084d22 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl

@@ -12,6 +12,8 @@
 # commentary below], and in 2006 the rest was rewritten in order to
 # gain freedom to liberate licensing terms.
 
+# January, September 2004.
+#
 # It was noted that Intel IA-32 C compiler generates code which
 # performs ~30% *faster* on P4 CPU than original *hand-coded*
 # SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
 # ----------------------------------------------------------------
 #					<[email protected]>
 
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+#    it possible to reduce amount of instructions required to perform
+#    the operation in question, for further details see
+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86		SSSE3		AVX
+# Pentium	15.7		-
+# PIII		11.5		-
+# P4		10.6		-
+# AMD K8	7.1		-
+# Core2		7.3		6.1/+20%	-
+# Atom		12.5		9.5(*)/+32%	-
+# Westmere	7.3		5.6/+30%	-
+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
+#
+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
+#	It remains mystery [to me] why ILP is limited to 1.7.
+#
+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
 
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+		$1>=2.19);	# first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
+		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+		$1>=2.03);	# first version supporting AVX
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
 $A="eax";
 $B="ebx";
 $C="ecx";
@@ -47,6 +129,10 @@
 
 @V=($A,$B,$C,$D,$E,$T);
 
+$alt=0;	# 1 denotes alternative IALU implementation, which performs
+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
+	# Sandy Bridge...
+
 sub BODY_00_15
 	{
 	local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@
 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
 	 &xor($f,$d);
 	&add($tmp1,$e);			# tmp1+=e;
-	 &and($f,$b);
-	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
 	 				# with xi, also note that e becomes
 					# f in next round...
-	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&and($f,$b);
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
 
-	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+		      &add($f,$tmp1); }	# f+=tmp1
 	else        { &add($tmp1,$f); }	# f becomes a in next round
+	&mov($tmp1,$a)			if ($alt && $n==15);
 	}
 
 sub BODY_16_19
@@ -77,22 +165,41 @@
 
 	&comment("16_19 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
-	&xor($f,&swtmp(($n+2)%16));
-	 &xor($tmp1,$d);
-	&xor($f,&swtmp(($n+8)%16));
-	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
-	&rotr($b,2);			# b=ROTATE(b,30)
+if ($alt) {
+	&xor($c,$d);
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
+	&xor($c,$d);			# restore $c
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
-	&mov(&swtmp($n%16),$f);		# xi=f
-	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
-	&add($f,$e);			# f+=ROTATE(a,5)
+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
+	 &mov($tmp1,$a);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_20_39
@@ -102,21 +209,41 @@
 
 	&comment("20_39 $n");
 
+if ($alt) {
+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+8)%16));
+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
+	&and($tmp1,$b)			if($n==39);
+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+	&rotr($a,5)			if ($n==79);
+} else {
 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
-	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-	&rotr($b,2);			# b=ROTATE(b,30)
-	 &xor($f,&swtmp(($n+2)%16));
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
 	&xor($tmp1,$c);
 	 &xor($f,&swtmp(($n+8)%16));
 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &add($tmp1,$e);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
-	&add($f,$e);			# f+=ROTATE(a,5)
+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($tmp1,$a);
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+	&add($f,$tmp1);			# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_40_59
@@ -125,41 +252,86 @@
 
 	&comment("40_59 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,&swtmp(($n+2)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+8)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+13)%16));
-	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
-	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
+if ($alt) {
+	&add($e,$tmp1);			# e+=b&(c^d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&mov($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($c,$d);			# restore $c
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &or($tmp1,$c);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &and($tmp1,$d);
-	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
-	 &mov($e,$b);			# e becomes volatile and is used
-					# to calculate F_40_59(b,c,d)
+	 &and($tmp1,$c);
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &add($e,$tmp1);		# e+=c&d
+	&mov($tmp1,$a);			# b in next round
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if ($n<59);
+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($tmp1,$e);		# b&(c^d)+=e
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &and($e,$c);
-	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
-	 &mov($e,$a);
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=tmp1;
+	 &mov($e,$a);			# e becomes volatile
+	&rotl($e,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+	 &mov($tmp1,$c);
 	&add($f,$e);			# f+=ROTATE(a,5)
+	 &and($tmp1,$d);
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=c&d
+}
 	}
 
 &function_begin("sha1_block_data_order");
+if ($xmm) {
+  &static_label("ssse3_shortcut");
+  &static_label("avx_shortcut")		if ($ymm);
+  &static_label("K_XX_XX");
+
+	&call	(&label("pic_point"));	# make it PIC!
+  &set_label("pic_point");
+	&blindpop($tmp1);
+	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+	&mov	($A,&DWP(0,$T));
+	&mov	($D,&DWP(4,$T));
+	&test	($D,1<<9);		# check SSSE3 bit
+	&jz	(&label("x86"));
+	&test	($A,1<<24);		# check FXSR bit
+	&jz	(&label("x86"));
+	if ($ymm) {
+		&and	($D,1<<28);		# mask AVX bit
+		&and	($A,1<<30);		# mask "Intel CPU" bit
+		&or	($A,$D);
+		&cmp	($A,1<<28|1<<30);
+		&je	(&label("avx_shortcut"));
+	}
+	&jmp	(&label("ssse3_shortcut"));
+  &set_label("x86",16);
+}
 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
 	&mov($T,&wparam(1));	# const void *input
 	&mov($A,&wparam(2));	# size_t num
-	&stack_push(16);	# allocate X[16]
+	&stack_push(16+3);	# allocate X[16]
 	&shl($A,6);
 	&add($A,$T);
 	&mov(&wparam(2),$A);	# pointer beyond the end of input
 	&mov($E,&DWP(16,$tmp1));# pre-load E
+	&jmp(&label("loop"));
 
-	&set_label("loop",16);
+&set_label("loop",16);
 
 	# copy input chunk to X, but reversing byte order!
 	for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@
 	&mov(&DWP(16,$tmp1),$C);
 	&jb(&label("loop"));
 
-	&stack_pop(16);
+	&stack_pop(16+3);
 &function_end("sha1_block_data_order");
+
+if ($xmm) {
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&movdqa	(&QWP(112+16,"esp"),@X[5]);
+	&movdqa	(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&movdqa	(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&movdqa	(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&pshufb	(@X[-3&7],@X[2]);
+	&pshufb	(@X[-2&7],@X[2]);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&pshufb	(@X[-1&7],@X[2]);
+	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
+	&paddd	(@X[-3&7],@X[3]);
+	&paddd	(@X[-2&7],@X[3]);
+	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
+	&psubd	(@X[-4&7],@X[3]);		# restore X[]
+	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
+	&psubd	(@X[-3&7],@X[3]);
+	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
+	&psubd	(@X[-2&7],@X[3]);
+	&movdqa	(@X[0],@X[-3&7]);
+	&jmp	(&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	&movdqa	(@X[2],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@X[3],@X[-1&7]);
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@X[4],@X[0]);
+	&movdqa	(@X[2],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[2],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@X[3],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[4],30);
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@X[3],2);
+	&pxor	(@X[0],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
+	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@X[2],@X[0]);
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@X[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
+	&movdqu	(@X[-3&7],&QWP(16,$inp));
+	&movdqu	(@X[-2&7],&QWP(32,$inp));
+	&movdqu	(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@X[3]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+&set_label("loop",16);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&movdqa	(@X[0],@X[-3&7]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+if ($ymm) {
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+	&vzeroall();
+
+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
+	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
+	&vpaddd	(@X[1],@X[-3&7],@X[3]);
+	&vpaddd	(@X[2],@X[-2&7],@X[3]);
+	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
+	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+	&jmp	(&label("loop"));
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[2],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[3],@X[4],30);
+	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@X[4],@X[4],2);
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@X[2],@X[0],30);
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+	&vzeroall();
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
+}
 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();

diff --git a/crypto/sha/asm/sha1-alpha.pl b/crypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 0000000..6c4b925
--- /dev/null
+++ b/crypto/sha/asm/sha1-alpha.pl

@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
+	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
+$ctx="a0";	# $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";	# 20
+$C="a5";
+$D="t8";
+$E="t9";	@V=($A,$B,$C,$D,$E);
+$t0="t10";	# 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";	# 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	ldq_u	@X[0],0+0($inp)
+	ldq_u	@X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+	ldq_u	@X[$i+2],($i+2)*4+0($inp)
+	ldq_u	@X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+	extql	@X[$i],$inp,@X[$i]
+	extqh	@X[$i+1],$inp,@X[$i+1]
+
+	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched
+
+	srl	@X[$i],24,$t0		# vectorized byte swap
+	srl	@X[$i],8,$t2
+
+	sll	@X[$i],8,$t3
+	sll	@X[$i],24,@X[$i]
+	zapnot	$t0,0x11,$t0
+	zapnot	$t2,0x22,$t2
+
+	zapnot	@X[$i],0x88,@X[$i]
+	or	$t0,$t2,$t0
+	zapnot	$t3,0x44,$t3
+	sll	$a,5,$t1
+
+	or	@X[$i],$t0,@X[$i]
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	or	@X[$i],$t3,@X[$i]
+	srl	$a,27,$t0
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	extll	@X[$i],4,@X[$i+1]	# extract upper half
+	or	$t2,$t3,$t2
+	addl	@X[$i],$e,$e
+
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	srl	$a,27,$t0
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	or	$t2,$t3,$t2
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	zapnot	$a,0xf,$a
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	addl	$t1,$e,$e
+	or	$t2,$t3,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t0,$e,$e
+	srl	@X[$j%16],31,$t1
+
+	addl	$t2,$e,$e
+	srl	$b,32,$t3
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t2,$e,$e
+	srl	$a,27,$t0
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);	# with context fetch
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	ldl	@X[0],0($ctx)
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	ldl	@X[1],4($ctx)
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	ldl	@X[2],8($ctx)
+
+	srl	$a,27,$t0
+	addl	$t2,$e,$e
+	ldl	@X[3],12($ctx)
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	ldl	@X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	and	$b,$c,$t2
+	and	$b,$d,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t1,$e,$e
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t0,$e,$e
+	or	$t2,$t3,$t2
+	and	$c,$d,$t3
+
+	or	$t2,$t3,$t2
+	srl	$b,32,$t3
+	addl	@X[$i%16],$e,$e
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t2,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	sha1_block_data_order
+.align	5
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	lda	sp,-64(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	stq	s2,24(sp)
+	stq	s3,32(sp)
+	stq	s4,40(sp)
+	stq	s5,48(sp)
+	stq	fp,56(sp)
+	.mask	0x0400fe00,-64
+	.frame	sp,64,ra
+	.prologue 0
+
+	ldl	$A,0($ctx)
+	ldl	$B,4($ctx)
+	sll	$num,6,$num
+	ldl	$C,8($ctx)
+	ldl	$D,12($ctx)
+	ldl	$E,16($ctx)
+	addq	$inp,$num,$num
+
+.Lloop:
+	.set	noreorder
+	ldah	$K,23170(zero)
+	zapnot	$B,0xf,$B
+	lda	$K,31129($K)	# K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,28378(zero)
+	lda	$K,-5215($K)	# K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-28900(zero)
+	lda	$K,-17188($K)	# K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-13725(zero)
+	lda	$K,-15914($K)	# K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stl	$A,0($ctx)
+	stl	$B,4($ctx)
+	addq	$inp,64,$inp
+	stl	$C,8($ctx)
+	stl	$D,12($ctx)
+	stl	$E,16($ctx)
+	cmpult	$inp,$num,$t1
+	bne	$t1,.Lloop
+
+	.set	noreorder
+	ldq	ra,0(sp)
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	ldq	s2,24(sp)
+	ldq	s3,32(sp)
+	ldq	s4,40(sp)
+	ldq	s5,48(sp)
+	ldq	fp,56(sp)
+	lda	sp,64(sp)
+	ret	(ra)
+.end	sha1_block_data_order
+.ascii	"SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;

diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 79e3f61..db83c51 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl

@@ -47,6 +47,10 @@
 # Cortex A8 core and in absolute terms ~870 cycles per input block
 # [or 13.6 cycles per byte].
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -76,31 +80,41 @@
 	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
 	ldr	$t3,[$Xi,#2*4]
 	eor	$t0,$t0,$t1
-	eor	$t2,$t2,$t3
+	eor	$t2,$t2,$t3			@ 1 cycle stall
 	eor	$t1,$c,$d			@ F_xx_xx
 	mov	$t0,$t0,ror#31
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
 	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
 	$opt1					@ F_xx_xx
 	$opt2					@ F_xx_xx
 	add	$e,$e,$t0			@ E+=X[i]
-	str	$t0,[$Xi,#-4]!
 ___
 }
 
 sub BODY_00_15 {
 my ($a,$b,$c,$d,$e)=@_;
 $code.=<<___;
-	ldrb	$t0,[$inp],#4
-	ldrb	$t1,[$inp,#-1]
-	ldrb	$t2,[$inp,#-2]
+#if __ARM_ARCH__<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
 	add	$e,$K,$e,ror#2			@ E+=K_00_19
-	ldrb	$t3,[$inp,#-3]
-	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
-	orr	$t0,$t1,$t0,lsl#24
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
 	eor	$t1,$c,$d			@ F_xx_xx
-	orr	$t0,$t0,$t2,lsl#8
-	orr	$t0,$t0,$t3,lsl#16
+	orr	$t0,$t0,$t2,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
+	eor	$t1,$c,$d			@ F_xx_xx
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
 	and	$t1,$b,$t1,ror#2
 	add	$e,$e,$t0			@ E+=X[i]
 	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
@@ -136,6 +150,8 @@
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -210,10 +226,14 @@
 	teq	$inp,$len
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1

diff --git a/crypto/sha/asm/sha1-armv4-large.s b/crypto/sha/asm/sha1-armv4-large.s
index 7f687d9..97ad8be 100644
--- a/crypto/sha/asm/sha1-armv4-large.s
+++ b/crypto/sha/asm/sha1-armv4-large.s

@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -16,76 +18,126 @@
 	mov	r6,r6,ror#30
 	mov	r7,r7,ror#30		@ [6]
 .L_00_15:
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r3,r10,ror#2
 	add	r6,r6,r9			@ E+=X[i]
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r7,r10,ror#2
 	add	r5,r5,r9			@ E+=X[i]
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r6,r10,ror#2
 	add	r4,r4,r9			@ E+=X[i]
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r5,r10,ror#2
 	add	r3,r3,r9			@ E+=X[i]
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
@@ -94,16 +146,26 @@
 	teq	r14,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
 	sub	sp,sp,#5*4
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
@@ -115,15 +177,15 @@
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -132,15 +194,15 @@
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -149,15 +211,15 @@
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -166,15 +228,15 @@
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
 	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
 
@@ -188,15 +250,15 @@
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r4,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -204,15 +266,15 @@
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -220,15 +282,15 @@
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -236,15 +298,15 @@
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -252,15 +314,15 @@
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
 	teq	r14,sp			@ preserve carry
 	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
@@ -275,15 +337,15 @@
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r4,r10,ror#2					@ F_xx_xx
 	and r11,r5,r6					@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
 	add	r7,r7,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -292,15 +354,15 @@
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 	and r11,r4,r5					@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
 	add	r6,r6,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -309,15 +371,15 @@
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 	and r11,r3,r4					@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
 	add	r5,r5,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -326,15 +388,15 @@
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 	and r11,r7,r3					@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
 	add	r4,r4,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -343,15 +405,15 @@
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 	and r11,r6,r7					@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
 	add	r3,r3,r11,ror#2
 	teq	r14,sp
@@ -373,10 +435,14 @@
 	teq	r1,r2
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1

diff --git a/crypto/sha/asm/sha1-ia64.pl b/crypto/sha/asm/sha1-ia64.pl
index 51c4f47..db28f08 100644
--- a/crypto/sha/asm/sha1-ia64.pl
+++ b/crypto/sha/asm/sha1-ia64.pl

@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -26,14 +26,10 @@
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
-		$big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-	    {	$big_endian=(unpack('L',pack('N',1))==1);   }
 
 #$human=1;
 if ($human) {	# useful for visual code auditing...
-	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
 	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
 	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
@@ -41,47 +37,50 @@
 		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
 }
 else {
-	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
-	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
+	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-	    (	"r14", "r15", "loc11", "loc12"	);
+	    (	"r14", "r15", "loc10", "loc11"	);
 	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
 		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
 }
 
 sub BODY_00_15 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___ if ($i==0);
-{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
+{ .mmi;	ld1	$X[$i]=[inp],2		    // MSB
 	ld1	tmp2=[tmp3],2		};;
 { .mmi;	ld1	tmp0=[inp],2
 	ld1	tmp4=[tmp3],2		    // LSB
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
+	dep	$X[$i]=$X[$i],tmp2,8,8	};;
 ___
 if ($i<15) {
 	$code.=<<___;
-{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
+{ .mmi;	ld1	$Xn=[inp],2		    // forward Xload
+	nop.m	0x0
 	dep	tmp1=tmp0,tmp4,8,8	};;
-{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
 	and	tmp4=$c,$b
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
+	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		};; // a>>27
-{ .mmi;	ld1	tmp0=[inp],2		    // +1
-	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
+{ .mmi;	ld1	tmp0=[inp],2		    // forward Xload
+	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
 	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
-	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
+	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
+	mux2	$X[$i]=$X[$i],0x44	} //;;
 
 ___
 	}
@@ -89,24 +88,24 @@
 	$code.=<<___;
 { .mii;	and	tmp3=$c,$b
 	dep	tmp1=tmp0,tmp4,8,8;;
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
+	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	mux2	$X[$i]=$X[$i],0x44	};;
 
 ___
 	}
@@ -114,27 +113,28 @@
 
 sub BODY_16_19 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp4=$e,$K_00_19	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=$f,tmp4		    // f+=e+K_00_19
+{ .mib;	andcm	tmp1=$d,$b
+	and	tmp0=$c,$b		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
@@ -142,49 +142,47 @@
 
 sub BODY_20_39 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
 	$Konst = $K_20_39 if (!defined($Konst));
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 if ($i<79) {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	add	$f=$f,tmp4		    // f+=e+K_20_39
+	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
+	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
 }
 else {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_60_79
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
 	add	$h1=$h1,$a		};; // wrap up
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	extr.u	tmp1=$a,27,5		}   // a>>27
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
 	add	$h3=$h3,$c		};; // wrap up
-{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
-	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
-{ .mib;	add	$h2=$h2,$b		    // wrap up
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
+{ .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	add	tmp3=1,inp		    // used in unaligned codepath
 	add	$h4=$h4,$d		};; // wrap up
 
 ___
@@ -193,29 +191,29 @@
 
 sub BODY_40_59 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	and	tmp1=$d,$b
-	add	tmp4=$e,$K_40_59	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
-	add	$f=$f,tmp4		    // f+=e+K_40_59
+{ .mib;	and	tmp1=$c,$d
+	xor	tmp0=$c,$d		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	and	tmp4=$c,$d
-	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	};;
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	and	tmp0=tmp0,$b
+	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
+	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi;	xor	$Xn=$Xn,tmp3
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
-	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	nop.i	0x0			};;
 
 ___
 }
@@ -237,7 +235,7 @@
 .align	32
 sha1_block_data_order:
 	.prologue
-{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
+{ .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
 	$ADDP	tmp0=4,ctx
 	.save	ar.lc,r3
 	mov	r3=ar.lc		}
@@ -245,8 +243,8 @@
 	$ADDP	inp=0,inp
 	mov	r2=pr			};;
 tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
 	.body
 { .mlx;	ld4	$h0=[ctx],8
 	movl	$K_00_19=0x5a827999	}
@@ -273,7 +271,7 @@
 
 ___
 
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i,@V=($A,$B,$C,$D,$E);
 
 	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@
 	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
+	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
 }
 
 $code.=<<___;
-{ .mmb;	add	$h0=$h0,$E
-	nop.m	0
+{ .mmb;	add	$h0=$h0,$A
+	add	$h2=$h2,$C
 	br.ctop.dptk.many	.Ldtop	};;
 .Ldend:
 { .mmi;	add	tmp0=4,ctx

diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 0000000..6d7bf49
--- /dev/null
+++ b/crypto/sha/asm/sha1-parisc.pl

@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+				#                 [+ argument transfer]
+$ctx="%r26";		# arg0
+$inp="%r25";		# arg1
+$num="%r24";		# arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	addl	@X[$i],$e,$e
+	and	$c,$b,$t0
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	and	$c,$b,$t0
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	add	$t0,$e,$e
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i==79);	# with context load
+	ldw	0($ctx),@X[0]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	ldw	4($ctx),@X[1]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	ldw	8($ctx),@X[2]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	ldw	12($ctx),@X[3]
+	addl	$t0,$e,$e
+	ldw	16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+	shd	$a,$a,27,$t1	; $i
+	addl	$K,$e,$e
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	xor	$d,$c,$t0
+	addl	@X[$i%16],$e,$e
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	and	$b,$t0,$t0
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t0,$e,$e
+	and	$d,$c,$t1
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t1,$e,$e
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+	ldw	0($ctx),$A
+	ldw	4($ctx),$B
+	ldw	8($ctx),$C
+	ldw	12($ctx),$D
+	ldw	16($ctx),$E
+
+	extru	$inp,31,2,$t0		; t0=inp&3;
+	sh3addl	$t0,%r0,$t0		; t0*=8;
+	subi	32,$t0,$t0		; t0=32-t0;
+	mtctl	$t0,%cr11		; %sar=t0;
+
+L\$oop
+	ldi	3,$t0
+	andcm	$inp,$t0,$t0		; 64-bit neutral
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\tldw	`4*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	ldw	60($t0),@X[15]
+	ldw	64($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align input
+	$code.="\tvshd	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	ldil	L'0x5a827000,$K		; K_00_19
+	ldo	0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x6ed9e000,$K		; K_20_39
+	ldo	0xba1($K),$K
+___
+
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x8f1bb000,$K		; K_40_59
+	ldo	0xcdc($K),$K
+___
+
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0xca62c000,$K		; K_60_79
+	ldo	0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stw	$A,0($ctx)
+	stw	$B,4($ctx)
+	stw	$C,8($ctx)
+	stw	$D,12($ctx)
+	stw	$E,16($ctx)
+	addib,*<> -1,$num,L\$oop
+	ldo	64($inp),$inp
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;

diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index dcd0fcd..2140dd2 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl

@@ -24,12 +24,14 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$UCMP	="cmpld";
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$UCMP	="cmplw";
 	$STU	="stwu";
 	$POP	="lwz";
@@ -43,7 +45,8 @@
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@
 .globl	.sha1_block_data_order
 .align	4
 .sha1_block_data_order:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+64)`($sp)
-	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 	lwz	$A,0($ctx)
 	lwz	$B,4($ctx)
 	lwz	$C,8($ctx)
@@ -192,8 +195,53 @@
 Laligned:
 	mtctr	$num
 	bl	Lsha1_block_private
+	b	Ldone
+
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
+.align	4
+Lunaligned:
+	subfic	$t1,$inp,4096
+	andi.	$t1,$t1,4095	; distance to closest page boundary
+	srwi.	$t1,$t1,6	; t1/=64
+	beq	Lcross_page
+	$UCMP	$num,$t1
+	ble-	Laligned	; didn't cross the page boundary
+	mtctr	$t1
+	subfc	$num,$t1,$num
+	bl	Lsha1_block_private
+Lcross_page:
+	li	$t1,16
+	mtctr	$t1
+	addi	r20,$sp,$LOCALS	; spot within the frame
+Lmemcpy:
+	lbz	r16,0($inp)
+	lbz	r17,1($inp)
+	lbz	r18,2($inp)
+	lbz	r19,3($inp)
+	addi	$inp,$inp,4
+	stb	r16,0(r20)
+	stb	r17,1(r20)
+	stb	r18,2(r20)
+	stb	r19,3(r20)
+	addi	r20,r20,4
+	bdnz	Lmemcpy
+
+	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
+	li	$t1,1
+	addi	$inp,$sp,$LOCALS
+	mtctr	$t1
+	bl	Lsha1_block_private
+	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
+	addic.	$num,$num,-1
+	bne-	Lunaligned
+
 Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
 	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
 	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
@@ -212,54 +260,11 @@
 	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
 	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
 	mtlr	r0
-	addi	$sp,$sp,`$FRAME+64`
+	addi	$sp,$sp,$FRAME
 	blr
-___
-
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
-.align	4
-Lunaligned:
-	subfic	$t1,$inp,4096
-	andi.	$t1,$t1,4095	; distance to closest page boundary
-	srwi.	$t1,$t1,6	; t1/=64
-	beq	Lcross_page
-	$UCMP	$num,$t1
-	ble-	Laligned	; didn't cross the page boundary
-	mtctr	$t1
-	subfc	$num,$t1,$num
-	bl	Lsha1_block_private
-Lcross_page:
-	li	$t1,16
-	mtctr	$t1
-	addi	r20,$sp,$FRAME	; spot below the frame
-Lmemcpy:
-	lbz	r16,0($inp)
-	lbz	r17,1($inp)
-	lbz	r18,2($inp)
-	lbz	r19,3($inp)
-	addi	$inp,$inp,4
-	stb	r16,0(r20)
-	stb	r17,1(r20)
-	stb	r18,2(r20)
-	stb	r19,3(r20)
-	addi	r20,r20,4
-	bdnz	Lmemcpy
-
-	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
-	li	$t1,1
-	addi	$inp,$sp,$FRAME
-	mtctr	$t1
-	bl	Lsha1_block_private
-	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
-	addic.	$num,$num,-1
-	bne-	Lunaligned
-	b	Ldone
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@
 	addi	$inp,$inp,`16*4`
 	bdnz-	Lsha1_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"

diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl
index 4b17848..9193dda 100644
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl

@@ -21,9 +21,28 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
 $kimdfunc=1;	# magic function code for kimd instruction
 
-$output=shift;
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@
 @X=("%r12","%r13","%r14");
 $sp="%r15";
 
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
 
 sub Xupdate {
 my $i=shift;
 
 $code.=<<___ if ($i==15);
-	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
+	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
 	lr	$X[0],$X[2]
 ___
 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@
 ___
 $code.=<<___ if ($i>=16);
 	xgr	$X[0],$prefetch		### Xupdate($i)
-	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
-	xg	$X[0],`160+4*(($i+8)%16)`($sp)
+	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
 	xgr	$X[0],$prefetch
 	rll	$X[0],$X[0],1
 	rllg	$X[1],$X[0],32
@@ -68,7 +88,7 @@
 	lr	$X[2],$X[1]		# feedback
 ___
 $code.=<<___ if ($i<=70);
-	stg	$X[0],`160+4*($i%16)`($sp)
+	stg	$X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@@ -148,9 +168,9 @@
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
@@ -165,11 +185,11 @@
 ___
 $code.=<<___;
 	lghi	%r1,-$frame
-	stg	$ctx,16($sp)
-	stmg	%r6,%r15,48($sp)
+	st${g}	$ctx,`2*$SIZE_T`($sp)
+	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$t0,Ktable
 	llgf	$A,0($ctx)
@@ -199,7 +219,7 @@
 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,64($inp)
 	al	$A,0($ctx)
 	al	$B,4($ctx)
@@ -211,13 +231,13 @@
 	st	$C,8($ctx)
 	st	$D,12($ctx)
 	st	$E,16($ctx)
-	brct	$len,.Lloop
+	brct${g} $len,.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
 	br	%r14
 .size	sha1_block_data_order,.-sha1_block_data_order
 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index 4edc5ea..f27c1e3 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl

@@ -16,7 +16,7 @@
 # There was suggestion to mechanically translate 32-bit code, but I
 # dismissed it, reasoning that x86_64 offers enough register bank
 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
 # x86_64 does offer larger *addressable* bank, but out-of-order core
 # reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
 # Xeon P4	+65%		+0%		9.9
 # Core2		+60%		+10%		7.0
 
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86_64		SSSE3		AVX
+# P4		9.8		-
+# Opteron	6.6		-
+# Core2		6.7		6.1/+10%	-
+# Atom		11.0		9.7/+13%	-
+# Westmere	7.1		5.6/+27%	-
+# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,6 +72,16 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
 open STDOUT,"| $^X $xlate $flavour $output";
 
 $ctx="%rdi";	# 1st arg
@@ -51,28 +93,154 @@
 $inp="%r9";
 $num="%r10";
 
-$xi="%eax";
-$t0="%ebx";
-$t1="%ecx";
-$A="%edx";
-$B="%esi";
-$C="%edi";
-$D="%ebp";
-$E="%r11d";
-$T="%r12d";
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
 
-@V=($A,$B,$C,$D,$E,$T);
+@V=($A,$B,$C,$D,$E);
 
-sub PROLOGUE {
-my $func=shift;
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	mov	`4*$i`($inp),$xi[0]
+	bswap	$xi[0]
+	mov	$xi[0],`4*$i`(%rsp)
+___
+$code.=<<___ if ($i<15);
+	mov	$c,$t0
+	mov	`4*$j`($inp),$xi[1]
+	mov	$a,$t2
+	xor	$d,$t0
+	bswap	$xi[1]
+	rol	\$5,$t2
+	lea	0x5a827999($xi[0],$e),$e
+	and	$b,$t0
+	mov	$xi[1],`4*$j`(%rsp)
+	add	$t2,$e
+	xor	$d,$t0
+	rol	\$30,$b
+	add	$t0,$e
+___
+$code.=<<___ if ($i>=15);
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	xor	$d,$t0
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	and	$b,$t0
+	lea	0x5a827999($xi[0],$e),$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	xor	$d,$t0
+	rol	\$1,$xi[1]
+	add	$t2,$e
+	rol	\$30,$b
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t0,$e
+___
+unshift(@xi,pop(@xi));
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+$code.=<<___ if ($i<79);
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	xor	$b,$t0
+	rol	\$5,$t2
+	lea	$K($xi[0],$e),$e
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	xor	$d,$t0
+	add	$t2,$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	rol	\$30,$b
+	add	$t0,$e
+	rol	\$1,$xi[1]
+___
+$code.=<<___ if ($i<76);
+	mov	$xi[1],`4*($j%16)`(%rsp)
+___
+$code.=<<___ if ($i==79);
+	mov	$c,$t0
+	mov	$a,$t2
+	xor	$b,$t0
+	lea	$K($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	$d,$t0
+	add	$t2,$e
+	rol	\$30,$b
+	add	$t0,$e
+___
+unshift(@xi,pop(@xi));
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
 $code.=<<___;
-.globl	$func
-.type	$func,\@function,3
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$c,$t1
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	and	$d,$t0
+	mov	$a,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	xor	$d,$t1
+	lea	0x8f1bbcdc($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	add	$t0,$e
+	and	$b,$t1
+	rol	\$1,$xi[1]
+	add	$t1,$e
+	rol	\$30,$b
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t2,$e
+___
+unshift(@xi,pop(@xi));
+}
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,\@function,3
 .align	16
-$func:
+sha1_block_data_order:
+	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
+	test	\$`1<<9`,%r8d		# check SSSE3 bit
+	jz	.Lialu
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r8d		# mask AVX bit
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	or	%r9d,%r8d
+	cmp	\$`1<<28|1<<30`,%r8d
+	je	_avx_shortcut
+___
+$code.=<<___;
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
 	push	%rbx
 	push	%rbp
 	push	%r12
+	push	%r13
 	mov	%rsp,%r11
 	mov	%rdi,$ctx	# reassigned argument
 	sub	\$`8+16*4`,%rsp
@@ -87,160 +255,832 @@
 	mov	8($ctx),$C
 	mov	12($ctx),$D
 	mov	16($ctx),$E
-___
-}
+	jmp	.Lloop
 
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
-	mov	`16*4`(%rsp),%rsi
-	mov	(%rsi),%r12
-	mov	8(%rsi),%rbp
-	mov	16(%rsi),%rbx
-	lea	24(%rsi),%rsp
-.Lepilogue:
-	ret
-.size	$func,.-$func
+.align	16
+.Lloop:
 ___
-}
-
-sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
-my $j=$i+1;
-$code.=<<___ if ($i==0);
-	mov	`4*$i`($inp),$xi	
-	`"bswap	$xi"	if(!defined($host))`
-	mov	$xi,`4*$i`(%rsp)
-___
-$code.=<<___ if ($i<15);
-	lea	0x5a827999($xi,$e),$f
-	mov	$c,$t0
-	mov	`4*$j`($inp),$xi
-	mov	$a,$e
-	xor	$d,$t0
-	`"bswap	$xi"	if(!defined($host))`	
-	rol	\$5,$e
-	and	$b,$t0
-	mov	$xi,`4*$j`(%rsp)
-	add	$e,$f
-	xor	$d,$t0
-	rol	\$30,$b
-	add	$t0,$f
-___
-$code.=<<___ if ($i>=15);
-	lea	0x5a827999($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	xor	$d,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	and	$b,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	xor	$d,$t0
-	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
-	mov	$xi,`4*($j%16)`(%rsp)
-___
-}
-
-sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
-my $j=$i+1;
-my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
-$code.=<<___ if ($i<79);
-	lea	$K($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	xor	$b,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	xor	$d,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
-___
-$code.=<<___ if ($i<76);
-	mov	$xi,`4*($j%16)`(%rsp)
-___
-$code.=<<___ if ($i==79);
-	lea	$K($xi,$e),$f
-	mov	$c,$t0
-	mov	$a,$e
-	xor	$b,$t0
-	rol	\$5,$e
-	xor	$d,$t0
-	add	$e,$f
-	rol	\$30,$b
-	add	$t0,$f
-___
-}
-
-sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
-my $j=$i+1;
-$code.=<<___;
-	lea	0x8f1bbcdc($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$b,$t0
-	mov	$b,$t1
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	mov	$a,$e
-	and	$c,$t0
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	or	$c,$t1
-	rol	\$5,$e
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	and	$d,$t1
-	add	$e,$f
-	rol	\$1,$xi
-	or	$t1,$t0
-	rol	\$30,$b
-	mov	$xi,`4*($j%16)`(%rsp)
-	add	$t0,$f
-___
-}
-
-$code=".text\n";
-
-&PROLOGUE("sha1_block_data_order");
-$code.=".align	4\n.Lloop:\n";
 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-	add	0($ctx),$E
-	add	4($ctx),$T
-	add	8($ctx),$A
-	add	12($ctx),$B
-	add	16($ctx),$C
-	mov	$E,0($ctx)
-	mov	$T,4($ctx)
-	mov	$A,8($ctx)
-	mov	$B,12($ctx)
-	mov	$C,16($ctx)
+	add	0($ctx),$A
+	add	4($ctx),$B
+	add	8($ctx),$C
+	add	12($ctx),$D
+	add	16($ctx),$E
+	mov	$A,0($ctx)
+	mov	$B,4($ctx)
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
 
-	xchg	$E,$A	# mov	$E,$A
-	xchg	$T,$B	# mov	$T,$B
-	xchg	$E,$C	# mov	$A,$C
-	xchg	$T,$D	# mov	$B,$D
-			# mov	$C,$E
-	lea	`16*4`($inp),$inp
 	sub	\$1,$num
+	lea	`16*4`($inp),$inp
 	jnz	.Lloop
+
+	mov	`16*4`(%rsp),%rsi
+	mov	(%rsi),%r13
+	mov	8(%rsi),%r12
+	mov	16(%rsi),%rbp
+	mov	24(%rsi),%rbx
+	lea	32(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
 ___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_ssse3,\@function,3
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	jmp	.Loop_ssse3
+___
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_avx,\@function,3
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+	vzeroall
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	jmp	.Loop_avx
+___
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vzeroall
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+___
+}}}
 $code.=<<___;
 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	16
+.align	64
 ___
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1112,75 @@
 
 	lea	.Lprologue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
 	lea	.Lepilogue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	jae	.Lcommon_seh_tail
 
 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
-	lea	24(%rax),%rax
+	lea	32(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
+
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	64(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
+	mov	%r12,216($context)	# restore cotnext->R12
 
-.Lin_prologue:
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -328,19 +1218,38 @@
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	ssse3_handler,.-ssse3_handler
 
 .section	.pdata
 .align	4
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
-
+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
+	.rva	.LSEH_end_sha1_block_data_order_ssse3
+	.rva	.LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_sha1_block_data_order_avx
+	.rva	.LSEH_end_sha1_block_data_order_avx
+	.rva	.LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_sha1_block_data_order:
 	.byte	9,0,0,0
 	.rva	se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
 }
 

diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index ecc8b69..928ec53 100644
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl

@@ -14,8 +14,8 @@
 #		Pentium	PIII	P4	AMD K8	Core2
 # gcc		46	36	41	27	26
 # icc		57	33	38	25	23	
-# x86 asm	40	30	35	20	20
-# x86_64 asm(*)	-	-	21	15.8	16.5
+# x86 asm	40	30	33	20	18
+# x86_64 asm(*)	-	-	21	16	16
 #
 # (*) x86_64 assembler performance is presented for reference
 #     purposes.
@@ -48,20 +48,19 @@
     my $in_16_63=shift;
 
 	&mov	("ecx",$E);
-	 &add	($T,&DWP(4*(8+15+16-9),"esp"))	if ($in_16_63);	# T += X[-7]
-	&ror	("ecx",6);
-	&mov	("edi",$E);
-	&ror	("edi",11);
+	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
+	&ror	("ecx",25-11);
 	 &mov	("esi",$Foff);
-	&xor	("ecx","edi");
-	&ror	("edi",25-11);
+	&xor	("ecx",$E);
+	&ror	("ecx",11-6);
 	 &mov	(&DWP(4*(8+15),"esp"),$T)	if ($in_16_63);	# save X[0]
-	&xor	("ecx","edi");	# Sigma1(e)
+	&xor	("ecx",$E);
+	&ror	("ecx",6);	# Sigma1(e)
 	 &mov	("edi",$Goff);
 	&add	($T,"ecx");	# T += Sigma1(e)
-	 &mov	($Eoff,$E);	# modulo-scheduled
 
 	&xor	("esi","edi");
+	 &mov	($Eoff,$E);	# modulo-scheduled
 	 &mov	("ecx",$A);
 	&and	("esi",$E);
 	 &mov	($E,$Doff);	# e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@
 	 &mov	("edi",$A);
 	&add	($T,"esi");	# T += Ch(e,f,g)
 
-	&ror	("ecx",2);
+	&ror	("ecx",22-13);
 	 &add	($T,$Hoff);	# T += h
-	&ror	("edi",13);
+	&xor	("ecx",$A);
+	&ror	("ecx",13-2);
 	 &mov	("esi",$Boff);
-	&xor	("ecx","edi");
-	&ror	("edi",22-13);
+	&xor	("ecx",$A);
+	&ror	("ecx",2);	# Sigma0(a)
 	 &add	($E,$T);	# d += T
-	&xor	("ecx","edi");	# Sigma0(a)
 	 &mov	("edi",$Coff);
 
 	&add	($T,"ecx");	# T += Sigma0(a)
@@ -168,23 +167,22 @@
 &set_label("16_63",16);
 	&mov	("esi",$T);
 	 &mov	("ecx",&DWP(4*(8+15+16-14),"esp"));
-	&shr	($T,3);
-	&ror	("esi",7);
-	&xor	($T,"esi");
 	&ror	("esi",18-7);
 	 &mov	("edi","ecx");
-	&xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("esi",$T);
+	&ror	("esi",7);
+	&shr	($T,3);
 
-	&shr	("ecx",10);
-	 &mov	("esi",&DWP(4*(8+15+16),"esp"));
-	&ror	("edi",17);
-	&xor	("ecx","edi");
 	&ror	("edi",19-17);
-	 &add	($T,"esi");			# T += X[-16]
-	&xor	("edi","ecx")			# sigma1(X[-2])
+	 &xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("edi","ecx");
+	&ror	("edi",17);
+	&shr	("ecx",10);
+	 &add	($T,&DWP(4*(8+15+16),"esp"));	# T += X[-16]
+	&xor	("edi","ecx");			# sigma1(X[-2])
 
-	&add	($T,"edi");			# T += sigma1(X[-2])
-	# &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7], moved to BODY_00_15(1)
+	 &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7]
+	# &add	($T,"edi");			# T += sigma1(X[-2])
 	# &mov	(&DWP(4*(8+15),"esp"),$T);	# save X[0]
 
 	&BODY_00_15(1);

diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 492cb62..9c84e8d 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl

@@ -18,11 +18,16 @@
 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
 # Cortex A8 core and ~20 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
-$inp="r1";
+$inp="r1";	$t3="r1";
 $len="r2";	$t1="r2";
 $T1="r3";
 $A="r4";
@@ -46,6 +51,9 @@
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	ldr	$T1,[$inp],#4
+#else
 	ldrb	$T1,[$inp,#3]			@ $i
 	ldrb	$t2,[$inp,#2]
 	ldrb	$t1,[$inp,#1]
@@ -53,16 +61,24 @@
 	orr	$T1,$T1,$t2,lsl#8
 	orr	$T1,$T1,$t1,lsl#16
 	orr	$T1,$T1,$t0,lsl#24
-	`"str	$inp,[sp,#17*4]"	if ($i==15)`
+#endif
 ___
 $code.=<<___;
-	ldr	$t2,[$Ktbl],#4			@ *K256++
 	mov	$t0,$e,ror#$Sigma1[0]
-	str	$T1,[sp,#`$i%16`*4]
+	ldr	$t2,[$Ktbl],#4			@ *K256++
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
 	eor	$t1,$f,$g
+#if $i>=16
+	add	$T1,$T1,$t3			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$T1,$T1
+#endif
+#if $i==15
+	str	$inp,[sp,#17*4]			@ leave room for $t3
+#endif
 	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	str	$T1,[sp,#`$i%16`*4]
 	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
 	add	$T1,$T1,$h
@@ -71,6 +87,9 @@
 	eor	$h,$h,$a,ror#$Sigma0[1]
 	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
+#if $i>=15
+	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
+#endif
 	orr	$t0,$a,$b
 	and	$t1,$a,$b
 	and	$t0,$t0,$c
@@ -85,24 +104,26 @@
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t3,ror#$sigma0[0]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	mov	$t0,$t1,ror#$sigma0[0]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
-	eor	$t0,$t0,$t1,ror#$sigma0[1]
-	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
-	mov	$t1,$t2,ror#$sigma1[0]
+	eor	$t0,$t0,$t3,ror#$sigma0[1]
+	ldr	$t1,[sp,#`($i+9)%16`*4]
+	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	mov	$t3,$t2,ror#$sigma1[0]
 	add	$T1,$T1,$t0
-	eor	$t1,$t1,$t2,ror#$sigma1[1]
-	add	$T1,$T1,$inp
-	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	eor	$t3,$t3,$t2,ror#$sigma1[1]
 	add	$T1,$T1,$t1
+	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	@ add	$T1,$T1,$t3
 ___
 	&BODY_00_15(@_);
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -132,7 +153,7 @@
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
@@ -171,10 +192,14 @@
 	bne	.Loop
 
 	add	sp,sp,#`16+3`*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2

diff --git a/crypto/sha/asm/sha256-armv4.s b/crypto/sha/asm/sha256-armv4.s
index ee903dc..9c20a63 100644
--- a/crypto/sha/asm/sha256-armv4.s
+++ b/crypto/sha/asm/sha256-armv4.s

@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -27,11 +29,14 @@
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{r0,r1,r2,r4-r12,lr}
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
 	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
 	sub	r14,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 0
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -39,14 +44,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 0>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 0==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -55,6 +68,9 @@
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 0>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -62,6 +78,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 1
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -69,14 +88,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 1>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 1==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -85,6 +112,9 @@
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 1>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -92,6 +122,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 2
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -99,14 +132,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 2>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 2==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -115,6 +156,9 @@
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 2>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -122,6 +166,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 3
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -129,14 +176,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 3>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 3==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -145,6 +200,9 @@
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 3>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -152,6 +210,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 4
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -159,14 +220,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 4>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 4==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -175,6 +244,9 @@
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 4>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -182,6 +254,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 5
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -189,14 +264,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 5>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 5==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -205,6 +288,9 @@
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 5>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -212,6 +298,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 6
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -219,14 +308,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 6>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 6==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -235,6 +332,9 @@
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 6>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -242,6 +342,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 7
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -249,14 +352,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 7>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 7==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -265,6 +376,9 @@
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 7>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -272,6 +386,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 8
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -279,14 +396,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 8>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 8==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -295,6 +420,9 @@
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 8>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -302,6 +430,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 9
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -309,14 +440,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 9>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 9==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -325,6 +464,9 @@
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 9>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -332,6 +474,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 10
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -339,14 +484,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 10>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 10==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -355,6 +508,9 @@
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 10>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -362,6 +518,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 11
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -369,14 +528,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 11>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 11==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -385,6 +552,9 @@
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 11>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -392,6 +562,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 12
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -399,14 +572,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 12>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 12==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -415,6 +596,9 @@
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 12>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -422,6 +606,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 13
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -429,14 +616,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 13>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 13==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -445,6 +640,9 @@
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 13>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -452,6 +650,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 14
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -459,14 +660,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 14>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 14==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -475,6 +684,9 @@
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 14>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -482,6 +694,9 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 15
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -489,14 +704,22 @@
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	str	r1,[sp,#17*4]
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 15>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 15==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -505,6 +728,9 @@
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 15>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -513,26 +739,34 @@
 	add	r8,r8,r3
 	add	r4,r4,r0
 .Lrounds_16_xx:
-	ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#1*4]		@ 16
 	ldr	r12,[sp,#14*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#0*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#9*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#9*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 16>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 16==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -541,6 +775,9 @@
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 16>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -548,26 +785,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#2*4]		@ 17
 	ldr	r12,[sp,#15*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#1*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#10*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#10*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 17>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 17==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -576,6 +821,9 @@
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 17>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -583,26 +831,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#3*4]		@ 18
 	ldr	r12,[sp,#0*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#2*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#11*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#11*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 18>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 18==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -611,6 +867,9 @@
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 18>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -618,26 +877,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#4*4]		@ 19
 	ldr	r12,[sp,#1*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#3*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#12*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#12*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 19>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 19==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -646,6 +913,9 @@
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 19>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -653,26 +923,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#5*4]		@ 20
 	ldr	r12,[sp,#2*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#4*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#13*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#13*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 20>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 20==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -681,6 +959,9 @@
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 20>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -688,26 +969,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#6*4]		@ 21
 	ldr	r12,[sp,#3*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#5*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#14*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#14*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 21>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 21==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -716,6 +1005,9 @@
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 21>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -723,26 +1015,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#7*4]		@ 22
 	ldr	r12,[sp,#4*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#6*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#15*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#15*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 22>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 22==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -751,6 +1051,9 @@
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 22>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -758,26 +1061,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#8*4]		@ 23
 	ldr	r12,[sp,#5*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#7*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#0*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#0*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 23>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 23==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -786,6 +1097,9 @@
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 23>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -793,26 +1107,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
-	ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#9*4]		@ 24
 	ldr	r12,[sp,#6*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#8*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#1*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#1*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 24>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 24==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -821,6 +1143,9 @@
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 24>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -828,26 +1153,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#10*4]		@ 25
 	ldr	r12,[sp,#7*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#9*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#2*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#2*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 25>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 25==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -856,6 +1189,9 @@
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 25>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -863,26 +1199,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#11*4]		@ 26
 	ldr	r12,[sp,#8*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#10*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#3*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#3*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 26>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 26==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -891,6 +1235,9 @@
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 26>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -898,26 +1245,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#12*4]		@ 27
 	ldr	r12,[sp,#9*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#11*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#4*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#4*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 27>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 27==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -926,6 +1281,9 @@
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 27>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -933,26 +1291,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#13*4]		@ 28
 	ldr	r12,[sp,#10*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#12*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#5*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#5*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 28>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 28==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -961,6 +1327,9 @@
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 28>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -968,26 +1337,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#14*4]		@ 29
 	ldr	r12,[sp,#11*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#13*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#6*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#6*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 29>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 29==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -996,6 +1373,9 @@
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 29>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -1003,26 +1383,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#15*4]		@ 30
 	ldr	r12,[sp,#12*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#14*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#7*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#7*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 30>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 30==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -1031,6 +1419,9 @@
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 30>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -1038,26 +1429,34 @@
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#0*4]		@ 31
 	ldr	r12,[sp,#13*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#15*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#8*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#8*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 31>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 31==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -1066,6 +1465,9 @@
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 31>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -1102,10 +1504,14 @@
 	bne	.Loop
 
 	add	sp,sp,#19*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <[email protected]>"
 .align	2

diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 3a35861..7faf37b 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl

@@ -18,22 +18,33 @@
 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 # Cortex A8 core and ~40 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
+
 # Byte order [in]dependence. =========================================
 #
-# Caller is expected to maintain specific *dword* order in h[0-7],
-# namely with most significant dword at *lower* address, which is
-# reflected in below two parameters. *Byte* order within these dwords
-# in turn is whatever *native* byte order on current platform.
-$hi=0;
-$lo=4;
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
 # ====================================================================
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
-$ctx="r0";
+$ctx="r0";	# parameter block
 $inp="r1";
 $len="r2";
+
 $Tlo="r3";
 $Thi="r4";
 $Alo="r5";
@@ -61,15 +72,17 @@
 sub BODY_00_15() {
 my $magic = shift;
 $code.=<<___;
-	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
-	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
 	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
 	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
 	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	eor	$t0,$t0,$Elo,lsr#18
 	eor	$t1,$t1,$Ehi,lsr#18
 	eor	$t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@
 	and	$t1,$t1,$Ehi
 	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
 
 	adds	$Tlo,$Tlo,$t0
 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
-	adc	$Ehi,$Ehi,$Thi		@ d += T
-
-	and	$t0,$t2,#0xff
-	teq	$t0,#$magic
-	orreq	$Ktbl,$Ktbl,#1
-
 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+	orreq	$Ktbl,$Ktbl,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@
 	eor	$t0,$t0,$Alo,lsl#25
 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
 	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
 
-	and	$t0,$Alo,$t2
-	orr	$Alo,$Alo,$t2
 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
 	and	$Alo,$Alo,$t3
-	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$t3,$Ahi,$t1
 	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$Ahi,$Ahi,$t2
-	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	adds	$Alo,$Alo,$Tlo
-	adc	$Ahi,$Ahi,$Thi		@ h += T
-
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
 	add	$Ktbl,$Ktbl,#8
 ___
 }
 $code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	$Ktbl,r3,#640		@ K512
+	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@
 	str	$Thi,[sp,#$Foff+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	$Tlo,[$inp,#7]
 	ldrb	$t0, [$inp,#6]
 	ldrb	$t1, [$inp,#5]
@@ -252,26 +285,30 @@
 	orr	$Thi,$Thi,$t3,lsl#8
 	orr	$Thi,$Thi,$t0,lsl#16
 	orr	$Thi,$Thi,$t1,lsl#24
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
 ___
 	&BODY_00_15(0x94);
 $code.=<<___;
 	tst	$Ktbl,#1
 	beq	.L00_15
-	bic	$Ktbl,$Ktbl,#1
-
-.L16_79:
 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
-	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
-
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
 	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
 	eor	$Tlo,$Tlo,$t1,lsl#31
 	eor	$Thi,$Thi,$t0,lsl#31
 	eor	$Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@
 	eor	$t1,$t1,$t3,lsl#3
 	eor	$t0,$t0,$t2,lsr#6
 	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	eor	$t0,$t0,$t3,lsl#26
 
-	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
 	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	adc	$Thi,$Thi,$t1
 
-	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3
 	adds	$Tlo,$Tlo,$t0
 	adc	$Thi,$Thi,$t1
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
 ___
 	&BODY_00_15(0x17);
 $code.=<<___;
-	tst	$Ktbl,#1
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
 	beq	.L16_79
 	bic	$Ktbl,$Ktbl,#1
 
@@ -324,12 +360,12 @@
 	ldr	$t2, [$ctx,#$Boff+$lo]
 	ldr	$t3, [$ctx,#$Boff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Boff+$hi]
 
 	ldr	$Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@
 	ldr	$t2, [$ctx,#$Doff+$lo]
 	ldr	$t3, [$ctx,#$Doff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Doff+$hi]
 
 	ldr	$Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@
 	ldr	$t2, [$ctx,#$Foff+$lo]
 	ldr	$t3, [$ctx,#$Foff+$hi]
 	adds	$Elo,$Elo,$t0
-	adc	$Ehi,$Ehi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
 	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Foff+$hi]
 
 	ldr	$Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@
 	ldr	$t2, [$ctx,#$Hoff+$lo]
 	ldr	$t3, [$ctx,#$Hoff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Hoff+$hi]
 
 	add	sp,sp,#640
@@ -388,13 +424,156 @@
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+#endif
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	vadd.i64	$T1,$K,$h
+	veor		$Ch,$f,$g
+	veor		$t0,$t1
+	vand		$Ch,$e
+	veor		$t0,$t2			@ Sigma1(e)
+	veor		$Ch,$g			@ Ch(e,f,g)
+	vadd.i64	$T1,$t0
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	vadd.i64	$T1,$Ch
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	vadd.i64	$T1,@X[$i%16]
+	vorr		$Maj,$a,$c
+	vand		$Ch,$a,$c
+	veor		$h,$t0,$t1
+	vand		$Maj,$b
+	veor		$h,$t2			@ Sigma0(a)
+	vorr		$Maj,$Ch		@ Maj(a,b,c)
+	vadd.i64	$h,$T1
+	vadd.i64	$d,$T1
+	vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	$Ktbl,r3,#672		@ K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	bx	lr
+#endif
+___
+}
+$code.=<<___;
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/sha/asm/sha512-armv4.s b/crypto/sha/asm/sha512-armv4.s
index b030c16..5730192 100644
--- a/crypto/sha/asm/sha512-armv4.s
+++ b/crypto/sha/asm/sha512-armv4.s

@@ -1,90 +1,111 @@
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	r14,r3,#640		@ K512
+	sub	r14,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
-	ldr	r7,[r0,#32+4]
-	ldr	r8,[r0,#32+0]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
 .Loop:
 	str	r9, [sp,#48+0]
 	str	r10, [sp,#48+4]
 	str	r11, [sp,#56+0]
 	str	r12, [sp,#56+4]
-	ldr	r5,[r0,#0+4]
-	ldr	r6,[r0,#0+0]
-	ldr	r3,[r0,#8+4]
-	ldr	r4,[r0,#8+0]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
 	str	r3,[sp,#8+0]
 	str	r4,[sp,#8+4]
 	str	r9, [sp,#16+0]
 	str	r10, [sp,#16+4]
 	str	r11, [sp,#24+0]
 	str	r12, [sp,#24+4]
-	ldr	r3,[r0,#40+4]
-	ldr	r4,[r0,#40+0]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
 	str	r3,[sp,#40+0]
 	str	r4,[sp,#40+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	r3,[r1,#7]
 	ldrb	r9, [r1,#6]
 	ldrb	r10, [r1,#5]
@@ -99,17 +120,25 @@
 	orr	r4,r4,r12,lsl#8
 	orr	r4,r4,r9,lsl#16
 	orr	r4,r4,r10,lsl#24
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
@@ -136,25 +165,24 @@
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
+	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
+	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
+	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
-	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
-	teq	r9,#148
-	orreq	r14,r14,#1
-
 	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
 	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -171,38 +199,36 @@
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
+	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
-	and	r9,r5,r11
-	orr	r5,r5,r11
 	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r12,r6,r10
 	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
 	add	r14,r14,#8
 	tst	r14,#1
 	beq	.L00_15
-	bic	r14,r14,#1
-
-.L16_79:
 	ldr	r9,[sp,#184+0]
 	ldr	r10,[sp,#184+4]
-	ldr	r11,[sp,#80+0]
-	ldr	r12,[sp,#80+4]
-
+	bic	r14,r14,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
 	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
 	eor	r3,r3,r10,lsl#31
 	eor	r4,r4,r9,lsl#31
 	eor	r3,r3,r9,lsr#8
@@ -226,30 +252,30 @@
 	eor	r10,r10,r12,lsl#3
 	eor	r9,r9,r11,lsr#6
 	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
 	eor	r9,r9,r12,lsl#26
 
-	ldr	r11,[sp,#120+0]
 	ldr	r12,[sp,#120+4]
 	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
 	adc	r4,r4,r10
 
-	ldr	r9,[sp,#192+0]
 	ldr	r10,[sp,#192+4]
 	adds	r3,r3,r11
 	adc	r4,r4,r12
 	adds	r3,r3,r9
 	adc	r4,r4,r10
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
@@ -276,25 +302,24 @@
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
+	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
+	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
+	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
-	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
-	teq	r9,#23
-	orreq	r14,r14,#1
-
 	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
 	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -311,90 +336,91 @@
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
+	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
-	and	r9,r5,r11
-	orr	r5,r5,r11
 	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r12,r6,r10
 	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
-	add	r14,r14,#8
+	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
+	add	r14,r14,#8
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
 	beq	.L16_79
 	bic	r14,r14,#1
 
 	ldr	r3,[sp,#8+0]
 	ldr	r4,[sp,#8+4]
-	ldr	r9, [r0,#0+4]
-	ldr	r10, [r0,#0+0]
-	ldr	r11, [r0,#8+4]
-	ldr	r12, [r0,#8+0]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#0+4]
-	str	r10, [r0,#0+0]
-	str	r11, [r0,#8+4]
-	str	r12, [r0,#8+0]
+	str	r12, [r0,#8+HI]
 
 	ldr	r5,[sp,#16+0]
 	ldr	r6,[sp,#16+4]
 	ldr	r3,[sp,#24+0]
 	ldr	r4,[sp,#24+4]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#16+4]
-	str	r10, [r0,#16+0]
-	str	r11, [r0,#24+4]
-	str	r12, [r0,#24+0]
+	str	r12, [r0,#24+HI]
 
 	ldr	r3,[sp,#40+0]
 	ldr	r4,[sp,#40+4]
-	ldr	r9, [r0,#32+4]
-	ldr	r10, [r0,#32+0]
-	ldr	r11, [r0,#40+4]
-	ldr	r12, [r0,#40+0]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
 	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
 	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
 	adc	r12,r4,r12
-	str	r7,[r0,#32+4]
-	str	r8,[r0,#32+0]
-	str	r11, [r0,#40+4]
-	str	r12, [r0,#40+0]
+	str	r12, [r0,#40+HI]
 
 	ldr	r5,[sp,#48+0]
 	ldr	r6,[sp,#48+4]
 	ldr	r3,[sp,#56+0]
 	ldr	r4,[sp,#56+4]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#48+4]
-	str	r10, [r0,#48+0]
-	str	r11, [r0,#56+4]
-	str	r12, [r0,#56+0]
+	str	r12, [r0,#56+HI]
 
 	add	sp,sp,#640
 	sub	r14,r14,#640
@@ -403,10 +429,1355 @@
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <[email protected]>"
+#endif
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	r3,r3,#672		@ K512
+	vldmia	r0,{d16-d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64		{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64		{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64		{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64		{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64		{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64		{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64		{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64		{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	mov		r12,#4
+.L16_79_neon:
+	subs		r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vshr.u64	q15,q7,#6
+	vsli.64		q12,q7,#45
+	vext.8		q14,q0,q1,#8	@ X[i+1]
+	vsli.64		q13,q7,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q4,q5,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vshr.u64	q15,q0,#6
+	vsli.64		q12,q0,#45
+	vext.8		q14,q1,q2,#8	@ X[i+1]
+	vsli.64		q13,q0,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q5,q6,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vshr.u64	q15,q1,#6
+	vsli.64		q12,q1,#45
+	vext.8		q14,q2,q3,#8	@ X[i+1]
+	vsli.64		q13,q1,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q6,q7,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vshr.u64	q15,q2,#6
+	vsli.64		q12,q2,#45
+	vext.8		q14,q3,q4,#8	@ X[i+1]
+	vsli.64		q13,q2,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q7,q0,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vshr.u64	q15,q3,#6
+	vsli.64		q12,q3,#45
+	vext.8		q14,q4,q5,#8	@ X[i+1]
+	vsli.64		q13,q3,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q0,q1,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vshr.u64	q15,q4,#6
+	vsli.64		q12,q4,#45
+	vext.8		q14,q5,q6,#8	@ X[i+1]
+	vsli.64		q13,q4,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q1,q2,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vshr.u64	q15,q5,#6
+	vsli.64		q12,q5,#45
+	vext.8		q14,q6,q7,#8	@ X[i+1]
+	vsli.64		q13,q5,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q2,q3,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vshr.u64	q15,q6,#6
+	vsli.64		q12,q6,#45
+	vext.8		q14,q7,q0,#8	@ X[i+1]
+	vsli.64		q13,q6,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q3,q4,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	bne		.L16_79_neon
+
+	vldmia		r0,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		r0,{d16-d23}	@ save context
+	teq		r1,r2
+	sub		r3,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	.word	0xe12fff1e
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <[email protected]>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4

diff --git a/crypto/sha/asm/sha512-parisc.pl b/crypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 0000000..e24ee58
--- /dev/null
+++ b/crypto/sha/asm/sha512-parisc.pl

@@ -0,0 +1,791 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 block procedure for PA-RISC.
+
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+if ($output =~ /512/) {
+	$func="sha512_block_data_order";
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$LAST10BITS=0x017;
+	$LD="ldd";
+	$LDM="ldd,ma";
+	$ST="std";
+} else {
+	$func="sha256_block_data_order";
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$LAST10BITS=0x0f2;
+	$LD="ldw";
+	$LDM="ldwm";
+	$ST="stw";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$XOFF=16*$SZ+32;		# local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
+
+$ctx="%r26";	# zapped by $a0
+$inp="%r25";	# zapped by $a1
+$num="%r24";	# zapped by $t0
+
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+	_ror	$e,$Sigma1[0],$a0
+	and	$f,$e,$t0
+	_ror	$e,$Sigma1[1],$a1
+	addl	$t1,$h,$h
+	andcm	$g,$e,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+	or	$t0,$t1,$t1		; Ch(e,f,g)
+	addl	@X[$i%16],$h,$h
+	xor	$a0,$a1,$a1		; Sigma1(e)
+	addl	$t1,$h,$h
+	_ror	$a,$Sigma0[0],$a0
+	addl	$a1,$h,$h
+
+	_ror	$a,$Sigma0[1],$a1
+	and	$a,$b,$t0
+	and	$a,$c,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$t1,$t0,$t0
+	and	$b,$c,$t1
+	xor	$a0,$a1,$a1		; Sigma0(a)
+	addl	$h,$d,$d
+	xor	$t1,$t0,$t0		; Maj(a,b,c)
+	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
+	addl	$a1,$h,$h
+	addl	$t0,$h,$h
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+	_ror	@X[($i+1)%16],$sigma0[0],$a0
+	_ror	@X[($i+1)%16],$sigma0[1],$a1
+	addl	@X[($i+9)%16],@X[$i],@X[$i]
+	_ror	@X[($i+14)%16],$sigma1[0],$t0
+	_ror	@X[($i+14)%16],$sigma1[1],$t1
+	xor	$a1,$a0,$a0
+	_shr	@X[($i+1)%16],$sigma0[2],$a1
+	xor	$t1,$t0,$t0
+	_shr	@X[($i+14)%16],$sigma1[2],$t1
+	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
+	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
+	$LDM	$SZ($Tbl),$t1
+	addl	$a0,@X[$i],@X[$i]
+	addl	$t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+	extru	$t1,31,10,$a1
+	comiclr,<> $LAST10BITS,$a1,%r0
+	ldo	1($Tbl),$Tbl		; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.ALIGN	64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+
+	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+$func
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	_shl	$num,`log(16*$SZ)/log(2)`,$num
+	addl	$inp,$num,$num		; $num to point at the end of $inp
+
+	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+
+	blr	%r0,$Tbl
+	ldi	3,$t1
+L\$pic
+	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
+	ldo	L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+	ldi	31,$t1
+	mtctl	$t1,%cr11
+	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
+	b	L\$parisc1
+	nop
+___
+$code.=<<___;
+	$LD	`0*$SZ`($ctx),$A	; load context
+	$LD	`1*$SZ`($ctx),$B
+	$LD	`2*$SZ`($ctx),$C
+	$LD	`3*$SZ`($ctx),$D
+	$LD	`4*$SZ`($ctx),$E
+	$LD	`5*$SZ`($ctx),$F
+	$LD	`6*$SZ`($ctx),$G
+	$LD	`7*$SZ`($ctx),$H
+
+	extru	$inp,31,`log($SZ)/log(2)`,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	`8*$SZ`,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop
+	ldi	`$SZ-1`,$t0
+	$LDM	$SZ($Tbl),$t1
+	andcm	$inp,$t0,$t0		; align $inp
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	$LD	`$SZ*15`($t0),@X[15]
+	$LD	`$SZ*16`($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align data
+	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
+	nop
+
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
+
+	$LD	`0*$SZ`($ctx),@X[0]	; load context
+	$LD	`1*$SZ`($ctx),@X[1]
+	$LD	`2*$SZ`($ctx),@X[2]
+	$LD	`3*$SZ`($ctx),@X[3]
+	$LD	`4*$SZ`($ctx),@X[4]
+	$LD	`5*$SZ`($ctx),@X[5]
+	addl	@X[0],$A,$A
+	$LD	`6*$SZ`($ctx),@X[6]
+	addl	@X[1],$B,$B
+	$LD	`7*$SZ`($ctx),@X[7]
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+
+	$ST	$A,`0*$SZ`($ctx)	; save context
+	addl	@X[2],$C,$C
+	$ST	$B,`1*$SZ`($ctx)
+	addl	@X[3],$D,$D
+	$ST	$C,`2*$SZ`($ctx)
+	addl	@X[4],$E,$E
+	$ST	$D,`3*$SZ`($ctx)
+	addl	@X[5],$F,$F
+	$ST	$E,`4*$SZ`($ctx)
+	addl	@X[6],$G,$G
+	$ST	$F,`5*$SZ`($ctx)
+	addl	@X[7],$H,$H
+	$ST	$G,`6*$SZ`($ctx)
+	$ST	$H,`7*$SZ`($ctx)
+
+	cmpb,*<>,n $inp,$num,L\$oop
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+___
+if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+	b	L\$done
+	nop
+
+	.ALIGN	64
+L\$parisc1
+___
+
+@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
+      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
+   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+
+@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
+
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+
+$code.=<<___ if (!$flag);
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+___
+$code.=<<___;
+	shd	$ehi,$elo,$Sigma1[0],$t0
+	 add	$Xlo,$hlo,$hlo
+	shd	$elo,$ehi,$Sigma1[0],$t1
+	 addc	$Xhi,$hhi,$hhi		; h += X[i]
+	shd	$ehi,$elo,$Sigma1[1],$t2
+	 ldwm	8($Tbl),$Xhi
+	shd	$elo,$ehi,$Sigma1[1],$t3
+	 ldw	-4($Tbl),$Xlo		; load K[i]
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	 and	$flo,$elo,$a0
+	 and	$fhi,$ehi,$a1
+	shd	$ehi,$elo,$Sigma1[2],$t2
+	 andcm	$glo,$elo,$a2
+	shd	$elo,$ehi,$Sigma1[2],$t3
+	 andcm	$ghi,$ehi,$a3
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma1(e)
+	add	$Xlo,$hlo,$hlo
+	 xor	$a2,$a0,$a0
+	addc	$Xhi,$hhi,$hhi		; h += K[i]
+	 xor	$a3,$a1,$a1		; Ch(e,f,g)
+
+	 add	$t0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[0],$t0
+	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
+	shd	$alo,$ahi,$Sigma0[0],$t1	
+	 add	$a0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[1],$t2
+	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
+	shd	$alo,$ahi,$Sigma0[1],$t3
+
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	shd	$ahi,$alo,$Sigma0[2],$t2
+	and	$alo,$blo,$a0
+	shd	$alo,$ahi,$Sigma0[2],$t3
+	and	$ahi,$bhi,$a1
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma0(a)
+
+	and	$alo,$clo,$a2
+	and	$ahi,$chi,$a3
+	xor	$a2,$a0,$a0
+	 add	$hlo,$dlo,$dlo
+	xor	$a3,$a1,$a1
+	 addc	$hhi,$dhi,$dhi		; d += h
+	and	$blo,$clo,$a2
+	 add	$t0,$hlo,$hlo
+	and	$bhi,$chi,$a3
+	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
+	xor	$a2,$a0,$a0
+	 add	$a0,$hlo,$hlo
+	xor	$a3,$a1,$a1		; Maj(a,b,c)
+	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
+
+___
+$code.=<<___ if ($i==15 && $flag);
+	extru	$Xlo,31,10,$Xlo
+	comiclr,= $LAST10BITS,$Xlo,%r0
+	b	L\$rounds_pa1
+	nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
+	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
+	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
+	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
+	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
+	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
+	 add	$a0,$Xlo,$Xlo
+	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
+	 addc	$a1,$Xhi,$Xhi
+	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
+	xor	$t2,$t0,$t0
+	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
+	xor	$t3,$t1,$t1
+	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+	xor	$t2,$t0,$t0
+	 shd	$a3,$a2,$sigma1[0],$a0
+	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
+	 shd	$a2,$a3,$sigma1[0],$a1
+	add	$t0,$Xlo,$Xlo
+	 shd	$a3,$a2,$sigma1[1],$t2
+	addc	$t1,$Xhi,$Xhi
+	 shd	$a2,$a3,$sigma1[1],$t3
+	xor	$t2,$a0,$a0
+	shd	$a3,$a2,$sigma1[2],$t2
+	xor	$t3,$a1,$a1
+	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+	xor	$t2,$a0,$a0
+	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
+	add	$a0,$Xlo,$Xlo
+	addc	$a1,$Xhi,$Xhi
+
+	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
+	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+	ldw	`0*4`($ctx),$Ahi		; load context
+	ldw	`1*4`($ctx),$Alo
+	ldw	`2*4`($ctx),$Bhi
+	ldw	`3*4`($ctx),$Blo
+	ldw	`4*4`($ctx),$Chi
+	ldw	`5*4`($ctx),$Clo
+	ldw	`6*4`($ctx),$Dhi
+	ldw	`7*4`($ctx),$Dlo
+	ldw	`8*4`($ctx),$Ehi
+	ldw	`9*4`($ctx),$Elo
+	ldw	`10*4`($ctx),$Fhi
+	ldw	`11*4`($ctx),$Flo
+	ldw	`12*4`($ctx),$Ghi
+	ldw	`13*4`($ctx),$Glo
+	ldw	`14*4`($ctx),$Hhi
+	ldw	`15*4`($ctx),$Hlo
+
+	extru	$inp,31,2,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop_pa1
+	extru	$inp,31,2,$a3
+	comib,=	0,$a3,L\$aligned_pa1
+	sub	$inp,$a3,$inp
+
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	vshd	$X[0],$X[1],$X[0]
+	vshd	$X[1],$t2,$X[1]
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	vshd	$t2,$t3,$t2
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+	vshd	$t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+	b	L\$collected_pa1
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
+
+	ldw	`0*4`($ctx),$t1		; update context
+	ldw	`1*4`($ctx),$t0
+	ldw	`2*4`($ctx),$t3
+	ldw	`3*4`($ctx),$t2
+	ldw	`4*4`($ctx),$a1
+	ldw	`5*4`($ctx),$a0
+	ldw	`6*4`($ctx),$a3
+	add	$t0,$Alo,$Alo
+	ldw	`7*4`($ctx),$a2
+	addc	$t1,$Ahi,$Ahi
+	ldw	`8*4`($ctx),$t1
+	add	$t2,$Blo,$Blo
+	ldw	`9*4`($ctx),$t0
+	addc	$t3,$Bhi,$Bhi
+	ldw	`10*4`($ctx),$t3
+	add	$a0,$Clo,$Clo
+	ldw	`11*4`($ctx),$t2
+	addc	$a1,$Chi,$Chi
+	ldw	`12*4`($ctx),$a1
+	add	$a2,$Dlo,$Dlo
+	ldw	`13*4`($ctx),$a0
+	addc	$a3,$Dhi,$Dhi
+	ldw	`14*4`($ctx),$a3
+	add	$t0,$Elo,$Elo
+	ldw	`15*4`($ctx),$a2
+	addc	$t1,$Ehi,$Ehi
+	stw	$Ahi,`0*4`($ctx)
+	add	$t2,$Flo,$Flo
+	stw	$Alo,`1*4`($ctx)
+	addc	$t3,$Fhi,$Fhi
+	stw	$Bhi,`2*4`($ctx)
+	add	$a0,$Glo,$Glo
+	stw	$Blo,`3*4`($ctx)
+	addc	$a1,$Ghi,$Ghi
+	stw	$Chi,`4*4`($ctx)
+	add	$a2,$Hlo,$Hlo
+	stw	$Clo,`5*4`($ctx)
+	addc	$a3,$Hhi,$Hhi
+	stw	$Dhi,`6*4`($ctx)
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+	stw	$Dlo,`7*4`($ctx)
+	stw	$Ehi,`8*4`($ctx)
+	stw	$Elo,`9*4`($ctx)
+	stw	$Fhi,`10*4`($ctx)
+	stw	$Flo,`11*4`($ctx)
+	stw	$Ghi,`12*4`($ctx)
+	stw	$Glo,`13*4`($ctx)
+	stw	$Hhi,`14*4`($ctx)
+	comb,=	$inp,$num,L\$done
+	stw	$Hlo,`15*4`($ctx)
+	b	L\$oop_pa1
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+	$opcode|=(1<<3) if ($mod =~ /^,m/);
+	$opcode|=(1<<2) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
+		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
+	# translate made up instructons: _ror, _shr, _align, _shl
+	s/_ror(\s+)(%r[0-9]+),/
+		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
+
+	s/_shr(\s+%r[0-9]+),([0-9]+),/
+		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
+
+	s/_align(\s+%r[0-9]+,%r[0-9]+),/
+		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
+
+	s/_shl(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+
+	s/cmpb,\*/comb,/ if ($SIZE_T==4);
+
+	print $_,"\n";
+}
+
+close STDOUT;

diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
index 768a6a6..6b44a68 100755
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl

@@ -40,6 +40,7 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
 	$STU="stdu";
 	$UCMP="cmpld";
 	$SHL="sldi";
@@ -47,6 +48,7 @@
 	$PUSH="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$UCMP="cmplw";
 	$SHL="slwi";
@@ -87,7 +89,8 @@
 	$SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@
 .globl	$func
 .align	6
 $func:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+16*$SZ)`($sp)
 	$SHL	$num,$num,`log(16*$SZ)/log(2)`
 
 	$PUSH	$ctx,`$FRAME-$SIZE_T*22`($sp)
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
 	$LD	$A,`0*$SZ`($ctx)
 	mr	$inp,r4				; incarnate $inp
@@ -217,7 +220,7 @@
 	$LD	$G,`6*$SZ`($ctx)
 	$LD	$H,`7*$SZ`($ctx)
 
-	b	LPICmeup
+	bl	LPICmeup
 LPICedup:
 	andi.	r0,$inp,3
 	bne	Lunaligned
@@ -226,8 +229,60 @@
 	$PUSH	$num,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
 	bl	Lsha2_block_private
+	b	Ldone
+
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
+.align	4
+Lunaligned:
+	subfic	$t1,$inp,4096
+	andi.	$t1,$t1,`4096-16*$SZ`	; distance to closest page boundary
+	beq	Lcross_page
+	$UCMP	$num,$t1
+	ble-	Laligned		; didn't cross the page boundary
+	subfc	$num,$t1,$num
+	add	$t1,$inp,$t1
+	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real remaining num
+	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; intermediate end pointer
+	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
+	bl	Lsha2_block_private
+	; $inp equals to the intermediate end pointer here
+	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real remaining num
+Lcross_page:
+	li	$t1,`16*$SZ/4`
+	mtctr	$t1
+	addi	r20,$sp,$LOCALS			; aligned spot below the frame
+Lmemcpy:
+	lbz	r16,0($inp)
+	lbz	r17,1($inp)
+	lbz	r18,2($inp)
+	lbz	r19,3($inp)
+	addi	$inp,$inp,4
+	stb	r16,0(r20)
+	stb	r17,1(r20)
+	stb	r18,2(r20)
+	stb	r19,3(r20)
+	addi	r20,r20,4
+	bdnz	Lmemcpy
+
+	$PUSH	$inp,`$FRAME-$SIZE_T*26`($sp)	; save real inp
+	addi	$t1,$sp,`$LOCALS+16*$SZ`	; fictitious end pointer
+	addi	$inp,$sp,$LOCALS		; fictitious inp pointer
+	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real num
+	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; end pointer
+	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
+	bl	Lsha2_block_private
+	$POP	$inp,`$FRAME-$SIZE_T*26`($sp)	; restore real inp
+	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
+	addic.	$num,$num,`-16*$SZ`		; num--
+	bne-	Lunaligned
+
 Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -249,64 +304,12 @@
 	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
 	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
 	mtlr	r0
-	addi	$sp,$sp,`$FRAME+16*$SZ`
+	addi	$sp,$sp,$FRAME
 	blr
-___
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
-.align	4
-Lunaligned:
-	subfic	$t1,$inp,4096
-	andi.	$t1,$t1,`4096-16*$SZ`	; distance to closest page boundary
-	beq	Lcross_page
-	$UCMP	$num,$t1
-	ble-	Laligned		; didn't cross the page boundary
-	subfc	$num,$t1,$num
-	add	$t1,$inp,$t1
-	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real remaining num
-	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; intermediate end pointer
-	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
-	bl	Lsha2_block_private
-	; $inp equals to the intermediate end pointer here
-	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real remaining num
-Lcross_page:
-	li	$t1,`16*$SZ/4`
-	mtctr	$t1
-	addi	r20,$sp,$FRAME			; aligned spot below the frame
-Lmemcpy:
-	lbz	r16,0($inp)
-	lbz	r17,1($inp)
-	lbz	r18,2($inp)
-	lbz	r19,3($inp)
-	addi	$inp,$inp,4
-	stb	r16,0(r20)
-	stb	r17,1(r20)
-	stb	r18,2(r20)
-	stb	r19,3(r20)
-	addi	r20,r20,4
-	bdnz	Lmemcpy
-
-	$PUSH	$inp,`$FRAME-$SIZE_T*26`($sp)	; save real inp
-	addi	$t1,$sp,`$FRAME+16*$SZ`		; fictitious end pointer
-	addi	$inp,$sp,$FRAME			; fictitious inp pointer
-	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real num
-	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; end pointer
-	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
-	bl	Lsha2_block_private
-	$POP	$inp,`$FRAME-$SIZE_T*26`($sp)	; restore real inp
-	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
-	addic.	$num,$num,`-16*$SZ`		; num--
-	bne-	Lunaligned
-	b	Ldone
-___
-
-$code.=<<___;
 .align	4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@
 	$ST	$H,`7*$SZ`($ctx)
 	bne	Lsha2_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@
 $code.=<<___;
 .align	6
 LPICmeup:
-	bl	LPIC
-	addi	$Tbl,$Tbl,`64-4`	; "distance" between . and last nop
-	b	LPICedup
-	nop
-	nop
-	nop
-	nop
-	nop
-LPIC:	mflr	$Tbl
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
 	blr
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
 	.long	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd

diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl
index e7ef2d5..079a3fc 100644
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl

@@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";	$t2="%r2";
@@ -44,7 +64,7 @@
 $T1="%r14";
 $sp="%r15";
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($output =~ /512/) {
@@ -78,7 +98,8 @@
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
 
 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@
 	xgr	$t0,$t1
 	$ROT	$t1,$t1,`$Sigma1[2]-$Sigma1[1]`
 	 xgr	$t2,$g
-	$ST	$T1,`160+$SZ*($i%16)`($sp)
+	$ST	$T1,`$stdframe+$SZ*($i%16)`($sp)
 	xgr	$t0,$t1			# Sigma1(e)
-	la	$T1,0($T1,$h)		# T1+=h
+	algr	$T1,$h			# T1+=h
 	 ngr	$t2,$e
 	 lgr	$t1,$a
 	algr	$T1,$t0			# T1+=Sigma1(e)
@@ -113,7 +134,7 @@
 	 ngr	$t2,$b
 	algr	$h,$T1			# h+=T1
 	 ogr	$t2,$t1			# Maj(a,b,c)
-	la	$d,0($d,$T1)		# d+=T1
+	algr	$d,$T1			# d+=T1
 	algr	$h,$t2			# h+=Maj(a,b,c)
 ___
 }
@@ -122,19 +143,19 @@
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	$LD	$T1,`160+$SZ*(($i+1)%16)`($sp)	### $i
-	$LD	$t1,`160+$SZ*(($i+14)%16)`($sp)
+	$LD	$T1,`$stdframe+$SZ*(($i+1)%16)`($sp)	### $i
+	$LD	$t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
 	$ROT	$t0,$T1,$sigma0[0]
 	$SHR	$T1,$sigma0[2]
 	$ROT	$t2,$t0,`$sigma0[1]-$sigma0[0]`
 	xgr	$T1,$t0
 	$ROT	$t0,$t1,$sigma1[0]
-	xgr	$T1,$t2				# sigma0(X[i+1])
+	xgr	$T1,$t2					# sigma0(X[i+1])
 	$SHR	$t1,$sigma1[2]
-	$ADD	$T1,`160+$SZ*($i%16)`($sp)	# +=X[i]
+	$ADD	$T1,`$stdframe+$SZ*($i%16)`($sp)	# +=X[i]
 	xgr	$t1,$t0
 	$ROT	$t0,$t0,`$sigma1[1]-$sigma1[0]`
-	$ADD	$T1,`160+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
+	$ADD	$T1,`$stdframe+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
 	xgr	$t1,$t0				# sigma1(X[i+14])
 	algr	$T1,$t1				# +=sigma1(X[i+14])
 ___
@@ -212,6 +233,7 @@
 .globl	$Func
 .type	$Func,\@function
 $Func:
+	sllg	$len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
 	larl	%r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
 	lgr	%r1,$ctx
 	lgr	%r2,$inp
-	sllg	%r3,$len,`log(16*$SZ)/log(2)`
+	lgr	%r3,$len
 	.long	0xb93e0002	# kimd %r0,%r2
 	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
@@ -235,13 +257,12 @@
 .Lsoftware:
 ___
 $code.=<<___;
-	sllg	$len,$len,`log(16*$SZ)/log(2)`
 	lghi	%r1,-$frame
-	agr	$len,$inp
-	stmg	$ctx,%r15,16($sp)
+	la	$len,0($len,$inp)
+	stm${g}	$ctx,%r15,`2*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$tbl,$Table
 	$LD	$A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@
 	clgr	$len,$t0
 	jne	.Lrounds_16_xx
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,`16*$SZ`($inp)
 	$ADD	$A,`0*$SZ`($ctx)
 	$ADD	$B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@
 	$ST	$F,`5*$SZ`($ctx)
 	$ST	$G,`6*$SZ`($ctx)
 	$ST	$H,`7*$SZ`($ctx)
-	clg	$inp,`$frame+32`($sp)
+	cl${g}	$inp,`$frame+4*$SIZE_T`($sp)
 	jne	.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)	
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)	
 	br	%r14
 .size	$Func,.-$Func
 .string	"SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl
index ec5d781..5857407 100644
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl

@@ -305,9 +305,9 @@
 	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
 	srl	@X[($i/2)%8],0,$tmp0
+	add	$tmp2,$tmp1,$tmp1
 	add	$xi,$T1,$T1			! +=X[i]
 	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
-	add	$tmp2,$T1,$T1
 	add	$tmp1,$T1,$T1
 
 	srl	$T1,0,$T1
@@ -318,9 +318,9 @@
 $code.=<<___;
 	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
-	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$xi,$T1,$T1			! +=X[i+9]
-	add	$tmp2,$T1,$T1
+	add	$tmp2,$tmp1,$tmp1
+	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$tmp1,$T1,$T1
 
 	sllx	$T1,32,$tmp0

diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index e6643f8..f611a2d 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl

@@ -95,50 +95,44 @@
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	mov	$e,$a0
-	mov	$e,$a1
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
 	mov	$f,$a2
-
-	ror	\$$Sigma1[0],$a0
-	ror	\$$Sigma1[1],$a1
-	xor	$g,$a2			# f^g
-
-	xor	$a1,$a0
-	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a1
-	and	$e,$a2			# (f^g)&e
 	mov	$T1,`$SZ*($i&0xf)`(%rsp)
 
-	xor	$a1,$a0			# Sigma1(e)
-	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$e,$a0
+	xor	$g,$a2			# f^g
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
 	add	$h,$T1			# T1+=h
+	xor	$a,$a1
 
-	mov	$a,$h
-	add	$a0,$T1			# T1+=Sigma1(e)
-
-	add	$a2,$T1			# T1+=Ch(e,f,g)
-	mov	$a,$a0
-	mov	$a,$a1
-
-	ror	\$$Sigma0[0],$h
-	ror	\$$Sigma0[1],$a0
-	mov	$a,$a2
 	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
+	and	$e,$a2			# (f^g)&e
+	mov	$b,$h
 
-	xor	$a0,$h
-	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a0
-	or	$c,$a1			# a|c
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
 
-	xor	$a0,$h			# h=Sigma0(a)
-	and	$c,$a2			# a&c
+	xor	$c,$h			# b^c
+	xor	$a,$a1
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+	mov	$b,$a2
+
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	and	$a,$h			# h=(b^c)&a
+	and	$c,$a2			# b&c
+
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)
+
 	add	$T1,$d			# d+=T1
-
-	and	$b,$a1			# (a|c)&b
 	add	$T1,$h			# h+=T1
-
-	or	$a2,$a1			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1($round),$round	# round++
+	add	$a1,$h			# h+=Sigma0(a)
 
-	add	$a1,$h			# h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +141,30 @@
 
 $code.=<<___;
 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
-	mov	`$SZ*(($i+14)&0xf)`(%rsp),$T1
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1
+	mov	$a0,$T1
+	mov	$a1,$a2
 
-	mov	$a0,$a2
-
+	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
+	xor	$a0,$T1
 	shr	\$$sigma0[2],$a0
-	ror	\$$sigma0[0],$a2
 
-	xor	$a2,$a0
-	ror	\$`$sigma0[1]-$sigma0[0]`,$a2
+	ror	\$$sigma0[0],$T1
+	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
+	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-	xor	$a2,$a0			# sigma0(X[(i+1)&0xf])
-	mov	$T1,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
 
-	shr	\$$sigma1[2],$T1
-	ror	\$$sigma1[0],$a1
-
-	xor	$a1,$T1
-	ror	\$`$sigma1[1]-$sigma1[0]`,$a1
-
-	xor	$a1,$T1			# sigma1(X[(i+14)&0xf])
-
+	ror	\$$sigma1[0],$a2
 	add	$a0,$T1
-
-	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
 
 	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a1,$T1
+	mov	$a,$a1
 ___
 	&ROUND_00_15(@_);
 }
@@ -219,6 +211,8 @@
 ___
 	for($i=0;$i<16;$i++) {
 		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
 		$code.="	bswap	$T1\n";
 		&ROUND_00_15($i,@ROT);
 		unshift(@ROT,pop(@ROT));

diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h
index 16cacf9..8a6bf4b 100644
--- a/crypto/sha/sha.h
+++ b/crypto/sha/sha.h

@@ -106,6 +106,9 @@
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);

diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c
index 50d1925..81219af 100644
--- a/crypto/sha/sha1dgst.c
+++ b/crypto/sha/sha1dgst.c

@@ -57,6 +57,7 @@
  */
 
 #include <openssl/opensslconf.h>
+#include <openssl/crypto.h>
 #if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
 
 #undef  SHA_0

diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c
index 8952d87..f88d3d6 100644
--- a/crypto/sha/sha256.c
+++ b/crypto/sha/sha256.c

@@ -16,7 +16,7 @@
 
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
 
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0xc1059ed8UL;	c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@
 	return 1;
 	}
 
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0x6a09e667UL;	c->h[1]=0xbb67ae85UL;

diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
index cbc0e58..50dd7dc 100644
--- a/crypto/sha/sha512.c
+++ b/crypto/sha/sha512.c

@@ -59,21 +59,8 @@
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
 
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
-	h[2]  = 0x629a292a; h[3]  = 0x367cd507;
-	h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
-	h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
-	h[8]  = 0x67332667; h[9]  = 0xffc00b31;
-	h[10] = 0x8eb44a87; h[11] = 0x68581511;
-	h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
-	h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
 	c->h[0]=U64(0xcbbb9d5dc1059ed8);
 	c->h[1]=U64(0x629a292a367cd507);
 	c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@
 	c->h[5]=U64(0x8eb44a8768581511);
 	c->h[6]=U64(0xdb0c2e0d64f98fa7);
 	c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
         return 1;
 	}
 
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
-	h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
-	h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
-	h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
-	h[8]  = 0x510e527f; h[9]  = 0xade682d1;
-	h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
-	h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
-	h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
 	c->h[0]=U64(0x6a09e667f3bcc908);
 	c->h[1]=U64(0xbb67ae8584caa73b);
 	c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@
 	c->h[5]=U64(0x9b05688c2b3e6c1f);
 	c->h[6]=U64(0x1f83d9abfb41bd6b);
 	c->h[7]=U64(0x5be0cd19137e2179);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
         return 1;
@@ -160,24 +134,6 @@
 
 	if (md==0) return 0;
 
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* recall assembler dword order... */
-	n = c->md_len;
-	if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
-		{
-		unsigned int *h = (unsigned int *)c->h, t;
-
-		for (n/=4;n;n--)
-			{
-			t = *(h++);
-			*(md++) = (unsigned char)(t>>24);
-			*(md++) = (unsigned char)(t>>16);
-			*(md++) = (unsigned char)(t>>8);
-			*(md++) = (unsigned char)(t);
-			}
-		}
-	else	return 0;
-#else
 	switch (c->md_len)
 		{
 		/* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@
 		/* ... as well as make sure md_len is not abused. */
 		default:	return 0;
 		}
-#endif
+
 	return 1;
 	}
 

diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c
index 70eb560..c946ad8 100644
--- a/crypto/sha/sha_dgst.c
+++ b/crypto/sha/sha_dgst.c

@@ -57,6 +57,7 @@
  */
 
 #include <openssl/opensslconf.h>
+#include <openssl/crypto.h>
 #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
 
 #undef  SHA_1

diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h
index 672c26e..7a0c3ca 100644
--- a/crypto/sha/sha_locl.h
+++ b/crypto/sha/sha_locl.h

@@ -122,7 +122,11 @@
 #define INIT_DATA_h3 0x10325476UL
 #define INIT_DATA_h4 0xc3d2e1f0UL
 
-int HASH_INIT (SHA_CTX *c)
+#ifdef SHA_0
+fips_md_init(SHA)
+#else
+fips_md_init_ctx(SHA1, SHA)
+#endif
 	{
 	memset (c,0,sizeof(*c));
 	c->h0=INIT_DATA_h0;

diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c
index ed195ab..43b3ac6 100644
--- a/crypto/sparcv9cap.c
+++ b/crypto/sparcv9cap.c

@@ -19,7 +19,8 @@
 	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
-	if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+	if (num>=8 && !(num&1) &&
+	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
 	else
@@ -169,7 +170,6 @@
 	char *e;
 	struct sigaction	common_act,ill_oact,bus_oact;
 	sigset_t		all_masked,oset;
-	int			sig;
 	static int trigger=0;
 
 	if (trigger) return;

diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h
new file mode 100644
index 0000000..7ec7825
--- /dev/null
+++ b/crypto/srp/srp.h

@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou ([email protected]) with 
+ * the precious help of Peter Sylvester ([email protected]) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+	{
+	char *b64_bn;
+	BIGNUM *bn;
+	} SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+	{
+	char *id;
+	BIGNUM *s;
+	BIGNUM *v;
+	const BIGNUM *g;
+	const BIGNUM *N;
+	char *info;
+	} SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+	{
+	STACK_OF(SRP_user_pwd) *users_pwd;
+	STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+	char *seed_key;
+	BIGNUM *default_g;
+	BIGNUM *default_N;
+	} SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+	{
+	char *id;
+	BIGNUM *g;
+	BIGNUM *N;
+	} SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype	0
+#define DB_srpverifier	1
+#define DB_srpsalt 	2
+#define DB_srpid	3              
+#define DB_srpgN	4       
+#define DB_srpinfo	5 
+#undef  DB_NUMBER      
+#define DB_NUMBER       6
+
+#define DB_SRP_INDEX	'I'
+#define DB_SRP_VALID	'V'
+#define DB_SRP_REVOKED	'R'
+#define DB_SRP_MODIF	'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); 
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+#endif

diff --git a/crypto/srp/srp_grps.h b/crypto/srp/srp_grps.h
new file mode 100644
index 0000000..d77c9ff
--- /dev/null
+++ b/crypto/srp/srp_grps.h

@@ -0,0 +1,517 @@
+/* start of generated data */
+
+static BN_ULONG bn_group_1024_value[] = {
+	bn_pack4(9FC6,1D2F,C0EB,06E3),
+	bn_pack4(FD51,38FE,8376,435B),
+	bn_pack4(2FD4,CBF4,976E,AA9A),
+	bn_pack4(68ED,BC3C,0572,6CC0),
+	bn_pack4(C529,F566,660E,57EC),
+	bn_pack4(8255,9B29,7BCF,1885),
+	bn_pack4(CE8E,F4AD,69B1,5D49),
+	bn_pack4(5DC7,D7B4,6154,D6B6),
+	bn_pack4(8E49,5C1D,6089,DAD1),
+	bn_pack4(E0D5,D8E2,50B9,8BE4),
+	bn_pack4(383B,4813,D692,C6E0),
+	bn_pack4(D674,DF74,96EA,81D3),
+	bn_pack4(9EA2,314C,9C25,6576),
+	bn_pack4(6072,6187,75FF,3C0B),
+	bn_pack4(9C33,F80A,FA8F,C5E8),
+	bn_pack4(EEAF,0AB9,ADB3,8DD6)
+};
+static BIGNUM bn_group_1024 = {
+	bn_group_1024_value,
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_1536_value[] = {
+	bn_pack4(CF76,E3FE,D135,F9BB),
+	bn_pack4(1518,0F93,499A,234D),
+	bn_pack4(8CE7,A28C,2442,C6F3),
+	bn_pack4(5A02,1FFF,5E91,479E),
+	bn_pack4(7F8A,2FE9,B8B5,292E),
+	bn_pack4(837C,264A,E3A9,BEB8),
+	bn_pack4(E442,734A,F7CC,B7AE),
+	bn_pack4(6577,2E43,7D6C,7F8C),
+	bn_pack4(DB2F,D53D,24B7,C486),
+	bn_pack4(6EDF,0195,3934,9627),
+	bn_pack4(158B,FD3E,2B9C,8CF5),
+	bn_pack4(764E,3F4B,53DD,9DA1),
+	bn_pack4(4754,8381,DBC5,B1FC),
+	bn_pack4(9B60,9E0B,E3BA,B63D),
+	bn_pack4(8134,B1C8,B979,8914),
+	bn_pack4(DF02,8A7C,EC67,F0D0),
+	bn_pack4(80B6,55BB,9A22,E8DC),
+	bn_pack4(1558,903B,A0D0,F843),
+	bn_pack4(51C6,A94B,E460,7A29),
+	bn_pack4(5F4F,5F55,6E27,CBDE),
+	bn_pack4(BEEE,A961,4B19,CC4D),
+	bn_pack4(DBA5,1DF4,99AC,4C80),
+	bn_pack4(B1F1,2A86,17A4,7BBB),
+	bn_pack4(9DEF,3CAF,B939,277A)
+};
+static BIGNUM bn_group_1536 = {
+	bn_group_1536_value,
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_2048_value[] = {
+	bn_pack4(0FA7,111F,9E4A,FF73),
+	bn_pack4(9B65,E372,FCD6,8EF2),
+	bn_pack4(35DE,236D,525F,5475),
+	bn_pack4(94B5,C803,D89F,7AE4),
+	bn_pack4(71AE,35F8,E9DB,FBB6),
+	bn_pack4(2A56,98F3,A8D0,C382),
+	bn_pack4(9CCC,041C,7BC3,08D8),
+	bn_pack4(AF87,4E73,03CE,5329),
+	bn_pack4(6160,2790,04E5,7AE6),
+	bn_pack4(032C,FBDB,F52F,B378),
+	bn_pack4(5EA7,7A27,75D2,ECFA),
+	bn_pack4(5445,23B5,24B0,D57D),
+	bn_pack4(5B9D,32E6,88F8,7748),
+	bn_pack4(F1D2,B907,8717,461A),
+	bn_pack4(76BD,207A,436C,6481),
+	bn_pack4(CA97,B43A,23FB,8016),
+	bn_pack4(1D28,1E44,6B14,773B),
+	bn_pack4(7359,D041,D5C3,3EA7),
+	bn_pack4(A80D,740A,DBF4,FF74),
+	bn_pack4(55F9,7993,EC97,5EEA),
+	bn_pack4(2918,A996,2F0B,93B8),
+	bn_pack4(661A,05FB,D5FA,AAE8),
+	bn_pack4(CF60,9517,9A16,3AB3),
+	bn_pack4(E808,3969,EDB7,67B0),
+	bn_pack4(CD7F,48A9,DA04,FD50),
+	bn_pack4(D523,12AB,4B03,310D),
+	bn_pack4(8193,E075,7767,A13D),
+	bn_pack4(A373,29CB,B4A0,99ED),
+	bn_pack4(FC31,9294,3DB5,6050),
+	bn_pack4(AF72,B665,1987,EE07),
+	bn_pack4(F166,DE5E,1389,582F),
+	bn_pack4(AC6B,DB41,324A,9A9B)
+};
+static BIGNUM bn_group_2048 = {
+	bn_group_2048_value,
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_3072_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(4B82,D120,A93A,D2CA),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_3072 = {
+	bn_group_3072_value,
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_4096_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(4DF4,35C9,3406,3199),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_4096 = {
+	bn_group_4096_value,
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_6144_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(E694,F91E,6DCC,4024),
+	bn_pack4(12BF,2D5B,0B74,74D6),
+	bn_pack4(043E,8F66,3F48,60EE),
+	bn_pack4(387F,E8D7,6E3C,0468),
+	bn_pack4(DA56,C9EC,2EF2,9632),
+	bn_pack4(EB19,CCB1,A313,D55C),
+	bn_pack4(F550,AA3D,8A1F,BFF0),
+	bn_pack4(06A1,D58B,B7C5,DA76),
+	bn_pack4(A797,15EE,F29B,E328),
+	bn_pack4(14CC,5ED2,0F80,37E0),
+	bn_pack4(CC8F,6D7E,BF48,E1D8),
+	bn_pack4(4BD4,07B2,2B41,54AA),
+	bn_pack4(0F1D,45B7,FF58,5AC5),
+	bn_pack4(23A9,7A7E,36CC,88BE),
+	bn_pack4(59E7,C97F,BEC7,E8F3),
+	bn_pack4(B5A8,4031,900B,1C9E),
+	bn_pack4(D55E,702F,4698,0C82),
+	bn_pack4(F482,D7CE,6E74,FEF6),
+	bn_pack4(F032,EA15,D172,1D03),
+	bn_pack4(5983,CA01,C64B,92EC),
+	bn_pack4(6FB8,F401,378C,D2BF),
+	bn_pack4(3320,5151,2BD7,AF42),
+	bn_pack4(DB7F,1447,E6CC,254B),
+	bn_pack4(44CE,6CBA,CED4,BB1B),
+	bn_pack4(DA3E,DBEB,CF9B,14ED),
+	bn_pack4(1797,27B0,865A,8918),
+	bn_pack4(B06A,53ED,9027,D831),
+	bn_pack4(E5DB,382F,4130,01AE),
+	bn_pack4(F8FF,9406,AD9E,530E),
+	bn_pack4(C975,1E76,3DBA,37BD),
+	bn_pack4(C1D4,DCB2,6026,46DE),
+	bn_pack4(36C3,FAB4,D27C,7026),
+	bn_pack4(4DF4,35C9,3402,8492),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_6144 = {
+	bn_group_6144_value,
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_8192_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(60C9,80DD,98ED,D3DF),
+	bn_pack4(C81F,56E8,80B9,6E71),
+	bn_pack4(9E30,50E2,7656,94DF),
+	bn_pack4(9558,E447,5677,E9AA),
+	bn_pack4(C919,0DA6,FC02,6E47),
+	bn_pack4(889A,002E,D5EE,382B),
+	bn_pack4(4009,438B,481C,6CD7),
+	bn_pack4(3590,46F4,EB87,9F92),
+	bn_pack4(FAF3,6BC3,1ECF,A268),
+	bn_pack4(B1D5,10BD,7EE7,4D73),
+	bn_pack4(F9AB,4819,5DED,7EA1),
+	bn_pack4(64F3,1CC5,0846,851D),
+	bn_pack4(4597,E899,A025,5DC1),
+	bn_pack4(DF31,0EE0,74AB,6A36),
+	bn_pack4(6D2A,13F8,3F44,F82D),
+	bn_pack4(062B,3CF5,B3A2,78A6),
+	bn_pack4(7968,3303,ED5B,DD3A),
+	bn_pack4(FA9D,4B7F,A2C0,87E8),
+	bn_pack4(4BCB,C886,2F83,85DD),
+	bn_pack4(3473,FC64,6CEA,306B),
+	bn_pack4(13EB,57A8,1A23,F0C7),
+	bn_pack4(2222,2E04,A403,7C07),
+	bn_pack4(E3FD,B8BE,FC84,8AD9),
+	bn_pack4(238F,16CB,E39D,652D),
+	bn_pack4(3423,B474,2BF1,C978),
+	bn_pack4(3AAB,639C,5AE4,F568),
+	bn_pack4(2576,F693,6BA4,2466),
+	bn_pack4(741F,A7BF,8AFC,47ED),
+	bn_pack4(3BC8,32B6,8D9D,D300),
+	bn_pack4(D8BE,C4D0,73B9,31BA),
+	bn_pack4(3877,7CB6,A932,DF8C),
+	bn_pack4(74A3,926F,12FE,E5E4),
+	bn_pack4(E694,F91E,6DBE,1159),
+	bn_pack4(12BF,2D5B,0B74,74D6),
+	bn_pack4(043E,8F66,3F48,60EE),
+	bn_pack4(387F,E8D7,6E3C,0468),
+	bn_pack4(DA56,C9EC,2EF2,9632),
+	bn_pack4(EB19,CCB1,A313,D55C),
+	bn_pack4(F550,AA3D,8A1F,BFF0),
+	bn_pack4(06A1,D58B,B7C5,DA76),
+	bn_pack4(A797,15EE,F29B,E328),
+	bn_pack4(14CC,5ED2,0F80,37E0),
+	bn_pack4(CC8F,6D7E,BF48,E1D8),
+	bn_pack4(4BD4,07B2,2B41,54AA),
+	bn_pack4(0F1D,45B7,FF58,5AC5),
+	bn_pack4(23A9,7A7E,36CC,88BE),
+	bn_pack4(59E7,C97F,BEC7,E8F3),
+	bn_pack4(B5A8,4031,900B,1C9E),
+	bn_pack4(D55E,702F,4698,0C82),
+	bn_pack4(F482,D7CE,6E74,FEF6),
+	bn_pack4(F032,EA15,D172,1D03),
+	bn_pack4(5983,CA01,C64B,92EC),
+	bn_pack4(6FB8,F401,378C,D2BF),
+	bn_pack4(3320,5151,2BD7,AF42),
+	bn_pack4(DB7F,1447,E6CC,254B),
+	bn_pack4(44CE,6CBA,CED4,BB1B),
+	bn_pack4(DA3E,DBEB,CF9B,14ED),
+	bn_pack4(1797,27B0,865A,8918),
+	bn_pack4(B06A,53ED,9027,D831),
+	bn_pack4(E5DB,382F,4130,01AE),
+	bn_pack4(F8FF,9406,AD9E,530E),
+	bn_pack4(C975,1E76,3DBA,37BD),
+	bn_pack4(C1D4,DCB2,6026,46DE),
+	bn_pack4(36C3,FAB4,D27C,7026),
+	bn_pack4(4DF4,35C9,3402,8492),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_8192 = {
+	bn_group_8192_value,
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_generator_19_value[] = {19} ;
+static BIGNUM bn_generator_19 = {
+	bn_generator_19_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_5_value[] = {5} ;
+static BIGNUM bn_generator_5 = {
+	bn_generator_5_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_2_value[] = {2} ;
+static BIGNUM bn_generator_2 = {
+	bn_generator_2_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static SRP_gN knowngN[] = {
+	{"8192",&bn_generator_19 , &bn_group_8192},
+	{"6144",&bn_generator_5 , &bn_group_6144},
+	{"4096",&bn_generator_5 , &bn_group_4096},
+	{"3072",&bn_generator_5 , &bn_group_3072},
+	{"2048",&bn_generator_2 , &bn_group_2048},
+	{"1536",&bn_generator_2 , &bn_group_1536},
+	{"1024",&bn_generator_2 , &bn_group_1024},
+};
+#define KNOWN_GN_NUMBER sizeof(knowngN) / sizeof(SRP_gN)
+
+/* end of generated data */

diff --git a/crypto/srp/srp_lcl.h b/crypto/srp/srp_lcl.h
new file mode 100644
index 0000000..42bda3f
--- /dev/null
+++ b/crypto/srp/srp_lcl.h

@@ -0,0 +1,83 @@
+/* crypto/srp/srp_lcl.h */
+/* Written by Peter Sylvester ([email protected])  
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#ifndef HEADER_SRP_LCL_H
+#define HEADER_SRP_LCL_H
+
+#include <openssl/srp.h>
+#include <openssl/sha.h>
+
+#if 0
+#define srp_bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \
+   fprintf(stderr,"\n");}
+#else
+#define   srp_bn_print(a)
+#endif
+
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif

diff --git a/crypto/srp/srp_lib.c b/crypto/srp/srp_lib.c
new file mode 100644
index 0000000..92cea98
--- /dev/null
+++ b/crypto/srp/srp_lib.c

@@ -0,0 +1,357 @@
+/* crypto/srp/srp_lib.c */
+/* Written by Christophe Renou ([email protected]) with 
+ * the precious help of Peter Sylvester ([email protected]) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+
+#if (BN_BYTES == 8)
+#define bn_pack4(a1,a2,a3,a4) 0x##a1##a2##a3##a4##ul
+#endif
+#if (BN_BYTES == 4)
+#define bn_pack4(a1,a2,a3,a4)  0x##a3##a4##ul, 0x##a1##a2##ul
+#endif
+#if (BN_BYTES == 2)
+#define bn_pack4(a1,a2,a3,a4) 0x##a4##u,0x##a3##u,0x##a2##u,0x##a1##u
+#endif
+
+
+#include "srp_grps.h"
+
+static BIGNUM *srp_Calc_k(BIGNUM *N, BIGNUM *g)
+	{
+	/* k = SHA1(N | PAD(g)) -- tls-srp draft 8 */
+
+	unsigned char digest[SHA_DIGEST_LENGTH];
+	unsigned char *tmp;
+	EVP_MD_CTX ctxt;
+	int longg ;
+	int longN = BN_num_bytes(N);
+
+	if ((tmp = OPENSSL_malloc(longN)) == NULL)
+		return NULL;
+	BN_bn2bin(N,tmp) ;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, tmp, longN);
+
+	memset(tmp, 0, longN);
+	longg = BN_bn2bin(g,tmp) ;
+        /* use the zeros behind to pad on left */
+	EVP_DigestUpdate(&ctxt, tmp + longg, longN-longg);
+	EVP_DigestUpdate(&ctxt, tmp, longg);
+	OPENSSL_free(tmp);
+
+	EVP_DigestFinal_ex(&ctxt, digest, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	return BN_bin2bn(digest, sizeof(digest), NULL);	
+	}
+
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N)
+	{
+	/* k = SHA1(PAD(A) || PAD(B) ) -- tls-srp draft 8 */
+
+	BIGNUM *u;	
+	unsigned char cu[SHA_DIGEST_LENGTH];
+	unsigned char *cAB;
+	EVP_MD_CTX ctxt;
+	int longN;  
+	if ((A == NULL) ||(B == NULL) || (N == NULL))
+		return NULL;
+
+	longN= BN_num_bytes(N);
+
+	if ((cAB = OPENSSL_malloc(2*longN)) == NULL) 
+		return NULL;
+
+	memset(cAB, 0, longN);
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(A,cAB+longN), longN);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(B,cAB+longN), longN);
+	OPENSSL_free(cAB);
+	EVP_DigestFinal_ex(&ctxt, cu, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	if (!(u = BN_bin2bn(cu, sizeof(cu), NULL)))
+		return NULL;
+	if (!BN_is_zero(u))
+		return u;
+	BN_free(u);
+	return NULL;
+}
+
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N)
+	{
+	BIGNUM *tmp = NULL, *S = NULL;
+	BN_CTX *bn_ctx; 
+	
+	if (u == NULL || A == NULL || v == NULL || b == NULL || N == NULL)
+		return NULL; 
+
+	if ((bn_ctx = BN_CTX_new()) == NULL ||
+		(tmp = BN_new()) == NULL ||
+		(S = BN_new()) == NULL )
+		goto err;
+
+	/* S = (A*v**u) ** b */ 
+
+	if (!BN_mod_exp(tmp,v,u,N,bn_ctx))
+		goto err;
+	if (!BN_mod_mul(tmp,A,tmp,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(S,tmp,b,N,bn_ctx))
+		goto err;
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	return S;
+	}
+
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v)
+	{
+	BIGNUM  *kv = NULL, *gb = NULL;
+	BIGNUM *B = NULL, *k = NULL;
+	BN_CTX *bn_ctx;
+
+	if (b == NULL || N == NULL || g == NULL || v == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ( (kv = BN_new()) == NULL ||
+		(gb = BN_new()) == NULL ||
+		(B = BN_new())== NULL)
+		goto err;
+
+	/* B = g**b + k*v */
+
+	if (!BN_mod_exp(gb,g,b,N,bn_ctx) ||
+	   !(k = srp_Calc_k(N,g)) ||
+	   !BN_mod_mul(kv,v,k,N,bn_ctx) || 
+	   !BN_mod_add(B,gb,kv,N,bn_ctx))
+		{
+		BN_free(B);
+		B = NULL;
+		}
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(kv);
+	BN_clear_free(gb);
+	BN_free(k); 
+	return B;
+	}
+
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass)
+	{
+	unsigned char dig[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+	unsigned char *cs;
+
+	if ((s == NULL) ||
+		(user == NULL) ||
+		(pass == NULL))
+		return NULL;
+
+	if ((cs = OPENSSL_malloc(BN_num_bytes(s))) == NULL)
+		return NULL;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, user, strlen(user));
+	EVP_DigestUpdate(&ctxt, ":", 1);
+	EVP_DigestUpdate(&ctxt, pass, strlen(pass));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	BN_bn2bin(s,cs);
+	EVP_DigestUpdate(&ctxt, cs, BN_num_bytes(s));
+	OPENSSL_free(cs);
+	EVP_DigestUpdate(&ctxt, dig, sizeof(dig));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	return BN_bin2bn(dig, sizeof(dig), NULL);
+	}
+
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g)
+	{
+	BN_CTX *bn_ctx; 
+	BIGNUM * A = NULL;
+
+	if (a == NULL || N == NULL || g == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL) 
+		return NULL;
+
+	if ((A = BN_new()) != NULL &&
+	   !BN_mod_exp(A,g,a,N,bn_ctx))
+		{
+		BN_free(A);
+		A = NULL;
+		}
+	BN_CTX_free(bn_ctx);
+	return A;
+	}
+
+
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u)
+	{
+	BIGNUM *tmp = NULL, *tmp2 = NULL, *tmp3 = NULL , *k = NULL, *K = NULL;
+	BN_CTX *bn_ctx;
+
+	if (u == NULL || B == NULL || N == NULL || g == NULL || x == NULL || a == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ((tmp = BN_new()) == NULL ||
+		(tmp2 = BN_new())== NULL ||
+		(tmp3 = BN_new())== NULL ||
+		(K = BN_new()) == NULL)
+		goto err;
+	
+	if (!BN_mod_exp(tmp,g,x,N,bn_ctx))
+		goto err;
+	if (!(k = srp_Calc_k(N,g)))
+		goto err;
+	if (!BN_mod_mul(tmp2,tmp,k,N,bn_ctx))
+		goto err;
+	if (!BN_mod_sub(tmp,B,tmp2,N,bn_ctx))
+		goto err;
+
+	if (!BN_mod_mul(tmp3,u,x,N,bn_ctx))
+		goto err;
+	if (!BN_mod_add(tmp2,a,tmp3,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(K,tmp,tmp2,N,bn_ctx))
+		goto err;
+
+err :
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	BN_clear_free(tmp2);
+	BN_clear_free(tmp3);
+	BN_free(k);
+	return K;	
+	}
+
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N)
+	{
+	BIGNUM *r;
+	BN_CTX *bn_ctx; 
+	int ret = 0;
+
+	if (B == NULL || N == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return 0;
+
+	if ((r = BN_new()) == NULL)
+		goto err;
+	/* Checks if B % N == 0 */
+	if (!BN_nnmod(r,B,N,bn_ctx))
+		goto err;
+	ret = !BN_is_zero(r);
+err:
+	BN_CTX_free(bn_ctx);
+	BN_free(r);
+	return ret;
+	}
+
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N)
+	{
+	/* Checks if A % N == 0 */
+	return SRP_Verify_B_mod_N(A,N) ;
+	}
+
+
+/* Check if G and N are kwown parameters. 
+   The values have been generated from the ietf-tls-srp draft version 8
+*/
+char *SRP_check_known_gN_param(BIGNUM *g, BIGNUM *N)
+	{
+	size_t i;
+	if ((g == NULL) || (N == NULL))
+		return 0;
+
+	srp_bn_print(g);
+	srp_bn_print(N);
+
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (BN_cmp(knowngN[i].g, g) == 0 && BN_cmp(knowngN[i].N, N) == 0) 
+			return knowngN[i].id;
+		}
+	return NULL;
+	}
+
+SRP_gN *SRP_get_default_gN(const char *id)
+	{
+	size_t i;
+
+	if (id == NULL) 
+		return knowngN;
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (strcmp(knowngN[i].id, id)==0)
+			return knowngN + i;
+		}
+	return NULL;
+	}
+#endif

diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c
new file mode 100644
index 0000000..c8be907
--- /dev/null
+++ b/crypto/srp/srp_vfy.c

@@ -0,0 +1,657 @@
+/* crypto/srp/srp_vfy.c */
+/* Written by Christophe Renou ([email protected]) with 
+ * the precious help of Peter Sylvester ([email protected]) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+#include <openssl/buffer.h>
+#include <openssl/rand.h>
+#include <openssl/txt_db.h>
+
+#define SRP_RANDOM_SALT_LEN 20
+#define MAX_LEN 2500
+
+static char b64table[] =
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
+
+/* the following two conversion routines have been inspired by code from Stanford */ 
+
+/*
+ * Convert a base64 string into raw byte array representation.
+ */
+static int t_fromb64(unsigned char *a, const char *src)
+	{
+	char *loc;
+	int i, j;
+	int size;
+
+	while(*src && (*src == ' ' || *src == '\t' || *src == '\n'))
+		++src;
+	size = strlen(src);
+	i = 0;
+	while(i < size)
+		{
+		loc = strchr(b64table, src[i]);
+		if(loc == (char *) 0) break;
+		else a[i] = loc - b64table;
+		++i;
+		}
+	size = i;
+	i = size - 1;
+	j = size;
+	while(1)
+		{
+		a[j] = a[i];
+		if(--i < 0) break;
+		a[j] |= (a[i] & 3) << 6;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x3c) >> 2);
+		if(--i < 0) break;
+		a[j] |= (a[i] & 0xf) << 4;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x30) >> 4);
+		if(--i < 0) break;
+		a[j] |= (a[i] << 2);
+
+		a[--j] = 0;
+		if(--i < 0) break;
+		}
+	while(a[j] == 0 && j <= size) ++j;
+	i = 0;
+	while (j <= size) a[i++] = a[j++];
+	return i;
+	}
+
+
+/*
+ * Convert a raw byte string into a null-terminated base64 ASCII string.
+ */
+static char *t_tob64(char *dst, const unsigned char *src, int size)
+	{
+	int c, pos = size % 3;
+	unsigned char b0 = 0, b1 = 0, b2 = 0, notleading = 0;
+	char *olddst = dst;
+
+	switch(pos)
+		{
+	case 1:
+		b2 = src[0];
+		break;
+	case 2:
+		b1 = src[0];
+		b2 = src[1];
+		break;
+		}
+
+	while(1)
+		{
+		c = (b0 & 0xfc) >> 2;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b0 & 3) << 4) | ((b1 & 0xf0) >> 4);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b1 & 0xf) << 2) | ((b2 & 0xc0) >> 6);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = b2 & 0x3f;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		if(pos >= size) break;
+		else
+			{
+			b0 = src[pos++];
+			b1 = src[pos++];
+			b2 = src[pos++];
+			}
+		}
+
+	*dst++ = '\0';
+	return olddst;
+	}
+
+static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+	{
+	if (user_pwd == NULL) 
+		return;
+	BN_free(user_pwd->s);
+	BN_clear_free(user_pwd->v);
+	OPENSSL_free(user_pwd->id);
+	OPENSSL_free(user_pwd->info);
+	OPENSSL_free(user_pwd);
+	}
+
+static SRP_user_pwd *SRP_user_pwd_new()
+	{
+	SRP_user_pwd *ret = OPENSSL_malloc(sizeof(SRP_user_pwd));
+	if (ret == NULL)
+		return NULL;								
+	ret->N = NULL;
+	ret->g = NULL;	
+	ret->s = NULL;
+	ret->v = NULL;
+	ret->id = NULL ;
+	ret->info = NULL;
+	return ret;
+	}
+
+static void SRP_user_pwd_set_gN(SRP_user_pwd *vinfo, const BIGNUM *g,
+				const BIGNUM *N)
+	{
+	vinfo->N = N;
+	vinfo->g = g;	
+	}
+
+static int SRP_user_pwd_set_ids(SRP_user_pwd *vinfo, const char *id,
+				const char *info)
+	{
+	if (id != NULL && NULL == (vinfo->id = BUF_strdup(id)))
+		return 0;
+	return (info == NULL || NULL != (vinfo->info = BUF_strdup(info))) ;
+	}
+
+static int SRP_user_pwd_set_sv(SRP_user_pwd *vinfo, const char *s,
+			       const char *v)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	if (strlen(s) > MAX_LEN || strlen(v) > MAX_LEN) 
+		return 0; 
+	len = t_fromb64(tmp, v);
+	if (NULL == (vinfo->v = BN_bin2bn(tmp, len, NULL)) )
+		return 0;
+	len = t_fromb64(tmp, s);
+	return ((vinfo->s = BN_bin2bn(tmp, len, NULL)) != NULL) ;
+	}
+
+static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
+	{
+	vinfo->v = v;
+	vinfo->s = s;
+	return (vinfo->s != NULL && vinfo->v != NULL) ;
+	}
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key)
+	{
+	SRP_VBASE *vb = (SRP_VBASE *) OPENSSL_malloc(sizeof(SRP_VBASE));
+
+	if (vb == NULL)
+		return NULL;
+	if (!(vb->users_pwd = sk_SRP_user_pwd_new_null()) ||
+		!(vb->gN_cache = sk_SRP_gN_cache_new_null()))
+		{
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	vb->default_g = NULL;
+	vb->default_N = NULL;
+	vb->seed_key = NULL;
+	if ((seed_key != NULL) && 
+		(vb->seed_key = BUF_strdup(seed_key)) == NULL)
+		{
+		sk_SRP_user_pwd_free(vb->users_pwd);
+		sk_SRP_gN_cache_free(vb->gN_cache);
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	return vb;
+	}
+
+
+int SRP_VBASE_free(SRP_VBASE *vb)
+	{
+	sk_SRP_user_pwd_pop_free(vb->users_pwd,SRP_user_pwd_free);
+	sk_SRP_gN_cache_free(vb->gN_cache);
+	OPENSSL_free(vb->seed_key);
+	OPENSSL_free(vb);
+	return 0;
+	}
+
+
+static SRP_gN_cache *SRP_gN_new_init(const char *ch)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	SRP_gN_cache *newgN = (SRP_gN_cache *)OPENSSL_malloc(sizeof(SRP_gN_cache));
+	if (newgN == NULL)
+		return NULL;
+
+	if ((newgN->b64_bn = BUF_strdup(ch)) == NULL)
+		goto err;
+
+	len = t_fromb64(tmp, ch);
+	if ((newgN->bn = BN_bin2bn(tmp, len, NULL)))
+		return newgN;
+
+	OPENSSL_free(newgN->b64_bn);
+err:
+	OPENSSL_free(newgN);
+	return NULL;
+	}
+
+
+static void SRP_gN_free(SRP_gN_cache *gN_cache)
+	{
+	if (gN_cache == NULL)
+		return;
+	OPENSSL_free(gN_cache->b64_bn);
+	BN_free(gN_cache->bn);
+	OPENSSL_free(gN_cache);
+	}
+
+static SRP_gN *SRP_get_gN_by_id(const char *id, STACK_OF(SRP_gN) *gN_tab)
+	{
+	int i;
+
+	SRP_gN *gN;
+	if (gN_tab != NULL) 
+	for(i = 0; i < sk_SRP_gN_num(gN_tab); i++)
+		{
+		gN = sk_SRP_gN_value(gN_tab, i);
+		if (gN && (id == NULL || strcmp(gN->id,id)==0))
+			return gN;
+		}
+	
+	return SRP_get_default_gN(id);
+	}
+
+static BIGNUM *SRP_gN_place_bn(STACK_OF(SRP_gN_cache) *gN_cache, char *ch)
+	{
+	int i;
+	if (gN_cache == NULL)
+		return NULL;
+
+	/* search if we have already one... */
+	for(i = 0; i < sk_SRP_gN_cache_num(gN_cache); i++)
+		{
+		SRP_gN_cache *cache = sk_SRP_gN_cache_value(gN_cache, i);
+		if (strcmp(cache->b64_bn,ch)==0)
+			return cache->bn;
+		}
+		{		/* it is the first time that we find it */
+		SRP_gN_cache *newgN = SRP_gN_new_init(ch);
+		if (newgN)
+			{
+			if (sk_SRP_gN_cache_insert(gN_cache,newgN,0)>0)
+				return newgN->bn;
+			SRP_gN_free(newgN);
+			}
+		}
+	return NULL;
+	}
+
+/* this function parses verifier file. Format is:
+ * string(index):base64(N):base64(g):0
+ * string(username):base64(v):base64(salt):int(index)
+ */
+
+
+int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
+	{
+	int error_code ;
+	STACK_OF(SRP_gN) *SRP_gN_tab = sk_SRP_gN_new_null();
+	char *last_index = NULL;
+	int i;
+	char **pp;
+
+	SRP_gN *gN = NULL;
+	SRP_user_pwd *user_pwd = NULL ;
+
+	TXT_DB *tmpdb = NULL;
+	BIO *in = BIO_new(BIO_s_file());
+
+	error_code = SRP_ERR_OPEN_FILE;
+
+	if (in == NULL || BIO_read_filename(in,verifier_file) <= 0)
+		goto err;
+
+	error_code = SRP_ERR_VBASE_INCOMPLETE_FILE;
+
+	if ((tmpdb =TXT_DB_read(in,DB_NUMBER)) == NULL)
+		goto err;
+
+	error_code = SRP_ERR_MEMORY;
+
+
+	if (vb->seed_key)
+		{
+		last_index = SRP_get_default_gN(NULL)->id;
+		}
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(tmpdb->data); i++)
+		{
+		pp = (char **)sk_OPENSSL_PSTRING_value(tmpdb->data,i);
+		if (pp[DB_srptype][0] == DB_SRP_INDEX)
+			{
+			/*we add this couple in the internal Stack */
+
+			if ((gN = (SRP_gN *)OPENSSL_malloc(sizeof(SRP_gN))) == NULL) 
+ 				goto err;
+
+			if  (!(gN->id = BUF_strdup(pp[DB_srpid]))
+	                ||  !(gN->N = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpverifier]))
+			||  !(gN->g = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpsalt]))
+			||  sk_SRP_gN_insert(SRP_gN_tab,gN,0) == 0)
+				goto err;
+
+			gN = NULL;
+
+			if (vb->seed_key != NULL)
+				{
+				last_index = pp[DB_srpid];
+				}
+			}
+		else if (pp[DB_srptype][0] == DB_SRP_VALID)
+			{
+			/* it is a user .... */
+			SRP_gN *lgN;
+			if ((lgN = SRP_get_gN_by_id(pp[DB_srpgN],SRP_gN_tab))!=NULL)
+				{
+				error_code = SRP_ERR_MEMORY;
+				if ((user_pwd = SRP_user_pwd_new()) == NULL) 
+					goto err;
+				
+				SRP_user_pwd_set_gN(user_pwd,lgN->g,lgN->N);
+				if (!SRP_user_pwd_set_ids(user_pwd, pp[DB_srpid],pp[DB_srpinfo]))
+					goto err;
+				
+				error_code = SRP_ERR_VBASE_BN_LIB;
+				if (!SRP_user_pwd_set_sv(user_pwd, pp[DB_srpsalt],pp[DB_srpverifier]))
+					goto err;
+
+				if (sk_SRP_user_pwd_insert(vb->users_pwd, user_pwd, 0) == 0)
+					goto err;
+				user_pwd = NULL; /* abandon responsability */
+				}
+			}
+		}
+	
+	if (last_index != NULL)
+		{
+		/* this means that we want to simulate a default user */
+
+		if (((gN = SRP_get_gN_by_id(last_index,SRP_gN_tab))==NULL))
+			{
+			error_code = SRP_ERR_VBASE_BN_LIB;
+			goto err;
+			}
+		vb->default_g = gN->g ;
+		vb->default_N = gN->N ;
+		gN = NULL ;
+		}
+	error_code = SRP_NO_ERROR;
+
+ err:
+	/* there may be still some leaks to fix, if this fails, the application terminates most likely */
+
+	if (gN != NULL)
+		{
+		OPENSSL_free(gN->id);
+		OPENSSL_free(gN);
+		}
+
+	SRP_user_pwd_free(user_pwd);
+
+	if (tmpdb) TXT_DB_free(tmpdb);
+	if (in) BIO_free_all(in);
+
+	sk_SRP_gN_free(SRP_gN_tab);
+
+	return error_code;
+
+	}
+
+
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+	{
+	int i;
+	SRP_user_pwd *user;
+	unsigned char digv[SHA_DIGEST_LENGTH];
+	unsigned char digs[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+
+	if (vb == NULL)
+		return NULL;
+	for(i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++)
+		{
+		user = sk_SRP_user_pwd_value(vb->users_pwd, i);
+		if (strcmp(user->id,username)==0)
+			return user;
+		}
+	if ((vb->seed_key == NULL) ||
+		(vb->default_g == NULL) ||
+		(vb->default_N == NULL))
+		return NULL;
+
+/* if the user is unknown we set parameters as well if we have a seed_key */
+
+	if ((user = SRP_user_pwd_new()) == NULL) 
+		return NULL;
+
+	SRP_user_pwd_set_gN(user,vb->default_g,vb->default_N);
+				
+	if (!SRP_user_pwd_set_ids(user,username,NULL))
+		goto err;
+		
+	RAND_pseudo_bytes(digv, SHA_DIGEST_LENGTH);
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, vb->seed_key, strlen(vb->seed_key));
+	EVP_DigestUpdate(&ctxt, username, strlen(username));
+	EVP_DigestFinal_ex(&ctxt, digs, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	if (SRP_user_pwd_set_sv_BN(user, BN_bin2bn(digs,SHA_DIGEST_LENGTH,NULL), BN_bin2bn(digv,SHA_DIGEST_LENGTH, NULL))) 
+		return user;
+
+err:    SRP_user_pwd_free(user);
+	return NULL;
+	}
+
+
+/*
+   create a verifier (*salt,*verifier,g and N are in base64)
+*/
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g)
+	{
+	int len;
+	char * result=NULL;
+	char *vf;
+	BIGNUM *N_bn = NULL, *g_bn = NULL, *s = NULL, *v = NULL;
+	unsigned char tmp[MAX_LEN];
+	unsigned char tmp2[MAX_LEN];
+	char * defgNid = NULL;
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL))
+		goto err;
+
+	if (N)
+		{
+		if (!(len = t_fromb64(tmp, N))) goto err;
+		N_bn = BN_bin2bn(tmp, len, NULL);
+		if (!(len = t_fromb64(tmp, g))) goto err;
+		g_bn = BN_bin2bn(tmp, len, NULL);
+		defgNid = "*";
+		}
+	else
+		{ 
+		SRP_gN * gN = SRP_get_gN_by_id(g, NULL) ;
+		if (gN == NULL)
+			goto err;
+		N_bn = gN->N;
+		g_bn = gN->g;
+		defgNid = gN->id;
+		}
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		s = BN_bin2bn(tmp2, SRP_RANDOM_SALT_LEN, NULL);
+		}
+	else
+		{
+		if (!(len = t_fromb64(tmp2, *salt)))
+			goto err;
+		s = BN_bin2bn(tmp2, len, NULL);
+		}
+
+
+	if(!SRP_create_verifier_BN(user, pass, &s, &v, N_bn, g_bn)) goto err;
+
+	BN_bn2bin(v,tmp);
+	if (((vf = OPENSSL_malloc(BN_num_bytes(v)*2)) == NULL))
+		goto err;
+	t_tob64(vf, tmp, BN_num_bytes(v));
+
+	*verifier = vf;
+	if (*salt == NULL)
+		{
+		char *tmp_salt;
+		if ((tmp_salt = (char *)OPENSSL_malloc(SRP_RANDOM_SALT_LEN * 2)) == NULL)
+			{
+			OPENSSL_free(vf);
+			goto err;
+			}
+		t_tob64(tmp_salt, tmp2, SRP_RANDOM_SALT_LEN);
+		*salt = tmp_salt;
+		}
+
+	result=defgNid;
+
+err:
+	if(N)
+		{
+		BN_free(N_bn);
+		BN_free(g_bn);
+		}
+	return result;
+	}
+
+/*
+   create a verifier (*salt,*verifier,g and N are BIGNUMs)
+*/
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g)
+	{
+	int result=0;
+	BIGNUM *x = NULL;
+	BN_CTX *bn_ctx = BN_CTX_new();
+	unsigned char tmp2[MAX_LEN];
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL)||
+		(N == NULL)||
+		(g == NULL)||
+		(bn_ctx == NULL))
+		goto err;
+
+	srp_bn_print(N);
+	srp_bn_print(g);
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		*salt = BN_bin2bn(tmp2,SRP_RANDOM_SALT_LEN,NULL);
+		}
+
+	x = SRP_Calc_x(*salt,user,pass);
+
+	*verifier = BN_new();
+	if(*verifier == NULL) goto err;
+
+	if (!BN_mod_exp(*verifier,g,x,N,bn_ctx))
+		{
+		BN_clear_free(*verifier);
+		goto err;
+		}
+
+	srp_bn_print(*verifier);
+
+	result=1;
+
+err:
+
+	BN_clear_free(x);
+	BN_CTX_free(bn_ctx);
+	return result;
+	}
+
+
+
+#endif

diff --git a/crypto/stack/safestack.h b/crypto/stack/safestack.h
index 3e76aa5..ea3aa0d 100644
--- a/crypto/stack/safestack.h
+++ b/crypto/stack/safestack.h

@@ -1459,6 +1459,94 @@
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \

diff --git a/crypto/symhacks.h b/crypto/symhacks.h
index 3fd4a81..403f592 100644
--- a/crypto/symhacks.h
+++ b/crypto/symhacks.h

@@ -176,7 +176,6 @@
 #define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
 #undef SSL_COMP_get_compression_methods
 #define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
-
 #undef ssl_add_clienthello_renegotiate_ext
 #define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
 #undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
 #define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
 #undef ssl_parse_serverhello_renegotiate_ext
 #define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb SSL_CTX_set_next_proto_sel_cb
 
 /* Hack some long ENGINE names */
 #undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
 #define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
 #undef EC_GROUP_clear_free_all_extra_data
 #define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
 #undef EC_POINT_set_Jprojective_coordinates_GFp
 #define EC_POINT_set_Jprojective_coordinates_GFp \
                                                 EC_POINT_set_Jproj_coords_GFp
@@ -399,6 +421,12 @@
 #undef dtls1_retransmit_buffered_messages
 #define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
 
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
 /* Hack some long UI names */
 #undef UI_method_get_prompt_constructor
 #define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr

diff --git a/crypto/ui/ui.h b/crypto/ui/ui.h
index 2b1cfa2..bd78aa4 100644
--- a/crypto/ui/ui.h
+++ b/crypto/ui/ui.h

@@ -316,7 +316,7 @@
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */

diff --git a/crypto/x509/x509.h b/crypto/x509/x509.h
index e6f8a40..092dd74 100644
--- a/crypto/x509/x509.h
+++ b/crypto/x509/x509.h

@@ -657,11 +657,15 @@
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)

diff --git a/crypto/x509/x509_cmp.c b/crypto/x509/x509_cmp.c
index 4bc9da0..7c2aaee 100644
--- a/crypto/x509/x509_cmp.c
+++ b/crypto/x509/x509_cmp.c

@@ -87,15 +87,20 @@
 	EVP_MD_CTX_init(&ctx);
 	f=X509_NAME_oneline(a->cert_info->issuer,NULL,0);
 	ret=strlen(f);
-	EVP_DigestInit_ex(&ctx, EVP_md5(), NULL);
-	EVP_DigestUpdate(&ctx,(unsigned char *)f,ret);
+	if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,ret))
+		goto err;
 	OPENSSL_free(f);
-	EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
-		(unsigned long)a->cert_info->serialNumber->length);
-	EVP_DigestFinal_ex(&ctx,&(md[0]),NULL);
+	if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
+		(unsigned long)a->cert_info->serialNumber->length))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL))
+		goto err;
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
 		)&0xffffffffL;
+	err:
 	EVP_MD_CTX_cleanup(&ctx);
 	return(ret);
 	}
@@ -219,7 +224,9 @@
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(),
+		NULL))
+		return 0;
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
@@ -234,12 +241,18 @@
 
 unsigned long X509_NAME_hash_old(X509_NAME *x)
 	{
+	EVP_MD_CTX md_ctx;
 	unsigned long ret=0;
 	unsigned char md[16];
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL);
+	EVP_MD_CTX_init(&md_ctx);
+	EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL);
+	EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length);
+	EVP_DigestFinal_ex(&md_ctx,md,NULL);
+	EVP_MD_CTX_cleanup(&md_ctx);
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)

diff --git a/crypto/x509/x509type.c b/crypto/x509/x509type.c
index 3385ad3..9702ec5 100644
--- a/crypto/x509/x509type.c
+++ b/crypto/x509/x509type.c

@@ -100,20 +100,26 @@
 		break;
 		}
 
-	i=X509_get_signature_type(x);
-	switch (i)
+	i=OBJ_obj2nid(x->sig_alg->algorithm);
+	if (i && OBJ_find_sigid_algs(i, NULL, &i))
 		{
-	case EVP_PKEY_RSA:
-		ret|=EVP_PKS_RSA;
-		break;
-	case EVP_PKEY_DSA:
-		ret|=EVP_PKS_DSA;
-		break;
-	case EVP_PKEY_EC:
-		ret|=EVP_PKS_EC;
-		break;
-	default:
-		break;
+
+		switch (i)
+			{
+		case NID_rsaEncryption:
+		case NID_rsa:
+			ret|=EVP_PKS_RSA;
+			break;
+		case NID_dsa:
+		case NID_dsa_2:
+			ret|=EVP_PKS_DSA;
+			break;
+		case NID_X9_62_id_ecPublicKey:
+			ret|=EVP_PKS_EC;
+			break;
+		default:
+			break;
+			}
 		}
 
 	if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look

diff --git a/crypto/x509/x_all.c b/crypto/x509/x_all.c
index 8ec88c2..b94aeeb 100644
--- a/crypto/x509/x_all.c
+++ b/crypto/x509/x_all.c

@@ -95,12 +95,25 @@
 		x->sig_alg, x->signature, x->cert_info,pkey,md));
 	}
 
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF),
+		x->cert_info->signature,
+		x->sig_alg, x->signature, x->cert_info, ctx);
+	}
+
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL,
 		x->signature, x->req_info,pkey,md));
 	}
 
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO),
+		x->sig_alg, NULL, x->signature, x->req_info, ctx);
+	}
+
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	x->crl->enc.modified = 1;
@@ -108,6 +121,12 @@
 		x->sig_alg, x->signature, x->crl,pkey,md));
 	}
 
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO),
+		x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx);
+	}
+
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL,

diff --git a/crypto/x509v3/v3_skey.c b/crypto/x509v3/v3_skey.c
index 202c9e4..0a984fb 100644
--- a/crypto/x509v3/v3_skey.c
+++ b/crypto/x509v3/v3_skey.c

@@ -129,7 +129,8 @@
 		goto err;
 	}
 
-	EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL);
+	if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL))
+		goto err;
 
 	if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) {
 		X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE);

diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index c96821a..7b7b93b 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl

@@ -7,15 +7,24 @@
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
 
-if ($win64)	{ $arg1="%rcx"; $arg2="%rdx"; }
-else		{ $arg1="%rdi"; $arg2="%rsi"; }
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
 print<<___;
 .extern		OPENSSL_cpuid_setup
+.hidden		OPENSSL_cpuid_setup
 .section	.init
 	call	OPENSSL_cpuid_setup
 
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
 .text
 
 .globl	OPENSSL_atomic_add
@@ -46,7 +55,7 @@
 .type	OPENSSL_ia32_cpuid,\@abi-omnipotent
 .align	16
 OPENSSL_ia32_cpuid:
-	mov	%rbx,%r8
+	mov	%rbx,%r8		# save %rbx
 
 	xor	%eax,%eax
 	cpuid
@@ -78,7 +87,15 @@
 	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
-	cmp	\$0x80000008,%eax
+	cmp	\$0x80000001,%eax
+	jb	.Lintel
+	mov	%eax,%r10d
+	mov	\$0x80000001,%eax
+	cpuid
+	or	%ecx,%r9d
+	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
+
+	cmp	\$0x80000008,%r10d
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -89,12 +106,12 @@
 	mov	\$1,%eax
 	cpuid
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	shr	\$16,%ebx		# number of logical processors
 	cmp	%r10b,%bl
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-	jmp	.Ldone
+	jmp	.Lgeneric
 
 .Lintel:
 	cmp	\$4,%r11d
@@ -111,30 +128,47 @@
 .Lnocacheinfo:
 	mov	\$1,%eax
 	cpuid
+	and	\$0xbfefffff,%edx	# force reserved bits to 0
 	cmp	\$0,%r9d
 	jne	.Lnotintel
-	or	\$0x00100000,%edx	# use reserved 20th bit to engage RC4_CHAR
+	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
 	and	\$15,%ah
 	cmp	\$15,%ah		# examine Family ID
-	je	.Lnotintel
-	or	\$0x40000000,%edx	# use reserved bit to skip unrolled loop
+	jne	.Lnotintel
+	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
 	cmp	\$0,%r10d
-	je	.Ldone
+	je	.Lgeneric
 
 	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
 	cmp	\$1,%bl			# see if cache is shared
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
+.Lgeneric:
+	and	\$0x00000800,%r9d	# isolate AMD XOP flag
+	and	\$0xfffff7ff,%ecx
+	or	%ecx,%r9d		# merge AMD XOP flag
+
+	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx
+	bt	\$27,%r9d		# check OSXSAVE bit
+	jnc	.Lclear_avx
+	xor	%ecx,%ecx		# XCR0
+	.byte	0x0f,0x01,0xd0		# xgetbv
+	and	\$6,%eax		# isolate XMM and YMM state support
+	cmp	\$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
+	and	%eax,%r9d		# clear AVX, FMA and AMD XOP bits
 .Ldone:
-	shl	\$32,%rcx
-	mov	%edx,%eax
-	mov	%r8,%rbx
-	or	%rcx,%rax
+	shl	\$32,%r9
+	mov	%r10d,%eax
+	mov	%r8,%rbx		# restore %rbx
+	or	%r9,%rax
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
@@ -229,4 +263,21 @@
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
 
+print<<___;
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,\@abi-omnipotent
+.align	16
+OPENSSL_ia32_rdrand:
+	mov	\$8,%ecx
+.Loop_rdrand:
+	rdrand	%rax
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmp	\$0,%rax
+	cmove	%rcx,%rax
+	ret
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+___
+
 close STDOUT;	# flush

diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index a7464af..39fd8f2 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl

@@ -19,9 +19,9 @@
 	&pushf	();
 	&pop	("eax");
 	&xor	("ecx","eax");
-	&bt	("ecx",21);
-	&jnc	(&label("done"));
 	&xor	("eax","eax");
+	&bt	("ecx",21);
+	&jnc	(&label("nocpuid"));
 	&cpuid	();
 	&mov	("edi","eax");		# max value for standard query level
 
@@ -51,7 +51,14 @@
 	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
-	&cmp	("eax",0x80000008);
+	&cmp	("eax",0x80000001);
+	&jb	(&label("intel"));
+	&mov	("esi","eax");
+	&mov	("eax",0x80000001);
+	&cpuid	();
+	&or	("ebp","ecx");
+	&and	("ebp",1<<11|1);	# isolate XOP bit
+	&cmp	("esi",0x80000008);
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -62,13 +69,13 @@
 	&mov	("eax",1);
 	&cpuid	();
 	&bt	("edx",28);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&shr	("ebx",16);
 	&and	("ebx",0xff);
 	&cmp	("ebx","esi");
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit
-	&jmp	(&label("done"));
+	&jmp	(&label("generic"));
 	
 &set_label("intel");
 	&cmp	("edi",4);
@@ -85,27 +92,51 @@
 &set_label("nocacheinfo");
 	&mov	("eax",1);
 	&cpuid	();
+	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
 	&cmp	("ebp",0);
-	&jne	(&label("notP4"));
+	&jne	(&label("notintel"));
+	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
 	&and	(&HB("eax"),15);	# familiy ID
 	&cmp	(&HB("eax"),15);	# P4?
-	&jne	(&label("notP4"));
-	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+	&jne	(&label("notintel"));
+	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
 	&bt	("edx",28);		# test hyper-threading bit
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&and	("edx",0xefffffff);
 	&cmp	("edi",0);
-	&je	(&label("done"));
+	&je	(&label("generic"));
 
 	&or	("edx",0x10000000);
 	&shr	("ebx",16);
 	&cmp	(&LB("ebx"),1);
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+
+&set_label("generic");
+	&and	("ebp",1<<11);		# isolate AMD XOP flag
+	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
+	&mov	("esi","edx");
+	&or	("ebp","ecx");		# merge AMD XOP flag
+
+	&bt	("ecx",27);		# check OSXSAVE bit
+	&jnc	(&label("clear_avx"));
+	&xor	("ecx","ecx");
+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
+	&and	("eax",6);
+	&cmp	("eax",6);
+	&je	(&label("done"));
+	&cmp	("eax",2);
+	&je	(&label("clear_avx"));
+&set_label("clear_xmm");
+	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
+	&and	("esi",0xfeffffff);	# clear FXSR
+&set_label("clear_avx");
+	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
 &set_label("done");
-	&mov	("eax","edx");
-	&mov	("edx","ecx");
+	&mov	("eax","esi");
+	&mov	("edx","ebp");
+&set_label("nocpuid");
 &function_end("OPENSSL_ia32_cpuid");
 
 &external_label("OPENSSL_ia32cap_P");
@@ -199,8 +230,9 @@
 	&bt	(&DWP(0,"ecx"),1);
 	&jnc	(&label("no_x87"));
 	if ($sse2) {
-		&bt	(&DWP(0,"ecx"),26);
-		&jnc	(&label("no_sse2"));
+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
+		&cmp	("ecx",1<<26|1<<24);
+		&jne	(&label("no_sse2"));
 		&pxor	("xmm0","xmm0");
 		&pxor	("xmm1","xmm1");
 		&pxor	("xmm2","xmm2");
@@ -307,6 +339,18 @@
 	&ret	();
 &function_end_B("OPENSSL_cleanse");
 
+&function_begin_B("OPENSSL_ia32_rdrand");
+	&mov	("ecx",8);
+&set_label("loop");
+	&rdrand	("eax");
+	&jc	(&label("break"));
+	&loop	(&label("loop"));
+&set_label("break");
+	&cmp	("eax",0);
+	&cmove	("eax","ecx");
+	&ret	();
+&function_end_B("OPENSSL_ia32_rdrand");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();

diff --git a/e_os.h b/e_os.h
index 5ceeeeb..79c1392 100644
--- a/e_os.h
+++ b/e_os.h

@@ -99,7 +99,6 @@
 #  ifndef MAC_OS_GUSI_SOURCE
 #    define MAC_OS_pre_X
 #    define NO_SYS_TYPES_H
-     typedef long ssize_t;
 #  endif
 #  define NO_SYS_PARAM_H
 #  define NO_CHMOD
@@ -340,8 +339,6 @@
 #    define OPENSSL_NO_POSIX_IO
 #  endif
 
-#  define ssize_t long
-
 #  if defined (__BORLANDC__)
 #    define _setmode setmode
 #    define _O_TEXT O_TEXT
@@ -456,9 +453,6 @@
                          * (unless when compiling with -D_POSIX_SOURCE,
                          * which doesn't work for us) */
 #    endif
-#    if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
-#      define ssize_t int /* ditto */
-#    endif
 #    ifdef OPENSSL_SYS_NEWS4 /* setvbuf is missing on mips-sony-bsd */
 #      define setvbuf(a, b, c, d) setbuffer((a), (b), (d))
        typedef unsigned long clock_t;
@@ -637,12 +631,6 @@
 
 #endif
 
-#if defined(__ultrix)
-#  ifndef ssize_t
-#    define ssize_t int 
-#  endif
-#endif
-
 #if defined(sun) && !defined(__svr4__) && !defined(__SVR4)
   /* include headers first, so our defines don't break it */
 #include <stdlib.h>

diff --git a/e_os2.h b/e_os2.h
index d30724d..d22c036 100644
--- a/e_os2.h
+++ b/e_os2.h

@@ -289,6 +289,26 @@
 # define OPENSSL_GLOBAL_REF(name) _shadow_##name
 #endif
 
+#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
+#  define ossl_ssize_t long
+#endif
+
+#ifdef OPENSSL_SYS_MSDOS
+#  define ossl_ssize_t long
+#endif
+
+#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
+#  define ssize_t int
+#endif
+
+#if defined(__ultrix) && !defined(ssize_t)
+#  define ossl_ssize_t int 
+#endif
+
+#ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#endif
+
 #ifdef  __cplusplus
 }
 #endif

diff --git a/include/openssl/aes.h b/include/openssl/aes.h
index d2c9973..031abf0 100644
--- a/include/openssl/aes.h
+++ b/include/openssl/aes.h

@@ -90,6 +90,11 @@
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,

diff --git a/include/openssl/asn1.h b/include/openssl/asn1.h
index 59540e4..220a0c8 100644
--- a/include/openssl/asn1.h
+++ b/include/openssl/asn1.h

@@ -235,7 +235,7 @@
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111

diff --git a/include/openssl/bio.h b/include/openssl/bio.h
index 3c39d18..05699ab 100644
--- a/include/openssl/bio.h
+++ b/include/openssl/bio.h

@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -162,7 +173,22 @@
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -332,6 +358,34 @@
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -629,6 +683,9 @@
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -671,6 +728,15 @@
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -735,6 +801,7 @@
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129

diff --git a/include/openssl/blowfish.h b/include/openssl/blowfish.h
index b97e76f..4b6c892 100644
--- a/include/openssl/blowfish.h
+++ b/include/openssl/blowfish.h

@@ -104,7 +104,9 @@
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);

diff --git a/include/openssl/bn.h b/include/openssl/bn.h
index a0bc478..f34248e 100644
--- a/include/openssl/bn.h
+++ b/include/openssl/bn.h

@@ -558,6 +558,17 @@
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);

diff --git a/include/openssl/cmac.h b/include/openssl/cmac.h
new file mode 100644
index 0000000..712e92d
--- /dev/null
+++ b/include/openssl/cmac.h

@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson ([email protected]) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif

diff --git a/include/openssl/crypto.h b/include/openssl/crypto.h
index b0360ce..6aeda0a 100644
--- a/include/openssl/crypto.h
+++ b/include/openssl/crypto.h

@@ -547,6 +547,33 @@
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +589,13 @@
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus

diff --git a/include/openssl/des.h b/include/openssl/des.h
index 92b6663..1eaedcb 100644
--- a/include/openssl/des.h
+++ b/include/openssl/des.h

@@ -224,6 +224,9 @@
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,

diff --git a/include/openssl/dh.h b/include/openssl/dh.h
index 849309a..ea59e61 100644
--- a/include/openssl/dh.h
+++ b/include/openssl/dh.h

@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105

diff --git a/include/openssl/dsa.h b/include/openssl/dsa.h
index ac50a5c..a6f6d0b 100644
--- a/include/openssl/dsa.h
+++ b/include/openssl/dsa.h

@@ -97,6 +97,21 @@
                                               * be used for all exponents.
                                               */
 
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -272,6 +287,8 @@
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +299,7 @@
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +316,8 @@
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 

diff --git a/include/openssl/dtls1.h b/include/openssl/dtls1.h
index 2900d1d..5008bf6 100644
--- a/include/openssl/dtls1.h
+++ b/include/openssl/dtls1.h

@@ -105,6 +105,11 @@
 #define DTLS1_AL_HEADER_LENGTH                   2
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_AUTH_LABEL	"EXPORTER_DTLS_OVER_SCTP"
+#endif
 
 typedef struct dtls1_bitmap_st
 	{
@@ -227,7 +232,7 @@
 
 	struct dtls1_timeout_st timeout;
 
-	/* Indicates when the last handshake msg sent will timeout */
+	/* Indicates when the last handshake msg or heartbeat sent will timeout */
 	struct timeval next_timeout;
 
 	/* Timeout duration */
@@ -243,6 +248,13 @@
 	unsigned int retransmitting;
 	unsigned int change_cipher_spec_ok;
 
+#ifndef OPENSSL_NO_SCTP
+	/* used when SSL_ST_XX_FLUSH is entered */
+	int next_state;
+
+	int shutdown_received;
+#endif
+
 	} DTLS1_STATE;
 
 typedef struct dtls1_record_data_st
@@ -251,8 +263,12 @@
 	unsigned int   packet_length;
 	SSL3_BUFFER    rbuf;
 	SSL3_RECORD    rrec;
+#ifndef OPENSSL_NO_SCTP
+	struct bio_dgram_sctp_rcvinfo recordinfo;
+#endif
 	} DTLS1_RECORD_DATA;
 
+#endif
 
 /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */
 #define DTLS1_TMO_READ_COUNT                      2

diff --git a/include/openssl/e_os2.h b/include/openssl/e_os2.h
index d30724d..d22c036 100644
--- a/include/openssl/e_os2.h
+++ b/include/openssl/e_os2.h

@@ -289,6 +289,26 @@
 # define OPENSSL_GLOBAL_REF(name) _shadow_##name
 #endif
 
+#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
+#  define ossl_ssize_t long
+#endif
+
+#ifdef OPENSSL_SYS_MSDOS
+#  define ossl_ssize_t long
+#endif
+
+#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
+#  define ssize_t int
+#endif
+
+#if defined(__ultrix) && !defined(ssize_t)
+#  define ossl_ssize_t int 
+#endif
+
+#ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#endif
+
 #ifdef  __cplusplus
 }
 #endif

diff --git a/include/openssl/ec.h b/include/openssl/ec.h
index ee70781..9d01325 100644
--- a/include/openssl/ec.h
+++ b/include/openssl/ec.h

@@ -151,7 +151,24 @@
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
 
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
+
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -282,6 +301,7 @@
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -653,9 +673,11 @@
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -799,6 +831,15 @@
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +967,7 @@
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1010,15 @@
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1061,7 @@
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1092,9 @@
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1107,15 @@
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1150,7 @@
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus

diff --git a/include/openssl/ecdh.h b/include/openssl/ecdh.h
index b4b58ee..8887102 100644
--- a/include/openssl/ecdh.h
+++ b/include/openssl/ecdh.h

@@ -109,11 +109,13 @@
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 

diff --git a/include/openssl/ecdsa.h b/include/openssl/ecdsa.h
index e61c539..7fb5254 100644
--- a/include/openssl/ecdsa.h
+++ b/include/openssl/ecdsa.h

@@ -238,6 +238,7 @@
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,7 @@
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 

diff --git a/include/openssl/engine.h b/include/openssl/engine.h
index 943aeae..f8be497 100644
--- a/include/openssl/engine.h
+++ b/include/openssl/engine.h

@@ -141,6 +141,13 @@
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation

diff --git a/include/openssl/err.h b/include/openssl/err.h
index b9f8c16..974cc9c 100644
--- a/include/openssl/err.h
+++ b/include/openssl/err.h

@@ -344,8 +344,9 @@
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);

diff --git a/include/openssl/evp.h b/include/openssl/evp.h
index 9f9795e..0d1b20a 100644
--- a/include/openssl/evp.h
+++ b/include/openssl/evp.h

@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,34 @@
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
+
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +419,7 @@
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +739,9 @@
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +788,10 @@
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +799,9 @@
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_gcm(void);
+const EVP_CIPHER *EVP_aes_192_ccm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +809,13 @@
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -1047,13 +1100,22 @@
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1190,8 +1252,13 @@
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1313,24 @@
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,6 +1348,7 @@
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
@@ -1303,6 +1380,7 @@
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121

diff --git a/include/openssl/kssl.h b/include/openssl/kssl.h
index a3d20e1..8242fd5 100644
--- a/include/openssl/kssl.h
+++ b/include/openssl/kssl.h

@@ -172,6 +172,10 @@
 			            krb5_timestamp *atimep, KSSL_ERR *kssl_err);
 unsigned char	*kssl_skip_confound(krb5_enctype enctype, unsigned char *authn);
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx);
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s);
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx);
+
 #ifdef  __cplusplus
 }
 #endif

diff --git a/include/openssl/md4.h b/include/openssl/md4.h
index c3ed9b3..a55368a 100644
--- a/include/openssl/md4.h
+++ b/include/openssl/md4.h

@@ -105,6 +105,9 @@
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);

diff --git a/include/openssl/md5.h b/include/openssl/md5.h
index 4cbf843..541cc92 100644
--- a/include/openssl/md5.h
+++ b/include/openssl/md5.h

@@ -105,6 +105,9 @@
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);

diff --git a/include/openssl/modes.h b/include/openssl/modes.h
index af8d97d..f18215b 100644
--- a/include/openssl/modes.h
+++ b/include/openssl/modes.h

@@ -15,6 +15,14 @@
 			size_t len, const void *key,
 			unsigned char ivec[16], int enc);
 
+typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16]);
+
+typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16],unsigned char cmac[16]);
+
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], block128_f block);
@@ -27,6 +35,11 @@
 			unsigned char ivec[16], unsigned char ecount_buf[16],
 			unsigned int *num, block128_f block);
 
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f ctr);
+
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], int *num,
@@ -57,3 +70,66 @@
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], cbc128_f cbc);
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+
+typedef struct gcm128_context GCM128_CONTEXT;
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+			size_t len);
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+			size_t len);
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len);
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
+
+typedef struct ccm128_context CCM128_CONTEXT;
+
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M, unsigned int L, void *key,block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce, size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad, size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+
+typedef struct xts128_context XTS128_CONTEXT;
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out, size_t len, int enc);

diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h
index 282f11a..b5ea7cd 100644
--- a/include/openssl/obj_mac.h
+++ b/include/openssl/obj_mac.h

@@ -580,6 +580,21 @@
 #define NID_sha1WithRSAEncryption		65
 #define OBJ_sha1WithRSAEncryption		OBJ_pkcs1,5L
 
+#define SN_rsaesOaep		"RSAES-OAEP"
+#define LN_rsaesOaep		"rsaesOaep"
+#define NID_rsaesOaep		919
+#define OBJ_rsaesOaep		OBJ_pkcs1,7L
+
+#define SN_mgf1		"MGF1"
+#define LN_mgf1		"mgf1"
+#define NID_mgf1		911
+#define OBJ_mgf1		OBJ_pkcs1,8L
+
+#define SN_rsassaPss		"RSASSA-PSS"
+#define LN_rsassaPss		"rsassaPss"
+#define NID_rsassaPss		912
+#define OBJ_rsassaPss		OBJ_pkcs1,10L
+
 #define SN_sha256WithRSAEncryption		"RSA-SHA256"
 #define LN_sha256WithRSAEncryption		"sha256WithRSAEncryption"
 #define NID_sha256WithRSAEncryption		668
@@ -981,6 +996,10 @@
 #define NID_id_smime_alg_CMSRC2wrap		247
 #define OBJ_id_smime_alg_CMSRC2wrap		OBJ_id_smime_alg,7L
 
+#define SN_id_alg_PWRI_KEK		"id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK		893
+#define OBJ_id_alg_PWRI_KEK		OBJ_id_smime_alg,9L
+
 #define SN_id_smime_cd_ldap		"id-smime-cd-ldap"
 #define NID_id_smime_cd_ldap		248
 #define OBJ_id_smime_cd_ldap		OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
 #define NID_no_rev_avail		403
 #define OBJ_no_rev_avail		OBJ_id_ce,56L
 
+#define SN_anyExtendedKeyUsage		"anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage		"Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage		910
+#define OBJ_anyExtendedKeyUsage		OBJ_ext_key_usage,0L
+
 #define SN_netscape		"Netscape"
 #define LN_netscape		"Netscape Communications Corp."
 #define NID_netscape		57
@@ -2586,6 +2610,24 @@
 #define NID_aes_128_cfb128		421
 #define OBJ_aes_128_cfb128		OBJ_aes,4L
 
+#define SN_id_aes128_wrap		"id-aes128-wrap"
+#define NID_id_aes128_wrap		788
+#define OBJ_id_aes128_wrap		OBJ_aes,5L
+
+#define SN_aes_128_gcm		"id-aes128-GCM"
+#define LN_aes_128_gcm		"aes-128-gcm"
+#define NID_aes_128_gcm		895
+#define OBJ_aes_128_gcm		OBJ_aes,6L
+
+#define SN_aes_128_ccm		"id-aes128-CCM"
+#define LN_aes_128_ccm		"aes-128-ccm"
+#define NID_aes_128_ccm		896
+#define OBJ_aes_128_ccm		OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad		"id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad		897
+#define OBJ_id_aes128_wrap_pad		OBJ_aes,8L
+
 #define SN_aes_192_ecb		"AES-192-ECB"
 #define LN_aes_192_ecb		"aes-192-ecb"
 #define NID_aes_192_ecb		422
@@ -2606,6 +2648,24 @@
 #define NID_aes_192_cfb128		425
 #define OBJ_aes_192_cfb128		OBJ_aes,24L
 
+#define SN_id_aes192_wrap		"id-aes192-wrap"
+#define NID_id_aes192_wrap		789
+#define OBJ_id_aes192_wrap		OBJ_aes,25L
+
+#define SN_aes_192_gcm		"id-aes192-GCM"
+#define LN_aes_192_gcm		"aes-192-gcm"
+#define NID_aes_192_gcm		898
+#define OBJ_aes_192_gcm		OBJ_aes,26L
+
+#define SN_aes_192_ccm		"id-aes192-CCM"
+#define LN_aes_192_ccm		"aes-192-ccm"
+#define NID_aes_192_ccm		899
+#define OBJ_aes_192_ccm		OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad		"id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad		900
+#define OBJ_id_aes192_wrap_pad		OBJ_aes,28L
+
 #define SN_aes_256_ecb		"AES-256-ECB"
 #define LN_aes_256_ecb		"aes-256-ecb"
 #define NID_aes_256_ecb		426
@@ -2626,6 +2686,24 @@
 #define NID_aes_256_cfb128		429
 #define OBJ_aes_256_cfb128		OBJ_aes,44L
 
+#define SN_id_aes256_wrap		"id-aes256-wrap"
+#define NID_id_aes256_wrap		790
+#define OBJ_id_aes256_wrap		OBJ_aes,45L
+
+#define SN_aes_256_gcm		"id-aes256-GCM"
+#define LN_aes_256_gcm		"aes-256-gcm"
+#define NID_aes_256_gcm		901
+#define OBJ_aes_256_gcm		OBJ_aes,46L
+
+#define SN_aes_256_ccm		"id-aes256-CCM"
+#define LN_aes_256_ccm		"aes-256-ccm"
+#define NID_aes_256_ccm		902
+#define OBJ_aes_256_ccm		OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad		"id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad		903
+#define OBJ_id_aes256_wrap_pad		OBJ_aes,48L
+
 #define SN_aes_128_cfb1		"AES-128-CFB1"
 #define LN_aes_128_cfb1		"aes-128-cfb1"
 #define NID_aes_128_cfb1		650
@@ -2650,6 +2728,26 @@
 #define LN_aes_256_cfb8		"aes-256-cfb8"
 #define NID_aes_256_cfb8		655
 
+#define SN_aes_128_ctr		"AES-128-CTR"
+#define LN_aes_128_ctr		"aes-128-ctr"
+#define NID_aes_128_ctr		904
+
+#define SN_aes_192_ctr		"AES-192-CTR"
+#define LN_aes_192_ctr		"aes-192-ctr"
+#define NID_aes_192_ctr		905
+
+#define SN_aes_256_ctr		"AES-256-CTR"
+#define LN_aes_256_ctr		"aes-256-ctr"
+#define NID_aes_256_ctr		906
+
+#define SN_aes_128_xts		"AES-128-XTS"
+#define LN_aes_128_xts		"aes-128-xts"
+#define NID_aes_128_xts		913
+
+#define SN_aes_256_xts		"AES-256-XTS"
+#define LN_aes_256_xts		"aes-256-xts"
+#define NID_aes_256_xts		914
+
 #define SN_des_cfb1		"DES-CFB1"
 #define LN_des_cfb1		"des-cfb1"
 #define NID_des_cfb1		656
@@ -2666,18 +2764,6 @@
 #define LN_des_ede3_cfb8		"des-ede3-cfb8"
 #define NID_des_ede3_cfb8		659
 
-#define SN_id_aes128_wrap		"id-aes128-wrap"
-#define NID_id_aes128_wrap		788
-#define OBJ_id_aes128_wrap		OBJ_aes,5L
-
-#define SN_id_aes192_wrap		"id-aes192-wrap"
-#define NID_id_aes192_wrap		789
-#define OBJ_id_aes192_wrap		OBJ_aes,25L
-
-#define SN_id_aes256_wrap		"id-aes256-wrap"
-#define NID_id_aes256_wrap		790
-#define OBJ_id_aes256_wrap		OBJ_aes,45L
-
 #define OBJ_nist_hashalgs		OBJ_nistAlgorithms,2L
 
 #define SN_sha256		"SHA256"
@@ -3810,6 +3896,18 @@
 #define NID_camellia_256_cbc		753
 #define OBJ_camellia_256_cbc		1L,2L,392L,200011L,61L,1L,1L,1L,4L
 
+#define SN_id_camellia128_wrap		"id-camellia128-wrap"
+#define NID_id_camellia128_wrap		907
+#define OBJ_id_camellia128_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap		"id-camellia192-wrap"
+#define NID_id_camellia192_wrap		908
+#define OBJ_id_camellia192_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap		"id-camellia256-wrap"
+#define NID_id_camellia256_wrap		909
+#define OBJ_id_camellia256_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
 #define OBJ_ntt_ds		0L,3L,4401L,5L
 
 #define OBJ_camellia		OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
 #define LN_hmac		"hmac"
 #define NID_hmac		855
 
+#define SN_cmac		"CMAC"
+#define LN_cmac		"cmac"
+#define NID_cmac		894
+
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+

diff --git a/include/openssl/opensslconf.h b/include/openssl/opensslconf.h
index 26ac6ba..f17eaa9 100644
--- a/include/openssl/opensslconf.h
+++ b/include/openssl/opensslconf.h

@@ -8,6 +8,9 @@
 #ifndef OPENSSL_NO_CAST
 # define OPENSSL_NO_CAST
 #endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
 #ifndef OPENSSL_NO_GMP
 # define OPENSSL_NO_GMP
 #endif
@@ -29,6 +32,9 @@
 #ifndef OPENSSL_NO_RFC3779
 # define OPENSSL_NO_RFC3779
 #endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
 #ifndef OPENSSL_NO_SEED
 # define OPENSSL_NO_SEED
 #endif
@@ -59,6 +65,9 @@
 # if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
 #  define NO_CAST
 # endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
 # if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
 #  define NO_GMP
 # endif
@@ -80,6 +89,9 @@
 # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
 #  define NO_RFC3779
 # endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
 # if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
 #  define NO_SEED
 # endif

diff --git a/include/openssl/opensslv.h b/include/openssl/opensslv.h
index 66a6d0d..bf42556 100644
--- a/include/openssl/opensslv.h
+++ b/include/openssl/opensslv.h

@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000008fL
+#define OPENSSL_VERSION_NUMBER	0x1000100fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0h-fips 12 Mar 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1-fips 14 Mar 2012"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0h 12 Mar 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1 14 Mar 2012"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 

diff --git a/include/openssl/ossl_typ.h b/include/openssl/ossl_typ.h
index 12bd701..ea9227f 100644
--- a/include/openssl/ossl_typ.h
+++ b/include/openssl/ossl_typ.h

@@ -91,10 +91,12 @@
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32

diff --git a/include/openssl/rand.h b/include/openssl/rand.h
index ac6c021..dc8fcf9 100644
--- a/include/openssl/rand.h
+++ b/include/openssl/rand.h

@@ -119,6 +119,11 @@
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,13 @@
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus

diff --git a/include/openssl/rc2.h b/include/openssl/rc2.h
index 34c8362..e542ec9 100644
--- a/include/openssl/rc2.h
+++ b/include/openssl/rc2.h

@@ -79,7 +79,9 @@
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);

diff --git a/include/openssl/rc4.h b/include/openssl/rc4.h
index 29d1acc..88ceb46 100644
--- a/include/openssl/rc4.h
+++ b/include/openssl/rc4.h

@@ -79,6 +79,7 @@
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 

diff --git a/include/openssl/ripemd.h b/include/openssl/ripemd.h
index 5942eb6..189bd8c 100644
--- a/include/openssl/ripemd.h
+++ b/include/openssl/ripemd.h

@@ -91,6 +91,9 @@
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);

diff --git a/include/openssl/rsa.h b/include/openssl/rsa.h
index cf74343..4814a2f 100644
--- a/include/openssl/rsa.h
+++ b/include/openssl/rsa.h

@@ -222,12 +222,22 @@
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -300,6 +323,16 @@
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 

diff --git a/include/openssl/safestack.h b/include/openssl/safestack.h
index 3e76aa5..ea3aa0d 100644
--- a/include/openssl/safestack.h
+++ b/include/openssl/safestack.h

@@ -1459,6 +1459,94 @@
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \

diff --git a/include/openssl/sha.h b/include/openssl/sha.h
index 16cacf9..8a6bf4b 100644
--- a/include/openssl/sha.h
+++ b/include/openssl/sha.h

@@ -106,6 +106,9 @@
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);

diff --git a/include/openssl/srp.h b/include/openssl/srp.h
new file mode 100644
index 0000000..7ec7825
--- /dev/null
+++ b/include/openssl/srp.h

@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou ([email protected]) with 
+ * the precious help of Peter Sylvester ([email protected]) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+	{
+	char *b64_bn;
+	BIGNUM *bn;
+	} SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+	{
+	char *id;
+	BIGNUM *s;
+	BIGNUM *v;
+	const BIGNUM *g;
+	const BIGNUM *N;
+	char *info;
+	} SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+	{
+	STACK_OF(SRP_user_pwd) *users_pwd;
+	STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+	char *seed_key;
+	BIGNUM *default_g;
+	BIGNUM *default_N;
+	} SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+	{
+	char *id;
+	BIGNUM *g;
+	BIGNUM *N;
+	} SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype	0
+#define DB_srpverifier	1
+#define DB_srpsalt 	2
+#define DB_srpid	3              
+#define DB_srpgN	4       
+#define DB_srpinfo	5 
+#undef  DB_NUMBER      
+#define DB_NUMBER       6
+
+#define DB_SRP_INDEX	'I'
+#define DB_SRP_VALID	'V'
+#define DB_SRP_REVOKED	'R'
+#define DB_SRP_MODIF	'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); 
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+#endif

diff --git a/include/openssl/srtp.h b/include/openssl/srtp.h
new file mode 100644
index 0000000..c0cf33e
--- /dev/null
+++ b/include/openssl/srtp.h

@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <[email protected]>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+

diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h
index d88d22d..90d5537 100644
--- a/include/openssl/ssl.h
+++ b/include/openssl/ssl.h

@@ -252,6 +252,7 @@
 #define SSL_TXT_kEECDH		"kEECDH"
 #define SSL_TXT_kPSK            "kPSK"
 #define SSL_TXT_kGOST		"kGOST"
+#define SSL_TXT_kSRP		"kSRP"
 
 #define	SSL_TXT_aRSA		"aRSA"
 #define	SSL_TXT_aDSS		"aDSS"
@@ -275,6 +276,7 @@
 #define SSL_TXT_ECDSA		"ECDSA"
 #define SSL_TXT_KRB5      	"KRB5"
 #define SSL_TXT_PSK             "PSK"
+#define SSL_TXT_SRP		"SRP"
 
 #define SSL_TXT_DES		"DES"
 #define SSL_TXT_3DES		"3DES"
@@ -285,6 +287,7 @@
 #define SSL_TXT_AES128		"AES128"
 #define SSL_TXT_AES256		"AES256"
 #define SSL_TXT_AES		"AES"
+#define SSL_TXT_AES_GCM		"AESGCM"
 #define SSL_TXT_CAMELLIA128	"CAMELLIA128"
 #define SSL_TXT_CAMELLIA256	"CAMELLIA256"
 #define SSL_TXT_CAMELLIA	"CAMELLIA"
@@ -294,10 +297,14 @@
 #define SSL_TXT_SHA		"SHA" /* same as "SHA1" */
 #define SSL_TXT_GOST94		"GOST94" 
 #define SSL_TXT_GOST89MAC		"GOST89MAC" 
+#define SSL_TXT_SHA256		"SHA256"
+#define SSL_TXT_SHA384		"SHA384"
 
 #define SSL_TXT_SSLV2		"SSLv2"
 #define SSL_TXT_SSLV3		"SSLv3"
 #define SSL_TXT_TLSV1		"TLSv1"
+#define SSL_TXT_TLSV1_1		"TLSv1.1"
+#define SSL_TXT_TLSV1_2		"TLSv1.2"
 
 #define SSL_TXT_EXP		"EXP"
 #define SSL_TXT_EXPORT		"EXPORT"
@@ -356,9 +363,29 @@
  * in SSL_CTX. */
 typedef struct ssl_st *ssl_crock_st;
 typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT;
+typedef struct ssl_method_st SSL_METHOD;
+typedef struct ssl_cipher_st SSL_CIPHER;
+typedef struct ssl_session_st SSL_SESSION;
+
+DECLARE_STACK_OF(SSL_CIPHER)
+
+/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/
+typedef struct srtp_protection_profile_st
+       {
+       const char *name;
+       unsigned long id;
+       } SRTP_PROTECTION_PROFILE;
+
+DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE)
+
+typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
+typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
+
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 /* used to hold info on the particular ciphers used */
-typedef struct ssl_cipher_st
+struct ssl_cipher_st
 	{
 	int valid;
 	const char *name;		/* text name */
@@ -375,15 +402,11 @@
 	unsigned long algorithm2;	/* Extra flags */
 	int strength_bits;		/* Number of bits really used */
 	int alg_bits;			/* Number of bits for algorithm */
-	} SSL_CIPHER;
+	};
 
-DECLARE_STACK_OF(SSL_CIPHER)
-
-typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
-typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
 
 /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */
-typedef struct ssl_method_st
+struct ssl_method_st
 	{
 	int version;
 	int (*ssl_new)(SSL *s);
@@ -416,7 +439,7 @@
 	int (*ssl_version)(void);
 	long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void));
 	long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void));
-	} SSL_METHOD;
+	};
 
 /* Lets make this into an ASN.1 type structure as follows
  * SSL_SESSION_ID ::= SEQUENCE {
@@ -433,14 +456,17 @@
  *	Session_ID_context [ 4 ] EXPLICIT OCTET STRING,   -- the Session ID context
  *	Verify_result [ 5 ] EXPLICIT INTEGER,   -- X509_V_... code for `Peer'
  *	HostName [ 6 ] EXPLICIT OCTET STRING,   -- optional HostName from servername TLS extension 
- *	ECPointFormatList [ 7 ] OCTET STRING,     -- optional EC point format list from TLS extension
- *	PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
- *	PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity
+ *	PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
+ *	PSK_identity [ 8 ] EXPLICIT OCTET STRING,  -- optional PSK identity
+ *	Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket
+ *	Ticket [10]             EXPLICIT OCTET STRING, -- session ticket (clients only)
+ *	Compression_meth [11]   EXPLICIT OCTET STRING, -- optional compression method
+ *	SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username
  *	}
  * Look in ssl/ssl_asn1.c for more details
  * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-).
  */
-typedef struct ssl_session_st
+struct ssl_session_st
 	{
 	int ssl_version;	/* what ssl version session info is
 				 * being kept in here? */
@@ -512,8 +538,12 @@
 	size_t	tlsext_ticklen;		/* Session ticket length */	
 	long tlsext_tick_lifetime_hint;	/* Session lifetime hint in seconds */
 #endif
-	} SSL_SESSION;
+#ifndef OPENSSL_NO_SRP
+	char *srp_username;
+#endif
+	};
 
+#endif
 
 #define SSL_OP_MICROSOFT_SESS_ID_BUG			0x00000001L
 #define SSL_OP_NETSCAPE_CHALLENGE_BUG			0x00000002L
@@ -526,6 +556,7 @@
 #define SSL_OP_SSLEAY_080_CLIENT_DH_BUG			0x00000080L
 #define SSL_OP_TLS_D5_BUG				0x00000100L
 #define SSL_OP_TLS_BLOCK_PADDING_BUG			0x00000200L
+#define SSL_OP_NO_TLSv1_1				0x00000400L
 
 /* Disable SSL 3.0/TLS 1.0 CBC vulnerability workaround that was added
  * in OpenSSL 0.9.6d.  Usually (depending on the application protocol)
@@ -536,7 +567,7 @@
 
 /* SSL_OP_ALL: various bug workarounds that should be rather harmless.
  *             This used to be 0x000FFFFFL before 0.9.7. */
-#define SSL_OP_ALL					0x80000FFFL
+#define SSL_OP_ALL					0x80000BFFL
 
 /* DTLS options */
 #define SSL_OP_NO_QUERY_MTU                 0x00001000L
@@ -572,11 +603,16 @@
 #define SSL_OP_NO_SSLv2					0x01000000L
 #define SSL_OP_NO_SSLv3					0x02000000L
 #define SSL_OP_NO_TLSv1					0x04000000L
+#define SSL_OP_NO_TLSv1_2				0x08000000L
 
+/* These next two were never actually used for anything since SSLeay
+ * zap so we have some more flags.
+ */
 /* The next flag deliberately changes the ciphertest, this is a check
  * for the PKCS#1 attack */
-#define SSL_OP_PKCS1_CHECK_1				0x08000000L
-#define SSL_OP_PKCS1_CHECK_2				0x10000000L
+#define SSL_OP_PKCS1_CHECK_1				0x0
+#define SSL_OP_PKCS1_CHECK_2				0x0
+
 #define SSL_OP_NETSCAPE_CA_DN_BUG			0x20000000L
 #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG		0x40000000L
 /* Make server add server-hello extension from early version of
@@ -644,12 +680,53 @@
 #define SSL_get_secure_renegotiation_support(ssl) \
 	SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_heartbeat(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL)
+#endif
+
 void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 
+#ifndef OPENSSL_NO_SRP
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+typedef struct srp_ctx_st
+	{
+	/* param for all the callbacks */
+	void *SRP_cb_arg;
+	/* set client Hello login callback */
+	int (*TLS_ext_srp_username_callback)(SSL *, int *, void *);
+	/* set SRP N/g param callback for verification */
+	int (*SRP_verify_param_callback)(SSL *, void *);
+	/* set SRP client passwd callback */
+	char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *);
+
+	char *login;
+	BIGNUM *N,*g,*s,*B,*A;
+	BIGNUM *a,*b,*v;
+	char *info;
+	int strength;
+
+	unsigned long srp_Mask;
+	} SRP_CTX;
+
+#endif
+
+/* see tls_srp.c */
+int SSL_SRP_CTX_init(SSL *s);
+int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx);
+int SSL_SRP_CTX_free(SSL *ctx);
+int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx);
+int SSL_srp_server_param_with_username(SSL *s, int *ad);
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key);
+int SRP_Calc_A_param(SSL *s);
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key);
+
+#endif
 
 #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32)
 #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */
@@ -675,7 +752,11 @@
 typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id,
 				unsigned int *id_len);
 
-typedef struct ssl_comp_st
+typedef struct ssl_comp_st SSL_COMP;
+
+#ifndef OPENSSL_NO_SSL_INTERN
+
+struct ssl_comp_st
 	{
 	int id;
 	const char *name;
@@ -684,7 +765,7 @@
 #else
 	char *method;
 #endif
-	} SSL_COMP;
+	};
 
 DECLARE_STACK_OF(SSL_COMP)
 DECLARE_LHASH_OF(SSL_SESSION);
@@ -853,11 +934,31 @@
 	/* Callback for status request */
 	int (*tlsext_status_cb)(SSL *ssl, void *arg);
 	void *tlsext_status_arg;
-
 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
 	void *tlsext_opaque_prf_input_callback_arg;
+#endif
 
+#ifndef OPENSSL_NO_PSK
+	char *psk_identity_hint;
+	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
+		unsigned int max_identity_len, unsigned char *psk,
+		unsigned int max_psk_len);
+	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
+		unsigned char *psk, unsigned int max_psk_len);
+#endif
+
+#ifndef OPENSSL_NO_BUF_FREELISTS
+#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
+	unsigned int freelist_max_len;
+	struct ssl3_buf_freelist_st *wbuf_freelist;
+	struct ssl3_buf_freelist_st *rbuf_freelist;
+#endif
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
+
+#ifndef OPENSSL_NO_TLSEXT
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	/* Next protocol negotiation information */
 	/* (for experimental NPN extension). */
@@ -876,25 +977,13 @@
 				    void *arg);
 	void *next_proto_select_cb_arg;
 # endif
-#endif
-
-#ifndef OPENSSL_NO_PSK
-	char *psk_identity_hint;
-	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
-		unsigned int max_identity_len, unsigned char *psk,
-		unsigned int max_psk_len);
-	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
-		unsigned char *psk, unsigned int max_psk_len);
-#endif
-
-#ifndef OPENSSL_NO_BUF_FREELISTS
-#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
-	unsigned int freelist_max_len;
-	struct ssl3_buf_freelist_st *wbuf_freelist;
-	struct ssl3_buf_freelist_st *rbuf_freelist;
+        /* SRTP profiles we are willing to do from RFC 5764 */
+        STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
 #endif
 	};
 
+#endif
+
 #define SSL_SESS_CACHE_OFF			0x0000
 #define SSL_SESS_CACHE_CLIENT			0x0001
 #define SSL_SESS_CACHE_SERVER			0x0002
@@ -952,24 +1041,26 @@
 					   int (*cb) (SSL *ssl,
 						      const unsigned char **out,
 						      unsigned int *outlen,
-						      void *arg), void *arg);
+						      void *arg),
+					   void *arg);
 void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-				      int (*cb) (SSL *ssl, unsigned char **out,
+				      int (*cb) (SSL *ssl,
+						 unsigned char **out,
 						 unsigned char *outlen,
 						 const unsigned char *in,
-						 unsigned int inlen, void *arg),
+						 unsigned int inlen,
+						 void *arg),
 				      void *arg);
 
 int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
 			  const unsigned char *in, unsigned int inlen,
 			  const unsigned char *client, unsigned int client_len);
-void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-				    unsigned *len);
+void SSL_get0_next_proto_negotiated(const SSL *s,
+				    const unsigned char **data, unsigned *len);
 
 #define OPENSSL_NPN_UNSUPPORTED	0
 #define OPENSSL_NPN_NEGOTIATED	1
 #define OPENSSL_NPN_NO_OVERLAP	2
-
 #endif
 
 #ifndef OPENSSL_NO_PSK
@@ -1011,6 +1102,8 @@
 #define SSL_MAC_FLAG_READ_MAC_STREAM 1
 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 struct ssl_st
 	{
 	/* protocol version
@@ -1055,9 +1148,7 @@
 
 	int server;	/* are we the server side? - mostly used by SSL_clear*/
 
-	int new_session;/* 1 if we are to use a new session.
-	                 * 2 if we are a server and are inside a handshake
-	                 *   (i.e. not just sending a HelloRequest)
+	int new_session;/* Generate a new session or reuse an old one.
 	                 * NB: For servers, the 'new' session may actually be a previously
 	                 * cached session or even the previous session unless
 	                 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
@@ -1244,11 +1335,32 @@
 #endif
 
 #define session_ctx initial_ctx
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  /* What we'll do */
+	SRTP_PROTECTION_PROFILE *srtp_profile;            /* What's been chosen */
+
+	unsigned int tlsext_heartbeat;  /* Is use of the Heartbeat extension negotiated?
+	                                   0: disabled
+	                                   1: enabled
+	                                   2: enabled, but not allowed to send Requests
+	                                 */
+	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
 #else
 #define session_ctx ctx
 #endif /* OPENSSL_NO_TLSEXT */
+
+	int renegotiate;/* 1 if we are renegotiating.
+	                 * 2 if we are a server and are inside a handshake
+	                 * (i.e. not just sending a HelloRequest) */
+
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
 	};
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -1258,6 +1370,7 @@
 #include <openssl/tls1.h> /* This is mostly sslv3 with a few tweaks */
 #include <openssl/dtls1.h> /* Datagram TLS */
 #include <openssl/ssl23.h>
+#include <openssl/srtp.h>  /* Support for the use_srtp extension */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -1476,6 +1589,20 @@
 #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP	71
 
 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB	72
+
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB	75
+#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB		76
+#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB		77
+
+#define SSL_CTRL_SET_SRP_ARG		78
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME		79
+#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH		80
+#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD		81
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT				85
+#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+#endif
 #endif
 
 #define DTLS_CTRL_GET_TIMEOUT		73
@@ -1486,6 +1613,9 @@
 #define SSL_CTRL_CLEAR_OPTIONS			77
 #define SSL_CTRL_CLEAR_MODE			78
 
+#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS		82
+#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS	83
+
 #define DTLSv1_get_timeout(ssl, arg) \
 	SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg)
 #define DTLSv1_handle_timeout(ssl) \
@@ -1522,6 +1652,10 @@
 
 #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+#define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509)
+#define SSL_CTX_clear_extra_chain_certs(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL)
 
 #ifndef OPENSSL_NO_BIO
 BIO_METHOD *BIO_f_ssl(void);
@@ -1549,7 +1683,7 @@
 int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
 char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
 const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
+unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
 
 int	SSL_get_fd(const SSL *s);
 int	SSL_get_rfd(const SSL *s);
@@ -1619,11 +1753,15 @@
 long	SSL_SESSION_get_timeout(const SSL_SESSION *s);
 long	SSL_SESSION_set_timeout(SSL_SESSION *s, long t);
 void	SSL_copy_session_id(SSL *to,const SSL *from);
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s);
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len);
 
 SSL_SESSION *SSL_SESSION_new(void);
 const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
 					unsigned int *len);
 const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
 #ifndef OPENSSL_NO_FP_API
 int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
 #endif
@@ -1687,6 +1825,30 @@
 int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm);
 int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm);
 
+#ifndef OPENSSL_NO_SRP
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name);
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password);
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength);
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx,
+					char *(*cb)(SSL *,void *));
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx,
+					  int (*cb)(SSL *,void *));
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *));
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg);
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info);
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass,
+				const char *grp);
+
+BIGNUM *SSL_get_srp_g(SSL *s);
+BIGNUM *SSL_get_srp_N(SSL *s);
+
+char *SSL_get_srp_username(SSL *s);
+char *SSL_get_srp_userinfo(SSL *s);
+#endif
+
 void	SSL_free(SSL *ssl);
 int 	SSL_accept(SSL *ssl);
 int 	SSL_connect(SSL *ssl);
@@ -1722,6 +1884,15 @@
 const SSL_METHOD *TLSv1_server_method(void);	/* TLSv1.0 */
 const SSL_METHOD *TLSv1_client_method(void);	/* TLSv1.0 */
 
+const SSL_METHOD *TLSv1_1_method(void);		/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_server_method(void);	/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_client_method(void);	/* TLSv1.1 */
+
+const SSL_METHOD *TLSv1_2_method(void);		/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_server_method(void);	/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_client_method(void);	/* TLSv1.2 */
+
+
 const SSL_METHOD *DTLSv1_method(void);		/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_server_method(void);	/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_client_method(void);	/* DTLSv1.0 */
@@ -1730,6 +1901,7 @@
 
 int SSL_do_handshake(SSL *s);
 int SSL_renegotiate(SSL *s);
+int SSL_renegotiate_abbreviated(SSL *s);
 int SSL_renegotiate_pending(SSL *s);
 int SSL_shutdown(SSL *s);
 
@@ -1781,6 +1953,7 @@
 			   void (*cb)(const SSL *ssl,int type,int val));
 void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val);
 int SSL_state(const SSL *ssl);
+void SSL_set_state(SSL *ssl, int state);
 
 void SSL_set_verify_result(SSL *ssl,long v);
 long SSL_get_verify_result(const SSL *ssl);
@@ -1881,6 +2054,9 @@
 /* Pre-shared secret session resumption functions */
 int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg);
 
+void SSL_set_debug(SSL *s, int debug);
+int SSL_cache_hit(SSL *s);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1900,7 +2076,6 @@
 #define SSL_F_DTLS1_ACCEPT				 246
 #define SSL_F_DTLS1_ADD_CERT_TO_BUF			 295
 #define SSL_F_DTLS1_BUFFER_RECORD			 247
-#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM			 305
 #define SSL_F_DTLS1_CLIENT_HELLO			 248
 #define SSL_F_DTLS1_CONNECT				 249
 #define SSL_F_DTLS1_ENC					 250
@@ -1909,6 +2084,7 @@
 #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT		 253
 #define SSL_F_DTLS1_GET_RECORD				 254
 #define SSL_F_DTLS1_HANDLE_TIMEOUT			 297
+#define SSL_F_DTLS1_HEARTBEAT				 305
 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN			 255
 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT			 288
 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE		 256
@@ -1977,7 +2153,7 @@
 #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
 #define SSL_F_SSL3_GET_MESSAGE				 142
 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-#define SSL_F_SSL3_GET_NEXT_PROTO			 304
+#define SSL_F_SSL3_GET_NEXT_PROTO			 306
 #define SSL_F_SSL3_GET_RECORD				 143
 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
 #define SSL_F_SSL3_GET_SERVER_DONE			 145
@@ -2002,10 +2178,12 @@
 #define SSL_F_SSL3_WRITE_PENDING			 159
 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT	 298
 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT		 277
+#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT		 307
 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK	 215
 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK	 216
 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT	 299
 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT		 278
+#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT		 308
 #define SSL_F_SSL_BAD_METHOD				 160
 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST			 161
 #define SSL_F_SSL_CERT_DUP				 221
@@ -2022,6 +2200,7 @@
 #define SSL_F_SSL_CREATE_CIPHER_LIST			 166
 #define SSL_F_SSL_CTRL					 232
 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY			 168
+#define SSL_F_SSL_CTX_MAKE_PROFILES			 309
 #define SSL_F_SSL_CTX_NEW				 169
 #define SSL_F_SSL_CTX_SET_CIPHER_LIST			 269
 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE		 290
@@ -2050,8 +2229,10 @@
 #define SSL_F_SSL_NEW					 186
 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT	 300
 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT		 302
+#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT	 310
 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT	 301
 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT		 303
+#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT	 311
 #define SSL_F_SSL_PEEK					 270
 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT		 281
 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT		 282
@@ -2060,6 +2241,7 @@
 #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT			 188
 #define SSL_F_SSL_SESSION_NEW				 189
 #define SSL_F_SSL_SESSION_PRINT_FP			 190
+#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT		 312
 #define SSL_F_SSL_SESS_CERT_NEW				 225
 #define SSL_F_SSL_SET_CERT				 191
 #define SSL_F_SSL_SET_CIPHER_LIST			 271
@@ -2073,6 +2255,7 @@
 #define SSL_F_SSL_SET_TRUST				 228
 #define SSL_F_SSL_SET_WFD				 196
 #define SSL_F_SSL_SHUTDOWN				 224
+#define SSL_F_SSL_SRP_CTX_INIT				 313
 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION		 243
 #define SSL_F_SSL_UNDEFINED_FUNCTION			 197
 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION		 244
@@ -2093,6 +2276,8 @@
 #define SSL_F_TLS1_CHANGE_CIPHER_STATE			 209
 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT		 274
 #define SSL_F_TLS1_ENC					 210
+#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL		 314
+#define SSL_F_TLS1_HEARTBEAT				 315
 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT		 275
 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT		 276
 #define SSL_F_TLS1_PRF					 284
@@ -2132,6 +2317,13 @@
 #define SSL_R_BAD_RSA_MODULUS_LENGTH			 121
 #define SSL_R_BAD_RSA_SIGNATURE				 122
 #define SSL_R_BAD_SIGNATURE				 123
+#define SSL_R_BAD_SRP_A_LENGTH				 347
+#define SSL_R_BAD_SRP_B_LENGTH				 348
+#define SSL_R_BAD_SRP_G_LENGTH				 349
+#define SSL_R_BAD_SRP_N_LENGTH				 350
+#define SSL_R_BAD_SRP_S_LENGTH				 351
+#define SSL_R_BAD_SRTP_MKI_VALUE			 352
+#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST		 353
 #define SSL_R_BAD_SSL_FILETYPE				 124
 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH			 125
 #define SSL_R_BAD_STATE					 126
@@ -2170,14 +2362,15 @@
 #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE	 322
 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE	 323
 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER		 310
+#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST	 354
 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
 #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
 #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
 #define SSL_R_HTTPS_PROXY_REQUEST			 155
 #define SSL_R_HTTP_REQUEST				 156
 #define SSL_R_ILLEGAL_PADDING				 283
@@ -2186,6 +2379,7 @@
 #define SSL_R_INVALID_COMMAND				 280
 #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
 #define SSL_R_INVALID_PURPOSE				 278
+#define SSL_R_INVALID_SRP_USERNAME			 357
 #define SSL_R_INVALID_STATUS_RESPONSE			 328
 #define SSL_R_INVALID_TICKET_KEYS_LENGTH		 325
 #define SSL_R_INVALID_TRUST				 279
@@ -2215,6 +2409,7 @@
 #define SSL_R_MISSING_RSA_CERTIFICATE			 168
 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT		 169
 #define SSL_R_MISSING_RSA_SIGNING_CERT			 170
+#define SSL_R_MISSING_SRP_PARAM				 358
 #define SSL_R_MISSING_TMP_DH_KEY			 171
 #define SSL_R_MISSING_TMP_ECDH_KEY			 311
 #define SSL_R_MISSING_TMP_RSA_KEY			 172
@@ -2244,6 +2439,7 @@
 #define SSL_R_NO_RENEGOTIATION				 339
 #define SSL_R_NO_REQUIRED_DIGEST			 324
 #define SSL_R_NO_SHARED_CIPHER				 193
+#define SSL_R_NO_SRTP_PROFILES				 359
 #define SSL_R_NO_VERIFY_CALLBACK			 194
 #define SSL_R_NULL_SSL_CTX				 195
 #define SSL_R_NULL_SSL_METHOD_PASSED			 196
@@ -2288,7 +2484,12 @@
 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED		 277
 #define SSL_R_SESSION_MAY_NOT_BE_CREATED		 2000
 #define SSL_R_SHORT_READ				 219
+#define SSL_R_SIGNATURE_ALGORITHMS_ERROR		 360
 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE	 220
+#define SSL_R_SRP_A_CALC				 361
+#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES		 362
+#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG	 363
+#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE		 364
 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE		 221
 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG		 299
 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT		 321
@@ -2333,6 +2534,9 @@
 #define SSL_R_TLSV1_UNRECOGNIZED_NAME			 1112
 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION		 1110
 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER	 232
+#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT		 365
+#define SSL_R_TLS_HEARTBEAT_PENDING			 366
+#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL		 367
 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST		 157
 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233
 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG	 234
@@ -2354,6 +2558,7 @@
 #define SSL_R_UNKNOWN_CERTIFICATE_TYPE			 247
 #define SSL_R_UNKNOWN_CIPHER_RETURNED			 248
 #define SSL_R_UNKNOWN_CIPHER_TYPE			 249
+#define SSL_R_UNKNOWN_DIGEST				 368
 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE			 250
 #define SSL_R_UNKNOWN_PKEY_TYPE				 251
 #define SSL_R_UNKNOWN_PROTOCOL				 252
@@ -2368,12 +2573,14 @@
 #define SSL_R_UNSUPPORTED_PROTOCOL			 258
 #define SSL_R_UNSUPPORTED_SSL_VERSION			 259
 #define SSL_R_UNSUPPORTED_STATUS_TYPE			 329
+#define SSL_R_USE_SRTP_NOT_NEGOTIATED			 369
 #define SSL_R_WRITE_BIO_NOT_SET				 260
 #define SSL_R_WRONG_CIPHER_RETURNED			 261
 #define SSL_R_WRONG_MESSAGE_TYPE			 262
 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS			 263
 #define SSL_R_WRONG_SIGNATURE_LENGTH			 264
 #define SSL_R_WRONG_SIGNATURE_SIZE			 265
+#define SSL_R_WRONG_SIGNATURE_TYPE			 370
 #define SSL_R_WRONG_SSL_VERSION				 266
 #define SSL_R_WRONG_VERSION_NUMBER			 267
 #define SSL_R_X509_LIB					 268

diff --git a/include/openssl/ssl2.h b/include/openssl/ssl2.h
index 99a52ea..eb25dcb 100644
--- a/include/openssl/ssl2.h
+++ b/include/openssl/ssl2.h

@@ -155,6 +155,8 @@
 #define  CERT		char
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl2_state_st
 	{
 	int three_byte_header;
@@ -219,6 +221,8 @@
 		} tmp;
 	} SSL2_STATE;
 
+#endif
+
 /* SSLv2 */
 /* client */
 #define SSL2_ST_SEND_CLIENT_HELLO_A		(0x10|SSL_ST_CONNECT)

diff --git a/include/openssl/ssl3.h b/include/openssl/ssl3.h
index d6425e5..fb08e72 100644
--- a/include/openssl/ssl3.h
+++ b/include/openssl/ssl3.h

@@ -332,6 +332,7 @@
 #define SSL3_RT_ALERT			21
 #define SSL3_RT_HANDSHAKE		22
 #define SSL3_RT_APPLICATION_DATA	23
+#define TLS1_RT_HEARTBEAT		24
 
 #define SSL3_AL_WARNING			1
 #define SSL3_AL_FATAL			2
@@ -349,6 +350,11 @@
 #define SSL3_AD_CERTIFICATE_UNKNOWN	46
 #define SSL3_AD_ILLEGAL_PARAMETER	47	/* fatal */
 
+#define TLS1_HB_REQUEST		1
+#define TLS1_HB_RESPONSE	2
+	
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_record_st
 	{
 /*r */	int type;               /* type of record */
@@ -370,6 +376,8 @@
 	int left;               /* how many bytes left */
 	} SSL3_BUFFER;
 
+#endif
+
 #define SSL3_CT_RSA_SIGN			1
 #define SSL3_CT_DSS_SIGN			2
 #define SSL3_CT_RSA_FIXED_DH			3
@@ -389,6 +397,7 @@
 #define SSL3_FLAGS_POP_BUFFER			0x0004
 #define TLS1_FLAGS_TLS_PADDING_BUG		0x0008
 #define TLS1_FLAGS_SKIP_CERT_VERIFY		0x0010
+#define TLS1_FLAGS_KEEP_HANDSHAKE		0x0020
  
 /* SSL3_FLAGS_SGC_RESTART_DONE is set when we
  * restart a handshake because of MS SGC and so prevents us
@@ -401,6 +410,8 @@
  */
 #define SSL3_FLAGS_SGC_RESTART_DONE		0x0040
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_state_st
 	{
 	long flags;
@@ -476,12 +487,6 @@
 	void *server_opaque_prf_input;
 	size_t server_opaque_prf_input_len;
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-	/* Set if we saw the Next Protocol Negotiation extension from
-	   our peer. */
-	int next_proto_neg_seen;
-#endif
-
 	struct	{
 		/* actually only needs to be 16+20 */
 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
@@ -491,7 +496,7 @@
 		int finish_md_len;
 		unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2];
 		int peer_finish_md_len;
-		
+
 		unsigned long message_size;
 		int message_type;
 
@@ -539,14 +544,24 @@
         unsigned char previous_server_finished[EVP_MAX_MD_SIZE];
         unsigned char previous_server_finished_len;
         int send_connection_binding; /* TODOEKR */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+	int next_proto_neg_seen;
+#endif
 	} SSL3_STATE;
 
+#endif
 
 /* SSLv3 */
 /*client */
 /* extra state */
 #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 #define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+#define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+#endif	
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
@@ -574,10 +589,8 @@
 #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
 #define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-#endif
 #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
 /* read from server */
@@ -593,6 +606,10 @@
 /* server */
 /* extra state */
 #define SSL3_ST_SW_FLUSH		(0x100|SSL_ST_ACCEPT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_SW_WRITE_SOCK			(0x310|SSL_ST_ACCEPT)
+#define DTLS1_SCTP_ST_SR_READ_SOCK			(0x320|SSL_ST_ACCEPT)
+#endif	
 /* read from client */
 /* Do not change the number values, they do matter */
 #define SSL3_ST_SR_CLNT_HELLO_A		(0x110|SSL_ST_ACCEPT)
@@ -623,10 +640,8 @@
 #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-#endif
 #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
 /* write to client */
@@ -651,9 +666,7 @@
 #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
 #define SSL3_MT_FINISHED			20
 #define SSL3_MT_CERTIFICATE_STATUS		22
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_MT_NEXT_PROTO			67
-#endif
 #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
 
 

diff --git a/include/openssl/symhacks.h b/include/openssl/symhacks.h
index 3fd4a81..403f592 100644
--- a/include/openssl/symhacks.h
+++ b/include/openssl/symhacks.h

@@ -176,7 +176,6 @@
 #define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
 #undef SSL_COMP_get_compression_methods
 #define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
-
 #undef ssl_add_clienthello_renegotiate_ext
 #define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
 #undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
 #define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
 #undef ssl_parse_serverhello_renegotiate_ext
 #define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb SSL_CTX_set_next_proto_sel_cb
 
 /* Hack some long ENGINE names */
 #undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
 #define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
 #undef EC_GROUP_clear_free_all_extra_data
 #define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
 #undef EC_POINT_set_Jprojective_coordinates_GFp
 #define EC_POINT_set_Jprojective_coordinates_GFp \
                                                 EC_POINT_set_Jproj_coords_GFp
@@ -399,6 +421,12 @@
 #undef dtls1_retransmit_buffered_messages
 #define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
 
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
 /* Hack some long UI names */
 #undef UI_method_get_prompt_constructor
 #define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr

diff --git a/include/openssl/tls1.h b/include/openssl/tls1.h
index 76f368a..c39c267 100644
--- a/include/openssl/tls1.h
+++ b/include/openssl/tls1.h

@@ -159,10 +159,24 @@
 
 #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES	0
 
+#define TLS1_2_VERSION			0x0303
+#define TLS1_2_VERSION_MAJOR		0x03
+#define TLS1_2_VERSION_MINOR		0x03
+
+#define TLS1_1_VERSION			0x0302
+#define TLS1_1_VERSION_MAJOR		0x03
+#define TLS1_1_VERSION_MINOR		0x02
+
 #define TLS1_VERSION			0x0301
 #define TLS1_VERSION_MAJOR		0x03
 #define TLS1_VERSION_MINOR		0x01
 
+#define TLS1_get_version(s) \
+		((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0)
+
+#define TLS1_get_client_version(s) \
+		((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0)
+
 #define TLS1_AD_DECRYPTION_FAILED	21
 #define TLS1_AD_RECORD_OVERFLOW		22
 #define TLS1_AD_UNKNOWN_CA		48	/* fatal */
@@ -183,17 +197,42 @@
 #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114
 #define TLS1_AD_UNKNOWN_PSK_IDENTITY	115	/* fatal */
 
-/* ExtensionType values from RFC3546 / RFC4366 */
+/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */
 #define TLSEXT_TYPE_server_name			0
 #define TLSEXT_TYPE_max_fragment_length		1
 #define TLSEXT_TYPE_client_certificate_url	2
 #define TLSEXT_TYPE_trusted_ca_keys		3
 #define TLSEXT_TYPE_truncated_hmac		4
 #define TLSEXT_TYPE_status_request		5
+/* ExtensionType values from RFC4681 */
+#define TLSEXT_TYPE_user_mapping		6
+
+/* ExtensionType values from RFC5878 */
+#define TLSEXT_TYPE_client_authz		7
+#define TLSEXT_TYPE_server_authz		8
+
+/* ExtensionType values from RFC6091 */
+#define TLSEXT_TYPE_cert_type		9
+
 /* ExtensionType values from RFC4492 */
 #define TLSEXT_TYPE_elliptic_curves		10
 #define TLSEXT_TYPE_ec_point_formats		11
+
+/* ExtensionType value from RFC5054 */
+#define TLSEXT_TYPE_srp				12
+
+/* ExtensionType values from RFC5246 */
+#define TLSEXT_TYPE_signature_algorithms	13
+
+/* ExtensionType value from RFC5764 */
+#define TLSEXT_TYPE_use_srtp	14
+
+/* ExtensionType value from RFC5620 */
+#define TLSEXT_TYPE_heartbeat	15
+
+/* ExtensionType value from RFC4507 */
 #define TLSEXT_TYPE_session_ticket		35
+
 /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */
 #if 0 /* will have to be provided externally for now ,
        * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183
@@ -221,12 +260,37 @@
 #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2	2
 #define TLSEXT_ECPOINTFORMAT_last			2
 
+/* Signature and hash algorithms from RFC 5246 */
+
+#define TLSEXT_signature_anonymous			0
+#define TLSEXT_signature_rsa				1
+#define TLSEXT_signature_dsa				2
+#define TLSEXT_signature_ecdsa				3
+
+#define TLSEXT_hash_none				0
+#define TLSEXT_hash_md5					1
+#define TLSEXT_hash_sha1				2
+#define TLSEXT_hash_sha224				3
+#define TLSEXT_hash_sha256				4
+#define TLSEXT_hash_sha384				5
+#define TLSEXT_hash_sha512				6
+
 #ifndef OPENSSL_NO_TLSEXT
 
 #define TLSEXT_MAXLEN_host_name 255
 
-const char *SSL_get_servername(const SSL *s, const int type) ;
-int SSL_get_servername_type(const SSL *s) ;
+const char *SSL_get_servername(const SSL *s, const int type);
+int SSL_get_servername_type(const SSL *s);
+/* SSL_export_keying_material exports a value derived from the master secret,
+ * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and
+ * optional context. (Since a zero length context is allowed, the |use_context|
+ * flag controls whether a context is included.)
+ *
+ * It returns 1 on success and zero otherwise.
+ */
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context);
 
 #define SSL_set_tlsext_host_name(s,name) \
 SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name)
@@ -290,6 +354,16 @@
 #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \
 SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_TLSEXT_HB_ENABLED				0x01
+#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS	0x02
+#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS	0x04
+
+#define SSL_get_tlsext_heartbeat_pending(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL)
+#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \
+        SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL)
+#endif
 #endif
 
 /* PSK ciphersuites from 4279 */
@@ -327,6 +401,14 @@
 #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA		0x03000039
 #define TLS1_CK_ADH_WITH_AES_256_SHA			0x0300003A
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_RSA_WITH_NULL_SHA256			0x0300003B
+#define TLS1_CK_RSA_WITH_AES_128_SHA256			0x0300003C
+#define TLS1_CK_RSA_WITH_AES_256_SHA256			0x0300003D
+#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256		0x0300003E
+#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256		0x0300003F
+#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256		0x03000040
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA		0x03000041
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	0x03000042
@@ -335,6 +417,16 @@
 #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA	0x03000045
 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA		0x03000046
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256		0x03000067
+#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256		0x03000068
+#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256		0x03000069
+#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256		0x0300006A
+#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256		0x0300006B
+#define TLS1_CK_ADH_WITH_AES_128_SHA256			0x0300006C
+#define TLS1_CK_ADH_WITH_AES_256_SHA256			0x0300006D
+
+/* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA		0x03000084
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA	0x03000085
 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA	0x03000086
@@ -350,6 +442,20 @@
 #define TLS1_CK_DHE_RSA_WITH_SEED_SHA                   0x0300009A
 #define TLS1_CK_ADH_WITH_SEED_SHA                	0x0300009B
 
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256		0x0300009C
+#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384		0x0300009D
+#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256		0x0300009E
+#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384		0x0300009F
+#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256		0x030000A0
+#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384		0x030000A1
+#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256		0x030000A2
+#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384		0x030000A3
+#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256		0x030000A4
+#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384		0x030000A5
+#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256		0x030000A6
+#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384		0x030000A7
+
 /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */
 #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA                0x0300C001
 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA             0x0300C002
@@ -381,6 +487,38 @@
 #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA          0x0300C018
 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA          0x0300C019
 
+/* SRP ciphersuites from RFC 5054 */
+#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA		0x0300C01A
+#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	0x0300C01B
+#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	0x0300C01C
+#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA		0x0300C01D
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	0x0300C01E
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	0x0300C01F
+#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA		0x0300C020
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	0x0300C021
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	0x0300C022
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256         0x0300C023
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384         0x0300C024
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256          0x0300C025
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384          0x0300C026
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256           0x0300C027
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384           0x0300C028
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256            0x0300C029
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384            0x0300C02A
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256	0x0300C02B
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384	0x0300C02C
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256      0x0300C02D
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384      0x0300C02E
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256       0x0300C02F
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384       0x0300C030
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -448,6 +586,17 @@
 #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA		"PSK-AES128-CBC-SHA"
 #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA		"PSK-AES256-CBC-SHA"
 
+/* SRP ciphersuite from RFC 5054 */
+#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA		"SRP-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	"SRP-RSA-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	"SRP-DSS-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA		"SRP-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	"SRP-RSA-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	"SRP-DSS-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA		"SRP-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	"SRP-RSA-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	"SRP-DSS-AES-256-CBC-SHA"
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA		"CAMELLIA128-SHA"
 #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	"DH-DSS-CAMELLIA128-SHA"
@@ -471,6 +620,55 @@
 #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA                  "DHE-RSA-SEED-SHA"
 #define TLS1_TXT_ADH_WITH_SEED_SHA                      "ADH-SEED-SHA"
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_TXT_RSA_WITH_NULL_SHA256			"NULL-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_128_SHA256		"AES128-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_SHA256		"AES256-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256		"DH-DSS-AES128-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256		"DH-RSA-AES128-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256		"DHE-DSS-AES128-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256		"DHE-RSA-AES128-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256		"DH-DSS-AES256-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256		"DH-RSA-AES256-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256		"DHE-DSS-AES256-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256		"DHE-RSA-AES256-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_128_SHA256		"ADH-AES128-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_SHA256		"ADH-AES256-SHA256"
+
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256		"AES128-GCM-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384		"AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256	"DHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384	"DHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256		"DH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384		"DH-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256	"DHE-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384	"DHE-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256		"DH-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384		"DH-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256		"ADH-AES128-GCM-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384		"ADH-AES256-GCM-SHA384"
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256    "ECDHE-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384    "ECDHE-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256     "ECDH-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384     "ECDH-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256      "ECDHE-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384      "ECDHE-RSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256       "ECDH-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384       "ECDH-RSA-AES256-SHA384"
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256    "ECDHE-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384    "ECDHE-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256     "ECDH-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384     "ECDH-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256      "ECDHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384      "ECDHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2

diff --git a/include/openssl/ts.h b/include/openssl/ts.h
index 190e8a1..c2448e3 100644
--- a/include/openssl/ts.h
+++ b/include/openssl/ts.h

@@ -86,9 +86,6 @@
 #include <openssl/dh.h>
 #endif
 
-#include <openssl/evp.h>
-
-
 #ifdef  __cplusplus
 extern "C" {
 #endif

diff --git a/include/openssl/ui.h b/include/openssl/ui.h
index 2b1cfa2..bd78aa4 100644
--- a/include/openssl/ui.h
+++ b/include/openssl/ui.h

@@ -316,7 +316,7 @@
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */

diff --git a/include/openssl/x509.h b/include/openssl/x509.h
index e6f8a40..092dd74 100644
--- a/include/openssl/x509.h
+++ b/include/openssl/x509.h

@@ -657,11 +657,15 @@
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)

diff --git a/openssl.config b/openssl.config
index 9b533ca..7109f03 100644
--- a/openssl.config
+++ b/openssl.config

@@ -10,6 +10,7 @@
 Netware \
 VMS \
 apps/demoCA \
+apps/demoSRP \
 apps/set \
 bugs \
 certs \
@@ -71,6 +72,7 @@
 crypto/LPdir_win32.c \
 crypto/Makefile \
 crypto/aes/Makefile \
+crypto/armcap.c \
 crypto/asn1/Makefile \
 crypto/bf/INSTALL \
 crypto/bf/Makefile \
@@ -86,8 +88,10 @@
 crypto/bio/bss_rtcp.c \
 crypto/bn/Makefile \
 crypto/bn/asm/vms.mar \
+crypto/bn/bn_x931p.c \
 crypto/bn/vms-helper.c \
 crypto/buffer/Makefile \
+crypto/cmac/Makefile \
 crypto/comp/Makefile \
 crypto/conf/Makefile \
 crypto/crypto-lib.com \
@@ -101,13 +105,22 @@
 crypto/dso/dso_vms.c \
 crypto/dso/dso_win32.c \
 crypto/ec/Makefile \
+crypto/ec/ecp_nistp224.c \
+crypto/ec/ecp_nistp256.c \
+crypto/ec/ecp_nistp521.c \
+crypto/ec/ecp_nistputil.c \
 crypto/ecdh/Makefile \
 crypto/ecdsa/Makefile \
 crypto/engine/Makefile \
+crypto/engine/eng_rdrand.c \
+crypto/engine/eng_rsax.c \
 crypto/err/Makefile \
 crypto/evp/Makefile \
+crypto/evp/evp_fips.c \
 crypto/evp/m_md2.c \
 crypto/evp/m_sha.c \
+crypto/fips_err.h \
+crypto/fips_ers.c \
 crypto/hmac/Makefile \
 crypto/install-crypto.com \
 crypto/jpake/Makefile \
@@ -119,12 +132,14 @@
 crypto/modes/Makefile \
 crypto/modes/cts128.c \
 crypto/modes/modes.h \
+crypto/o_fips.c \
 crypto/objects/Makefile \
 crypto/ocsp/Makefile \
 crypto/pem/Makefile \
 crypto/pkcs12/Makefile \
 crypto/pkcs7/Makefile \
 crypto/pkcs7/bio_pk7.c \
+crypto/ppccap.c \
 crypto/pqueue/Makefile \
 crypto/rand/Makefile \
 crypto/rand/rand_vms.c \
@@ -134,6 +149,8 @@
 crypto/rsa/Makefile \
 crypto/sha/Makefile \
 crypto/sha/sha_one.c \
+crypto/srp/Makefile \
+crypto/srp/srptest.c \
 crypto/stack/Makefile \
 crypto/store/Makefile \
 crypto/threads/pthreads-vms.com \
@@ -185,9 +202,7 @@
 small_records.patch \
 handshake_cutthrough.patch \
 jsse.patch \
-npn.patch \
 sha1_armv4_large.patch \
-mips_asm.patch \
 "
 
 OPENSSL_PATCHES_progs_SOURCES="\
@@ -233,25 +248,3 @@
 ssl/ssl_rsa.c \
 ssl/ssl_sess.c \
 "
-
-OPENSSL_PATCHES_npn_SOURCES="\
-apps/apps.c \
-apps/apps.h \
-apps/s_client.c \
-apps/s_server.c \
-include/openssl/ssl.h \
-include/openssl/ssl3.h \
-include/openssl/tls1.h \
-ssl/s3_both.c \
-ssl/s3_clnt.c \
-ssl/s3_lib.c \
-ssl/s3_pkt.c \
-ssl/s3_srvr.c \
-ssl/ssl.h \
-ssl/ssl3.h \
-ssl/ssl_err.c \
-ssl/ssl_lib.c \
-ssl/ssl_locl.h \
-ssl/t1_lib.c \
-ssl/tls1.h \
-"

diff --git a/openssl.version b/openssl.version
index a535da9..c7b738e 100644
--- a/openssl.version
+++ b/openssl.version

@@ -1,2 +1,2 @@
 # also update ThirdPartyProject.prop
-OPENSSL_VERSION=1.0.0h
+OPENSSL_VERSION=1.0.1

diff --git a/patches/README b/patches/README
index f70135c..f7d886d 100644
--- a/patches/README
+++ b/patches/README

@@ -33,7 +33,3 @@
 sha1_armv4_large.patch
 
 This patch eliminates memory stores to addresses below SP.
-
-mips_asm.patch
-
-MIPS assembly routines (AES, BN, SHA1, SHA256)

diff --git a/patches/apps_Android.mk b/patches/apps_Android.mk
index c2dc2d7..9110490 100644
--- a/patches/apps_Android.mk
+++ b/patches/apps_Android.mk

@@ -48,6 +48,7 @@
 	smime.c \
 	speed.c \
 	spkac.c \
+	srp.c \
 	verify.c \
 	version.c \
 	x509.c

diff --git a/patches/crypto_Android.mk b/patches/crypto_Android.mk
index 8090c12..fb599ce 100644
--- a/patches/crypto_Android.mk
+++ b/patches/crypto_Android.mk

@@ -169,7 +169,11 @@
 	bn/bn_sqrt.c \
 	bn/bn_word.c \
 	buffer/buf_err.c \
+	buffer/buf_str.c \
 	buffer/buffer.c \
+	cmac/cm_ameth.c \
+	cmac/cm_pmeth.c \
+	cmac/cmac.c \
 	comp/c_rle.c \
 	comp/c_zlib.c \
 	comp/comp_err.c \
@@ -235,6 +239,7 @@
 	dso/dso_null.c \
 	dso/dso_openssl.c \
 	ec/ec2_mult.c \
+	ec/ec2_oct.c \
 	ec/ec2_smpl.c \
 	ec/ec_ameth.c \
 	ec/ec_asn1.c \
@@ -245,11 +250,13 @@
 	ec/ec_key.c \
 	ec/ec_lib.c \
 	ec/ec_mult.c \
+	ec/ec_oct.c \
 	ec/ec_pmeth.c \
 	ec/ec_print.c \
 	ec/eck_prn.c \
 	ec/ecp_mont.c \
 	ec/ecp_nist.c \
+	ec/ecp_oct.c \
 	ec/ecp_smpl.c \
 	ecdh/ech_err.c \
 	ecdh/ech_key.c \
@@ -295,6 +302,7 @@
 	evp/c_alld.c \
 	evp/digest.c \
 	evp/e_aes.c \
+	evp/e_aes_cbc_hmac_sha1.c \
 	evp/e_bf.c \
 	evp/e_des.c \
 	evp/e_des3.c \
@@ -302,6 +310,7 @@
 	evp/e_old.c \
 	evp/e_rc2.c \
 	evp/e_rc4.c \
+	evp/e_rc4_hmac_md5.c \
 	evp/e_rc5.c \
 	evp/e_xcbc_d.c \
 	evp/encode.c \
@@ -347,9 +356,13 @@
 	md5/md5_dgst.c \
 	md5/md5_one.c \
 	modes/cbc128.c \
+	modes/ccm128.c \
 	modes/cfb128.c \
 	modes/ctr128.c \
+	modes/gcm128.c \
 	modes/ofb128.c \
+	modes/xts128.c \
+	o_init.c \
 	objects/o_names.c \
 	objects/obj_dat.c \
 	objects/obj_err.c \
@@ -398,6 +411,7 @@
 	pkcs7/pk7_mime.c \
 	pkcs7/pk7_smime.c \
 	pkcs7/pkcs7err.c \
+	pqueue/pqueue.c \
 	rand/md_rand.c \
 	rand/rand_egd.c \
 	rand/rand_err.c \
@@ -411,11 +425,13 @@
 	rc2/rc2ofb64.c \
 	rc4/rc4_enc.c \
 	rc4/rc4_skey.c \
+	rc4/rc4_utl.c \
 	ripemd/rmd_dgst.c \
 	ripemd/rmd_one.c \
 	rsa/rsa_ameth.c \
 	rsa/rsa_asn1.c \
 	rsa/rsa_chk.c \
+	rsa/rsa_crpt.c \
 	rsa/rsa_eay.c \
 	rsa/rsa_err.c \
 	rsa/rsa_gen.c \
@@ -436,6 +452,8 @@
 	sha/sha256.c \
 	sha/sha512.c \
 	sha/sha_dgst.c \
+	srp/srp_lib.c \
+	srp/srp_vfy.c \
 	stack/stack.c \
 	ts/ts_err.c \
 	txt_db/txt_db.c \
@@ -507,12 +525,15 @@
 	external/openssl \
 	external/openssl/crypto/asn1 \
 	external/openssl/crypto/evp \
+	external/openssl/crypto/modes \
 	external/openssl/include \
 	external/openssl/include/openssl \
 	external/zlib
 
 local_c_flags := -DNO_WINDOWS_BRAINDEATH
 
+local_as_flags := -x assembler-with-cpp
+
 #######################################
 # target static library
 include $(CLEAR_VARS)
@@ -525,6 +546,7 @@
 
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags)
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 ifeq ($(TARGET_ARCH),arm)
 	LOCAL_SRC_FILES += $(arm_src_files)
@@ -561,6 +583,7 @@
 
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags)
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 ifeq ($(TARGET_ARCH),arm)
 	LOCAL_SRC_FILES += $(arm_src_files)
@@ -587,6 +610,7 @@
 include $(LOCAL_PATH)/../android-config.mk
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 LOCAL_SRC_FILES += $(other_arch_src_files)
 LOCAL_STATIC_LIBRARIES += libz
@@ -602,6 +626,7 @@
 include $(LOCAL_PATH)/../android-config.mk
 LOCAL_SRC_FILES += $(local_src_files)
 LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
+LOCAL_ASFLAGS += $(local_as_flags)
 LOCAL_C_INCLUDES += $(local_c_includes)
 LOCAL_SRC_FILES += $(other_arch_src_files)
 LOCAL_STATIC_LIBRARIES += libz

diff --git a/patches/handshake_cutthrough.patch b/patches/handshake_cutthrough.patch
index 4f29839..57c4c78 100644
--- a/patches/handshake_cutthrough.patch
+++ b/patches/handshake_cutthrough.patch

@@ -6,9 +6,9 @@
  	BIO_printf(bio_err," -status           - request certificate status from server\n");
  	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
 +	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
- #endif
- 	}
- 
+ # if !defined(OPENSSL_NO_NEXTPROTONEG)
+ 	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
+ # endif
 @@ -304,6 +305,7 @@ int MAIN(int argc, char **argv)
  	EVP_PKEY *key = NULL;
  	char *CApath=NULL,*CAfile=NULL,*cipher=NULL;
@@ -191,9 +191,9 @@
  /* extra state */
  #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 +#define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
- /* write to server */
- #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
+ #ifndef OPENSSL_NO_SCTP
+ #define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+ #define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
 diff -uarp openssl-1.0.0.orig/ssl/ssl_lib.c openssl-1.0.0/ssl/ssl_lib.c
 --- openssl-1.0.0.orig/ssl/ssl_lib.c	2010-02-17 14:43:46.000000000 -0500
 +++ openssl-1.0.0/ssl/ssl_lib.c	2010-04-21 17:02:45.000000000 -0400

diff --git a/patches/jsse.patch b/patches/jsse.patch
index 249fb5b..80e5357 100644
--- a/patches/jsse.patch
+++ b/patches/jsse.patch

@@ -10,14 +10,6 @@
  	/* Default generate session ID callback. */
  	GEN_SESSION_CB generate_session_id;
  
-@@ -1546,6 +1549,7 @@ const SSL_CIPHER *SSL_get_current_cipher
- int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
- char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
- const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-+const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
- 
- int	SSL_get_fd(const SSL *s);
- int	SSL_get_rfd(const SSL *s);
 @@ -1554,6 +1558,7 @@ const char  * SSL_get_cipher_list(const 
  char *	SSL_get_shared_ciphers(const SSL *s, char *buf, int len);
  int	SSL_get_read_ahead(const SSL * s);
@@ -48,9 +40,9 @@
  const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
  					unsigned int *len);
 +const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+ unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
  #ifndef OPENSSL_NO_FP_API
  int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
- #endif
 @@ -1624,6 +1633,7 @@ int	SSL_SESSION_print(BIO *fp,const SSL_
  void	SSL_SESSION_free(SSL_SESSION *ses);
  int	i2d_SSL_SESSION(SSL_SESSION *in,unsigned char **pp);
@@ -296,13 +288,19 @@
  /* works well for SSLv2, not so good for SSLv3 */
  char *SSL_get_shared_ciphers(const SSL *s,char *buf,int len)
  	{
-@@ -2551,18 +2578,45 @@ SSL_METHOD *ssl_bad_method(int ver)
+@@ -2551,22 +2578,45 @@ SSL_METHOD *ssl_bad_method(int ver)
  	return(NULL);
  	}
  
 -const char *SSL_get_version(const SSL *s)
 +static const char *ssl_get_version(int version)
  	{
+-	if (s->version == TLS1_2_VERSION)
++	if (version == TLS1_2_VERSION)
+ 		return("TLSv1.2");
+-	else if (s->version == TLS1_1_VERSION)
++	else if (version == TLS1_1_VERSION)
+ 		return("TLSv1.1");
 -	if (s->version == TLS1_VERSION)
 +	if (version == TLS1_VERSION)
  		return("TLSv1");
@@ -334,12 +332,8 @@
 +		{
 +	case SSL2_VERSION:
 +		return SSL_TXT_RSA;
-+	case SSL3_VERSION:
-+	case TLS1_VERSION:
-+	case DTLS1_VERSION:
-+		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 +	default:
-+		return "UNKNOWN";
++		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 +		}
 +	}
 +

diff --git a/patches/mips_asm.patch b/patches/mips_asm.patch
deleted file mode 100644
index 68a80f1..0000000
--- a/patches/mips_asm.patch
+++ /dev/null

@@ -1,5461 +0,0 @@
-diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
-new file mode 100644
-index 0000000..2ce6def
---- /dev/null
-+++ b/crypto/aes/asm/aes-mips.pl
-@@ -0,0 +1,1611 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# AES for MIPS
-+
-+# October 2010
-+#
-+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
-+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
-+# faster than gcc-generated code, which is not very impressive. But
-+# recall that compressed S-box requires extra processing, namely
-+# additional rotations. Rotations are implemented with lwl/lwr pairs,
-+# which is normally used for loading unaligned data. Another cool
-+# thing about this module is its endian neutrality, which means that
-+# it processes data without ever changing byte order...
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
-+#
-+# <[email protected]>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian))
-+{    $big_endian=(unpack('L',pack('N',1))==1);   }
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+my ($MSB,$LSB)=(0,3);	# automatically converted to little-endian
-+
-+$code.=<<___;
-+.text
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+#if !defined(__vxworks) || defined(__pic__)
-+.option	pic2
-+#endif
-+.set	noat
-+___
-+
-+{{{
-+my $FRAMESIZE=16*$SZREG;
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
-+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
-+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
-+my ($key0,$cnt)=($gp,$fp);
-+
-+# instuction ordering is "stolen" from output from MIPSpro assembler
-+# invoked with -mips3 -O3 arguments...
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_encrypt
-+_mips_AES_encrypt:
-+	.frame	$sp,0,$ra
-+	.set	reorder
-+	lw	$t0,0($key)
-+	lw	$t1,4($key)
-+	lw	$t2,8($key)
-+	lw	$t3,12($key)
-+	lw	$cnt,240($key)
-+	$PTR_ADD $key0,$key,16
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	sub	$cnt,1
-+	_xtr	$i0,$s1,16-2
-+.Loop_enc:
-+	_xtr	$i1,$s2,16-2
-+	_xtr	$i2,$s3,16-2
-+	_xtr	$i3,$s0,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t0,3($i0)		# Te1[s1>>16]
-+	lwl	$t1,3($i1)		# Te1[s2>>16]
-+	lwl	$t2,3($i2)		# Te1[s3>>16]
-+	lwl	$t3,3($i3)		# Te1[s0>>16]
-+	lwr	$t0,2($i0)		# Te1[s1>>16]
-+	lwr	$t1,2($i1)		# Te1[s2>>16]
-+	lwr	$t2,2($i2)		# Te1[s3>>16]
-+	lwr	$t3,2($i3)		# Te1[s0>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t4,2($i0)		# Te2[s2>>8]
-+	lwl	$t5,2($i1)		# Te2[s3>>8]
-+	lwl	$t6,2($i2)		# Te2[s0>>8]
-+	lwl	$t7,2($i3)		# Te2[s1>>8]
-+	lwr	$t4,1($i0)		# Te2[s2>>8]
-+	lwr	$t5,1($i1)		# Te2[s3>>8]
-+	lwr	$t6,1($i2)		# Te2[s0>>8]
-+	lwr	$t7,1($i3)		# Te2[s1>>8]
-+
-+	_xtr	$i0,$s3,0-2
-+	_xtr	$i1,$s0,0-2
-+	_xtr	$i2,$s1,0-2
-+	_xtr	$i3,$s2,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t8,1($i0)		# Te3[s3]
-+	lwl	$t9,1($i1)		# Te3[s0]
-+	lwl	$t10,1($i2)		# Te3[s1]
-+	lwl	$t11,1($i3)		# Te3[s2]
-+	lwr	$t8,0($i0)		# Te3[s3]
-+	lwr	$t9,0($i1)		# Te3[s0]
-+	lwr	$t10,0($i2)		# Te3[s1]
-+	lwr	$t11,0($i3)		# Te3[s2]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+	lw	$t4,0($i0)		# Te0[s0>>24]
-+	lw	$t5,0($i1)		# Te0[s1>>24]
-+	lw	$t6,0($i2)		# Te0[s2>>24]
-+	lw	$t7,0($i3)		# Te0[s3>>24]
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	sub	$cnt,1
-+	$PTR_ADD $key0,16
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+	.set	noreorder
-+	bnez	$cnt,.Loop_enc
-+	_xtr	$i0,$s1,16-2
-+
-+	.set	reorder
-+	_xtr	$i1,$s2,16-2
-+	_xtr	$i2,$s3,16-2
-+	_xtr	$i3,$s0,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t0,2($i0)		# Te4[s1>>16]
-+	lbu	$t1,2($i1)		# Te4[s2>>16]
-+	lbu	$t2,2($i2)		# Te4[s3>>16]
-+	lbu	$t3,2($i3)		# Te4[s0>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,2($i0)		# Te4[s2>>8]
-+	lbu	$t5,2($i1)		# Te4[s3>>8]
-+	lbu	$t6,2($i2)		# Te4[s0>>8]
-+	lbu	$t7,2($i3)		# Te4[s1>>8]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t8,2($i0)		# Te4[s0>>24]
-+	lbu	$t9,2($i1)		# Te4[s1>>24]
-+	lbu	$t10,2($i2)		# Te4[s2>>24]
-+	lbu	$t11,2($i3)		# Te4[s3>>24]
-+
-+	_xtr	$i0,$s3,0-2
-+	_xtr	$i1,$s0,0-2
-+	_xtr	$i2,$s1,0-2
-+	_xtr	$i3,$s2,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+
-+	_ins	$t0,16
-+	_ins	$t1,16
-+	_ins	$t2,16
-+	_ins	$t3,16
-+
-+	_ins	$t4,8
-+	_ins	$t5,8
-+	_ins	$t6,8
-+	_ins	$t7,8
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,2($i0)		# Te4[s3]
-+	lbu	$t5,2($i1)		# Te4[s0]
-+	lbu	$t6,2($i2)		# Te4[s1]
-+	lbu	$t7,2($i3)		# Te4[s2]
-+
-+	_ins	$t8,24
-+	_ins	$t9,24
-+	_ins	$t10,24
-+	_ins	$t11,24
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	_ins	$t4,0
-+	_ins	$t5,0
-+	_ins	$t6,0
-+	_ins	$t7,0
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	jr	$ra
-+.end	_mips_AES_encrypt
-+
-+.align	5
-+.globl	AES_encrypt
-+.ent	AES_encrypt
-+AES_encrypt:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_encrypt
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	lwl	$s0,0+$MSB($inp)
-+	lwl	$s1,4+$MSB($inp)
-+	lwl	$s2,8+$MSB($inp)
-+	lwl	$s3,12+$MSB($inp)
-+	lwr	$s0,0+$LSB($inp)
-+	lwr	$s1,4+$LSB($inp)
-+	lwr	$s2,8+$LSB($inp)
-+	lwr	$s3,12+$LSB($inp)
-+
-+	bal	_mips_AES_encrypt
-+
-+	swr	$s0,0+$LSB($out)
-+	swr	$s1,4+$LSB($out)
-+	swr	$s2,8+$LSB($out)
-+	swr	$s3,12+$LSB($out)
-+	swl	$s0,0+$MSB($out)
-+	swl	$s1,4+$MSB($out)
-+	swl	$s2,8+$MSB($out)
-+	swl	$s3,12+$MSB($out)
-+
-+	.set	noreorder
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_encrypt
-+___
-+
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_decrypt
-+_mips_AES_decrypt:
-+	.frame	$sp,0,$ra
-+	.set	reorder
-+	lw	$t0,0($key)
-+	lw	$t1,4($key)
-+	lw	$t2,8($key)
-+	lw	$t3,12($key)
-+	lw	$cnt,240($key)
-+	$PTR_ADD $key0,$key,16
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	sub	$cnt,1
-+	_xtr	$i0,$s3,16-2
-+.Loop_dec:
-+	_xtr	$i1,$s0,16-2
-+	_xtr	$i2,$s1,16-2
-+	_xtr	$i3,$s2,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t0,3($i0)		# Td1[s3>>16]
-+	lwl	$t1,3($i1)		# Td1[s0>>16]
-+	lwl	$t2,3($i2)		# Td1[s1>>16]
-+	lwl	$t3,3($i3)		# Td1[s2>>16]
-+	lwr	$t0,2($i0)		# Td1[s3>>16]
-+	lwr	$t1,2($i1)		# Td1[s0>>16]
-+	lwr	$t2,2($i2)		# Td1[s1>>16]
-+	lwr	$t3,2($i3)		# Td1[s2>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t4,2($i0)		# Td2[s2>>8]
-+	lwl	$t5,2($i1)		# Td2[s3>>8]
-+	lwl	$t6,2($i2)		# Td2[s0>>8]
-+	lwl	$t7,2($i3)		# Td2[s1>>8]
-+	lwr	$t4,1($i0)		# Td2[s2>>8]
-+	lwr	$t5,1($i1)		# Td2[s3>>8]
-+	lwr	$t6,1($i2)		# Td2[s0>>8]
-+	lwr	$t7,1($i3)		# Td2[s1>>8]
-+
-+	_xtr	$i0,$s1,0-2
-+	_xtr	$i1,$s2,0-2
-+	_xtr	$i2,$s3,0-2
-+	_xtr	$i3,$s0,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t8,1($i0)		# Td3[s1]
-+	lwl	$t9,1($i1)		# Td3[s2]
-+	lwl	$t10,1($i2)		# Td3[s3]
-+	lwl	$t11,1($i3)		# Td3[s0]
-+	lwr	$t8,0($i0)		# Td3[s1]
-+	lwr	$t9,0($i1)		# Td3[s2]
-+	lwr	$t10,0($i2)		# Td3[s3]
-+	lwr	$t11,0($i3)		# Td3[s0]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+
-+	lw	$t4,0($i0)		# Td0[s0>>24]
-+	lw	$t5,0($i1)		# Td0[s1>>24]
-+	lw	$t6,0($i2)		# Td0[s2>>24]
-+	lw	$t7,0($i3)		# Td0[s3>>24]
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	sub	$cnt,1
-+	$PTR_ADD $key0,16
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+	.set	noreorder
-+	bnez	$cnt,.Loop_dec
-+	_xtr	$i0,$s3,16-2
-+
-+	.set	reorder
-+	lw	$t4,1024($Tbl)		# prefetch Td4
-+	lw	$t5,1024+32($Tbl)
-+	lw	$t6,1024+64($Tbl)
-+	lw	$t7,1024+96($Tbl)
-+	lw	$t8,1024+128($Tbl)
-+	lw	$t9,1024+160($Tbl)
-+	lw	$t10,1024+192($Tbl)
-+	lw	$t11,1024+224($Tbl)
-+
-+	_xtr	$i0,$s3,16
-+	_xtr	$i1,$s0,16
-+	_xtr	$i2,$s1,16
-+	_xtr	$i3,$s2,16
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	and	$i3,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t0,1024($i0)		# Td4[s3>>16]
-+	lbu	$t1,1024($i1)		# Td4[s0>>16]
-+	lbu	$t2,1024($i2)		# Td4[s1>>16]
-+	lbu	$t3,1024($i3)		# Td4[s2>>16]
-+
-+	_xtr	$i0,$s2,8
-+	_xtr	$i1,$s3,8
-+	_xtr	$i2,$s0,8
-+	_xtr	$i3,$s1,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	and	$i3,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,1024($i0)		# Td4[s2>>8]
-+	lbu	$t5,1024($i1)		# Td4[s3>>8]
-+	lbu	$t6,1024($i2)		# Td4[s0>>8]
-+	lbu	$t7,1024($i3)		# Td4[s1>>8]
-+
-+	_xtr	$i0,$s0,24
-+	_xtr	$i1,$s1,24
-+	_xtr	$i2,$s2,24
-+	_xtr	$i3,$s3,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t8,1024($i0)		# Td4[s0>>24]
-+	lbu	$t9,1024($i1)		# Td4[s1>>24]
-+	lbu	$t10,1024($i2)		# Td4[s2>>24]
-+	lbu	$t11,1024($i3)		# Td4[s3>>24]
-+
-+	_xtr	$i0,$s1,0
-+	_xtr	$i1,$s2,0
-+	_xtr	$i2,$s3,0
-+	_xtr	$i3,$s0,0
-+
-+	_ins	$t0,16
-+	_ins	$t1,16
-+	_ins	$t2,16
-+	_ins	$t3,16
-+
-+	_ins	$t4,8
-+	_ins	$t5,8
-+	_ins	$t6,8
-+	_ins	$t7,8
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,1024($i0)		# Td4[s1]
-+	lbu	$t5,1024($i1)		# Td4[s2]
-+	lbu	$t6,1024($i2)		# Td4[s3]
-+	lbu	$t7,1024($i3)		# Td4[s0]
-+
-+	_ins	$t8,24
-+	_ins	$t9,24
-+	_ins	$t10,24
-+	_ins	$t11,24
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	_ins	$t4,0
-+	_ins	$t5,0
-+	_ins	$t6,0
-+	_ins	$t7,0
-+
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	jr	$ra
-+.end	_mips_AES_decrypt
-+
-+.align	5
-+.globl	AES_decrypt
-+.ent	AES_decrypt
-+AES_decrypt:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_decrypt
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Td		# PIC-ified 'load address'
-+
-+	lwl	$s0,0+$MSB($inp)
-+	lwl	$s1,4+$MSB($inp)
-+	lwl	$s2,8+$MSB($inp)
-+	lwl	$s3,12+$MSB($inp)
-+	lwr	$s0,0+$LSB($inp)
-+	lwr	$s1,4+$LSB($inp)
-+	lwr	$s2,8+$LSB($inp)
-+	lwr	$s3,12+$LSB($inp)
-+
-+	bal	_mips_AES_decrypt
-+
-+	swr	$s0,0+$LSB($out)
-+	swr	$s1,4+$LSB($out)
-+	swr	$s2,8+$LSB($out)
-+	swr	$s3,12+$LSB($out)
-+	swl	$s0,0+$MSB($out)
-+	swl	$s1,4+$MSB($out)
-+	swl	$s2,8+$MSB($out)
-+	swl	$s3,12+$MSB($out)
-+
-+	.set	noreorder
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_decrypt
-+___
-+}}}
-+
-+{{{
-+my $FRAMESIZE=8*$SZREG;
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
-+
-+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
-+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
-+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
-+my ($rcon,$cnt)=($gp,$fp);
-+
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_set_encrypt_key
-+_mips_AES_set_encrypt_key:
-+	.frame	$sp,0,$ra
-+	.set	noreorder
-+	beqz	$inp,.Lekey_done
-+	li	$t0,-1
-+	beqz	$key,.Lekey_done
-+	$PTR_ADD $rcon,$Tbl,1024+256
-+
-+	.set	reorder
-+	lwl	$rk0,0+$MSB($inp)	# load 128 bits
-+	lwl	$rk1,4+$MSB($inp)
-+	lwl	$rk2,8+$MSB($inp)
-+	lwl	$rk3,12+$MSB($inp)
-+	li	$at,128
-+	lwr	$rk0,0+$LSB($inp)
-+	lwr	$rk1,4+$LSB($inp)
-+	lwr	$rk2,8+$LSB($inp)
-+	lwr	$rk3,12+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L128bits
-+	li	$cnt,10
-+
-+	.set	reorder
-+	lwl	$rk4,16+$MSB($inp)	# load 192 bits
-+	lwl	$rk5,20+$MSB($inp)
-+	li	$at,192
-+	lwr	$rk4,16+$LSB($inp)
-+	lwr	$rk5,20+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L192bits
-+	li	$cnt,8
-+
-+	.set	reorder
-+	lwl	$rk6,24+$MSB($inp)	# load 256 bits
-+	lwl	$rk7,28+$MSB($inp)
-+	li	$at,256
-+	lwr	$rk6,24+$LSB($inp)
-+	lwr	$rk7,28+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L256bits
-+	li	$cnt,7
-+
-+	b	.Lekey_done
-+	li	$t0,-2
-+
-+.align	4
-+.L128bits:
-+	.set	reorder
-+	srl	$i0,$rk3,16
-+	srl	$i1,$rk3,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk3,0xff
-+	srl	$i3,$rk3,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sub	$cnt,1
-+	$PTR_ADD $key,16
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+
-+	.set	noreorder
-+	bnez	$cnt,.L128bits
-+	$PTR_ADD $rcon,4
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	li	$cnt,10
-+	sw	$rk3,12($key)
-+	li	$t0,0
-+	sw	$cnt,80($key)
-+	b	.Lekey_done
-+	$PTR_SUB $key,10*16
-+
-+.align	4
-+.L192bits:
-+	.set	reorder
-+	srl	$i0,$rk5,16
-+	srl	$i1,$rk5,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk5,0xff
-+	srl	$i3,$rk5,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sw	$rk4,16($key)
-+	sw	$rk5,20($key)
-+	sub	$cnt,1
-+	$PTR_ADD $key,24
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+	xor	$rk4,$rk3
-+	xor	$rk5,$rk4
-+
-+	.set	noreorder
-+	bnez	$cnt,.L192bits
-+	$PTR_ADD $rcon,4
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	li	$cnt,12
-+	sw	$rk3,12($key)
-+	li	$t0,0
-+	sw	$cnt,48($key)
-+	b	.Lekey_done
-+	$PTR_SUB $key,12*16
-+
-+.align	4
-+.L256bits:
-+	.set	reorder
-+	srl	$i0,$rk7,16
-+	srl	$i1,$rk7,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk7,0xff
-+	srl	$i3,$rk7,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sw	$rk4,16($key)
-+	sw	$rk5,20($key)
-+	sw	$rk6,24($key)
-+	sw	$rk7,28($key)
-+	sub	$cnt,1
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+	beqz	$cnt,.L256bits_done
-+
-+	srl	$i0,$rk3,24
-+	srl	$i1,$rk3,16
-+	srl	$i2,$rk3,8
-+	and	$i3,$rk3,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+	sll	$i0,24
-+	sll	$i1,16
-+	sll	$i2,8
-+
-+	xor	$rk4,$i0
-+	xor	$rk4,$i1
-+	xor	$rk4,$i2
-+	xor	$rk4,$i3
-+
-+	xor	$rk5,$rk4
-+	xor	$rk6,$rk5
-+	xor	$rk7,$rk6
-+
-+	$PTR_ADD $key,32
-+	.set	noreorder
-+	b	.L256bits
-+	$PTR_ADD $rcon,4
-+
-+.L256bits_done:
-+	sw	$rk0,32($key)
-+	sw	$rk1,36($key)
-+	sw	$rk2,40($key)
-+	li	$cnt,14
-+	sw	$rk3,44($key)
-+	li	$t0,0
-+	sw	$cnt,48($key)
-+	$PTR_SUB $key,12*16
-+
-+.Lekey_done:
-+	jr	$ra
-+	nop
-+.end	_mips_AES_set_encrypt_key
-+
-+.globl	AES_set_encrypt_key
-+.ent	AES_set_encrypt_key
-+AES_set_encrypt_key:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_set_encrypt_key
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	bal	_mips_AES_set_encrypt_key
-+
-+	.set	noreorder
-+	move	$a0,$t0
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_set_encrypt_key
-+___
-+
-+my ($head,$tail)=($inp,$bits);
-+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
-+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
-+$code.=<<___;
-+.align	5
-+.globl	AES_set_decrypt_key
-+.ent	AES_set_decrypt_key
-+AES_set_decrypt_key:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_set_decrypt_key
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	bal	_mips_AES_set_encrypt_key
-+
-+	bltz	$t0,.Ldkey_done
-+
-+	sll	$at,$cnt,4
-+	$PTR_ADD $head,$key,0
-+	$PTR_ADD $tail,$key,$at
-+.align	4
-+.Lswap:
-+	lw	$rk0,0($head)
-+	lw	$rk1,4($head)
-+	lw	$rk2,8($head)
-+	lw	$rk3,12($head)
-+	lw	$rk4,0($tail)
-+	lw	$rk5,4($tail)
-+	lw	$rk6,8($tail)
-+	lw	$rk7,12($tail)
-+	sw	$rk0,0($tail)
-+	sw	$rk1,4($tail)
-+	sw	$rk2,8($tail)
-+	sw	$rk3,12($tail)
-+	$PTR_ADD $head,16
-+	$PTR_SUB $tail,16
-+	sw	$rk4,-16($head)
-+	sw	$rk5,-12($head)
-+	sw	$rk6,-8($head)
-+	sw	$rk7,-4($head)
-+	bne	$head,$tail,.Lswap
-+
-+	lw	$tp1,16($key)		# modulo-scheduled
-+	lui	$x80808080,0x8080
-+	sub	$cnt,1
-+	or	$x80808080,0x8080
-+	sll	$cnt,2
-+	$PTR_ADD $key,16
-+	lui	$x1b1b1b1b,0x1b1b
-+	nor	$x7f7f7f7f,$zero,$x80808080
-+	or	$x1b1b1b1b,0x1b1b
-+.align	4
-+.Lmix:
-+	and	$m,$tp1,$x80808080
-+	and	$tp2,$tp1,$x7f7f7f7f
-+	srl	$tp4,$m,7
-+	addu	$tp2,$tp2		# tp2<<1
-+	subu	$m,$tp4
-+	and	$m,$x1b1b1b1b
-+	xor	$tp2,$m
-+
-+	and	$m,$tp2,$x80808080
-+	and	$tp4,$tp2,$x7f7f7f7f
-+	srl	$tp8,$m,7
-+	addu	$tp4,$tp4		# tp4<<1
-+	subu	$m,$tp8
-+	and	$m,$x1b1b1b1b
-+	xor	$tp4,$m
-+
-+	and	$m,$tp4,$x80808080
-+	and	$tp8,$tp4,$x7f7f7f7f
-+	srl	$tp9,$m,7
-+	addu	$tp8,$tp8		# tp8<<1
-+	subu	$m,$tp9
-+	and	$m,$x1b1b1b1b
-+	xor	$tp8,$m
-+
-+	xor	$tp9,$tp8,$tp1
-+	xor	$tpe,$tp8,$tp4
-+	xor	$tpb,$tp9,$tp2
-+	xor	$tpd,$tp9,$tp4
-+
-+	_ror	$tp1,$tpd,16
-+	 xor	$tpe,$tp2
-+	_ror	$tp2,$tpd,-16
-+	xor	$tpe,$tp1
-+	_ror	$tp1,$tp9,8
-+	xor	$tpe,$tp2
-+	_ror	$tp2,$tp9,-24
-+	xor	$tpe,$tp1
-+	_ror	$tp1,$tpb,24
-+	xor	$tpe,$tp2
-+	_ror	$tp2,$tpb,-8
-+	xor	$tpe,$tp1
-+	lw	$tp1,4($key)		# modulo-scheduled
-+	xor	$tpe,$tp2
-+	sub	$cnt,1
-+	sw	$tpe,0($key)
-+	$PTR_ADD $key,4
-+	bnez	$cnt,.Lmix
-+
-+	li	$t0,0
-+.Ldkey_done:
-+	.set	noreorder
-+	move	$a0,$t0
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_set_decrypt_key
-+___
-+}}}
-+
-+######################################################################
-+# Tables are kept in endian-neutral manner
-+$code.=<<___;
-+.rdata
-+.align	6
-+AES_Te:
-+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
-+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
-+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
-+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
-+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
-+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
-+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
-+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
-+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
-+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
-+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
-+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
-+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
-+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
-+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
-+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
-+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
-+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
-+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
-+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
-+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
-+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
-+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
-+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
-+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
-+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
-+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
-+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
-+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
-+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
-+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
-+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
-+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
-+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
-+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
-+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
-+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
-+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
-+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
-+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
-+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
-+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
-+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
-+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
-+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
-+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
-+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
-+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
-+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
-+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
-+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
-+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
-+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
-+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
-+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
-+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
-+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
-+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
-+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
-+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
-+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
-+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
-+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
-+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
-+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
-+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
-+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
-+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
-+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
-+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
-+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
-+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
-+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
-+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
-+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
-+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
-+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
-+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
-+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
-+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
-+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
-+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
-+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
-+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
-+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
-+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
-+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
-+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
-+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
-+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
-+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
-+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
-+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
-+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
-+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
-+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
-+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
-+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
-+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
-+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
-+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
-+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
-+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
-+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
-+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
-+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
-+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
-+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
-+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
-+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
-+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
-+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
-+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
-+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
-+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
-+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
-+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
-+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
-+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
-+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
-+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
-+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
-+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
-+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
-+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
-+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
-+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
-+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
-+
-+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
-+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
-+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
-+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
-+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
-+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
-+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
-+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
-+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
-+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
-+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
-+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
-+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
-+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
-+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
-+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
-+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
-+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
-+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
-+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
-+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
-+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
-+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
-+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
-+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
-+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
-+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
-+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
-+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
-+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
-+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
-+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-+
-+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
-+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
-+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
-+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
-+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
-+
-+.align	6
-+AES_Td:
-+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
-+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
-+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
-+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
-+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
-+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
-+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
-+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
-+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
-+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
-+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
-+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
-+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
-+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
-+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
-+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
-+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
-+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
-+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
-+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
-+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
-+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
-+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
-+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
-+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
-+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
-+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
-+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
-+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
-+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
-+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
-+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
-+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
-+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
-+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
-+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
-+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
-+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
-+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
-+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
-+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
-+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
-+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
-+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
-+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
-+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
-+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
-+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
-+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
-+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
-+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
-+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
-+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
-+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
-+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
-+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
-+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
-+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
-+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
-+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
-+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
-+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
-+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
-+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
-+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
-+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
-+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
-+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
-+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
-+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
-+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
-+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
-+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
-+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
-+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
-+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
-+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
-+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
-+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
-+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
-+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
-+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
-+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
-+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
-+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
-+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
-+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
-+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
-+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
-+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
-+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
-+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
-+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
-+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
-+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
-+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
-+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
-+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
-+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
-+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
-+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
-+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
-+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
-+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
-+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
-+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
-+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
-+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
-+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
-+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
-+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
-+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
-+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
-+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
-+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
-+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
-+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
-+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
-+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
-+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
-+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
-+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
-+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
-+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
-+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
-+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
-+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
-+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
-+
-+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
-+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
-+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
-+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
-+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
-+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
-+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
-+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
-+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
-+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
-+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
-+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
-+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
-+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
-+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
-+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
-+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
-+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
-+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
-+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
-+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
-+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
-+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
-+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
-+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
-+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
-+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
-+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
-+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
-+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
-+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
-+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-+___
-+
-+foreach (split("\n",$code)) {
-+	s/\`([^\`]*)\`/eval $1/ge;
-+
-+	# made-up _instructions, _xtr, _ins, _ror and _bias, cope
-+	# with byte order dependencies...
-+	if (/^\s+_/) {
-+	    s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
-+
-+	    s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
-+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("24-$3"))/e or
-+	    s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
-+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("24-$3"))/e or
-+	    s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
-+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("$3*-1"))/e or
-+	    s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
-+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("($3-16)&31"))/e;
-+
-+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
-+		sprintf("sll\t$1,$2,$3")/e				or
-+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
-+		sprintf("and\t$1,$2,0xff")/e				or
-+	    s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
-+	}
-+
-+	# convert lwl/lwr and swr/swl to little-endian order
-+	if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
-+	    s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
-+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e	or
-+	    s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
-+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
-+	}
-+
-+	print $_,"\n";
-+}
-+
-+close STDOUT;
-diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
-new file mode 100644
-index 0000000..b944a12
---- /dev/null
-+++ b/crypto/bn/asm/mips-mont.pl
-@@ -0,0 +1,426 @@
-+#!/usr/bin/env perl
-+#
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# This module doesn't present direct interest for OpenSSL, because it
-+# doesn't provide better performance for longer keys, at least not on
-+# in-order-execution cores. While 512-bit RSA sign operations can be
-+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
-+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
-+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
-+# verify:-( All comparisons are against bn_mul_mont-free assembler.
-+# The module might be of interest to embedded system developers, as
-+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
-+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
-+# code.
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$SZREG=4;
-+}
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
-+#
-+# <[email protected]>
-+#
-+######################################################################
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+if ($flavour =~ /64|n32/i) {
-+	$LD="ld";
-+	$ST="sd";
-+	$MULTU="dmultu";
-+	$ADDU="daddu";
-+	$SUBU="dsubu";
-+	$BNSZ=8;
-+} else {
-+	$LD="lw";
-+	$ST="sw";
-+	$MULTU="multu";
-+	$ADDU="addu";
-+	$SUBU="subu";
-+	$BNSZ=4;
-+}
-+
-+# int bn_mul_mont(
-+$rp=$a0;	# BN_ULONG *rp,
-+$ap=$a1;	# const BN_ULONG *ap,
-+$bp=$a2;	# const BN_ULONG *bp,
-+$np=$a3;	# const BN_ULONG *np,
-+$n0=$a4;	# const BN_ULONG *n0,
-+$num=$a5;	# int num);
-+
-+$lo0=$a6;
-+$hi0=$a7;
-+$lo1=$t1;
-+$hi1=$t2;
-+$aj=$s0;
-+$bi=$s1;
-+$nj=$s2;
-+$tp=$s3;
-+$alo=$s4;
-+$ahi=$s5;
-+$nlo=$s6;
-+$nhi=$s7;
-+$tj=$s8;
-+$i=$s9;
-+$j=$s10;
-+$m1=$s11;
-+
-+$FRAMESIZE=14;
-+
-+$code=<<___;
-+.text
-+
-+.set	noat
-+.set	noreorder
-+
-+.align	5
-+.globl	bn_mul_mont
-+.ent	bn_mul_mont
-+bn_mul_mont:
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);
-+	lw	$n0,16($sp)
-+	lw	$num,20($sp)
-+___
-+$code.=<<___;
-+	slt	$at,$num,4
-+	bnez	$at,1f
-+	li	$t0,0
-+	slt	$at,$num,17	# on in-order CPU
-+	bnezl	$at,bn_mul_mont_internal
-+	nop
-+1:	jr	$ra
-+	li	$a0,0
-+.end	bn_mul_mont
-+
-+.align	5
-+.ent	bn_mul_mont_internal
-+bn_mul_mont_internal:
-+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
-+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
-+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
-+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
-+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	move	$fp,$sp
-+
-+	.set	reorder
-+	$LD	$n0,0($n0)
-+	$LD	$bi,0($bp)	# bp[0]
-+	$LD	$aj,0($ap)	# ap[0]
-+	$LD	$nj,0($np)	# np[0]
-+
-+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
-+	sll	$num,`log($BNSZ)/log(2)`
-+	li	$at,-4096
-+	$PTR_SUB $sp,$num
-+	and	$sp,$at
-+
-+	$MULTU	$aj,$bi
-+	$LD	$alo,$BNSZ($ap)
-+	$LD	$nlo,$BNSZ($np)
-+	mflo	$lo0
-+	mfhi	$hi0
-+	$MULTU	$lo0,$n0
-+	mflo	$m1
-+
-+	$MULTU	$alo,$bi
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$MULTU	$nj,$m1
-+	mflo	$lo1
-+	mfhi	$hi1
-+	$MULTU	$nlo,$m1
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	move	$tp,$sp
-+	li	$j,2*$BNSZ
-+.align	4
-+.L1st:
-+	.set	noreorder
-+	$PTR_ADD $aj,$ap,$j
-+	$PTR_ADD $nj,$np,$j
-+	$LD	$aj,($aj)
-+	$LD	$nj,($nj)
-+
-+	$MULTU	$aj,$bi
-+	$ADDU	$lo0,$alo,$hi0
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo0,$hi0
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$hi1,$nhi,$t0
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$MULTU	$nj,$m1
-+	$ADDU	$hi1,$at
-+	addu	$j,$BNSZ
-+	$ST	$lo1,($tp)
-+	sltu	$t0,$j,$num
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	bnez	$t0,.L1st
-+	$PTR_ADD $tp,$BNSZ
-+	.set	reorder
-+
-+	$ADDU	$lo0,$alo,$hi0
-+	sltu	$at,$lo0,$hi0
-+	$ADDU	$hi0,$ahi,$at
-+
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi1,$nhi,$t0
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+
-+	$ST	$lo1,($tp)
-+
-+	$ADDU	$hi1,$hi0
-+	sltu	$at,$hi1,$hi0
-+	$ST	$hi1,$BNSZ($tp)
-+	$ST	$at,2*$BNSZ($tp)
-+
-+	li	$i,$BNSZ
-+.align	4
-+.Louter:
-+	$PTR_ADD $bi,$bp,$i
-+	$LD	$bi,($bi)
-+	$LD	$aj,($ap)
-+	$LD	$alo,$BNSZ($ap)
-+	$LD	$tj,($sp)
-+
-+	$MULTU	$aj,$bi
-+	$LD	$nj,($np)
-+	$LD	$nlo,$BNSZ($np)
-+	mflo	$lo0
-+	mfhi	$hi0
-+	$ADDU	$lo0,$tj
-+	$MULTU	$lo0,$n0
-+	sltu	$at,$lo0,$tj
-+	$ADDU	$hi0,$at
-+	mflo	$m1
-+
-+	$MULTU	$alo,$bi
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$MULTU	$nj,$m1
-+	mflo	$lo1
-+	mfhi	$hi1
-+
-+	$MULTU	$nlo,$m1
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	move	$tp,$sp
-+	li	$j,2*$BNSZ
-+	$LD	$tj,$BNSZ($tp)
-+.align	4
-+.Linner:
-+	.set	noreorder
-+	$PTR_ADD $aj,$ap,$j
-+	$PTR_ADD $nj,$np,$j
-+	$LD	$aj,($aj)
-+	$LD	$nj,($nj)
-+
-+	$MULTU	$aj,$bi
-+	$ADDU	$lo0,$alo,$hi0
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo0,$hi0
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$hi1,$nhi,$t0
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$ADDU	$lo0,$tj
-+	addu	$j,$BNSZ
-+	$MULTU	$nj,$m1
-+	sltu	$at,$lo0,$tj
-+	$ADDU	$lo1,$lo0
-+	$ADDU	$hi0,$at
-+	sltu	$t0,$lo1,$lo0
-+	$LD	$tj,2*$BNSZ($tp)
-+	$ADDU	$hi1,$t0
-+	sltu	$at,$j,$num
-+	mflo	$nlo
-+	mfhi	$nhi
-+	$ST	$lo1,($tp)
-+	bnez	$at,.Linner
-+	$PTR_ADD $tp,$BNSZ
-+	.set	reorder
-+
-+	$ADDU	$lo0,$alo,$hi0
-+	sltu	$at,$lo0,$hi0
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$lo0,$tj
-+	sltu	$t0,$lo0,$tj
-+	$ADDU	$hi0,$t0
-+
-+	$LD	$tj,2*$BNSZ($tp)
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo1,$hi1
-+	$ADDU	$hi1,$nhi,$at
-+	$ADDU	$lo1,$lo0
-+	sltu	$t0,$lo1,$lo0
-+	$ADDU	$hi1,$t0
-+	$ST	$lo1,($tp)
-+
-+	$ADDU	$lo1,$hi1,$hi0
-+	sltu	$hi1,$lo1,$hi0
-+	$ADDU	$lo1,$tj
-+	sltu	$at,$lo1,$tj
-+	$ADDU	$hi1,$at
-+	$ST	$lo1,$BNSZ($tp)
-+	$ST	$hi1,2*$BNSZ($tp)
-+
-+	addu	$i,$BNSZ
-+	sltu	$t0,$i,$num
-+	bnez	$t0,.Louter
-+
-+	.set	noreorder
-+	$PTR_ADD $tj,$sp,$num	# &tp[num]
-+	move	$tp,$sp
-+	move	$ap,$sp
-+	li	$hi0,0		# clear borrow bit
-+
-+.align	4
-+.Lsub:	$LD	$lo0,($tp)
-+	$LD	$lo1,($np)
-+	$PTR_ADD $tp,$BNSZ
-+	$PTR_ADD $np,$BNSZ
-+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
-+	sgtu	$at,$lo1,$lo0
-+	$SUBU	$lo0,$lo1,$hi0
-+	sgtu	$hi0,$lo0,$lo1
-+	$ST	$lo0,($rp)
-+	or	$hi0,$at
-+	sltu	$at,$tp,$tj
-+	bnez	$at,.Lsub
-+	$PTR_ADD $rp,$BNSZ
-+
-+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
-+	move	$tp,$sp
-+	$PTR_SUB $rp,$num	# restore rp
-+	not	$hi1,$hi0
-+
-+	and	$ap,$hi0,$sp
-+	and	$bp,$hi1,$rp
-+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
-+
-+.align	4
-+.Lcopy:	$LD	$aj,($ap)
-+	$PTR_ADD $ap,$BNSZ
-+	$ST	$zero,($tp)
-+	$PTR_ADD $tp,$BNSZ
-+	sltu	$at,$tp,$tj
-+	$ST	$aj,($rp)
-+	bnez	$at,.Lcopy
-+	$PTR_ADD $rp,$BNSZ
-+
-+	li	$a0,1
-+	li	$t0,1
-+
-+	.set	noreorder
-+	move	$sp,$fp
-+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
-+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
-+.end	bn_mul_mont_internal
-+.rdata
-+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+___
-+
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
-+
-+print $code;
-+close STDOUT;
-diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
-new file mode 100644
-index 0000000..f04b3b9
---- /dev/null
-+++ b/crypto/bn/asm/mips.pl
-@@ -0,0 +1,2585 @@
-+#!/usr/bin/env perl
-+#
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project.
-+#
-+# Rights for redistribution and usage in source and binary forms are
-+# granted according to the OpenSSL license. Warranty of any kind is
-+# disclaimed.
-+# ====================================================================
-+
-+
-+# July 1999
-+#
-+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
-+#
-+# The module is designed to work with either of the "new" MIPS ABI(5),
-+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
-+# IRIX 5.x not only because it doesn't support new ABIs but also
-+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
-+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
-+# cause illegal instruction exception:-(
-+#
-+# In addition the code depends on preprocessor flags set up by MIPSpro
-+# compiler driver (either as or cc) and therefore (probably?) can't be
-+# compiled by the GNU assembler. GNU C driver manages fine though...
-+# I mean as long as -mmips-as is specified or is the default option,
-+# because then it simply invokes /usr/bin/as which in turn takes
-+# perfect care of the preprocessor definitions. Another neat feature
-+# offered by the MIPSpro assembler is an optimization pass. This gave
-+# me the opportunity to have the code looking more regular as all those
-+# architecture dependent instruction rescheduling details were left to
-+# the assembler. Cool, huh?
-+#
-+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
-+# goes way over 3 times faster!
-+#
-+#					<[email protected]>
-+
-+# October 2010
-+#
-+# Adapt the module even for 32-bit ABIs and other OSes. The former was
-+# achieved by mechanical replacement of 64-bit arithmetic instructions
-+# such as dmultu, daddu, etc. with their 32-bit counterparts and
-+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
-+# >3x performance improvement naturally does not apply to 32-bit code
-+# [because there is no instruction 32-bit compiler can't use], one
-+# has to content with 40-85% improvement depending on benchmark and
-+# key length, more for longer keys.
-+
-+$flavour = shift;
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+if ($flavour =~ /64|n32/i) {
-+	$LD="ld";
-+	$ST="sd";
-+	$MULTU="dmultu";
-+	$DIVU="ddivu";
-+	$ADDU="daddu";
-+	$SUBU="dsubu";
-+	$SRL="dsrl";
-+	$SLL="dsll";
-+	$BNSZ=8;
-+	$PTR_ADD="daddu";
-+	$PTR_SUB="dsubu";
-+	$SZREG=8;
-+	$REG_S="sd";
-+	$REG_L="ld";
-+} else {
-+	$LD="lw";
-+	$ST="sw";
-+	$MULTU="multu";
-+	$DIVU="divu";
-+	$ADDU="addu";
-+	$SUBU="subu";
-+	$SRL="srl";
-+	$SLL="sll";
-+	$BNSZ=4;
-+	$PTR_ADD="addu";
-+	$PTR_SUB="subu";
-+	$SZREG=4;
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$code=".set	mips2\n";
-+}
-+
-+# Below is N32/64 register layout used in the original module.
-+#
-+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
-+#
-+# No special adaptation is required for O32. NUBI on the other hand
-+# is treated by saving/restoring ($v1,$t0..$t3).
-+
-+$gp=$v1 if ($flavour =~ /nubi/i);
-+
-+$minus4=$v1;
-+
-+$code.=<<___;
-+.rdata
-+.asciiz	"mips3.s, Version 1.2"
-+.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
-+
-+.text
-+.set	noat
-+
-+.align	5
-+.globl	bn_mul_add_words
-+.ent	bn_mul_add_words
-+bn_mul_add_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_mul_add_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_add_words
-+
-+.align	5
-+.ent	bn_mul_add_words_internal
-+bn_mul_add_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_mul_add_words_tail
-+
-+.L_bn_mul_add_words_loop:
-+	$MULTU	$t0,$a3
-+	$LD	$t1,0($a0)
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$t3,$BNSZ($a0)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta1,2*$BNSZ($a0)
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
-+				# values", but it seems to work fine
-+				# even on 64-bit registers.
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	 $MULTU	$t2,$a3
-+	sltu	$at,$t1,$at
-+	$ST	$t1,0($a0)
-+	$ADDU	$v0,$at
-+
-+	$LD	$ta2,3*$BNSZ($a1)
-+	$LD	$ta3,3*$BNSZ($a0)
-+	$ADDU	$t3,$v0
-+	sltu	$v0,$t3,$v0
-+	mflo	$at
-+	mfhi	$t2
-+	$ADDU	$t3,$at
-+	$ADDU	$v0,$t2
-+	 $MULTU	$ta0,$a3
-+	sltu	$at,$t3,$at
-+	$ST	$t3,$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+	subu	$a2,4
-+	$PTR_ADD $a0,4*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	$ADDU	$ta1,$v0
-+	sltu	$v0,$ta1,$v0
-+	mflo	$at
-+	mfhi	$ta0
-+	$ADDU	$ta1,$at
-+	$ADDU	$v0,$ta0
-+	 $MULTU	$ta2,$a3
-+	sltu	$at,$ta1,$at
-+	$ST	$ta1,-2*$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+
-+	and	$ta0,$a2,$minus4
-+	$ADDU	$ta3,$v0
-+	sltu	$v0,$ta3,$v0
-+	mflo	$at
-+	mfhi	$ta2
-+	$ADDU	$ta3,$at
-+	$ADDU	$v0,$ta2
-+	sltu	$at,$ta3,$at
-+	$ST	$ta3,-$BNSZ($a0)
-+	$ADDU	$v0,$at
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_mul_add_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_mul_add_words_return
-+	nop
-+
-+.L_bn_mul_add_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,0($a0)
-+	subu	$a2,1
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,0($a0)
-+	$ADDU	$v0,$at
-+	beqz	$a2,.L_bn_mul_add_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,$BNSZ($a0)
-+	subu	$a2,1
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$at
-+	beqz	$a2,.L_bn_mul_add_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,2*$BNSZ($a0)
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,2*$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+.L_bn_mul_add_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_add_words_internal
-+
-+.align	5
-+.globl	bn_mul_words
-+.ent	bn_mul_words
-+bn_mul_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_mul_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_words
-+
-+.align	5
-+.ent	bn_mul_words_internal
-+bn_mul_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_mul_words_tail
-+
-+.L_bn_mul_words_loop:
-+	$MULTU	$t0,$a3
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta2,3*$BNSZ($a1)
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	 $MULTU	$t2,$a3
-+	$ST	$v0,0($a0)
-+	$ADDU	$v0,$t1,$t0
-+
-+	subu	$a2,4
-+	$PTR_ADD $a0,4*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	mflo	$at
-+	mfhi	$t2
-+	$ADDU	$v0,$at
-+	sltu	$t3,$v0,$at
-+	 $MULTU	$ta0,$a3
-+	$ST	$v0,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t3,$t2
-+
-+	mflo	$at
-+	mfhi	$ta0
-+	$ADDU	$v0,$at
-+	sltu	$ta1,$v0,$at
-+	 $MULTU	$ta2,$a3
-+	$ST	$v0,-2*$BNSZ($a0)
-+	$ADDU	$v0,$ta1,$ta0
-+
-+	and	$ta0,$a2,$minus4
-+	mflo	$at
-+	mfhi	$ta2
-+	$ADDU	$v0,$at
-+	sltu	$ta3,$v0,$at
-+	$ST	$v0,-$BNSZ($a0)
-+	$ADDU	$v0,$ta3,$ta2
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_mul_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_mul_words_return
-+	nop
-+
-+.L_bn_mul_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$a3
-+	subu	$a2,1
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,0($a0)
-+	$ADDU	$v0,$t1,$t0
-+	beqz	$a2,.L_bn_mul_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	subu	$a2,1
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,$BNSZ($a0)
-+	$ADDU	$v0,$t1,$t0
-+	beqz	$a2,.L_bn_mul_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,2*$BNSZ($a0)
-+	$ADDU	$v0,$t1,$t0
-+
-+.L_bn_mul_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_words_internal
-+
-+.align	5
-+.globl	bn_sqr_words
-+.ent	bn_sqr_words
-+bn_sqr_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_sqr_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_sqr_words
-+
-+.align	5
-+.ent	bn_sqr_words_internal
-+bn_sqr_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_sqr_words_tail
-+
-+.L_bn_sqr_words_loop:
-+	$MULTU	$t0,$t0
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta2,3*$BNSZ($a1)
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,0($a0)
-+	$ST	$t0,$BNSZ($a0)
-+
-+	$MULTU	$t2,$t2
-+	subu	$a2,4
-+	$PTR_ADD $a0,8*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	mflo	$t3
-+	mfhi	$t2
-+	$ST	$t3,-6*$BNSZ($a0)
-+	$ST	$t2,-5*$BNSZ($a0)
-+
-+	$MULTU	$ta0,$ta0
-+	mflo	$ta1
-+	mfhi	$ta0
-+	$ST	$ta1,-4*$BNSZ($a0)
-+	$ST	$ta0,-3*$BNSZ($a0)
-+
-+
-+	$MULTU	$ta2,$ta2
-+	and	$ta0,$a2,$minus4
-+	mflo	$ta3
-+	mfhi	$ta2
-+	$ST	$ta3,-2*$BNSZ($a0)
-+	$ST	$ta2,-$BNSZ($a0)
-+
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_sqr_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_sqr_words_return
-+	nop
-+
-+.L_bn_sqr_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$t0
-+	subu	$a2,1
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,0($a0)
-+	$ST	$t0,$BNSZ($a0)
-+	beqz	$a2,.L_bn_sqr_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$t0
-+	subu	$a2,1
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,2*$BNSZ($a0)
-+	$ST	$t0,3*$BNSZ($a0)
-+	beqz	$a2,.L_bn_sqr_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$t0
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,4*$BNSZ($a0)
-+	$ST	$t0,5*$BNSZ($a0)
-+
-+.L_bn_sqr_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+
-+.end	bn_sqr_words_internal
-+
-+.align	5
-+.globl	bn_add_words
-+.ent	bn_add_words
-+bn_add_words:
-+	.set	noreorder
-+	bgtz	$a3,bn_add_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_add_words
-+
-+.align	5
-+.ent	bn_add_words_internal
-+bn_add_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$at,$a3,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$at,.L_bn_add_words_tail
-+
-+.L_bn_add_words_loop:
-+	$LD	$ta0,0($a2)
-+	subu	$a3,4
-+	$LD	$t1,$BNSZ($a1)
-+	and	$at,$a3,$minus4
-+	$LD	$t2,2*$BNSZ($a1)
-+	$PTR_ADD $a2,4*$BNSZ
-+	$LD	$t3,3*$BNSZ($a1)
-+	$PTR_ADD $a0,4*$BNSZ
-+	$LD	$ta1,-3*$BNSZ($a2)
-+	$PTR_ADD $a1,4*$BNSZ
-+	$LD	$ta2,-2*$BNSZ($a2)
-+	$LD	$ta3,-$BNSZ($a2)
-+	$ADDU	$ta0,$t0
-+	sltu	$t8,$ta0,$t0
-+	$ADDU	$t0,$ta0,$v0
-+	sltu	$v0,$t0,$ta0
-+	$ST	$t0,-4*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	$ADDU	$ta1,$t1
-+	sltu	$t9,$ta1,$t1
-+	$ADDU	$t1,$ta1,$v0
-+	sltu	$v0,$t1,$ta1
-+	$ST	$t1,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	$ADDU	$ta2,$t2
-+	sltu	$t8,$ta2,$t2
-+	$ADDU	$t2,$ta2,$v0
-+	sltu	$v0,$t2,$ta2
-+	$ST	$t2,-2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	$ADDU	$ta3,$t3
-+	sltu	$t9,$ta3,$t3
-+	$ADDU	$t3,$ta3,$v0
-+	sltu	$v0,$t3,$ta3
-+	$ST	$t3,-$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	.set	noreorder
-+	bgtzl	$at,.L_bn_add_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a3,.L_bn_add_words_return
-+	nop
-+
-+.L_bn_add_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$LD	$ta0,0($a2)
-+	$ADDU	$ta0,$t0
-+	subu	$a3,1
-+	sltu	$t8,$ta0,$t0
-+	$ADDU	$t0,$ta0,$v0
-+	sltu	$v0,$t0,$ta0
-+	$ST	$t0,0($a0)
-+	$ADDU	$v0,$t8
-+	beqz	$a3,.L_bn_add_words_return
-+
-+	$LD	$t1,$BNSZ($a1)
-+	$LD	$ta1,$BNSZ($a2)
-+	$ADDU	$ta1,$t1
-+	subu	$a3,1
-+	sltu	$t9,$ta1,$t1
-+	$ADDU	$t1,$ta1,$v0
-+	sltu	$v0,$t1,$ta1
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+	beqz	$a3,.L_bn_add_words_return
-+
-+	$LD	$t2,2*$BNSZ($a1)
-+	$LD	$ta2,2*$BNSZ($a2)
-+	$ADDU	$ta2,$t2
-+	sltu	$t8,$ta2,$t2
-+	$ADDU	$t2,$ta2,$v0
-+	sltu	$v0,$t2,$ta2
-+	$ST	$t2,2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+.L_bn_add_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+
-+.end	bn_add_words_internal
-+
-+.align	5
-+.globl	bn_sub_words
-+.ent	bn_sub_words
-+bn_sub_words:
-+	.set	noreorder
-+	bgtz	$a3,bn_sub_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$zero
-+.end	bn_sub_words
-+
-+.align	5
-+.ent	bn_sub_words_internal
-+bn_sub_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$at,$a3,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$at,.L_bn_sub_words_tail
-+
-+.L_bn_sub_words_loop:
-+	$LD	$ta0,0($a2)
-+	subu	$a3,4
-+	$LD	$t1,$BNSZ($a1)
-+	and	$at,$a3,$minus4
-+	$LD	$t2,2*$BNSZ($a1)
-+	$PTR_ADD $a2,4*$BNSZ
-+	$LD	$t3,3*$BNSZ($a1)
-+	$PTR_ADD $a0,4*$BNSZ
-+	$LD	$ta1,-3*$BNSZ($a2)
-+	$PTR_ADD $a1,4*$BNSZ
-+	$LD	$ta2,-2*$BNSZ($a2)
-+	$LD	$ta3,-$BNSZ($a2)
-+	sltu	$t8,$t0,$ta0
-+	$SUBU	$ta0,$t0,$ta0
-+	$SUBU	$t0,$ta0,$v0
-+	sgtu	$v0,$t0,$ta0
-+	$ST	$t0,-4*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	sltu	$t9,$t1,$ta1
-+	$SUBU	$ta1,$t1,$ta1
-+	$SUBU	$t1,$ta1,$v0
-+	sgtu	$v0,$t1,$ta1
-+	$ST	$t1,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+
-+	sltu	$t8,$t2,$ta2
-+	$SUBU	$ta2,$t2,$ta2
-+	$SUBU	$t2,$ta2,$v0
-+	sgtu	$v0,$t2,$ta2
-+	$ST	$t2,-2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	sltu	$t9,$t3,$ta3
-+	$SUBU	$ta3,$t3,$ta3
-+	$SUBU	$t3,$ta3,$v0
-+	sgtu	$v0,$t3,$ta3
-+	$ST	$t3,-$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	.set	noreorder
-+	bgtzl	$at,.L_bn_sub_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a3,.L_bn_sub_words_return
-+	nop
-+
-+.L_bn_sub_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$LD	$ta0,0($a2)
-+	subu	$a3,1
-+	sltu	$t8,$t0,$ta0
-+	$SUBU	$ta0,$t0,$ta0
-+	$SUBU	$t0,$ta0,$v0
-+	sgtu	$v0,$t0,$ta0
-+	$ST	$t0,0($a0)
-+	$ADDU	$v0,$t8
-+	beqz	$a3,.L_bn_sub_words_return
-+
-+	$LD	$t1,$BNSZ($a1)
-+	subu	$a3,1
-+	$LD	$ta1,$BNSZ($a2)
-+	sltu	$t9,$t1,$ta1
-+	$SUBU	$ta1,$t1,$ta1
-+	$SUBU	$t1,$ta1,$v0
-+	sgtu	$v0,$t1,$ta1
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+	beqz	$a3,.L_bn_sub_words_return
-+
-+	$LD	$t2,2*$BNSZ($a1)
-+	$LD	$ta2,2*$BNSZ($a2)
-+	sltu	$t8,$t2,$ta2
-+	$SUBU	$ta2,$t2,$ta2
-+	$SUBU	$t2,$ta2,$v0
-+	sgtu	$v0,$t2,$ta2
-+	$ST	$t2,2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+.L_bn_sub_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_sub_words_internal
-+
-+.align 5
-+.globl	bn_div_3_words
-+.ent	bn_div_3_words
-+bn_div_3_words:
-+	.set	noreorder
-+	move	$a3,$a0		# we know that bn_div_words does not
-+				# touch $a3, $ta2, $ta3 and preserves $a2
-+				# so that we can save two arguments
-+				# and return address in registers
-+				# instead of stack:-)
-+
-+	$LD	$a0,($a3)
-+	move	$ta2,$a1
-+	bne	$a0,$a2,bn_div_3_words_internal
-+	$LD	$a1,-$BNSZ($a3)
-+	li	$v0,-1
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_3_words
-+
-+.align	5
-+.ent	bn_div_3_words_internal
-+bn_div_3_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	move	$ta3,$ra
-+	bal	bn_div_words
-+	move	$ra,$ta3
-+	$MULTU	$ta2,$v0
-+	$LD	$t2,-2*$BNSZ($a3)
-+	move	$ta0,$zero
-+	mfhi	$t1
-+	mflo	$t0
-+	sltu	$t8,$t1,$a1
-+.L_bn_div_3_words_inner_loop:
-+	bnez	$t8,.L_bn_div_3_words_inner_loop_done
-+	sgeu	$at,$t2,$t0
-+	seq	$t9,$t1,$a1
-+	and	$at,$t9
-+	sltu	$t3,$t0,$ta2
-+	$ADDU	$a1,$a2
-+	$SUBU	$t1,$t3
-+	$SUBU	$t0,$ta2
-+	sltu	$t8,$t1,$a1
-+	sltu	$ta0,$a1,$a2
-+	or	$t8,$ta0
-+	.set	noreorder
-+	beqzl	$at,.L_bn_div_3_words_inner_loop
-+	$SUBU	$v0,1
-+	.set	reorder
-+.L_bn_div_3_words_inner_loop_done:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_3_words_internal
-+
-+.align	5
-+.globl	bn_div_words
-+.ent	bn_div_words
-+bn_div_words:
-+	.set	noreorder
-+	bnez	$a2,bn_div_words_internal
-+	li	$v0,-1		# I would rather signal div-by-zero
-+				# which can be done with 'break 7'
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_words
-+
-+.align	5
-+.ent	bn_div_words_internal
-+bn_div_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	move	$v1,$zero
-+	bltz	$a2,.L_bn_div_words_body
-+	move	$t9,$v1
-+	$SLL	$a2,1
-+	bgtz	$a2,.-4
-+	addu	$t9,1
-+
-+	.set	reorder
-+	negu	$t1,$t9
-+	li	$t2,-1
-+	$SLL	$t2,$t1
-+	and	$t2,$a0
-+	$SRL	$at,$a1,$t1
-+	.set	noreorder
-+	bnezl	$t2,.+8
-+	break	6		# signal overflow
-+	.set	reorder
-+	$SLL	$a0,$t9
-+	$SLL	$a1,$t9
-+	or	$a0,$at
-+___
-+$QT=$ta0;
-+$HH=$ta1;
-+$DH=$v1;
-+$code.=<<___;
-+.L_bn_div_words_body:
-+	$SRL	$DH,$a2,4*$BNSZ	# bits
-+	sgeu	$at,$a0,$a2
-+	.set	noreorder
-+	bnezl	$at,.+8
-+	$SUBU	$a0,$a2
-+	.set	reorder
-+
-+	li	$QT,-1
-+	$SRL	$HH,$a0,4*$BNSZ	# bits
-+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
-+	beq	$DH,$HH,.L_bn_div_words_skip_div1
-+	$DIVU	$zero,$a0,$DH
-+	mflo	$QT
-+.L_bn_div_words_skip_div1:
-+	$MULTU	$a2,$QT
-+	$SLL	$t3,$a0,4*$BNSZ	# bits
-+	$SRL	$at,$a1,4*$BNSZ	# bits
-+	or	$t3,$at
-+	mflo	$t0
-+	mfhi	$t1
-+.L_bn_div_words_inner_loop1:
-+	sltu	$t2,$t3,$t0
-+	seq	$t8,$HH,$t1
-+	sltu	$at,$HH,$t1
-+	and	$t2,$t8
-+	sltu	$v0,$t0,$a2
-+	or	$at,$t2
-+	.set	noreorder
-+	beqz	$at,.L_bn_div_words_inner_loop1_done
-+	$SUBU	$t1,$v0
-+	$SUBU	$t0,$a2
-+	b	.L_bn_div_words_inner_loop1
-+	$SUBU	$QT,1
-+	.set	reorder
-+.L_bn_div_words_inner_loop1_done:
-+
-+	$SLL	$a1,4*$BNSZ	# bits
-+	$SUBU	$a0,$t3,$t0
-+	$SLL	$v0,$QT,4*$BNSZ	# bits
-+
-+	li	$QT,-1
-+	$SRL	$HH,$a0,4*$BNSZ	# bits
-+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
-+	beq	$DH,$HH,.L_bn_div_words_skip_div2
-+	$DIVU	$zero,$a0,$DH
-+	mflo	$QT
-+.L_bn_div_words_skip_div2:
-+	$MULTU	$a2,$QT
-+	$SLL	$t3,$a0,4*$BNSZ	# bits
-+	$SRL	$at,$a1,4*$BNSZ	# bits
-+	or	$t3,$at
-+	mflo	$t0
-+	mfhi	$t1
-+.L_bn_div_words_inner_loop2:
-+	sltu	$t2,$t3,$t0
-+	seq	$t8,$HH,$t1
-+	sltu	$at,$HH,$t1
-+	and	$t2,$t8
-+	sltu	$v1,$t0,$a2
-+	or	$at,$t2
-+	.set	noreorder
-+	beqz	$at,.L_bn_div_words_inner_loop2_done
-+	$SUBU	$t1,$v1
-+	$SUBU	$t0,$a2
-+	b	.L_bn_div_words_inner_loop2
-+	$SUBU	$QT,1
-+	.set	reorder
-+.L_bn_div_words_inner_loop2_done:
-+
-+	$SUBU	$a0,$t3,$t0
-+	or	$v0,$QT
-+	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
-+	$SRL	$a2,$t9		# restore $a2
-+
-+	.set	noreorder
-+	move	$a1,$v1
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_words_internal
-+___
-+undef $HH; undef $QT; undef $DH;
-+
-+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
-+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
-+
-+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
-+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
-+
-+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
-+
-+$code.=<<___;
-+
-+.align	5
-+.globl	bn_mul_comba8
-+.ent	bn_mul_comba8
-+bn_mul_comba8:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,12*$SZREG,$ra
-+	.mask	0x803ff008,-$SZREG
-+	$PTR_SUB $sp,12*$SZREG
-+	$REG_S	$ra,11*$SZREG($sp)
-+	$REG_S	$s5,10*$SZREG($sp)
-+	$REG_S	$s4,9*$SZREG($sp)
-+	$REG_S	$s3,8*$SZREG($sp)
-+	$REG_S	$s2,7*$SZREG($sp)
-+	$REG_S	$s1,6*$SZREG($sp)
-+	$REG_S	$s0,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x003f0000,-$SZREG
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$s5,5*$SZREG($sp)
-+	$REG_S	$s4,4*$SZREG($sp)
-+	$REG_S	$s3,3*$SZREG($sp)
-+	$REG_S	$s2,2*$SZREG($sp)
-+	$REG_S	$s1,1*$SZREG($sp)
-+	$REG_S	$s0,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+
-+	.set	reorder
-+	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
-+				# R5000 box assembler barks on this
-+				# 1ine with "should not have mult/div
-+				# as last instruction in bb (R10K
-+				# bug)" warning. If anybody out there
-+				# has a clue about how to circumvent
-+				# this do send me a note.
-+				#		<appro\@fy.chalmers.se>
-+
-+	$LD	$b_0,0($a2)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_3,3*$BNSZ($a1)
-+	$LD	$b_1,$BNSZ($a2)
-+	$LD	$b_2,2*$BNSZ($a2)
-+	$LD	$b_3,3*$BNSZ($a2)
-+	mflo	$c_1
-+	mfhi	$c_2
-+
-+	$LD	$a_4,4*$BNSZ($a1)
-+	$LD	$a_5,5*$BNSZ($a1)
-+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
-+	$LD	$a_6,6*$BNSZ($a1)
-+	$LD	$a_7,7*$BNSZ($a1)
-+	$LD	$b_4,4*$BNSZ($a2)
-+	$LD	$b_5,5*$BNSZ($a2)
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
-+	$ADDU	$c_3,$t_2,$at
-+	$LD	$b_6,6*$BNSZ($a2)
-+	$LD	$b_7,7*$BNSZ($a2)
-+	$ST	$c_1,0($a0)	# r[0]=c1;
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
-+	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s5,10*$SZREG($sp)
-+	$REG_L	$s4,9*$SZREG($sp)
-+	$REG_L	$s3,8*$SZREG($sp)
-+	$REG_L	$s2,7*$SZREG($sp)
-+	$REG_L	$s1,6*$SZREG($sp)
-+	$REG_L	$s0,5*$SZREG($sp)
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	jr	$ra
-+	$PTR_ADD $sp,12*$SZREG
-+___
-+$code.=<<___ if ($flavour !~ /nubi/i);
-+	$REG_L	$s5,5*$SZREG($sp)
-+	$REG_L	$s4,4*$SZREG($sp)
-+	$REG_L	$s3,3*$SZREG($sp)
-+	$REG_L	$s2,2*$SZREG($sp)
-+	$REG_L	$s1,1*$SZREG($sp)
-+	$REG_L	$s0,0*$SZREG($sp)
-+	jr	$ra
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+.end	bn_mul_comba8
-+
-+.align	5
-+.globl	bn_mul_comba4
-+.ent	bn_mul_comba4
-+bn_mul_comba4:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$b_0,0($a2)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_3,3*$BNSZ($a1)
-+	$LD	$b_1,$BNSZ($a2)
-+	$LD	$b_2,2*$BNSZ($a2)
-+	$LD	$b_3,3*$BNSZ($a2)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
-+	$ADDU	$c_3,$t_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	$ST	$c_1,6*$BNSZ($a0)
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_mul_comba4
-+___
-+
-+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
-+
-+$code.=<<___;
-+
-+.align	5
-+.globl	bn_sqr_comba8
-+.ent	bn_sqr_comba8
-+bn_sqr_comba8:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$LD	$a_3,3*$BNSZ($a1)
-+
-+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_4,4*$BNSZ($a1)
-+	$LD	$a_5,5*$BNSZ($a1)
-+	$LD	$a_6,6*$BNSZ($a1)
-+	$LD	$a_7,7*$BNSZ($a1)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$c_3,$t_2,$at
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
-+	$ADDU	$c_2,$at
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,6*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,8*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,9*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,10*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,11*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,12*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,13*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	$ST	$c_3,14*$BNSZ($a0)
-+	$ST	$c_1,15*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_sqr_comba8
-+
-+.align	5
-+.globl	bn_sqr_comba4
-+.ent	bn_sqr_comba4
-+bn_sqr_comba4:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$a_1,$BNSZ($a1)
-+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$LD	$a_3,3*$BNSZ($a1)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$c_3,$t_2,$at
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	$ST	$c_1,6*$BNSZ($a0)
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_sqr_comba4
-+___
-+print $code;
-+close STDOUT;
-diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
-new file mode 100644
-index 0000000..f1a702f
---- /dev/null
-+++ b/crypto/sha/asm/sha1-mips.pl
-@@ -0,0 +1,354 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# SHA1 block procedure for MIPS.
-+
-+# Performance improvement is 30% on unaligned input. The "secret" is
-+# to deploy lwl/lwr pair to load unaligned input. One could have
-+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
-+# compatible subroutine. There is room for minor optimization on
-+# little-endian platforms...
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+#
-+# <[email protected]>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian))
-+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
-+
-+# offsets of the Most and Least Significant Bytes
-+$MSB=$big_endian?0:3;
-+$LSB=3&~$MSB;
-+
-+@X=map("\$$_",(8..23));	# a4-a7,s0-s11
-+
-+$ctx=$a0;
-+$inp=$a1;
-+$num=$a2;
-+$A="\$1";
-+$B="\$2";
-+$C="\$3";
-+$D="\$7";
-+$E="\$24";	@V=($A,$B,$C,$D,$E);
-+$t0="\$25";
-+$t1=$num;	# $num is offloaded to stack
-+$t2="\$30";	# fp
-+$K="\$31";	# ra
-+
-+sub BODY_00_14 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___	if (!$big_endian);
-+	srl	$t0,@X[$i],24	# byte swap($i)
-+	srl	$t1,@X[$i],8
-+	andi	$t2,@X[$i],0xFF00
-+	sll	@X[$i],@X[$i],24
-+	andi	$t1,0xFF00
-+	sll	$t2,$t2,8
-+	or	@X[$i],$t0
-+	or	$t1,$t2
-+	or	@X[$i],$t1
-+___
-+$code.=<<___;
-+	 lwl	@X[$j],$j*4+$MSB($inp)
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	 lwr	@X[$j],$j*4+$LSB($inp)
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	sll	$t2,$b,30
-+	and	$t0,$b
-+	srl	$b,$b,2
-+	xor	$t0,$d
-+	addu	$e,@X[$i]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_15_19 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+
-+$code.=<<___	if (!$big_endian && $i==15);
-+	srl	$t0,@X[$i],24	# byte swap($i)
-+	srl	$t1,@X[$i],8
-+	andi	$t2,@X[$i],0xFF00
-+	sll	@X[$i],@X[$i],24
-+	andi	$t1,0xFF00
-+	sll	$t2,$t2,8
-+	or	@X[$i],$t0
-+	or	@X[$i],$t1
-+	or	@X[$i],$t2
-+___
-+$code.=<<___;
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	and	$t0,$b
-+	 srl	$t1,@X[$j%16],31
-+	 addu	@X[$j%16],@X[$j%16]
-+	srl	$b,$b,2
-+	xor	$t0,$d
-+	 or	@X[$j%16],$t1
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_20_39 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___ if ($i<79);
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	xor	$t0,$b
-+	 srl	$t1,@X[$j%16],31
-+	 addu	@X[$j%16],@X[$j%16]
-+	srl	$b,$b,2
-+	addu	$e,@X[$i%16]
-+	 or	@X[$j%16],$t1
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+$code.=<<___ if ($i==79);
-+	 lw	@X[0],0($ctx)
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	 lw	@X[1],4($ctx)
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 lw	@X[2],8($ctx)
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 lw	@X[3],12($ctx)
-+	sll	$t2,$b,30
-+	xor	$t0,$b
-+	 lw	@X[4],16($ctx)
-+	srl	$b,$b,2
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_40_59 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___ if ($i<79);
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	and	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	addu	$e,$t0
-+	 srl	$t1,@X[$j%16],31
-+	xor	$t0,$c,$d
-+	 addu	@X[$j%16],@X[$j%16]
-+	and	$t0,$b
-+	srl	$b,$b,2
-+	 or	@X[$j%16],$t1
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+$FRAMESIZE=16;	# large enough to accomodate NUBI saved registers
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+$code=<<___;
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+.text
-+
-+.set	noat
-+.set	noreorder
-+.align	5
-+.globl	sha1_block_data_order
-+.ent	sha1_block_data_order
-+sha1_block_data_order:
-+	.frame	$sp,$FRAMESIZE*$SZREG,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
-+	$REG_S	$ra,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_S	$fp,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_S	$s11,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_S	$s10,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_S	$s9,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_S	$s8,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_S	$s7,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_S	$s6,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_S	$s5,($FRAMESIZE-9)*$SZREG($sp)
-+	$REG_S	$s4,($FRAMESIZE-10)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_S	$s2,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_S	$s1,($FRAMESIZE-13)*$SZREG($sp)
-+	$REG_S	$s0,($FRAMESIZE-14)*$SZREG($sp)
-+	$REG_S	$gp,($FRAMESIZE-15)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	$PTR_SLL $num,6
-+	$PTR_ADD $num,$inp
-+	$REG_S	$num,0($sp)
-+	lw	$A,0($ctx)
-+	lw	$B,4($ctx)
-+	lw	$C,8($ctx)
-+	lw	$D,12($ctx)
-+	b	.Loop
-+	lw	$E,16($ctx)
-+.align	4
-+.Loop:
-+	.set	reorder
-+	lwl	@X[0],$MSB($inp)
-+	lui	$K,0x5a82
-+	lwr	@X[0],$LSB($inp)
-+	ori	$K,0x7999	# K_00_19
-+___
-+for ($i=0;$i<15;$i++)	{ &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
-+for (;$i<20;$i++)	{ &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0x6ed9
-+	ori	$K,0xeba1	# K_20_39
-+___
-+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0x8f1b
-+	ori	$K,0xbcdc	# K_40_59
-+___
-+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0xca62
-+	ori	$K,0xc1d6	# K_60_79
-+___
-+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	$PTR_ADD $inp,64
-+	$REG_L	$num,0($sp)
-+
-+	addu	$A,$X[0]
-+	addu	$B,$X[1]
-+	sw	$A,0($ctx)
-+	addu	$C,$X[2]
-+	addu	$D,$X[3]
-+	sw	$B,4($ctx)
-+	addu	$E,$X[4]
-+	sw	$C,8($ctx)
-+	sw	$D,12($ctx)
-+	sw	$E,16($ctx)
-+	.set	noreorder
-+	bne	$inp,$num,.Loop
-+	nop
-+
-+	.set	noreorder
-+	$REG_L	$ra,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_L	$fp,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_L	$s11,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_L	$s10,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_L	$s9,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_L	$s8,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_L	$s7,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_L	$s6,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_L	$s5,($FRAMESIZE-9)*$SZREG($sp)
-+	$REG_L	$s4,($FRAMESIZE-10)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_L	$s2,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_L	$s1,($FRAMESIZE-13)*$SZREG($sp)
-+	$REG_L	$s0,($FRAMESIZE-14)*$SZREG($sp)
-+	$REG_L	$gp,($FRAMESIZE-15)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
-+.end	sha1_block_data_order
-+.rdata
-+.asciiz	"SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+___
-+print $code;
-+close STDOUT;
-diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
-new file mode 100644
-index 0000000..ba5b250
---- /dev/null
-+++ b/crypto/sha/asm/sha512-mips.pl
-@@ -0,0 +1,455 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# SHA2 block procedures for MIPS.
-+
-+# October 2010.
-+#
-+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
-+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
-+# for now can only be compiled for MIPS64 ISA] improvement is modest
-+# ~17%, but it comes for free, because it's same instruction sequence.
-+# Improvement coefficients are for aligned input.
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-+#   excluded from the rule, because it's specified volatile];
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
-+#
-+# <[email protected]>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
-+
-+if ($output =~ /512/) {
-+	$label="512";
-+	$SZ=8;
-+	$LD="ld";		# load from memory
-+	$ST="sd";		# store to memory
-+	$SLL="dsll";		# shift left logical
-+	$SRL="dsrl";		# shift right logical
-+	$ADDU="daddu";
-+	@Sigma0=(28,34,39);
-+	@Sigma1=(14,18,41);
-+	@sigma0=( 7, 1, 8);	# right shift first
-+	@sigma1=( 6,19,61);	# right shift first
-+	$lastK=0x817;
-+	$rounds=80;
-+} else {
-+	$label="256";
-+	$SZ=4;
-+	$LD="lw";		# load from memory
-+	$ST="sw";		# store to memory
-+	$SLL="sll";		# shift left logical
-+	$SRL="srl";		# shift right logical
-+	$ADDU="addu";
-+	@Sigma0=( 2,13,22);
-+	@Sigma1=( 6,11,25);
-+	@sigma0=( 3, 7,18);	# right shift first
-+	@sigma1=(10,17,19);	# right shift first
-+	$lastK=0x8f2;
-+	$rounds=64;
-+}
-+
-+$MSB = $big_endian ? 0 : ($SZ-1);
-+$LSB = ($SZ-1)&~$MSB;
-+
-+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
-+@X=map("\$$_",(8..23));
-+
-+$ctx=$a0;
-+$inp=$a1;
-+$len=$a2;	$Ktbl=$len;
-+
-+sub BODY_00_15 {
-+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
-+
-+$code.=<<___ if ($i<15);
-+	${LD}l	@X[1],`($i+1)*$SZ+$MSB`($inp)
-+	${LD}r	@X[1],`($i+1)*$SZ+$LSB`($inp)
-+___
-+$code.=<<___	if (!$big_endian && $i<16 && $SZ==4);
-+	srl	$tmp0,@X[0],24		# byte swap($i)
-+	srl	$tmp1,@X[0],8
-+	andi	$tmp2,@X[0],0xFF00
-+	sll	@X[0],@X[0],24
-+	andi	$tmp1,0xFF00
-+	sll	$tmp2,$tmp2,8
-+	or	@X[0],$tmp0
-+	or	$tmp1,$tmp2
-+	or	@X[0],$tmp1
-+___
-+$code.=<<___	if (!$big_endian && $i<16 && $SZ==8);
-+	ori	$tmp0,$zero,0xFF
-+	dsll	$tmp2,$tmp0,32
-+	or	$tmp0,$tmp2		# 0x000000FF000000FF
-+	and	$tmp1,@X[0],$tmp0	# byte swap($i)
-+	dsrl	$tmp2,@X[0],24
-+	dsll	$tmp1,24
-+	and	$tmp2,$tmp0
-+	dsll	$tmp0,8			# 0x0000FF000000FF00
-+	or	$tmp1,$tmp2
-+	and	$tmp2,@X[0],$tmp0
-+	dsrl	@X[0],8
-+	dsll	$tmp2,8
-+	and	@X[0],$tmp0
-+	or	$tmp1,$tmp2
-+	or	@X[0],$tmp1
-+	dsrl	$tmp1,@X[0],32
-+	dsll	@X[0],32
-+	or	@X[0],$tmp1
-+___
-+$code.=<<___;
-+	$ADDU	$T1,$X[0],$h			# $i
-+	$SRL	$h,$e,@Sigma1[0]
-+	xor	$tmp2,$f,$g
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[2]`
-+	and	$tmp2,$e
-+	$SRL	$tmp0,$e,@Sigma1[1]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[1]`
-+	xor	$h,$tmp0
-+	$SRL	$tmp0,$e,@Sigma1[2]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[0]`
-+	xor	$h,$tmp0
-+	xor	$tmp2,$g			# Ch(e,f,g)
-+	xor	$tmp0,$tmp1,$h			# Sigma1(e)
-+
-+	$SRL	$h,$a,@Sigma0[0]
-+	$ADDU	$T1,$tmp2
-+	$LD	$tmp2,`$i*$SZ`($Ktbl)		# K[$i]
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[2]`
-+	$ADDU	$T1,$tmp0
-+	$SRL	$tmp0,$a,@Sigma0[1]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[1]`
-+	xor	$h,$tmp0
-+	$SRL	$tmp0,$a,@Sigma0[2]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[0]`
-+	xor	$h,$tmp0
-+	$ST	@X[0],`($i%16)*$SZ`($sp)	# offload to ring buffer
-+	xor	$h,$tmp1			# Sigma0(a)
-+
-+	or	$tmp0,$a,$b
-+	and	$tmp1,$a,$b
-+	and	$tmp0,$c
-+	or	$tmp1,$tmp0			# Maj(a,b,c)
-+	$ADDU	$T1,$tmp2			# +=K[$i]
-+	$ADDU	$h,$tmp1
-+
-+	$ADDU	$d,$T1
-+	$ADDU	$h,$T1
-+___
-+$code.=<<___ if ($i>=13);
-+	$LD	@X[3],`(($i+3)%16)*$SZ`($sp)	# prefetch from ring buffer
-+___
-+}
-+
-+sub BODY_16_XX {
-+my $i=@_[0];
-+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
-+
-+$code.=<<___;
-+	$SRL	$tmp2,@X[1],@sigma0[0]		# Xupdate($i)
-+	$ADDU	@X[0],@X[9]			# +=X[i+9]
-+	$SLL	$tmp1,@X[1],`$SZ*8-@sigma0[2]`
-+	$SRL	$tmp0,@X[1],@sigma0[1]
-+	xor	$tmp2,$tmp1
-+	$SLL	$tmp1,`@sigma0[2]-@sigma0[1]`
-+	xor	$tmp2,$tmp0
-+	$SRL	$tmp0,@X[1],@sigma0[2]
-+	xor	$tmp2,$tmp1
-+
-+	$SRL	$tmp3,@X[14],@sigma1[0]
-+	xor	$tmp2,$tmp0			# sigma0(X[i+1])
-+	$SLL	$tmp1,@X[14],`$SZ*8-@sigma1[2]`
-+	$ADDU	@X[0],$tmp2
-+	$SRL	$tmp0,@X[14],@sigma1[1]
-+	xor	$tmp3,$tmp1
-+	$SLL	$tmp1,`@sigma1[2]-@sigma1[1]`
-+	xor	$tmp3,$tmp0
-+	$SRL	$tmp0,@X[14],@sigma1[2]
-+	xor	$tmp3,$tmp1
-+
-+	xor	$tmp3,$tmp0			# sigma1(X[i+14])
-+	$ADDU	@X[0],$tmp3
-+___
-+	&BODY_00_15(@_);
-+}
-+
-+$FRAMESIZE=16*$SZ+16*$SZREG;
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+$code.=<<___;
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+.text
-+.set	noat
-+#if !defined(__vxworks) || defined(__pic__)
-+.option	pic2
-+#endif
-+
-+.align	5
-+.globl	sha${label}_block_data_order
-+.ent	sha${label}_block_data_order
-+sha${label}_block_data_order:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Ktbl
-+	.cpsetup	$pf,$zero,sha${label}_block_data_order
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Ktbl,K${label}		# PIC-ified 'load address'
-+
-+	$LD	$A,0*$SZ($ctx)		# load context
-+	$LD	$B,1*$SZ($ctx)
-+	$LD	$C,2*$SZ($ctx)
-+	$LD	$D,3*$SZ($ctx)
-+	$LD	$E,4*$SZ($ctx)
-+	$LD	$F,5*$SZ($ctx)
-+	$LD	$G,6*$SZ($ctx)
-+	$LD	$H,7*$SZ($ctx)
-+
-+	$PTR_ADD @X[15],$inp		# pointer to the end of input
-+	$REG_S	@X[15],16*$SZ($sp)
-+	b	.Loop
-+
-+.align	5
-+.Loop:
-+	${LD}l	@X[0],$MSB($inp)
-+	${LD}r	@X[0],$LSB($inp)
-+___
-+for ($i=0;$i<16;$i++)
-+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
-+$code.=<<___;
-+	b	.L16_xx
-+.align	4
-+.L16_xx:
-+___
-+for (;$i<32;$i++)
-+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
-+$code.=<<___;
-+	and	@X[6],0xfff
-+	li	@X[7],$lastK
-+	.set	noreorder
-+	bne	@X[6],@X[7],.L16_xx
-+	$PTR_ADD $Ktbl,16*$SZ		# Ktbl+=16
-+
-+	$REG_L	@X[15],16*$SZ($sp)	# restore pointer to the end of input
-+	$LD	@X[0],0*$SZ($ctx)
-+	$LD	@X[1],1*$SZ($ctx)
-+	$LD	@X[2],2*$SZ($ctx)
-+	$PTR_ADD $inp,16*$SZ
-+	$LD	@X[3],3*$SZ($ctx)
-+	$ADDU	$A,@X[0]
-+	$LD	@X[4],4*$SZ($ctx)
-+	$ADDU	$B,@X[1]
-+	$LD	@X[5],5*$SZ($ctx)
-+	$ADDU	$C,@X[2]
-+	$LD	@X[6],6*$SZ($ctx)
-+	$ADDU	$D,@X[3]
-+	$LD	@X[7],7*$SZ($ctx)
-+	$ADDU	$E,@X[4]
-+	$ST	$A,0*$SZ($ctx)
-+	$ADDU	$F,@X[5]
-+	$ST	$B,1*$SZ($ctx)
-+	$ADDU	$G,@X[6]
-+	$ST	$C,2*$SZ($ctx)
-+	$ADDU	$H,@X[7]
-+	$ST	$D,3*$SZ($ctx)
-+	$ST	$E,4*$SZ($ctx)
-+	$ST	$F,5*$SZ($ctx)
-+	$ST	$G,6*$SZ($ctx)
-+	$ST	$H,7*$SZ($ctx)
-+
-+	bnel	$inp,@X[15],.Loop
-+	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl
-+
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	sha${label}_block_data_order
-+
-+.rdata
-+.align	5
-+K${label}:
-+___
-+if ($SZ==4) {
-+$code.=<<___;
-+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-+	.word	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-+	.word	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-+	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-+	.word	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-+	.word	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-+	.word	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-+	.word	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-+	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-+___
-+} else {
-+$code.=<<___;
-+	.dword	0x428a2f98d728ae22, 0x7137449123ef65cd
-+	.dword	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
-+	.dword	0x3956c25bf348b538, 0x59f111f1b605d019
-+	.dword	0x923f82a4af194f9b, 0xab1c5ed5da6d8118
-+	.dword	0xd807aa98a3030242, 0x12835b0145706fbe
-+	.dword	0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
-+	.dword	0x72be5d74f27b896f, 0x80deb1fe3b1696b1
-+	.dword	0x9bdc06a725c71235, 0xc19bf174cf692694
-+	.dword	0xe49b69c19ef14ad2, 0xefbe4786384f25e3
-+	.dword	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
-+	.dword	0x2de92c6f592b0275, 0x4a7484aa6ea6e483
-+	.dword	0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
-+	.dword	0x983e5152ee66dfab, 0xa831c66d2db43210
-+	.dword	0xb00327c898fb213f, 0xbf597fc7beef0ee4
-+	.dword	0xc6e00bf33da88fc2, 0xd5a79147930aa725
-+	.dword	0x06ca6351e003826f, 0x142929670a0e6e70
-+	.dword	0x27b70a8546d22ffc, 0x2e1b21385c26c926
-+	.dword	0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
-+	.dword	0x650a73548baf63de, 0x766a0abb3c77b2a8
-+	.dword	0x81c2c92e47edaee6, 0x92722c851482353b
-+	.dword	0xa2bfe8a14cf10364, 0xa81a664bbc423001
-+	.dword	0xc24b8b70d0f89791, 0xc76c51a30654be30
-+	.dword	0xd192e819d6ef5218, 0xd69906245565a910
-+	.dword	0xf40e35855771202a, 0x106aa07032bbd1b8
-+	.dword	0x19a4c116b8d2d0c8, 0x1e376c085141ab53
-+	.dword	0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
-+	.dword	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
-+	.dword	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
-+	.dword	0x748f82ee5defb2fc, 0x78a5636f43172f60
-+	.dword	0x84c87814a1f0ab72, 0x8cc702081a6439ec
-+	.dword	0x90befffa23631e28, 0xa4506cebde82bde9
-+	.dword	0xbef9a3f7b2c67915, 0xc67178f2e372532b
-+	.dword	0xca273eceea26619c, 0xd186b8c721c0c207
-+	.dword	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
-+	.dword	0x06f067aa72176fba, 0x0a637dc5a2c898a6
-+	.dword	0x113f9804bef90dae, 0x1b710b35131c471b
-+	.dword	0x28db77f523047d84, 0x32caab7b40c72493
-+	.dword	0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
-+	.dword	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
-+	.dword	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-+___
-+}
-+$code.=<<___;
-+.asciiz	"SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+.align	5
-+
-+___
-+
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
-+print $code;
-+close STDOUT;

diff --git a/patches/npn.patch b/patches/npn.patch
deleted file mode 100644
index 46b7a7d..0000000
--- a/patches/npn.patch
+++ /dev/null

@@ -1,1293 +0,0 @@
---- openssl-1.0.0b.orig/apps/apps.c	2010-11-11 14:42:19.000000000 +0000
-+++ openssl-1.0.0b/apps/apps.c	2010-11-29 19:56:04.902465346 +0000
-@@ -3012,3 +3012,46 @@ int raw_write_stdout(const void *buf,int
- int raw_write_stdout(const void *buf,int siz)
- 	{	return write(fileno(stdout),buf,siz);	}
- #endif
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+/* next_protos_parse parses a comma separated list of strings into a string
-+ * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
-+ *   outlen: (output) set to the length of the resulting buffer on success.
-+ *   in: a NUL termianted string like "abc,def,ghi"
-+ *
-+ *   returns: a malloced buffer or NULL on failure.
-+ */
-+unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
-+	{
-+	size_t len;
-+	unsigned char *out;
-+	size_t i, start = 0;
-+
-+	len = strlen(in);
-+	if (len >= 65535)
-+		return NULL;
-+
-+	out = OPENSSL_malloc(strlen(in) + 1);
-+	if (!out)
-+		return NULL;
-+
-+	for (i = 0; i <= len; ++i)
-+		{
-+		if (i == len || in[i] == ',')
-+			{
-+			if (i - start > 255)
-+				{
-+				OPENSSL_free(out);
-+				return NULL;
-+				}
-+			out[start] = i - start;
-+			start = i + 1;
-+			}
-+		else
-+			out[i+1] = in[i];
-+		}
-+
-+	*outlen = len + 1;
-+	return out;
-+	}
-+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
---- openssl-1.0.0b.orig/apps/apps.h	2009-10-31 13:34:19.000000000 +0000
-+++ openssl-1.0.0b/apps/apps.h	2010-11-29 19:56:04.902465346 +0000
-@@ -358,3 +358,7 @@ int raw_write_stdout(const void *,int);
- #define TM_STOP		1
- double app_tminterval (int stop,int usertime);
- #endif
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
-+#endif
---- openssl-1.0.0b.orig/apps/s_client.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/apps/s_client.c	2010-11-29 19:56:04.902465346 +0000
-@@ -342,6 +342,9 @@ static void sc_usage(void)
- 	BIO_printf(bio_err," -tlsextdebug      - hex dump of all TLS extensions received\n");
- 	BIO_printf(bio_err," -status           - request certificate status from server\n");
- 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
-+# endif
- 	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
- #endif
- 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
-@@ -367,6 +370,40 @@ static int MS_CALLBACK ssl_servername_cb
- 	
- 	return SSL_TLSEXT_ERR_OK;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This the context that we pass to next_proto_cb */
-+typedef struct tlsextnextprotoctx_st {
-+	unsigned char *data;
-+	unsigned short len;
-+	int status;
-+} tlsextnextprotoctx;
-+
-+static tlsextnextprotoctx next_proto;
-+
-+static int next_proto_cb(SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg)
-+	{
-+	tlsextnextprotoctx *ctx = arg;
-+
-+	if (!c_quiet)
-+		{
-+		/* We can assume that |in| is syntactically valid. */
-+		unsigned i;
-+		BIO_printf(bio_c_out, "Protocols advertised by server: ");
-+		for (i = 0; i < inlen; )
-+			{
-+			if (i)
-+				BIO_write(bio_c_out, ", ", 2);
-+			BIO_write(bio_c_out, &in[i + 1], in[i]);
-+			i += in[i] + 1;
-+			}
-+		BIO_write(bio_c_out, "\n", 1);
-+		}
-+
-+	ctx->status = SSL_select_next_proto(out, outlen, in, inlen, ctx->data, ctx->len);
-+	return SSL_TLSEXT_ERR_OK;
-+	}
-+# endif  /* ndef OPENSSL_NO_NEXTPROTONEG */
- #endif
- 
- enum
-@@ -431,6 +468,9 @@ int MAIN(int argc, char **argv)
- 	char *servername = NULL; 
-         tlsextctx tlsextcbp = 
-         {NULL,0};
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	const char *next_proto_neg_in = NULL;
-+# endif
- #endif
- 	char *sess_in = NULL;
- 	char *sess_out = NULL;
-@@ -658,6 +698,13 @@ int MAIN(int argc, char **argv)
- #ifndef OPENSSL_NO_TLSEXT
- 		else if	(strcmp(*argv,"-no_ticket") == 0)
- 			{ off|=SSL_OP_NO_TICKET; }
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (strcmp(*argv,"-nextprotoneg") == 0)
-+			{
-+			if (--argc < 1) goto bad;
-+			next_proto_neg_in = *(++argv);
-+			}
-+# endif
- #endif
- 		else if (strcmp(*argv,"-cutthrough") == 0)
- 			cutthrough=1;
-@@ -766,6 +813,21 @@ bad:
- 	OpenSSL_add_ssl_algorithms();
- 	SSL_load_error_strings();
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	next_proto.status = -1;
-+	if (next_proto_neg_in)
-+		{
-+		next_proto.data = next_protos_parse(&next_proto.len, next_proto_neg_in);
-+		if (next_proto.data == NULL)
-+			{
-+			BIO_printf(bio_err, "Error parsing -nextprotoneg argument\n");
-+			goto end;
-+			}
-+		}
-+	else
-+		next_proto.data = NULL;
-+#endif
-+
- #ifndef OPENSSL_NO_ENGINE
-         e = setup_engine(bio_err, engine_id, 1);
- 	if (ssl_client_engine_id)
-@@ -896,6 +958,11 @@ bad:
- 		SSL_CTX_set_mode(ctx, ssl_mode);
- 		}
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (next_proto.data)
-+		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
-+#endif
-+
- 	if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
- 	if (cipher != NULL)
- 		if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
-@@ -1755,6 +1822,18 @@ static void print_stuff(BIO *bio, SSL *s
- 	BIO_printf(bio,"Expansion: %s\n",
- 		expansion ? SSL_COMP_get_name(expansion) : "NONE");
- #endif
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (next_proto.status != -1) {
-+		const unsigned char *proto;
-+		unsigned int proto_len;
-+		SSL_get0_next_proto_negotiated(s, &proto, &proto_len);
-+		BIO_printf(bio, "Next protocol: (%d) ", next_proto.status);
-+		BIO_write(bio, proto, proto_len);
-+		BIO_write(bio, "\n", 1);
-+	}
-+#endif
-+
- 	SSL_SESSION_print(bio,SSL_get_session(s));
- 	BIO_printf(bio,"---\n");
- 	if (peer != NULL)
---- openssl-1.0.0b.orig/apps/s_server.c	2010-06-15 17:25:02.000000000 +0000
-+++ openssl-1.0.0b/apps/s_server.c	2010-11-29 19:56:04.902465346 +0000
-@@ -492,6 +492,9 @@ static void sv_usage(void)
- 	BIO_printf(bio_err," -tlsextdebug  - hex dump of all TLS extensions received\n");
- 	BIO_printf(bio_err," -no_ticket    - disable use of RFC4507bis session tickets\n");
- 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	BIO_printf(bio_err," -nextprotoneg arg - set the advertised protocols for the NPN extension (comma-separated list)\n");
-+# endif
- #endif
- 	}
- 
-@@ -826,6 +829,24 @@ BIO_printf(err, "cert_status: received %
- 	ret = SSL_TLSEXT_ERR_ALERT_FATAL;
- 	goto done;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is the context that we pass to next_proto_cb */
-+typedef struct tlsextnextprotoctx_st {
-+	unsigned char *data;
-+	unsigned int len;
-+} tlsextnextprotoctx;
-+
-+static int next_proto_cb(SSL *s, const unsigned char **data, unsigned int *len, void *arg)
-+	{
-+	tlsextnextprotoctx *next_proto = arg;
-+
-+	*data = next_proto->data;
-+	*len = next_proto->len;
-+
-+	return SSL_TLSEXT_ERR_OK;
-+	}
-+# endif  /* ndef OPENSSL_NO_NPN */
- #endif
- 
- int MAIN(int, char **);
-@@ -867,6 +888,10 @@ int MAIN(int argc, char *argv[])
- #endif
- #ifndef OPENSSL_NO_TLSEXT
-         tlsextctx tlsextcbp = {NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING};
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	const char *next_proto_neg_in = NULL;
-+	tlsextnextprotoctx next_proto;
-+# endif
- #endif
- #ifndef OPENSSL_NO_PSK
- 	/* by default do not send a PSK identity hint */
-@@ -1191,7 +1216,13 @@ int MAIN(int argc, char *argv[])
- 			if (--argc < 1) goto bad;
- 			s_key_file2= *(++argv);
- 			}
--			
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if	(strcmp(*argv,"-nextprotoneg") == 0)
-+			{
-+			if (--argc < 1) goto bad;
-+			next_proto_neg_in = *(++argv);
-+			}
-+# endif
- #endif
- #if !defined(OPENSSL_NO_JPAKE) && !defined(OPENSSL_NO_PSK)
- 		else if (strcmp(*argv,"-jpake") == 0)
-@@ -1476,6 +1507,11 @@ bad:
- 		if (vpm)
- 			SSL_CTX_set1_param(ctx2, vpm);
- 		}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	if (next_proto.data)
-+		SSL_CTX_set_next_protos_advertised_cb(ctx, next_proto_cb, &next_proto);
-+# endif
- #endif 
- 
- #ifndef OPENSSL_NO_DH
-@@ -1617,6 +1653,21 @@ bad:
- 					goto end;
- 					}
- 				}
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		if (next_proto_neg_in)
-+			{
-+			unsigned short len;
-+			next_proto.data = next_protos_parse(&len,
-+				next_proto_neg_in);
-+			if (next_proto.data == NULL)
-+				goto end;
-+			next_proto.len = len;
-+			}
-+		else
-+			{
-+			next_proto.data = NULL;
-+			}
-+# endif
- #endif
- 		RSA_free(rsa);
- 		BIO_printf(bio_s_out,"\n");
-@@ -2159,6 +2210,10 @@ static int init_ssl_connection(SSL *con)
- 	X509 *peer;
- 	long verify_error;
- 	MS_STATIC char buf[BUFSIZ];
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	const unsigned char *next_proto_neg;
-+	unsigned next_proto_neg_len;
-+#endif
- 
- 	if ((i=SSL_accept(con)) <= 0)
- 		{
-@@ -2198,6 +2253,15 @@ static int init_ssl_connection(SSL *con)
- 		BIO_printf(bio_s_out,"Shared ciphers:%s\n",buf);
- 	str=SSL_CIPHER_get_name(SSL_get_current_cipher(con));
- 	BIO_printf(bio_s_out,"CIPHER is %s\n",(str != NULL)?str:"(NONE)");
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	SSL_get0_next_proto_negotiated(con, &next_proto_neg, &next_proto_neg_len);
-+	if (next_proto_neg)
-+		{
-+		BIO_printf(bio_s_out,"NEXTPROTO is ");
-+		BIO_write(bio_s_out, next_proto_neg, next_proto_neg_len);
-+		BIO_printf(bio_s_out, "\n");
-+		}
-+#endif
- 	if (con->hit) BIO_printf(bio_s_out,"Reused session-id\n");
- 	if (SSL_ctrl(con,SSL_CTRL_GET_FLAGS,0,NULL) &
- 		TLS1_FLAGS_TLS_PADDING_BUG)
---- openssl-1.0.0b.orig/include/openssl/ssl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/include/openssl/ssl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -857,6 +857,25 @@ struct ssl_ctx_st
- 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
- 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
- 	void *tlsext_opaque_prf_input_callback_arg;
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation information */
-+	/* (for experimental NPN extension). */
-+
-+	/* For a server, this contains a callback function by which the set of
-+	 * advertised protocols can be provided. */
-+	int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf,
-+			                 unsigned int *len, void *arg);
-+	void *next_protos_advertised_cb_arg;
-+	/* For a client, this contains a callback function that selects the
-+	 * next protocol from the list provided by the server. */
-+	int (*next_proto_select_cb)(SSL *s, unsigned char **out,
-+				    unsigned char *outlen,
-+				    const unsigned char *in,
-+				    unsigned int inlen,
-+				    void *arg);
-+	void *next_proto_select_cb_arg;
-+# endif
- #endif
- 
- #ifndef OPENSSL_NO_PSK
-@@ -928,6 +947,30 @@ int SSL_CTX_set_client_cert_engine(SSL_C
- #endif
- void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len));
- void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len));
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
-+					   int (*cb) (SSL *ssl,
-+						      const unsigned char **out,
-+						      unsigned int *outlen,
-+						      void *arg), void *arg);
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-+				      int (*cb) (SSL *ssl, unsigned char **out,
-+						 unsigned char *outlen,
-+						 const unsigned char *in,
-+						 unsigned int inlen, void *arg),
-+				      void *arg);
-+
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
-+			  const unsigned char *in, unsigned int inlen,
-+			  const unsigned char *client, unsigned int client_len);
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-+				    unsigned *len);
-+
-+#define OPENSSL_NPN_UNSUPPORTED	0
-+#define OPENSSL_NPN_NEGOTIATED	1
-+#define OPENSSL_NPN_NO_OVERLAP	2
-+
-+#endif
- 
- #ifndef OPENSSL_NO_PSK
- /* the maximum length of the buffer given to callbacks containing the
-@@ -1187,6 +1230,19 @@ struct ssl_st
- 	void *tls_session_secret_cb_arg;
- 
- 	SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation. For the client, this is the protocol that
-+	 * we sent in NextProtocol and is set when handling ServerHello
-+	 * extensions.
-+	 *
-+	 * For a server, this is the client's selected_protocol from
-+	 * NextProtocol and is set when handling the NextProtocol message,
-+	 * before the Finished message. */
-+	unsigned char *next_proto_negotiated;
-+	unsigned char next_proto_negotiated_len;
-+#endif
-+
- #define session_ctx initial_ctx
- #else
- #define session_ctx ctx
-@@ -1919,6 +1975,7 @@ void ERR_load_SSL_strings(void);
- #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
- #define SSL_F_SSL3_GET_MESSAGE				 142
- #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-+#define SSL_F_SSL3_GET_NEXT_PROTO			 304
- #define SSL_F_SSL3_GET_RECORD				 143
- #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
- #define SSL_F_SSL3_GET_SERVER_DONE			 145
-@@ -2117,6 +2174,8 @@ void ERR_load_SSL_strings(void);
- #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
- #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
- #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
- #define SSL_R_HTTPS_PROXY_REQUEST			 155
- #define SSL_R_HTTP_REQUEST				 156
- #define SSL_R_ILLEGAL_PADDING				 283
---- openssl-1.0.0b.orig/include/openssl/ssl3.h	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/include/openssl/ssl3.h	2010-11-29 19:56:04.965928855 +0000
-@@ -465,6 +465,12 @@ typedef struct ssl3_state_st
- 	void *server_opaque_prf_input;
- 	size_t server_opaque_prf_input_len;
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Set if we saw the Next Protocol Negotiation extension from
-+	   our peer. */
-+	int next_proto_neg_seen;
-+#endif
-+
- 	struct	{
- 		/* actually only needs to be 16+20 */
- 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
-@@ -557,6 +563,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
-+#define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-+#endif
- #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
- /* read from server */
-@@ -602,6 +612,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
-+#define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-+#endif
- #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
- /* write to client */
-@@ -626,6 +640,9 @@ typedef struct ssl3_state_st
- #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
- #define SSL3_MT_FINISHED			20
- #define SSL3_MT_CERTIFICATE_STATUS		22
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_MT_NEXT_PROTO			67
-+#endif
- #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
- 
- 
---- openssl-1.0.0b.orig/include/openssl/tls1.h	2009-11-11 14:51:29.000000000 +0000
-+++ openssl-1.0.0b/include/openssl/tls1.h	2010-11-29 19:56:04.965928855 +0000
-@@ -204,6 +204,11 @@ extern "C" {
- /* Temporary extension type */
- #define TLSEXT_TYPE_renegotiate                 0xff01
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is not an IANA defined extension number */
-+#define TLSEXT_TYPE_next_proto_neg		13172
-+#endif
-+
- /* NameType value from RFC 3546 */
- #define TLSEXT_NAMETYPE_host_name 0
- /* status request value from RFC 3546 */
---- openssl-1.0.0b.orig/ssl/s3_both.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_both.c	2010-11-29 19:56:04.965928855 +0000
-@@ -202,15 +202,40 @@ int ssl3_send_finished(SSL *s, int a, in
- 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
- 	}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* ssl3_take_mac calculates the Finished MAC for the handshakes messages seen to far. */
-+static void ssl3_take_mac(SSL *s)
-+	{
-+	const char *sender;
-+	int slen;
-+
-+	if (s->state & SSL_ST_CONNECT)
-+		{
-+		sender=s->method->ssl3_enc->server_finished_label;
-+		slen=s->method->ssl3_enc->server_finished_label_len;
-+		}
-+	else
-+		{
-+		sender=s->method->ssl3_enc->client_finished_label;
-+		slen=s->method->ssl3_enc->client_finished_label_len;
-+		}
-+
-+	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
-+		sender,slen,s->s3->tmp.peer_finish_md);
-+	}
-+#endif
-+
- int ssl3_get_finished(SSL *s, int a, int b)
- 	{
- 	int al,i,ok;
- 	long n;
- 	unsigned char *p;
- 
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	/* the mac has already been generated when we received the
- 	 * change cipher spec message and is in s->s3->tmp.peer_finish_md
- 	 */ 
-+#endif
- 
- 	n=s->method->ssl_get_message(s,
- 		a,
-@@ -521,6 +546,15 @@ long ssl3_get_message(SSL *s, int st1, i
- 		s->init_num += i;
- 		n -= i;
- 		}
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* If receiving Finished, record MAC of prior handshake messages for
-+	 * Finished verification. */
-+	if (*s->init_buf->data == SSL3_MT_FINISHED)
-+		ssl3_take_mac(s);
-+#endif
-+
-+	/* Feed this message into MAC computation. */
- 	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
- 	if (s->msg_callback)
- 		s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg);
---- openssl-1.0.0b.orig/ssl/s3_clnt.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_clnt.c	2010-11-29 19:56:04.965928855 +0000
-@@ -435,7 +435,16 @@ int ssl3_connect(SSL *s)
- 			ret=ssl3_send_change_cipher_spec(s,
- 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
- 			if (ret <= 0) goto end;
-+
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 			s->state=SSL3_ST_CW_FINISHED_A;
-+#else
-+			if (s->next_proto_negotiated)
-+				s->state=SSL3_ST_CW_NEXT_PROTO_A;
-+			else
-+				s->state=SSL3_ST_CW_FINISHED_A;
-+#endif
-+
- 			s->init_num=0;
- 
- 			s->session->cipher=s->s3->tmp.new_cipher;
-@@ -463,6 +472,15 @@ int ssl3_connect(SSL *s)
- 
- 			break;
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+		case SSL3_ST_CW_NEXT_PROTO_A:
-+		case SSL3_ST_CW_NEXT_PROTO_B:
-+			ret=ssl3_send_next_proto(s);
-+			if (ret <= 0) goto end;
-+			s->state=SSL3_ST_CW_FINISHED_A;
-+			break;
-+#endif
-+
- 		case SSL3_ST_CW_FINISHED_A:
- 		case SSL3_ST_CW_FINISHED_B:
- 			ret=ssl3_send_finished(s,
-@@ -3060,6 +3078,32 @@ err:
-  */
- 
- #ifndef OPENSSL_NO_TLSEXT
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_send_next_proto(SSL *s)
-+	{
-+	unsigned int len, padding_len;
-+	unsigned char *d;
-+
-+	if (s->state == SSL3_ST_CW_NEXT_PROTO_A)
-+		{
-+		len = s->next_proto_negotiated_len;
-+		padding_len = 32 - ((len + 2) % 32);
-+		d = (unsigned char *)s->init_buf->data;
-+		d[4] = len;
-+		memcpy(d + 5, s->next_proto_negotiated, len);
-+		d[5 + len] = padding_len;
-+		memset(d + 6 + len, 0, padding_len);
-+		*(d++)=SSL3_MT_NEXT_PROTO;
-+		l2n3(2 + len + padding_len, d);
-+		s->state = SSL3_ST_CW_NEXT_PROTO_B;
-+		s->init_num = 4 + 2 + len + padding_len;
-+		s->init_off = 0;
-+		}
-+
-+	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
-+	}
-+# endif
-+
- int ssl3_check_finished(SSL *s)
- 	{
- 	int ok;
---- openssl-1.0.0b.orig/ssl/s3_lib.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/s3_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -2230,6 +2230,15 @@ void ssl3_clear(SSL *s)
- 	s->s3->num_renegotiations=0;
- 	s->s3->in_read_app_data=0;
- 	s->version=SSL3_VERSION;
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (s->next_proto_negotiated)
-+		{
-+		OPENSSL_free(s->next_proto_negotiated);
-+		s->next_proto_negotiated = NULL;
-+		s->next_proto_negotiated_len = 0;
-+		}
-+#endif
- 	}
- 
- long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
---- openssl-1.0.0b.orig/ssl/s3_pkt.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/s3_pkt.c	2010-11-29 19:56:04.965928855 +0000
-@@ -1394,8 +1394,10 @@ err:
- int ssl3_do_change_cipher_spec(SSL *s)
- 	{
- 	int i;
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	const char *sender;
- 	int slen;
-+#endif
- 
- 	if (s->state & SSL_ST_ACCEPT)
- 		i=SSL3_CHANGE_CIPHER_SERVER_READ;
-@@ -1418,6 +1420,7 @@ int ssl3_do_change_cipher_spec(SSL *s)
- 	if (!s->method->ssl3_enc->change_cipher_state(s,i))
- 		return(0);
- 
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	/* we have to record the message digest at
- 	 * this point so we can get it before we read
- 	 * the finished message */
-@@ -1434,6 +1437,7 @@ int ssl3_do_change_cipher_spec(SSL *s)
- 
- 	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
- 		sender,slen,s->s3->tmp.peer_finish_md);
-+#endif
- 
- 	return(1);
- 	}
---- openssl-1.0.0b.orig/ssl/s3_srvr.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_srvr.c	2010-11-29 19:56:04.965928855 +0000
-@@ -538,7 +538,14 @@ int ssl3_accept(SSL *s)
- 				 * the client uses its key from the certificate
- 				 * for key exchange.
- 				 */
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 				s->state=SSL3_ST_SR_FINISHED_A;
-+#else
-+				if (s->s3->next_proto_neg_seen)
-+					s->state=SSL3_ST_SR_NEXT_PROTO_A;
-+				else
-+					s->state=SSL3_ST_SR_FINISHED_A;
-+#endif
- 				s->init_num = 0;
- 				}
- 			else
-@@ -581,10 +588,27 @@ int ssl3_accept(SSL *s)
- 			ret=ssl3_get_cert_verify(s);
- 			if (ret <= 0) goto end;
- 
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 			s->state=SSL3_ST_SR_FINISHED_A;
-+#else
-+			if (s->s3->next_proto_neg_seen)
-+				s->state=SSL3_ST_SR_NEXT_PROTO_A;
-+			else
-+				s->state=SSL3_ST_SR_FINISHED_A;
-+#endif
- 			s->init_num=0;
- 			break;
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+		case SSL3_ST_SR_NEXT_PROTO_A:
-+		case SSL3_ST_SR_NEXT_PROTO_B:
-+			ret=ssl3_get_next_proto(s);
-+			if (ret <= 0) goto end;
-+			s->init_num = 0;
-+			s->state=SSL3_ST_SR_FINISHED_A;
-+			break;
-+#endif
-+
- 		case SSL3_ST_SR_FINISHED_A:
- 		case SSL3_ST_SR_FINISHED_B:
- 			ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A,
-@@ -655,7 +679,16 @@ int ssl3_accept(SSL *s)
- 			if (ret <= 0) goto end;
- 			s->state=SSL3_ST_SW_FLUSH;
- 			if (s->hit)
-+				{
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-+#else
-+				if (s->s3->next_proto_neg_seen)
-+					s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A;
-+				else
-+					s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-+#endif
-+				}
- 			else
- 				s->s3->tmp.next_state=SSL_ST_OK;
- 			s->init_num=0;
-@@ -3196,4 +3229,72 @@ int ssl3_send_cert_status(SSL *s)
- 	/* SSL3_ST_SW_CERT_STATUS_B */
- 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
- 	}
-+
-+# ifndef OPENSSL_NO_NPN
-+/* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It
-+ * sets the next_proto member in s if found */
-+int ssl3_get_next_proto(SSL *s)
-+	{
-+	int ok;
-+	unsigned proto_len, padding_len;
-+	long n;
-+	const unsigned char *p;
-+
-+	/* Clients cannot send a NextProtocol message if we didn't see the
-+	 * extension in their ClientHello */
-+	if (!s->s3->next_proto_neg_seen)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION);
-+		return -1;
-+		}
-+
-+	n=s->method->ssl_get_message(s,
-+		SSL3_ST_SR_NEXT_PROTO_A,
-+		SSL3_ST_SR_NEXT_PROTO_B,
-+		SSL3_MT_NEXT_PROTO,
-+		514,  /* See the payload format below */
-+		&ok);
-+
-+	if (!ok)
-+		return((int)n);
-+
-+	/* s->state doesn't reflect whether ChangeCipherSpec has been received
-+	 * in this handshake, but s->s3->change_cipher_spec does (will be reset
-+	 * by ssl3_get_finished). */
-+	if (!s->s3->change_cipher_spec)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS);
-+		return -1;
-+		}
-+
-+	if (n < 2)
-+		return 0;  /* The body must be > 1 bytes long */
-+
-+	p=(unsigned char *)s->init_msg;
-+
-+	/* The payload looks like:
-+	 *   uint8 proto_len;
-+	 *   uint8 proto[proto_len];
-+	 *   uint8 padding_len;
-+	 *   uint8 padding[padding_len];
-+	 */
-+	proto_len = p[0];
-+	if (proto_len + 2 > s->init_num)
-+		return 0;
-+	padding_len = p[proto_len + 1];
-+	if (proto_len + padding_len + 2 != s->init_num)
-+		return 0;
-+
-+	s->next_proto_negotiated = OPENSSL_malloc(proto_len);
-+	if (!s->next_proto_negotiated)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,ERR_R_MALLOC_FAILURE);
-+		return 0;
-+		}
-+	memcpy(s->next_proto_negotiated, p + 1, proto_len);
-+	s->next_proto_negotiated_len = proto_len;
-+
-+	return 1;
-+	}
-+# endif
- #endif
---- openssl-1.0.0b.orig/ssl/ssl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -857,6 +857,25 @@ struct ssl_ctx_st
- 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
- 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
- 	void *tlsext_opaque_prf_input_callback_arg;
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation information */
-+	/* (for experimental NPN extension). */
-+
-+	/* For a server, this contains a callback function by which the set of
-+	 * advertised protocols can be provided. */
-+	int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf,
-+			                 unsigned int *len, void *arg);
-+	void *next_protos_advertised_cb_arg;
-+	/* For a client, this contains a callback function that selects the
-+	 * next protocol from the list provided by the server. */
-+	int (*next_proto_select_cb)(SSL *s, unsigned char **out,
-+				    unsigned char *outlen,
-+				    const unsigned char *in,
-+				    unsigned int inlen,
-+				    void *arg);
-+	void *next_proto_select_cb_arg;
-+# endif
- #endif
- 
- #ifndef OPENSSL_NO_PSK
-@@ -928,6 +947,30 @@ int SSL_CTX_set_client_cert_engine(SSL_C
- #endif
- void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len));
- void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len));
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
-+					   int (*cb) (SSL *ssl,
-+						      const unsigned char **out,
-+						      unsigned int *outlen,
-+						      void *arg), void *arg);
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-+				      int (*cb) (SSL *ssl, unsigned char **out,
-+						 unsigned char *outlen,
-+						 const unsigned char *in,
-+						 unsigned int inlen, void *arg),
-+				      void *arg);
-+
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
-+			  const unsigned char *in, unsigned int inlen,
-+			  const unsigned char *client, unsigned int client_len);
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-+				    unsigned *len);
-+
-+#define OPENSSL_NPN_UNSUPPORTED	0
-+#define OPENSSL_NPN_NEGOTIATED	1
-+#define OPENSSL_NPN_NO_OVERLAP	2
-+
-+#endif
- 
- #ifndef OPENSSL_NO_PSK
- /* the maximum length of the buffer given to callbacks containing the
-@@ -1187,6 +1230,19 @@ struct ssl_st
- 	void *tls_session_secret_cb_arg;
- 
- 	SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation. For the client, this is the protocol that
-+	 * we sent in NextProtocol and is set when handling ServerHello
-+	 * extensions.
-+	 *
-+	 * For a server, this is the client's selected_protocol from
-+	 * NextProtocol and is set when handling the NextProtocol message,
-+	 * before the Finished message. */
-+	unsigned char *next_proto_negotiated;
-+	unsigned char next_proto_negotiated_len;
-+#endif
-+
- #define session_ctx initial_ctx
- #else
- #define session_ctx ctx
-@@ -1919,6 +1975,7 @@ void ERR_load_SSL_strings(void);
- #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
- #define SSL_F_SSL3_GET_MESSAGE				 142
- #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-+#define SSL_F_SSL3_GET_NEXT_PROTO			 304
- #define SSL_F_SSL3_GET_RECORD				 143
- #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
- #define SSL_F_SSL3_GET_SERVER_DONE			 145
-@@ -2117,6 +2174,8 @@ void ERR_load_SSL_strings(void);
- #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
- #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
- #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
- #define SSL_R_HTTPS_PROXY_REQUEST			 155
- #define SSL_R_HTTP_REQUEST				 156
- #define SSL_R_ILLEGAL_PADDING				 283
---- openssl-1.0.0b.orig/ssl/ssl3.h	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/ssl3.h	2010-11-29 19:56:04.965928855 +0000
-@@ -465,6 +465,12 @@ typedef struct ssl3_state_st
- 	void *server_opaque_prf_input;
- 	size_t server_opaque_prf_input_len;
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Set if we saw the Next Protocol Negotiation extension from
-+	   our peer. */
-+	int next_proto_neg_seen;
-+#endif
-+
- 	struct	{
- 		/* actually only needs to be 16+20 */
- 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
-@@ -557,6 +563,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
-+#define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-+#endif
- #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
- /* read from server */
-@@ -602,6 +612,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
-+#define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-+#endif
- #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
- /* write to client */
-@@ -626,6 +640,9 @@ typedef struct ssl3_state_st
- #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
- #define SSL3_MT_FINISHED			20
- #define SSL3_MT_CERTIFICATE_STATUS		22
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_MT_NEXT_PROTO			67
-+#endif
- #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
- 
- 
---- openssl-1.0.0b.orig/ssl/ssl_err.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_err.c	2010-11-29 19:56:04.965928855 +0000
-@@ -155,6 +155,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
- {ERR_FUNC(SSL_F_SSL3_GET_KEY_EXCHANGE),	"SSL3_GET_KEY_EXCHANGE"},
- {ERR_FUNC(SSL_F_SSL3_GET_MESSAGE),	"SSL3_GET_MESSAGE"},
- {ERR_FUNC(SSL_F_SSL3_GET_NEW_SESSION_TICKET),	"SSL3_GET_NEW_SESSION_TICKET"},
-+{ERR_FUNC(SSL_F_SSL3_GET_NEXT_PROTO),	"SSL3_GET_NEXT_PROTO"},
- {ERR_FUNC(SSL_F_SSL3_GET_RECORD),	"SSL3_GET_RECORD"},
- {ERR_FUNC(SSL_F_SSL3_GET_SERVER_CERTIFICATE),	"SSL3_GET_SERVER_CERTIFICATE"},
- {ERR_FUNC(SSL_F_SSL3_GET_SERVER_DONE),	"SSL3_GET_SERVER_DONE"},
-@@ -355,6 +356,8 @@ static ERR_STRING_DATA SSL_str_reasons[]
- {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"},
- {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"},
- {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"},
-+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"},
-+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"},
- {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST)   ,"https proxy request"},
- {ERR_REASON(SSL_R_HTTP_REQUEST)          ,"http request"},
- {ERR_REASON(SSL_R_ILLEGAL_PADDING)       ,"illegal padding"},
---- openssl-1.0.0b.orig/ssl/ssl_lib.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -354,6 +354,9 @@ SSL *SSL_new(SSL_CTX *ctx)
- 	s->tlsext_ocsp_resplen = -1;
- 	CRYPTO_add(&ctx->references,1,CRYPTO_LOCK_SSL_CTX);
- 	s->initial_ctx=ctx;
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	s->next_proto_negotiated = NULL;
-+# endif
- #endif
- 
- 	s->verify_result=X509_V_OK;
-@@ -587,6 +590,11 @@ void SSL_free(SSL *s)
- 		kssl_ctx_free(s->kssl_ctx);
- #endif	/* OPENSSL_NO_KRB5 */
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (s->next_proto_negotiated)
-+		OPENSSL_free(s->next_proto_negotiated);
-+#endif
-+
- 	OPENSSL_free(s);
- 	}
- 
-@@ -1503,6 +1511,124 @@ int SSL_get_servername_type(const SSL *s
- 		return TLSEXT_NAMETYPE_host_name;
- 	return -1;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* SSL_select_next_proto implements the standard protocol selection. It is
-+ * expected that this function is called from the callback set by
-+ * SSL_CTX_set_next_proto_select_cb.
-+ *
-+ * The protocol data is assumed to be a vector of 8-bit, length prefixed byte
-+ * strings. The length byte itself is not included in the length. A byte
-+ * string of length 0 is invalid. No byte string may be truncated.
-+ *
-+ * The current, but experimental algorithm for selecting the protocol is:
-+ *
-+ * 1) If the server doesn't support NPN then this is indicated to the
-+ * callback. In this case, the client application has to abort the connection
-+ * or have a default application level protocol.
-+ *
-+ * 2) If the server supports NPN, but advertises an empty list then the
-+ * client selects the first protcol in its list, but indicates via the
-+ * API that this fallback case was enacted.
-+ *
-+ * 3) Otherwise, the client finds the first protocol in the server's list
-+ * that it supports and selects this protocol. This is because it's
-+ * assumed that the server has better information about which protocol
-+ * a client should use.
-+ *
-+ * 4) If the client doesn't support any of the server's advertised
-+ * protocols, then this is treated the same as case 2.
-+ *
-+ * It returns either
-+ * OPENSSL_NPN_NEGOTIATED if a common protocol was found, or
-+ * OPENSSL_NPN_NO_OVERLAP if the fallback case was reached.
-+ */
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen, const unsigned char *server, unsigned int server_len, const unsigned char *client, unsigned int client_len)
-+	{
-+	unsigned int i, j;
-+	const unsigned char *result;
-+	int status = OPENSSL_NPN_UNSUPPORTED;
-+
-+	/* For each protocol in server preference order, see if we support it. */
-+	for (i = 0; i < server_len; )
-+		{
-+		for (j = 0; j < client_len; )
-+			{
-+			if (server[i] == client[j] &&
-+			    memcmp(&server[i+1], &client[j+1], server[i]) == 0)
-+				{
-+				/* We found a match */
-+				result = &server[i];
-+				status = OPENSSL_NPN_NEGOTIATED;
-+				goto found;
-+				}
-+			j += client[j];
-+			j++;
-+			}
-+		i += server[i];
-+		i++;
-+		}
-+
-+	/* There's no overlap between our protocols and the server's list. */
-+	result = client;
-+	status = OPENSSL_NPN_NO_OVERLAP;
-+
-+	found:
-+	*out = (unsigned char *) result + 1;
-+	*outlen = result[0];
-+	return status;
-+	}
-+
-+/* SSL_get0_next_proto_negotiated sets *data and *len to point to the client's
-+ * requested protocol for this connection and returns 0. If the client didn't
-+ * request any protocol, then *data is set to NULL.
-+ *
-+ * Note that the client can request any protocol it chooses. The value returned
-+ * from this function need not be a member of the list of supported protocols
-+ * provided by the callback.
-+ */
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data, unsigned *len)
-+	{
-+	*data = s->next_proto_negotiated;
-+	if (!*data) {
-+		*len = 0;
-+	} else {
-+		*len = s->next_proto_negotiated_len;
-+	}
-+}
-+
-+/* SSL_CTX_set_next_protos_advertised_cb sets a callback that is called when a
-+ * TLS server needs a list of supported protocols for Next Protocol
-+ * Negotiation. The returned list must be in wire format.  The list is returned
-+ * by setting |out| to point to it and |outlen| to its length. This memory will
-+ * not be modified, but one should assume that the SSL* keeps a reference to
-+ * it.
-+ *
-+ * The callback should return SSL_TLSEXT_ERR_OK if it wishes to advertise. Otherwise, no
-+ * such extension will be included in the ServerHello. */
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *ctx, int (*cb) (SSL *ssl, const unsigned char **out, unsigned int *outlen, void *arg), void *arg)
-+	{
-+	ctx->next_protos_advertised_cb = cb;
-+	ctx->next_protos_advertised_cb_arg = arg;
-+	}
-+
-+/* SSL_CTX_set_next_proto_select_cb sets a callback that is called when a
-+ * client needs to select a protocol from the server's provided list. |out|
-+ * must be set to point to the selected protocol (which may be within |in|).
-+ * The length of the protocol name must be written into |outlen|. The server's
-+ * advertised protocols are provided in |in| and |inlen|. The callback can
-+ * assume that |in| is syntactically valid.
-+ *
-+ * The client must select a protocol. It is fatal to the connection if this
-+ * callback returns a value other than SSL_TLSEXT_ERR_OK.
-+ */
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg), void *arg)
-+	{
-+	ctx->next_proto_select_cb = cb;
-+	ctx->next_proto_select_cb_arg = arg;
-+	}
-+
-+# endif
- #endif
- 
- static unsigned long ssl_session_hash(const SSL_SESSION *a)
-@@ -1667,6 +1793,10 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
- 	ret->tlsext_status_cb = 0;
- 	ret->tlsext_status_arg = NULL;
- 
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	ret->next_protos_advertised_cb = 0;
-+	ret->next_proto_select_cb = 0;
-+# endif
- #endif
- #ifndef OPENSSL_NO_PSK
- 	ret->psk_identity_hint=NULL;
---- openssl-1.0.0b.orig/ssl/ssl_locl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_locl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -968,6 +968,9 @@ int ssl3_get_server_certificate(SSL *s);
- int ssl3_check_cert_and_algorithm(SSL *s);
- #ifndef OPENSSL_NO_TLSEXT
- int ssl3_check_finished(SSL *s);
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_send_next_proto(SSL *s);
-+# endif
- #endif
- 
- int dtls1_client_hello(SSL *s);
-@@ -986,6 +989,9 @@ int ssl3_check_client_hello(SSL *s);
- int ssl3_get_client_certificate(SSL *s);
- int ssl3_get_client_key_exchange(SSL *s);
- int ssl3_get_cert_verify(SSL *s);
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_get_next_proto(SSL *s);
-+#endif
- 
- int dtls1_send_hello_request(SSL *s);
- int dtls1_send_server_hello(SSL *s);
---- openssl-1.0.0b.orig/ssl/t1_lib.c	2010-11-16 13:26:24.000000000 +0000
-+++ openssl-1.0.0b/ssl/t1_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -494,6 +494,18 @@ unsigned char *ssl_add_clienthello_tlsex
- 			i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret);
- 		}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len)
-+		{
-+		/* The client advertises an emtpy extension to indicate its
-+		 * support for Next Protocol Negotiation */
-+		if (limit - ret - 4 < 0)
-+			return NULL;
-+		s2n(TLSEXT_TYPE_next_proto_neg,ret);
-+		s2n(0,ret);
-+		}
-+#endif
-+
- 	if ((extdatalen = ret-p-2)== 0) 
- 		return p;
- 
-@@ -505,6 +517,9 @@ unsigned char *ssl_add_serverhello_tlsex
- 	{
- 	int extdatalen=0;
- 	unsigned char *ret = p;
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	int next_proto_neg_seen;
-+#endif
- 
- 	/* don't add extensions for SSLv3, unless doing secure renegotiation */
- 	if (s->version == SSL3_VERSION && !s->s3->send_connection_binding)
-@@ -618,6 +633,28 @@ unsigned char *ssl_add_serverhello_tlsex
- 
- 		}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	next_proto_neg_seen = s->s3->next_proto_neg_seen;
-+	s->s3->next_proto_neg_seen = 0;
-+	if (next_proto_neg_seen && s->ctx->next_protos_advertised_cb)
-+		{
-+		const unsigned char *npa;
-+		unsigned int npalen;
-+		int r;
-+
-+		r = s->ctx->next_protos_advertised_cb(s, &npa, &npalen, s->ctx->next_protos_advertised_cb_arg);
-+		if (r == SSL_TLSEXT_ERR_OK)
-+			{
-+			if ((long)(limit - ret - 4 - npalen) < 0) return NULL;
-+			s2n(TLSEXT_TYPE_next_proto_neg,ret);
-+			s2n(npalen,ret);
-+			memcpy(ret, npa, npalen);
-+			ret += npalen;
-+			s->s3->next_proto_neg_seen = 1;
-+			}
-+		}
-+#endif
-+
- 	if ((extdatalen = ret-p-2)== 0) 
- 		return p;
- 
-@@ -982,6 +1019,28 @@ int ssl_parse_clienthello_tlsext(SSL *s,
- 				else
- 					s->tlsext_status_type = -1;
- 			}
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (type == TLSEXT_TYPE_next_proto_neg &&
-+                         s->s3->tmp.finish_md_len == 0)
-+			{
-+			/* We shouldn't accept this extension on a
-+			 * renegotiation.
-+			 *
-+			 * s->new_session will be set on renegotiation, but we
-+			 * probably shouldn't rely that it couldn't be set on
-+			 * the initial renegotation too in certain cases (when
-+			 * there's some other reason to disallow resuming an
-+			 * earlier session -- the current code won't be doing
-+			 * anything like that, but this might change).
-+
-+			 * A valid sign that there's been a previous handshake
-+			 * in this connection is if s->s3->tmp.finish_md_len >
-+			 * 0.  (We are talking about a check that will happen
-+			 * in the Hello protocol round, well before a new
-+			 * Finished message could have been computed.) */
-+			s->s3->next_proto_neg_seen = 1;
-+			}
-+#endif
- 
- 		/* session ticket processed earlier */
- 		data+=size;
-@@ -1005,6 +1064,26 @@ int ssl_parse_clienthello_tlsext(SSL *s,
- 	return 1;
- 	}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* ssl_next_proto_validate validates a Next Protocol Negotiation block. No
-+ * elements of zero length are allowed and the set of elements must exactly fill
-+ * the length of the block. */
-+static int ssl_next_proto_validate(unsigned char *d, unsigned len)
-+	{
-+	unsigned int off = 0;
-+
-+	while (off < len)
-+		{
-+		if (d[off] == 0)
-+			return 0;
-+		off += d[off];
-+		off++;
-+		}
-+
-+	return off == len;
-+	}
-+#endif
-+
- int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al)
- 	{
- 	unsigned short length;
-@@ -1139,6 +1218,39 @@ int ssl_parse_serverhello_tlsext(SSL *s,
- 			/* Set flag to expect CertificateStatus message */
- 			s->tlsext_status_expected = 1;
- 			}
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (type == TLSEXT_TYPE_next_proto_neg)
-+			{
-+			unsigned char *selected;
-+			unsigned char selected_len;
-+
-+			/* We must have requested it. */
-+			if ((s->ctx->next_proto_select_cb == NULL))
-+				{
-+				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
-+				return 0;
-+				}
-+			/* The data must be valid */
-+			if (!ssl_next_proto_validate(data, size))
-+				{
-+				*al = TLS1_AD_DECODE_ERROR;
-+				return 0;
-+				}
-+			if (s->ctx->next_proto_select_cb(s, &selected, &selected_len, data, size, s->ctx->next_proto_select_cb_arg) != SSL_TLSEXT_ERR_OK)
-+				{
-+				*al = TLS1_AD_INTERNAL_ERROR;
-+				return 0;
-+				}
-+			s->next_proto_negotiated = OPENSSL_malloc(selected_len);
-+			if (!s->next_proto_negotiated)
-+				{
-+				*al = TLS1_AD_INTERNAL_ERROR;
-+				return 0;
-+				}
-+			memcpy(s->next_proto_negotiated, selected, selected_len);
-+			s->next_proto_negotiated_len = selected_len;
-+			}
-+#endif
- 		else if (type == TLSEXT_TYPE_renegotiate)
- 			{
- 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
---- openssl-1.0.0b.orig/ssl/tls1.h	2009-11-11 14:51:29.000000000 +0000
-+++ openssl-1.0.0b/ssl/tls1.h	2010-11-29 19:56:04.965928855 +0000
-@@ -204,6 +204,11 @@ extern "C" {
- /* Temporary extension type */
- #define TLSEXT_TYPE_renegotiate                 0xff01
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is not an IANA defined extension number */
-+#define TLSEXT_TYPE_next_proto_neg		13172
-+#endif
-+
- /* NameType value from RFC 3546 */
- #define TLSEXT_NAMETYPE_host_name 0
- /* status request value from RFC 3546 */

diff --git a/patches/progs.patch b/patches/progs.patch
index 16fd9b0..f0879ae 100644
--- a/patches/progs.patch
+++ b/patches/progs.patch

@@ -20,8 +20,8 @@
 +#if 0 /* ANDROID */
  	{FUNC_TYPE_GENERAL,"ts",ts_main},
 +#endif
- #ifndef OPENSSL_NO_MD2
- 	{FUNC_TYPE_MD,"md2",dgst_main},
+ #ifndef OPENSSL_NO_SRP
+ 	{FUNC_TYPE_GENERAL,"srp",srp_main},
  #endif
 --- openssl-1.0.0.orig/apps/speed.c	2010-03-03 11:56:17.000000000 -0800
 +++ openssl-1.0.0/apps/speed.c	2010-05-18 14:05:57.000000000 -0700

diff --git a/patches/ssl_Android.mk b/patches/ssl_Android.mk
index 40641a3..619aede 100644
--- a/patches/ssl_Android.mk
+++ b/patches/ssl_Android.mk

@@ -6,43 +6,49 @@
 	external/openssl/crypto
 
 local_src_files:= \
-	s2_meth.c \
-	s2_srvr.c \
-	s2_clnt.c \
-	s2_lib.c \
-	s2_enc.c \
-	s2_pkt.c \
-	s3_meth.c \
-	s3_srvr.c \
-	s3_clnt.c \
-	s3_lib.c \
-	s3_enc.c \
-	s3_pkt.c \
-	s3_both.c \
-	s23_meth.c \
-	s23_srvr.c \
+	bio_ssl.c \
+	d1_both.c \
+	d1_enc.c \
+	d1_lib.c \
+	d1_pkt.c \
+	d1_srtp.c \
+	kssl.c \
 	s23_clnt.c \
 	s23_lib.c \
+	s23_meth.c \
 	s23_pkt.c \
-	t1_meth.c \
-	t1_srvr.c \
-	t1_clnt.c \
-	t1_lib.c \
-	t1_enc.c \
-	t1_reneg.c \
-	ssl_lib.c \
-	ssl_err2.c \
-	ssl_cert.c \
-	ssl_sess.c \
-	ssl_ciph.c \
-	ssl_stat.c \
-	ssl_rsa.c \
-	ssl_asn1.c \
-	ssl_txt.c \
+	s23_srvr.c \
+	s2_clnt.c \
+	s2_enc.c \
+	s2_lib.c \
+	s2_meth.c \
+	s2_pkt.c \
+	s2_srvr.c \
+	s3_both.c \
+	s3_clnt.c \
+	s3_enc.c \
+	s3_lib.c \
+	s3_meth.c \
+	s3_pkt.c \
+	s3_srvr.c \
 	ssl_algs.c \
-	bio_ssl.c \
+	ssl_asn1.c \
+	ssl_cert.c \
+	ssl_ciph.c \
 	ssl_err.c \
-	kssl.c
+	ssl_err2.c \
+	ssl_lib.c \
+	ssl_rsa.c \
+	ssl_sess.c \
+	ssl_stat.c \
+	ssl_txt.c \
+	t1_clnt.c \
+	t1_enc.c \
+	t1_lib.c \
+	t1_meth.c \
+	t1_reneg.c \
+	t1_srvr.c \
+	tls_srp.c
 
 #######################################
 # target static library

diff --git a/ssl/Android.mk b/ssl/Android.mk
index 40641a3..619aede 100644
--- a/ssl/Android.mk
+++ b/ssl/Android.mk

@@ -6,43 +6,49 @@
 	external/openssl/crypto
 
 local_src_files:= \
-	s2_meth.c \
-	s2_srvr.c \
-	s2_clnt.c \
-	s2_lib.c \
-	s2_enc.c \
-	s2_pkt.c \
-	s3_meth.c \
-	s3_srvr.c \
-	s3_clnt.c \
-	s3_lib.c \
-	s3_enc.c \
-	s3_pkt.c \
-	s3_both.c \
-	s23_meth.c \
-	s23_srvr.c \
+	bio_ssl.c \
+	d1_both.c \
+	d1_enc.c \
+	d1_lib.c \
+	d1_pkt.c \
+	d1_srtp.c \
+	kssl.c \
 	s23_clnt.c \
 	s23_lib.c \
+	s23_meth.c \
 	s23_pkt.c \
-	t1_meth.c \
-	t1_srvr.c \
-	t1_clnt.c \
-	t1_lib.c \
-	t1_enc.c \
-	t1_reneg.c \
-	ssl_lib.c \
-	ssl_err2.c \
-	ssl_cert.c \
-	ssl_sess.c \
-	ssl_ciph.c \
-	ssl_stat.c \
-	ssl_rsa.c \
-	ssl_asn1.c \
-	ssl_txt.c \
+	s23_srvr.c \
+	s2_clnt.c \
+	s2_enc.c \
+	s2_lib.c \
+	s2_meth.c \
+	s2_pkt.c \
+	s2_srvr.c \
+	s3_both.c \
+	s3_clnt.c \
+	s3_enc.c \
+	s3_lib.c \
+	s3_meth.c \
+	s3_pkt.c \
+	s3_srvr.c \
 	ssl_algs.c \
-	bio_ssl.c \
+	ssl_asn1.c \
+	ssl_cert.c \
+	ssl_ciph.c \
 	ssl_err.c \
-	kssl.c
+	ssl_err2.c \
+	ssl_lib.c \
+	ssl_rsa.c \
+	ssl_sess.c \
+	ssl_stat.c \
+	ssl_txt.c \
+	t1_clnt.c \
+	t1_enc.c \
+	t1_lib.c \
+	t1_meth.c \
+	t1_reneg.c \
+	t1_srvr.c \
+	tls_srp.c
 
 #######################################
 # target static library

diff --git a/ssl/Makefile b/ssl/Makefile
index 2b275fa..feaf3e3 100644
--- a/ssl/Makefile
+++ b/ssl/Makefile

@@ -26,26 +26,26 @@
 	s23_meth.c s23_srvr.c s23_clnt.c s23_lib.c          s23_pkt.c \
 	t1_meth.c   t1_srvr.c t1_clnt.c  t1_lib.c  t1_enc.c \
 	d1_meth.c   d1_srvr.c d1_clnt.c  d1_lib.c  d1_pkt.c \
-	d1_both.c d1_enc.c \
+	d1_both.c d1_enc.c d1_srtp.c \
 	ssl_lib.c ssl_err2.c ssl_cert.c ssl_sess.c \
 	ssl_ciph.c ssl_stat.c ssl_rsa.c \
 	ssl_asn1.c ssl_txt.c ssl_algs.c \
-	bio_ssl.c ssl_err.c kssl.c t1_reneg.c
+	bio_ssl.c ssl_err.c kssl.c tls_srp.c t1_reneg.c
 LIBOBJ= \
 	s2_meth.o  s2_srvr.o  s2_clnt.o  s2_lib.o  s2_enc.o s2_pkt.o \
 	s3_meth.o  s3_srvr.o  s3_clnt.o  s3_lib.o  s3_enc.o s3_pkt.o s3_both.o \
 	s23_meth.o s23_srvr.o s23_clnt.o s23_lib.o          s23_pkt.o \
 	t1_meth.o   t1_srvr.o t1_clnt.o  t1_lib.o  t1_enc.o \
 	d1_meth.o   d1_srvr.o d1_clnt.o  d1_lib.o  d1_pkt.o \
-	d1_both.o d1_enc.o \
+	d1_both.o d1_enc.o d1_srtp.o\
 	ssl_lib.o ssl_err2.o ssl_cert.o ssl_sess.o \
 	ssl_ciph.o ssl_stat.o ssl_rsa.o \
 	ssl_asn1.o ssl_txt.o ssl_algs.o \
-	bio_ssl.o ssl_err.o kssl.o t1_reneg.o
+	bio_ssl.o ssl_err.o kssl.o tls_srp.o t1_reneg.o
 
 SRC= $(LIBSRC)
 
-EXHEADER= ssl.h ssl2.h ssl3.h ssl23.h tls1.h dtls1.h kssl.h
+EXHEADER= ssl.h ssl2.h ssl3.h ssl23.h tls1.h dtls1.h kssl.h srtp.h
 HEADER=	$(EXHEADER) ssl_locl.h kssl_lcl.h
 
 ALL=    $(GENERAL) $(SRC) $(HEADER)
@@ -118,11 +118,11 @@
 bio_ssl.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 bio_ssl.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 bio_ssl.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-bio_ssl.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-bio_ssl.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-bio_ssl.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-bio_ssl.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-bio_ssl.o: ../include/openssl/x509_vfy.h bio_ssl.c
+bio_ssl.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+bio_ssl.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+bio_ssl.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+bio_ssl.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+bio_ssl.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h bio_ssl.c
 d1_both.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_both.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 d1_both.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -137,12 +137,12 @@
 d1_both.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 d1_both.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 d1_both.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-d1_both.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-d1_both.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-d1_both.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-d1_both.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-d1_both.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_both.c
-d1_both.o: ssl_locl.h
+d1_both.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+d1_both.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+d1_both.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+d1_both.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+d1_both.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+d1_both.o: ../include/openssl/x509_vfy.h d1_both.c ssl_locl.h
 d1_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_clnt.o: ../include/openssl/bn.h ../include/openssl/buffer.h
 d1_clnt.o: ../include/openssl/comp.h ../include/openssl/crypto.h
@@ -159,11 +159,12 @@
 d1_clnt.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 d1_clnt.o: ../include/openssl/rand.h ../include/openssl/rsa.h
 d1_clnt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_clnt.o: ../include/openssl/x509_vfy.h d1_clnt.c kssl_lcl.h ssl_locl.h
+d1_clnt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_clnt.c
+d1_clnt.o: kssl_lcl.h ssl_locl.h
 d1_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 d1_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -179,11 +180,12 @@
 d1_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 d1_enc.o: ../include/openssl/rand.h ../include/openssl/rsa.h
 d1_enc.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_enc.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_enc.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_enc.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_enc.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_enc.o: ../include/openssl/x509_vfy.h d1_enc.c ssl_locl.h
+d1_enc.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_enc.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_enc.c
+d1_enc.o: ssl_locl.h
 d1_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 d1_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -198,11 +200,12 @@
 d1_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 d1_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 d1_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_lib.o: ../include/openssl/x509_vfy.h d1_lib.c ssl_locl.h
+d1_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_lib.c
+d1_lib.o: ssl_locl.h
 d1_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 d1_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -217,11 +220,12 @@
 d1_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 d1_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 d1_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_meth.o: ../include/openssl/x509_vfy.h d1_meth.c ssl_locl.h
+d1_meth.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_meth.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_meth.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_meth.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_meth.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_meth.c
+d1_meth.o: ssl_locl.h
 d1_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 d1_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -236,12 +240,32 @@
 d1_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 d1_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 d1_pkt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-d1_pkt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-d1_pkt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-d1_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-d1_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-d1_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_pkt.c
-d1_pkt.o: ssl_locl.h
+d1_pkt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+d1_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+d1_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+d1_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+d1_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+d1_pkt.o: ../include/openssl/x509_vfy.h d1_pkt.c ssl_locl.h
+d1_srtp.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
+d1_srtp.o: ../include/openssl/buffer.h ../include/openssl/comp.h
+d1_srtp.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
+d1_srtp.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
+d1_srtp.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
+d1_srtp.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
+d1_srtp.o: ../include/openssl/evp.h ../include/openssl/hmac.h
+d1_srtp.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
+d1_srtp.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
+d1_srtp.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
+d1_srtp.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
+d1_srtp.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
+d1_srtp.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
+d1_srtp.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+d1_srtp.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_srtp.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_srtp.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_srtp.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_srtp.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_srtp.c
+d1_srtp.o: srtp.h ssl_locl.h
 d1_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 d1_srvr.o: ../include/openssl/bn.h ../include/openssl/buffer.h
 d1_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h
@@ -258,11 +282,12 @@
 d1_srvr.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 d1_srvr.o: ../include/openssl/rand.h ../include/openssl/rsa.h
 d1_srvr.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_srvr.o: ../include/openssl/x509_vfy.h d1_srvr.c ssl_locl.h
+d1_srvr.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+d1_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+d1_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+d1_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+d1_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_srvr.c
+d1_srvr.o: ssl_locl.h
 kssl.o: ../include/openssl/asn1.h ../include/openssl/bio.h
 kssl.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 kssl.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
@@ -276,11 +301,12 @@
 kssl.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 kssl.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 kssl.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-kssl.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-kssl.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-kssl.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-kssl.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-kssl.o: ../include/openssl/x509_vfy.h kssl.c kssl_lcl.h
+kssl.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+kssl.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+kssl.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+kssl.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+kssl.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl.c
+kssl.o: kssl_lcl.h
 s23_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s23_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s23_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -295,12 +321,12 @@
 s23_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s23_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s23_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s23_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s23_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s23_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s23_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s23_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_clnt.c
-s23_clnt.o: ssl_locl.h
+s23_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s23_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s23_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s23_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s23_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s23_clnt.o: ../include/openssl/x509_vfy.h s23_clnt.c ssl_locl.h
 s23_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s23_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s23_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -315,11 +341,12 @@
 s23_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s23_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s23_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_lib.o: ../include/openssl/x509_vfy.h s23_lib.c ssl_locl.h
+s23_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s23_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s23_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s23_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s23_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_lib.c
+s23_lib.o: ssl_locl.h
 s23_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s23_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s23_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -334,11 +361,12 @@
 s23_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s23_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s23_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_meth.o: ../include/openssl/x509_vfy.h s23_meth.c ssl_locl.h
+s23_meth.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s23_meth.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s23_meth.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s23_meth.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s23_meth.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_meth.c
+s23_meth.o: ssl_locl.h
 s23_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s23_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s23_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -353,11 +381,12 @@
 s23_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s23_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s23_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_pkt.o: ../include/openssl/x509_vfy.h s23_pkt.c ssl_locl.h
+s23_pkt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s23_pkt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s23_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s23_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s23_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_pkt.c
+s23_pkt.o: ssl_locl.h
 s23_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s23_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s23_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -372,12 +401,12 @@
 s23_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s23_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s23_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s23_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s23_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s23_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s23_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s23_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_srvr.c
-s23_srvr.o: ssl_locl.h
+s23_srvr.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s23_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s23_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s23_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s23_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s23_srvr.o: ../include/openssl/x509_vfy.h s23_srvr.c ssl_locl.h
 s2_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -392,12 +421,12 @@
 s2_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s2_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_clnt.c
-s2_clnt.o: ssl_locl.h
+s2_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s2_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s2_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s2_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s2_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s2_clnt.o: ../include/openssl/x509_vfy.h s2_clnt.c ssl_locl.h
 s2_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -412,11 +441,12 @@
 s2_enc.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s2_enc.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s2_enc.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_enc.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_enc.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_enc.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_enc.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_enc.o: ../include/openssl/x509_vfy.h s2_enc.c ssl_locl.h
+s2_enc.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_enc.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_enc.c
+s2_enc.o: ssl_locl.h
 s2_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -431,12 +461,12 @@
 s2_lib.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 s2_lib.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 s2_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_lib.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_lib.c
-s2_lib.o: ssl_locl.h
+s2_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s2_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s2_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s2_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s2_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s2_lib.o: ../include/openssl/x509_vfy.h s2_lib.c ssl_locl.h
 s2_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -451,11 +481,12 @@
 s2_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s2_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s2_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_meth.o: ../include/openssl/x509_vfy.h s2_meth.c ssl_locl.h
+s2_meth.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_meth.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_meth.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_meth.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_meth.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_meth.c
+s2_meth.o: ssl_locl.h
 s2_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -470,11 +501,12 @@
 s2_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s2_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s2_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_pkt.o: ../include/openssl/x509_vfy.h s2_pkt.c ssl_locl.h
+s2_pkt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_pkt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_pkt.c
+s2_pkt.o: ssl_locl.h
 s2_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -489,12 +521,12 @@
 s2_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s2_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s2_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_srvr.c
-s2_srvr.o: ssl_locl.h
+s2_srvr.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s2_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s2_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s2_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s2_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s2_srvr.o: ../include/openssl/x509_vfy.h s2_srvr.c ssl_locl.h
 s3_both.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s3_both.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s3_both.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -509,12 +541,12 @@
 s3_both.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s3_both.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s3_both.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_both.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_both.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_both.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_both.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_both.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s3_both.c
-s3_both.o: ssl_locl.h
+s3_both.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s3_both.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s3_both.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s3_both.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s3_both.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s3_both.o: ../include/openssl/x509_vfy.h s3_both.c ssl_locl.h
 s3_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s3_clnt.o: ../include/openssl/bn.h ../include/openssl/buffer.h
 s3_clnt.o: ../include/openssl/comp.h ../include/openssl/crypto.h
@@ -531,12 +563,12 @@
 s3_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s3_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s3_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl_lcl.h
-s3_clnt.o: s3_clnt.c ssl_locl.h
+s3_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s3_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s3_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s3_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s3_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s3_clnt.o: ../include/openssl/x509_vfy.h kssl_lcl.h s3_clnt.c ssl_locl.h
 s3_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s3_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s3_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -551,12 +583,12 @@
 s3_enc.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 s3_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 s3_enc.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_enc.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_enc.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s3_enc.c
-s3_enc.o: ssl_locl.h
+s3_enc.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s3_enc.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s3_enc.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s3_enc.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s3_enc.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s3_enc.o: ../include/openssl/x509_vfy.h s3_enc.c ssl_locl.h
 s3_lib.o: ../crypto/ec/ec_lcl.h ../e_os.h ../include/openssl/asn1.h
 s3_lib.o: ../include/openssl/bio.h ../include/openssl/bn.h
 s3_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
@@ -573,11 +605,12 @@
 s3_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s3_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s3_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s3_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s3_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s3_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s3_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s3_lib.o: ../include/openssl/x509_vfy.h kssl_lcl.h s3_lib.c ssl_locl.h
+s3_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s3_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s3_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s3_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s3_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl_lcl.h
+s3_lib.o: s3_lib.c ssl_locl.h
 s3_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s3_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s3_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -592,11 +625,12 @@
 s3_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s3_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s3_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s3_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s3_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s3_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s3_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s3_meth.o: ../include/openssl/x509_vfy.h s3_meth.c ssl_locl.h
+s3_meth.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s3_meth.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s3_meth.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s3_meth.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s3_meth.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s3_meth.c
+s3_meth.o: ssl_locl.h
 s3_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s3_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s3_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -609,8 +643,9 @@
 s3_pkt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
 s3_pkt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
 s3_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s3_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+s3_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
+s3_pkt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
+s3_pkt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
 s3_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
 s3_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
 s3_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
@@ -632,12 +667,12 @@
 s3_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 s3_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 s3_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl_lcl.h
-s3_srvr.o: s3_srvr.c ssl_locl.h
+s3_srvr.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+s3_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+s3_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+s3_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+s3_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+s3_srvr.o: ../include/openssl/x509_vfy.h kssl_lcl.h s3_srvr.c ssl_locl.h
 ssl_algs.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_algs.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_algs.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -652,11 +687,12 @@
 ssl_algs.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_algs.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 ssl_algs.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_algs.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_algs.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_algs.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_algs.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_algs.o: ../include/openssl/x509_vfy.h ssl_algs.c ssl_locl.h
+ssl_algs.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_algs.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_algs.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_algs.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_algs.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_algs.c
+ssl_algs.o: ssl_locl.h
 ssl_asn1.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/asn1_mac.h
 ssl_asn1.o: ../include/openssl/bio.h ../include/openssl/buffer.h
 ssl_asn1.o: ../include/openssl/comp.h ../include/openssl/crypto.h
@@ -671,12 +707,12 @@
 ssl_asn1.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 ssl_asn1.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 ssl_asn1.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_asn1.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_asn1.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_asn1.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_asn1.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_asn1.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_asn1.c
-ssl_asn1.o: ssl_locl.h
+ssl_asn1.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+ssl_asn1.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+ssl_asn1.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+ssl_asn1.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+ssl_asn1.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+ssl_asn1.o: ../include/openssl/x509_vfy.h ssl_asn1.c ssl_locl.h
 ssl_cert.o: ../crypto/o_dir.h ../e_os.h ../include/openssl/asn1.h
 ssl_cert.o: ../include/openssl/bio.h ../include/openssl/bn.h
 ssl_cert.o: ../include/openssl/buffer.h ../include/openssl/comp.h
@@ -693,12 +729,12 @@
 ssl_cert.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_cert.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 ssl_cert.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_cert.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_cert.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_cert.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_cert.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_cert.o: ../include/openssl/x509_vfy.h ../include/openssl/x509v3.h
-ssl_cert.o: ssl_cert.c ssl_locl.h
+ssl_cert.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_cert.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_cert.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_cert.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_cert.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h
+ssl_cert.o: ../include/openssl/x509v3.h ssl_cert.c ssl_locl.h
 ssl_ciph.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_ciph.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_ciph.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -713,12 +749,12 @@
 ssl_ciph.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 ssl_ciph.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 ssl_ciph.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_ciph.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_ciph.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_ciph.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_ciph.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_ciph.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_ciph.c
-ssl_ciph.o: ssl_locl.h
+ssl_ciph.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+ssl_ciph.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+ssl_ciph.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+ssl_ciph.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+ssl_ciph.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+ssl_ciph.o: ../include/openssl/x509_vfy.h ssl_ciph.c ssl_locl.h
 ssl_err.o: ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_err.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_err.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
@@ -732,11 +768,11 @@
 ssl_err.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 ssl_err.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 ssl_err.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_err.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_err.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_err.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_err.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_err.o: ../include/openssl/x509_vfy.h ssl_err.c
+ssl_err.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_err.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_err.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_err.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_err.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_err.c
 ssl_err2.o: ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_err2.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_err2.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
@@ -750,11 +786,11 @@
 ssl_err2.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 ssl_err2.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 ssl_err2.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_err2.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_err2.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_err2.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_err2.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_err2.o: ../include/openssl/x509_vfy.h ssl_err2.c
+ssl_err2.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_err2.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_err2.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_err2.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_err2.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_err2.c
 ssl_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_lib.o: ../include/openssl/conf.h ../include/openssl/crypto.h
@@ -771,12 +807,13 @@
 ssl_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_lib.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 ssl_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_lib.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h
-ssl_lib.o: ../include/openssl/x509v3.h kssl_lcl.h ssl_lib.c ssl_locl.h
+ssl_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+ssl_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+ssl_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+ssl_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+ssl_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+ssl_lib.o: ../include/openssl/x509_vfy.h ../include/openssl/x509v3.h kssl_lcl.h
+ssl_lib.o: ssl_lib.c ssl_locl.h
 ssl_rsa.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_rsa.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_rsa.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -791,11 +828,12 @@
 ssl_rsa.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_rsa.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 ssl_rsa.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_rsa.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_rsa.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_rsa.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_rsa.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_rsa.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_rsa.c
+ssl_rsa.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_rsa.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_rsa.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_rsa.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_rsa.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+ssl_rsa.o: ssl_rsa.c
 ssl_sess.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_sess.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_sess.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -811,11 +849,12 @@
 ssl_sess.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
 ssl_sess.o: ../include/openssl/rand.h ../include/openssl/rsa.h
 ssl_sess.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_sess.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_sess.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_sess.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_sess.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_sess.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_sess.c
+ssl_sess.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_sess.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_sess.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_sess.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_sess.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+ssl_sess.o: ssl_sess.c
 ssl_stat.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_stat.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_stat.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -830,11 +869,12 @@
 ssl_stat.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_stat.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 ssl_stat.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_stat.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_stat.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_stat.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_stat.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_stat.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_stat.c
+ssl_stat.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_stat.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_stat.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_stat.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_stat.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+ssl_stat.o: ssl_stat.c
 ssl_txt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 ssl_txt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 ssl_txt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -849,11 +889,12 @@
 ssl_txt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 ssl_txt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 ssl_txt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_txt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_txt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_txt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_txt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_txt.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_txt.c
+ssl_txt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+ssl_txt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+ssl_txt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+ssl_txt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+ssl_txt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+ssl_txt.o: ssl_txt.c
 t1_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 t1_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 t1_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -868,12 +909,12 @@
 t1_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 t1_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 t1_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-t1_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-t1_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-t1_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-t1_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
-t1_clnt.o: t1_clnt.c
+t1_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+t1_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+t1_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+t1_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+t1_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+t1_clnt.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_clnt.c
 t1_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 t1_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 t1_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -887,8 +928,9 @@
 t1_enc.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
 t1_enc.o: ../include/openssl/pem.h ../include/openssl/pem2.h
 t1_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-t1_enc.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_enc.o: ../include/openssl/sha.h ../include/openssl/ssl.h
+t1_enc.o: ../include/openssl/rand.h ../include/openssl/rsa.h
+t1_enc.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+t1_enc.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
 t1_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
 t1_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
 t1_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
@@ -907,8 +949,9 @@
 t1_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
 t1_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
 t1_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-t1_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+t1_lib.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
+t1_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
+t1_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h
 t1_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
 t1_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
 t1_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
@@ -929,11 +972,12 @@
 t1_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 t1_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 t1_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-t1_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-t1_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-t1_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-t1_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-t1_meth.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_meth.c
+t1_meth.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+t1_meth.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+t1_meth.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+t1_meth.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+t1_meth.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+t1_meth.o: t1_meth.c
 t1_reneg.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 t1_reneg.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 t1_reneg.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -948,11 +992,12 @@
 t1_reneg.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 t1_reneg.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 t1_reneg.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-t1_reneg.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-t1_reneg.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-t1_reneg.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-t1_reneg.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-t1_reneg.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_reneg.c
+t1_reneg.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+t1_reneg.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+t1_reneg.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+t1_reneg.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+t1_reneg.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
+t1_reneg.o: t1_reneg.c
 t1_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 t1_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 t1_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -967,9 +1012,30 @@
 t1_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
 t1_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
 t1_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-t1_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-t1_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-t1_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-t1_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
-t1_srvr.o: t1_srvr.c
+t1_srvr.o: ../include/openssl/sha.h ../include/openssl/srtp.h
+t1_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+t1_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+t1_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+t1_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+t1_srvr.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_srvr.c
+tls_srp.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
+tls_srp.o: ../include/openssl/bn.h ../include/openssl/buffer.h
+tls_srp.o: ../include/openssl/comp.h ../include/openssl/crypto.h
+tls_srp.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
+tls_srp.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
+tls_srp.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
+tls_srp.o: ../include/openssl/err.h ../include/openssl/evp.h
+tls_srp.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
+tls_srp.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
+tls_srp.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
+tls_srp.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
+tls_srp.o: ../include/openssl/pem.h ../include/openssl/pem2.h
+tls_srp.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
+tls_srp.o: ../include/openssl/rand.h ../include/openssl/rsa.h
+tls_srp.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+tls_srp.o: ../include/openssl/srp.h ../include/openssl/srtp.h
+tls_srp.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
+tls_srp.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
+tls_srp.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
+tls_srp.o: ../include/openssl/tls1.h ../include/openssl/x509.h
+tls_srp.o: ../include/openssl/x509_vfy.h ssl_locl.h tls_srp.c

diff --git a/ssl/d1_both.c b/ssl/d1_both.c
index 2f95880..de8bab8 100644
--- a/ssl/d1_both.c
+++ b/ssl/d1_both.c

@@ -1084,7 +1084,11 @@
 		return code;
 		}
 
-	if ( ! SSL_in_init(s))  /* done, no need to send a retransmit */
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (!SSL_in_init(s) && !s->tlsext_hb_pending)  /* done, no need to send a retransmit */
+#else
+	if (!SSL_in_init(s))  /* done, no need to send a retransmit */
+#endif
 		{
 		BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ);
 		return code;
@@ -1417,3 +1421,171 @@
 
 	ccs_hdr->type = *(data++);
 	}
+
+int dtls1_shutdown(SSL *s)
+	{
+	int ret;
+#ifndef OPENSSL_NO_SCTP
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+	    !(s->shutdown & SSL_SENT_SHUTDOWN))
+		{
+		ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+		if (ret < 0) return -1;
+
+		if (ret == 0)
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL);
+		}
+#endif
+	ret = ssl3_shutdown(s);
+#ifndef OPENSSL_NO_SCTP
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL);
+#endif
+	return ret;
+	}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+dtls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 byte
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			dtls1_stop_timer(s);
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
+	return 0;
+	}
+
+int
+dtls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		dtls1_start_timer(s);
+		s->tlsext_hb_pending = 1;
+		}
+
+	OPENSSL_free(buf);
+
+	return ret;
+	}
+#endif

diff --git a/ssl/d1_clnt.c b/ssl/d1_clnt.c
index 02fc704..7e8077e 100644
--- a/ssl/d1_clnt.c
+++ b/ssl/d1_clnt.c

@@ -150,7 +150,11 @@
 	unsigned long Time=(unsigned long)time(NULL);
 	void (*cb)(const SSL *ssl,int type,int val)=NULL;
 	int ret= -1;
-	int new_state,state,skip=0;;
+	int new_state,state,skip=0;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -164,6 +168,27 @@
 	s->in_handshake++;
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
 
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -171,7 +196,7 @@
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -226,6 +251,42 @@
 			s->hit = 0;
 			break;
 
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_CR_READ_SOCK:
+
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->s3->tmp.next_state;
+			break;
+
+		case DTLS1_SCTP_ST_CW_WRITE_SOCK:
+			/* read app data until dry event */
+
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+
+			if (ret == 0)
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_CW_CLNT_HELLO_A:
 		case SSL3_ST_CW_CLNT_HELLO_B:
 
@@ -248,9 +309,17 @@
 
 			s->init_num=0;
 
-			/* turn on buffering for the next lot of output */
-			if (s->bbio != s->wbio)
-				s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+			/* Disable buffering for SCTP */
+			if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+				{
+#endif
+				/* turn on buffering for the next lot of output */
+				if (s->bbio != s->wbio)
+					s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+				}
+#endif
 
 			break;
 
@@ -261,7 +330,24 @@
 			else
 				{
 				if (s->hit)
+					{
+#ifndef OPENSSL_NO_SCTP
+					/* Add new shared key for SCTP-Auth,
+					 * will be ignored if no SCTP used.
+					 */
+					snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+					         DTLS1_SCTP_AUTH_LABEL);
+
+					SSL_export_keying_material(s, sctpauthkey,
+					                           sizeof(sctpauthkey), labelbuffer,
+					                           sizeof(labelbuffer), NULL, 0, 0);
+
+					BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+							 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 					s->state=SSL3_ST_CR_FINISHED_A;
+					}
 				else
 					s->state=DTLS1_ST_CR_HELLO_VERIFY_REQUEST_A;
 				}
@@ -355,11 +441,18 @@
 			if (ret <= 0) goto end;
 			dtls1_stop_timer(s);
 			if (s->s3->tmp.cert_req)
-				s->state=SSL3_ST_CW_CERT_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_CERT_A;
 			else
-				s->state=SSL3_ST_CW_KEY_EXCH_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_KEY_EXCH_A;
 			s->init_num=0;
 
+#ifndef OPENSSL_NO_SCTP			
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_CR_READ_SOCK;
+			else
+#endif			
+			s->state=s->s3->tmp.next_state;
 			break;
 
 		case SSL3_ST_CW_CERT_A:
@@ -378,6 +471,22 @@
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_key_exchange(s);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+					 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			/* EAY EAY EAY need to check for DH fix cert
 			 * sent back */
 			/* For TLS, cert_req is set to 2, so a cert chain
@@ -388,7 +497,15 @@
 				}
 			else
 				{
-				s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+					s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+				else
+#endif
+					s->state=SSL3_ST_CW_CHANGE_A;
 				s->s3->change_cipher_spec=0;
 				}
 
@@ -400,7 +517,15 @@
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_verify(s);
 			if (ret <= 0) goto end;
-			s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+			{
+				s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+			}
+			else
+#endif
+				s->state=SSL3_ST_CW_CHANGE_A;
 			s->init_num=0;
 			s->s3->change_cipher_spec=0;
 			break;
@@ -412,6 +537,14 @@
 			ret=dtls1_send_change_cipher_spec(s,
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_CW_FINISHED_A;
 			s->init_num=0;
 
@@ -457,9 +590,23 @@
 			if (s->hit)
 				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+						s->d1->next_state = s->s3->tmp.next_state;
+						s->s3->tmp.next_state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+#endif
 				if (s->s3->flags & SSL3_FLAGS_DELAY_CLIENT_FINISHED)
 					{
 					s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+					if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+						{
+							s->d1->next_state = SSL_ST_OK;
+							s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+						}
+#endif
 					s->s3->flags|=SSL3_FLAGS_POP_BUFFER;
 					s->s3->delay_buf_pop_ret=0;
 					}
@@ -508,6 +655,16 @@
 				s->state=SSL3_ST_CW_CHANGE_A;
 			else
 				s->state=SSL_ST_OK;
+
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+				state == SSL_ST_RENEGOTIATE)
+				{
+				s->d1->next_state=s->state;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+				}
+#endif
+
 			s->init_num=0;
 			break;
 
@@ -515,6 +672,13 @@
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -541,6 +705,7 @@
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -587,6 +752,15 @@
 		}
 end:
 	s->in_handshake--;
+	
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to leave handshake
+	 * mode and allow stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (buf != NULL)
 		BUF_MEM_free(buf);
 	if (cb != NULL)

diff --git a/ssl/d1_lib.c b/ssl/d1_lib.c
index 8479932..56f6253 100644
--- a/ssl/d1_lib.c
+++ b/ssl/d1_lib.c

@@ -82,6 +82,7 @@
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long dtls1_default_timeout(void)
@@ -291,6 +292,15 @@
 
 void dtls1_start_timer(SSL *s)
 	{
+#ifndef OPENSSL_NO_SCTP
+	/* Disable timer for SCTP */
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+		{
+		memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
+		return;
+		}
+#endif
+
 	/* If timer is not set, initialize duration with 1 second */
 	if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0)
 		{
@@ -402,7 +412,7 @@
 	if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
 		{
 		/* fail the connection, enough alerts have been sent */
-		SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED);
+		SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED);
 		return -1;
 		}
 
@@ -428,6 +438,14 @@
 		s->d1->timeout.read_timeouts = 1;
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		return dtls1_heartbeat(s);
+		}
+#endif
+
 	dtls1_start_timer(s);
 	return dtls1_retransmit_buffered_messages(s);
 	}

diff --git a/ssl/d1_pkt.c b/ssl/d1_pkt.c
index 7cc6973..e3e9864 100644
--- a/ssl/d1_pkt.c
+++ b/ssl/d1_pkt.c

@@ -231,6 +231,14 @@
 
 	item->data = rdata;
 
+#ifndef OPENSSL_NO_SCTP
+	/* Store bio_dgram_sctp_rcvinfo struct */
+	if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	    (s->state == SSL3_ST_SR_FINISHED_A || s->state == SSL3_ST_CR_FINISHED_A)) {
+		BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_GET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+	}
+#endif
+
 	/* insert should not fail, since duplicates are dropped */
 	if (pqueue_insert(queue->q, item) == NULL)
 		{
@@ -658,20 +666,28 @@
 		goto again;   /* get another record */
 		}
 
-	/* Check whether this is a repeat, or aged record.
-	 * Don't check if we're listening and this message is
-	 * a ClientHello. They can look as if they're replayed,
-	 * since they arrive from different connections and
-	 * would be dropped unnecessarily.
-	 */
-	if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
-		*p == SSL3_MT_CLIENT_HELLO) &&
-		!dtls1_record_replay_check(s, bitmap))
-		{
-		rr->length = 0;
-		s->packet_length=0; /* dump this record */
-		goto again;     /* get another record */
-		}
+#ifndef OPENSSL_NO_SCTP
+	/* Only do replay check if no SCTP bio */
+	if (!BIO_dgram_is_sctp(SSL_get_rbio(s)))
+  		{
+#endif
+		/* Check whether this is a repeat, or aged record.
+		 * Don't check if we're listening and this message is
+		 * a ClientHello. They can look as if they're replayed,
+		 * since they arrive from different connections and
+		 * would be dropped unnecessarily.
+		 */
+		if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
+		    *p == SSL3_MT_CLIENT_HELLO) &&
+		    !dtls1_record_replay_check(s, bitmap))
+			{
+			rr->length = 0;
+			s->packet_length=0; /* dump this record */
+			goto again;     /* get another record */
+			}
+#ifndef OPENSSL_NO_SCTP
+  		}
+#endif
 
 	/* just read a 0 length packet */
 	if (rr->length == 0) goto again;
@@ -756,7 +772,17 @@
 
 	/* Now s->d1->handshake_fragment_len == 0 if type == SSL3_RT_HANDSHAKE. */
 
+#ifndef OPENSSL_NO_SCTP
+	/* Continue handshake if it had to be interrupted to read
+	 * app data with SCTP.
+	 */
+	if ((!s->in_handshake && SSL_in_init(s)) ||
+	    (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK) &&
+	     s->s3->in_read_app_data != 2))
+#else
 	if (!s->in_handshake && SSL_in_init(s))
+#endif
 		{
 		/* type == SSL3_RT_APPLICATION_DATA */
 		i=s->handshake_func(s);
@@ -787,6 +813,15 @@
 		item = pqueue_pop(s->d1->buffered_app_data.q);
 		if (item)
 			{
+#ifndef OPENSSL_NO_SCTP
+			/* Restore bio_dgram_sctp_rcvinfo struct */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)))
+				{
+				DTLS1_RECORD_DATA *rdata = (DTLS1_RECORD_DATA *) item->data;
+				BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_SET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+				}
+#endif
+
 			dtls1_copy_record(s, item);
 
 			OPENSSL_free(item->data);
@@ -869,6 +904,31 @@
 				rr->off=0;
 				}
 			}
+
+#ifndef OPENSSL_NO_SCTP
+			/* We were about to renegotiate but had to read
+			 * belated application data first, so retry.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    rr->type == SSL3_RT_APPLICATION_DATA &&
+			    (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK))
+				{
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				}
+
+			/* We might had to delay a close_notify alert because
+			 * of reordered app data. If there was an alert and there
+			 * is no message to read anymore, finally set shutdown.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    s->d1->shutdown_received && !BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+				{
+				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
+				return(0);
+				}
+#endif			
 		return(n);
 		}
 
@@ -896,6 +956,19 @@
 			dest = s->d1->alert_fragment;
 			dest_len = &s->d1->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			dtls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 		/* else it's a CCS message, or application data or wrong */
 		else if (rr->type != SSL3_RT_CHANGE_CIPHER_SPEC)
 			{
@@ -979,6 +1052,7 @@
 			!(s->s3->flags & SSL3_FLAGS_NO_RENEGOTIATE_CIPHERS) &&
 			!s->s3->renegotiate)
 			{
+			s->new_session = 1;
 			ssl3_renegotiate(s);
 			if (ssl3_renegotiate_check(s))
 				{
@@ -1040,6 +1114,21 @@
 			s->s3->warn_alert = alert_descr;
 			if (alert_descr == SSL_AD_CLOSE_NOTIFY)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* With SCTP and streams the socket may deliver app data
+				 * after a close_notify alert. We have to check this
+				 * first so that nothing gets discarded.
+				 */
+				if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+					BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+					{
+					s->d1->shutdown_received = 1;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					return -1;
+					}
+#endif
 				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
 				return(0);
 				}
@@ -1146,6 +1235,15 @@
 		if (s->version == DTLS1_BAD_VER)
 			s->d1->handshake_read_seq++;
 
+#ifndef OPENSSL_NO_SCTP
+		/* Remember that a CCS has been received,
+		 * so that an old key of SCTP-Auth can be
+		 * deleted when a CCS is sent. Will be ignored
+		 * if no SCTP is used
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD, 1, NULL);
+#endif
+
 		goto start;
 		}
 
@@ -1188,6 +1286,7 @@
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1284,7 +1383,16 @@
 	{
 	int i;
 
-	if (SSL_in_init(s) && !s->in_handshake)
+#ifndef OPENSSL_NO_SCTP
+		/* Check if we have to continue an interrupted handshake
+		 * for reading belated app data with SCTP.
+		 */
+		if ((SSL_in_init(s) && !s->in_handshake) ||
+		    (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+		     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK)))
+#else
+		if (SSL_in_init(s) && !s->in_handshake)
+#endif
 		{
 		i=s->handshake_func(s);
 		if (i < 0) return(i);

diff --git a/ssl/d1_srtp.c b/ssl/d1_srtp.c
new file mode 100644
index 0000000..928935b
--- /dev/null
+++ b/ssl/d1_srtp.c

@@ -0,0 +1,493 @@
+/* ssl/t1_lib.c */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <[email protected]>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef OPENSSL_NO_SRTP
+
+#include <stdio.h>
+#include <openssl/objects.h>
+#include "ssl_locl.h"
+#include "srtp.h"
+
+
+static SRTP_PROTECTION_PROFILE srtp_known_profiles[]=
+    {
+    {
+    "SRTP_AES128_CM_SHA1_80",
+    SRTP_AES128_CM_SHA1_80,
+    },
+    {
+    "SRTP_AES128_CM_SHA1_32",
+    SRTP_AES128_CM_SHA1_32,
+    },
+#if 0
+    {
+    "SRTP_NULL_SHA1_80",
+    SRTP_NULL_SHA1_80,
+    },
+    {
+    "SRTP_NULL_SHA1_32",
+    SRTP_NULL_SHA1_32,
+    },
+#endif
+    {0}
+    };
+
+static int find_profile_by_name(char *profile_name,
+				SRTP_PROTECTION_PROFILE **pptr,unsigned len)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if((len == strlen(p->name)) && !strncmp(p->name,profile_name,
+							len))
+			{
+			*pptr=p;
+			return 0;
+			}
+
+		p++;
+		}
+
+	return 1;
+	}
+
+static int find_profile_by_num(unsigned profile_num,
+			       SRTP_PROTECTION_PROFILE **pptr)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if(p->id == profile_num)
+			{
+			*pptr=p;
+			return 0;
+			}
+		p++;
+		}
+
+	return 1;
+	}
+
+static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out)
+	{
+	STACK_OF(SRTP_PROTECTION_PROFILE) *profiles;
+
+	char *col;
+	char *ptr=(char *)profiles_string;
+    
+	SRTP_PROTECTION_PROFILE *p;
+
+	if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null()))
+		{
+		SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES);
+		return 1;
+		}
+    
+	do
+		{
+		col=strchr(ptr,':');
+
+		if(!find_profile_by_name(ptr,&p,
+					 col ? col-ptr : (int)strlen(ptr)))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(profiles,p);
+			}
+		else
+			{
+			SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE);
+			return 1;
+			}
+
+		if(col) ptr=col+1;
+		} while (col);
+
+	*out=profiles;
+    
+	return 0;
+	}
+    
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles);
+	}
+
+int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&s->srtp_profiles);
+	}
+
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s)
+	{
+	if(s != NULL)
+		{
+		if(s->srtp_profiles != NULL)
+			{
+			return s->srtp_profiles;
+			}
+		else if((s->ctx != NULL) &&
+			(s->ctx->srtp_profiles != NULL))
+			{
+			return s->ctx->srtp_profiles;
+			}
+		}
+
+	return NULL;
+	}
+
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s)
+	{
+	return s->srtp_profile;
+	}
+
+/* Note: this function returns 0 length if there are no 
+   profiles specified */
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	int ct=0;
+	int i;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0;
+	SRTP_PROTECTION_PROFILE *prof;
+    
+	clnt=SSL_get_srtp_profiles(s);    
+	ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */
+
+	if(p)
+		{
+		if(ct==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST);
+			return 1;
+			}
+
+		if((2 + ct*2 + 1) > maxlen)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+                /* Add the length */
+                s2n(ct * 2, p);
+		for(i=0;i<ct;i++)
+			{
+			prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+			s2n(prof->id,p);
+			}
+
+                /* Add an empty use_mki value */
+                *p++ = 0;
+		}
+
+	*len=2 + ct*2 + 1;
+    
+	return 0;
+	}
+
+
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	SRTP_PROTECTION_PROFILE *cprof,*sprof;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr;
+        int ct;
+        int mki_len;
+	int i,j;
+	int id;
+	int ret;
+
+         /* Length value + the MKI length */
+        if(len < 3)
+		{            
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+                }
+
+        /* Pull off the length of the cipher suite list */
+        n2s(d, ct);
+        len -= 2;
+        
+        /* Check that it is even */
+	if(ct%2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+        
+        /* Check that lengths are consistent */
+	if(len < (ct + 1)) 
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        
+	clnt=sk_SRTP_PROTECTION_PROFILE_new_null();
+
+	while(ct)
+		{
+		n2s(d,id);
+		ct-=2;
+                len-=2;
+
+		if(!find_profile_by_num(id,&cprof))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof);
+			}
+		else
+			{
+			; /* Ignore */
+			}
+		}
+
+        /* Now extract the MKI value as a sanity check, but discard it for now */
+        mki_len = *d;
+        d++; len--;
+
+        if (mki_len != len)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	srvr=SSL_get_srtp_profiles(s);
+
+	/* Pick our most preferred profile. If no profiles have been
+	 configured then the outer loop doesn't run 
+	 (sk_SRTP_PROTECTION_PROFILE_num() = -1)
+	 and so we just return without doing anything */
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++)
+		{
+		sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i);
+
+		for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++)
+			{
+			cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j);
+            
+			if(cprof->id==sprof->id)
+				{
+				s->srtp_profile=sprof;
+				*al=0;
+				ret=0;
+				goto done;
+				}
+			}
+		}
+
+	ret=0;
+    
+done:
+	if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt);
+
+	return ret;
+	}
+
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	if(p)
+		{
+		if(maxlen < 5)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+		if(s->srtp_profile==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED);
+			return 1;
+			}
+                s2n(2, p);
+		s2n(s->srtp_profile->id,p);
+                *p++ = 0;
+		}
+	*len=5;
+    
+	return 0;
+	}
+    
+
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	unsigned id;
+	int i;
+        int ct;
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt;
+	SRTP_PROTECTION_PROFILE *prof;
+
+	if(len!=5)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        n2s(d, ct);
+	if(ct!=2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	n2s(d,id);
+        if (*d)  /* Must be no MKI, since we never offer one */
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_ILLEGAL_PARAMETER;
+		return 1;
+		}
+
+	clnt=SSL_get_srtp_profiles(s);
+
+	/* Throw an error if the server gave us an unsolicited extension */
+	if (clnt == NULL)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+    
+	/* Check to see if the server gave us something we support
+	   (and presumably offered)
+	*/
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++)
+		{
+		prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+	    
+		if(prof->id == id)
+			{
+			s->srtp_profile=prof;
+			*al=0;
+			return 0;
+			}
+		}
+
+	SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+	*al=SSL_AD_DECODE_ERROR;
+	return 1;
+	}
+
+
+#endif

diff --git a/ssl/d1_srvr.c b/ssl/d1_srvr.c
index 17cc022..5822379 100644
--- a/ssl/d1_srvr.c
+++ b/ssl/d1_srvr.c

@@ -151,6 +151,10 @@
 	int ret= -1;
 	int new_state,state,skip=0;
 	int listen;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -168,6 +172,13 @@
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s);
 
 	s->d1->listen = listen;
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
 
 	if (s->cert == NULL)
 		{
@@ -175,6 +186,19 @@
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -182,7 +206,7 @@
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -227,8 +251,12 @@
 				{
 				/* Ok, we now need to push on a buffering BIO so that
 				 * the output is sent in a way that TCP likes :-)
+				 * ...but not with SCTP :-)
 				 */
-				if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
+#ifndef OPENSSL_NO_SCTP
+				if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+#endif
+					if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
 
 				ssl3_init_finished_mac(s);
 				s->state=SSL3_ST_SR_CLNT_HELLO_A;
@@ -313,25 +341,75 @@
 				ssl3_init_finished_mac(s);
 			break;
 			
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_SR_READ_SOCK:
+			
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))		
+				{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+				}
+			
+			s->state=SSL3_ST_SR_FINISHED_A;
+			break;
+			
+		case DTLS1_SCTP_ST_SW_WRITE_SOCK:
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+			
+			if (ret == 0)
+				{
+				if (s->d1->next_state != SSL_ST_OK)
+					{
+					s->s3->in_read_app_data=2;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					ret = -1;
+					goto end;
+					}
+				}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_SW_SRVR_HELLO_A:
 		case SSL3_ST_SW_SRVR_HELLO_B:
-			s->new_session = 2;
+			s->renegotiate = 2;
 			dtls1_start_timer(s);
 			ret=dtls1_send_server_hello(s);
 			if (ret <= 0) goto end;
 
-#ifndef OPENSSL_NO_TLSEXT
 			if (s->hit)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* Add new shared key for SCTP-Auth,
+				 * will be ignored if no SCTP used.
+				 */
+				snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+				         DTLS1_SCTP_AUTH_LABEL);
+
+				SSL_export_keying_material(s, sctpauthkey,
+				                           sizeof(sctpauthkey), labelbuffer,
+				                           sizeof(labelbuffer), NULL, 0, 0);
+				
+				BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+                         sizeof(sctpauthkey), sctpauthkey);
+#endif
+#ifndef OPENSSL_NO_TLSEXT
 				if (s->tlsext_ticket_expected)
 					s->state=SSL3_ST_SW_SESSION_TICKET_A;
 				else
 					s->state=SSL3_ST_SW_CHANGE_A;
-				}
 #else
-			if (s->hit)
-					s->state=SSL3_ST_SW_CHANGE_A;
+				s->state=SSL3_ST_SW_CHANGE_A;
 #endif
+				}
 			else
 				s->state=SSL3_ST_SW_CERT_A;
 			s->init_num=0;
@@ -441,6 +519,13 @@
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 				}
 			else
 				{
@@ -450,9 +535,23 @@
 				if (ret <= 0) goto end;
 #ifndef NETSCAPE_HANG_BUG
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #else
 				s->state=SSL3_ST_SW_FLUSH;
 				s->s3->tmp.next_state=SSL3_ST_SR_CERT_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #endif
 				s->init_num=0;
 				}
@@ -472,6 +571,13 @@
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -504,6 +610,21 @@
 		case SSL3_ST_SR_KEY_EXCH_B:
 			ret=ssl3_get_client_key_exchange(s);
 			if (ret <= 0) goto end;
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char *) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+			         sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			s->state=SSL3_ST_SR_CERT_VRFY_A;
 			s->init_num=0;
 
@@ -540,8 +661,13 @@
 			/* we should decide if we expected this one */
 			ret=ssl3_get_cert_verify(s);
 			if (ret <= 0) goto end;
-
-			s->state=SSL3_ST_SR_FINISHED_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_SR_READ_SOCK;
+			else
+#endif			
+				s->state=SSL3_ST_SR_FINISHED_A;
 			s->init_num=0;
 			break;
 
@@ -593,6 +719,14 @@
 				SSL3_ST_SW_CHANGE_A,SSL3_ST_SW_CHANGE_B);
 
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_SW_FINISHED_A;
 			s->init_num=0;
 
@@ -617,7 +751,16 @@
 			if (s->hit)
 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
 			else
+				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
+				}
 			s->init_num=0;
 			break;
 
@@ -635,11 +778,9 @@
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -691,6 +832,14 @@
 	/* BIO_flush(s->wbio); */
 
 	s->in_handshake--;
+#ifndef OPENSSL_NO_SCTP
+		/* Notify SCTP BIO socket to leave handshake
+		 * mode and prevent stream identifier other
+		 * than 0. Will be ignored if no SCTP is used.
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (cb != NULL)
 		cb(s,SSL_CB_ACCEPT_EXIT,ret);
 	return(ret);
@@ -1146,7 +1295,7 @@
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher, NULL))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;

diff --git a/ssl/dtls1.h b/ssl/dtls1.h
index 2900d1d..5008bf6 100644
--- a/ssl/dtls1.h
+++ b/ssl/dtls1.h

@@ -105,6 +105,11 @@
 #define DTLS1_AL_HEADER_LENGTH                   2
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_AUTH_LABEL	"EXPORTER_DTLS_OVER_SCTP"
+#endif
 
 typedef struct dtls1_bitmap_st
 	{
@@ -227,7 +232,7 @@
 
 	struct dtls1_timeout_st timeout;
 
-	/* Indicates when the last handshake msg sent will timeout */
+	/* Indicates when the last handshake msg or heartbeat sent will timeout */
 	struct timeval next_timeout;
 
 	/* Timeout duration */
@@ -243,6 +248,13 @@
 	unsigned int retransmitting;
 	unsigned int change_cipher_spec_ok;
 
+#ifndef OPENSSL_NO_SCTP
+	/* used when SSL_ST_XX_FLUSH is entered */
+	int next_state;
+
+	int shutdown_received;
+#endif
+
 	} DTLS1_STATE;
 
 typedef struct dtls1_record_data_st
@@ -251,8 +263,12 @@
 	unsigned int   packet_length;
 	SSL3_BUFFER    rbuf;
 	SSL3_RECORD    rrec;
+#ifndef OPENSSL_NO_SCTP
+	struct bio_dgram_sctp_rcvinfo recordinfo;
+#endif
 	} DTLS1_RECORD_DATA;
 
+#endif
 
 /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */
 #define DTLS1_TMO_READ_COUNT                      2

diff --git a/ssl/kssl.c b/ssl/kssl.c
index b820e37..fd7c67b 100644
--- a/ssl/kssl.c
+++ b/ssl/kssl.c

@@ -2194,6 +2194,22 @@
 	return ENOMEM;
 	}
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx)
+	{
+	s->kssl_ctx = kctx;
+	} 
+
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s)
+	{
+	return s->kssl_ctx;
+	}
+
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx)
+	{
+	if (kctx)
+		return kctx->client_princ;
+	return NULL;
+	}
 
 #else /* !OPENSSL_NO_KRB5 */
 

diff --git a/ssl/kssl.h b/ssl/kssl.h
index a3d20e1..8242fd5 100644
--- a/ssl/kssl.h
+++ b/ssl/kssl.h

@@ -172,6 +172,10 @@
 			            krb5_timestamp *atimep, KSSL_ERR *kssl_err);
 unsigned char	*kssl_skip_confound(krb5_enctype enctype, unsigned char *authn);
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx);
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s);
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx);
+
 #ifdef  __cplusplus
 }
 #endif

diff --git a/ssl/s23_clnt.c b/ssl/s23_clnt.c
index f41fe3a..6a75843 100644
--- a/ssl/s23_clnt.c
+++ b/ssl/s23_clnt.c

@@ -129,6 +129,10 @@
 		return(SSLv3_client_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_client_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_client_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_client_method());
 	else
 		return(NULL);
 	}
@@ -284,7 +288,15 @@
 	if (ssl2_compat && ssl23_no_ssl2_ciphers(s))
 		ssl2_compat = 0;
 
-	if (!(s->options & SSL_OP_NO_TLSv1))
+	if (!(s->options & SSL_OP_NO_TLSv1_2))
+		{
+		version = TLS1_2_VERSION;
+		}
+	else if (!(s->options & SSL_OP_NO_TLSv1_1))
+		{
+		version = TLS1_1_VERSION;
+		}
+	else if (!(s->options & SSL_OP_NO_TLSv1))
 		{
 		version = TLS1_VERSION;
 		}
@@ -329,11 +341,29 @@
 		if (RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4) <= 0)
 			return -1;
 
-		if (version == TLS1_VERSION)
+		if (version == TLS1_2_VERSION)
+			{
+			version_major = TLS1_2_VERSION_MAJOR;
+			version_minor = TLS1_2_VERSION_MINOR;
+			}
+		else if (version == TLS1_1_VERSION)
+			{
+			version_major = TLS1_1_VERSION_MAJOR;
+			version_minor = TLS1_1_VERSION_MINOR;
+			}
+		else if (version == TLS1_VERSION)
 			{
 			version_major = TLS1_VERSION_MAJOR;
 			version_minor = TLS1_VERSION_MINOR;
 			}
+#ifdef OPENSSL_FIPS
+		else if(FIPS_mode())
+			{
+			SSLerr(SSL_F_SSL23_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+			return -1;
+			}
+#endif
 		else if (version == SSL3_VERSION)
 			{
 			version_major = SSL3_VERSION_MAJOR;
@@ -608,7 +638,7 @@
 #endif
 		}
 	else if (p[1] == SSL3_VERSION_MAJOR &&
-	         (p[2] == SSL3_VERSION_MINOR || p[2] == TLS1_VERSION_MINOR) &&
+	         p[2] <= TLS1_2_VERSION_MINOR &&
 	         ((p[0] == SSL3_RT_HANDSHAKE && p[5] == SSL3_MT_SERVER_HELLO) ||
 	          (p[0] == SSL3_RT_ALERT && p[3] == 0 && p[4] == 2)))
 		{
@@ -617,6 +647,14 @@
 		if ((p[2] == SSL3_VERSION_MINOR) &&
 			!(s->options & SSL_OP_NO_SSLv3))
 			{
+#ifdef OPENSSL_FIPS
+			if(FIPS_mode())
+				{
+				SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+				goto err;
+				}
+#endif
 			s->version=SSL3_VERSION;
 			s->method=SSLv3_client_method();
 			}
@@ -626,6 +664,18 @@
 			s->version=TLS1_VERSION;
 			s->method=TLSv1_client_method();
 			}
+		else if ((p[2] == TLS1_1_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_1))
+			{
+			s->version=TLS1_1_VERSION;
+			s->method=TLSv1_1_client_method();
+			}
+		else if ((p[2] == TLS1_2_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_2))
+			{
+			s->version=TLS1_2_VERSION;
+			s->method=TLSv1_2_client_method();
+			}
 		else
 			{
 			SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,SSL_R_UNSUPPORTED_PROTOCOL);

diff --git a/ssl/s23_meth.c b/ssl/s23_meth.c
index c6099ef..40eae0f 100644
--- a/ssl/s23_meth.c
+++ b/ssl/s23_meth.c

@@ -76,6 +76,10 @@
 #ifndef OPENSSL_NO_TLS1
 	if (ver == TLS1_VERSION)
 		return(TLSv1_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_method());
 	else
 #endif
 		return(NULL);

diff --git a/ssl/s23_srvr.c b/ssl/s23_srvr.c
index e22879c..4c4721f 100644
--- a/ssl/s23_srvr.c
+++ b/ssl/s23_srvr.c

@@ -115,6 +115,9 @@
 #include <openssl/rand.h>
 #include <openssl/objects.h>
 #include <openssl/evp.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static const SSL_METHOD *ssl23_get_server_method(int ver);
 int ssl23_get_client_hello(SSL *s);
@@ -128,6 +131,10 @@
 		return(SSLv3_server_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_server_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_server_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_server_method());
 	else
 		return(NULL);
 	}
@@ -283,7 +290,20 @@
 				/* SSLv3/TLSv1 */
 				if (p[4] >= TLS1_VERSION_MINOR)
 					{
-					if (!(s->options & SSL_OP_NO_TLSv1))
+					if (p[4] >= TLS1_2_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_2))
+						{
+						s->version=TLS1_2_VERSION;
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (p[4] >= TLS1_1_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_1))
+						{
+						s->version=TLS1_1_VERSION;
+						/* type=2; */ /* done later to survive restarts */
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (!(s->options & SSL_OP_NO_TLSv1))
 						{
 						s->version=TLS1_VERSION;
 						/* type=2; */ /* done later to survive restarts */
@@ -350,7 +370,19 @@
 				v[1]=p[10]; /* minor version according to client_version */
 			if (v[1] >= TLS1_VERSION_MINOR)
 				{
-				if (!(s->options & SSL_OP_NO_TLSv1))
+				if (v[1] >= TLS1_2_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_2))
+					{
+					s->version=TLS1_2_VERSION;
+					type=3;
+					}
+				else if (v[1] >= TLS1_1_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_1))
+					{
+					s->version=TLS1_1_VERSION;
+					type=3;
+					}
+				else if (!(s->options & SSL_OP_NO_TLSv1))
 					{
 					s->version=TLS1_VERSION;
 					type=3;
@@ -393,6 +425,15 @@
 			}
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (s->version < TLS1_VERSION))
+		{
+		SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		goto err;
+		}
+#endif
+
 	if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
 		{
 		/* we have SSLv3/TLSv1 in an SSLv2 header
@@ -572,8 +613,11 @@
 			s->s3->rbuf.left=0;
 			s->s3->rbuf.offset=0;
 			}
-
-		if (s->version == TLS1_VERSION)
+		if (s->version == TLS1_2_VERSION)
+			s->method = TLSv1_2_server_method();
+		else if (s->version == TLS1_1_VERSION)
+			s->method = TLSv1_1_server_method();
+		else if (s->version == TLS1_VERSION)
 			s->method = TLSv1_server_method();
 		else
 			s->method = SSLv3_server_method();

diff --git a/ssl/s3_both.c b/ssl/s3_both.c
index 508e390..af271d6 100644
--- a/ssl/s3_both.c
+++ b/ssl/s3_both.c

@@ -204,8 +204,7 @@
 
 #ifndef OPENSSL_NO_NEXTPROTONEG
 /* ssl3_take_mac calculates the Finished MAC for the handshakes messages seen to far. */
-static void ssl3_take_mac(SSL *s)
-	{
+static void ssl3_take_mac(SSL *s) {
 	const char *sender;
 	int slen;
 
@@ -222,7 +221,7 @@
 
 	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
 		sender,slen,s->s3->tmp.peer_finish_md);
-	}
+}
 #endif
 
 int ssl3_get_finished(SSL *s, int a, int b)
@@ -232,9 +231,8 @@
 	unsigned char *p;
 
 #ifdef OPENSSL_NO_NEXTPROTONEG
-	/* the mac has already been generated when we received the
-	 * change cipher spec message and is in s->s3->tmp.peer_finish_md
-	 */ 
+	/* the mac has already been generated when we received the change
+	 * cipher spec message and is in s->s3->tmp.peer_finish_md. */
 #endif
 
 	n=s->method->ssl_get_message(s,
@@ -546,14 +544,12 @@
 		s->init_num += i;
 		n -= i;
 		}
-
 #ifndef OPENSSL_NO_NEXTPROTONEG
 	/* If receiving Finished, record MAC of prior handshake messages for
 	 * Finished verification. */
 	if (*s->init_buf->data == SSL3_MT_FINISHED)
 		ssl3_take_mac(s);
 #endif
-
 	/* Feed this message into MAC computation. */
 	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
 	if (s->msg_callback)

diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
index 04d6e5b..b5b7f11 100644
--- a/ssl/s3_clnt.c
+++ b/ssl/s3_clnt.c

@@ -156,6 +156,9 @@
 #include <openssl/objects.h>
 #include <openssl/evp.h>
 #include <openssl/md5.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
 #endif
@@ -212,6 +215,18 @@
 		}
 #endif
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -219,7 +234,7 @@
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -292,7 +307,16 @@
 			if (ret <= 0) goto end;
 
 			if (s->hit)
+				{
 				s->state=SSL3_ST_CR_FINISHED_A;
+#ifndef OPENSSL_NO_TLSEXT
+				if (s->tlsext_ticket_expected)
+					{
+					/* receive renewed session ticket */
+					s->state=SSL3_ST_CR_SESSION_TICKET_A;
+					}
+#endif
+				}
 			else
 				s->state=SSL3_ST_CR_CERT_A;
 			s->init_num=0;
@@ -370,6 +394,17 @@
 		case SSL3_ST_CR_SRVR_DONE_B:
 			ret=ssl3_get_server_done(s);
 			if (ret <= 0) goto end;
+#ifndef OPENSSL_NO_SRP
+			if (s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP)
+				{
+				if ((ret = SRP_Calc_A_param(s))<=0)
+					{
+					SSLerr(SSL_F_SSL3_CONNECT,SSL_R_SRP_A_CALC);
+					ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_INTERNAL_ERROR);
+					goto end;
+					}
+				}
+#endif
 			if (s->s3->tmp.cert_req)
 				s->state=SSL3_ST_CW_CERT_A;
 			else
@@ -436,15 +471,15 @@
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
 
+
 #if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 			s->state=SSL3_ST_CW_FINISHED_A;
 #else
-			if (s->next_proto_negotiated)
+			if (s->s3->next_proto_neg_seen)
 				s->state=SSL3_ST_CW_NEXT_PROTO_A;
 			else
 				s->state=SSL3_ST_CW_FINISHED_A;
 #endif
-
 			s->init_num=0;
 
 			s->session->cipher=s->s3->tmp.new_cipher;
@@ -611,6 +646,7 @@
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -706,9 +742,43 @@
 		/* Do the message type and length last */
 		d=p= &(buf[4]);
 
+		/* version indicates the negotiated version: for example from
+		 * an SSLv2/v3 compatible client hello). The client_version
+		 * field is the maximum version we permit and it is also
+		 * used in RSA encrypted premaster secrets. Some servers can
+		 * choke if we initially report a higher version then
+		 * renegotiate to a lower one in the premaster secret. This
+		 * didn't happen with TLS 1.0 as most servers supported it
+		 * but it can with TLS 1.1 or later if the server only supports
+		 * 1.0.
+		 *
+		 * Possible scenario with previous logic:
+		 * 	1. Client hello indicates TLS 1.2
+		 * 	2. Server hello says TLS 1.0
+		 *	3. RSA encrypted premaster secret uses 1.2.
+		 * 	4. Handhaked proceeds using TLS 1.0.
+		 *	5. Server sends hello request to renegotiate.
+		 *	6. Client hello indicates TLS v1.0 as we now
+		 *	   know that is maximum server supports.
+		 *	7. Server chokes on RSA encrypted premaster secret
+		 *	   containing version 1.0.
+		 *
+		 * For interoperability it should be OK to always use the
+		 * maximum version we support in client hello and then rely
+		 * on the checking of version to ensure the servers isn't
+		 * being inconsistent: for example initially negotiating with
+		 * TLS 1.0 and renegotiating with TLS 1.2. We do this by using
+		 * client_version in client hello and not resetting it to
+		 * the negotiated version.
+		 */
+#if 0
 		*(p++)=s->version>>8;
 		*(p++)=s->version&0xff;
 		s->client_version=s->version;
+#else
+		*(p++)=s->client_version>>8;
+		*(p++)=s->client_version&0xff;
+#endif
 
 		/* Random stuff */
 		memcpy(p,s->s3->client_random,SSL3_RANDOM_SIZE);
@@ -924,6 +994,14 @@
 		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_UNKNOWN_CIPHER_RETURNED);
 		goto f_err;
 		}
+	/* TLS v1.2 only ciphersuites require v1.2 or later */
+	if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+		(TLS1_get_version(s) < TLS1_2_VERSION))
+		{
+		al=SSL_AD_ILLEGAL_PARAMETER;
+		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_WRONG_CIPHER_RETURNED);
+		goto f_err;
+		}
 	p+=ssl_put_cipher_by_char(s,NULL,NULL);
 
 	sk=ssl_get_ciphers_by_id(s);
@@ -955,9 +1033,11 @@
 			}
 		}
 	s->s3->tmp.new_cipher=c;
-	if (!ssl3_digest_cached_records(s))
+	/* Don't digest cached records if TLS v1.2: we may need them for
+	 * client authentication.
+	 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION && !ssl3_digest_cached_records(s))
 		goto f_err;
-
 	/* lets get the compression algorithm */
 	/* COMPRESSION */
 #ifdef OPENSSL_NO_COMP
@@ -1236,6 +1316,7 @@
 	int al,i,j,param_len,ok;
 	long n,alg_k,alg_a;
 	EVP_PKEY *pkey=NULL;
+	const EVP_MD *md = NULL;
 #ifndef OPENSSL_NO_RSA
 	RSA *rsa=NULL;
 #endif
@@ -1359,6 +1440,86 @@
 		}
 	else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (alg_k & SSL_kSRP)
+		{
+		n2s(p,i);
+		param_len=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_N_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.N=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_G_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.g=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		i = (unsigned int)(p[0]);
+		p++;
+		param_len+=i+1;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_S_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.s=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_B_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.B=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+		n-=param_len;
+
+/* We must check if there is a certificate */
+#ifndef OPENSSL_NO_RSA
+		if (alg_a & SSL_aRSA)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_RSA_ENC].x509);
+#else
+		if (0)
+			;
+#endif
+#ifndef OPENSSL_NO_DSA
+		else if (alg_a & SSL_aDSS)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_DSA_SIGN].x509);
+#endif
+		}
+	else
+#endif /* !OPENSSL_NO_SRP */
 #ifndef OPENSSL_NO_RSA
 	if (alg_k & SSL_kRSA)
 		{
@@ -1606,6 +1767,38 @@
 	/* if it was signed, check the signature */
 	if (pkey != NULL)
 		{
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
+		else
+			md = EVP_sha1();
+			
 		n2s(p,i);
 		n-=2;
 		j=EVP_PKEY_size(pkey);
@@ -1619,7 +1812,7 @@
 			}
 
 #ifndef OPENSSL_NO_RSA
-		if (pkey->type == EVP_PKEY_RSA)
+		if (pkey->type == EVP_PKEY_RSA && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			int num;
 
@@ -1627,6 +1820,8 @@
 			q=md_buf;
 			for (num=2; num > 0; num--)
 				{
+				EVP_MD_CTX_set_flags(&md_ctx,
+					EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 				EVP_DigestInit_ex(&md_ctx,(num == 2)
 					?s->ctx->md5:s->ctx->sha1, NULL);
 				EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1654,11 +1849,8 @@
 			}
 		else
 #endif
-#ifndef OPENSSL_NO_DSA
-			if (pkey->type == EVP_PKEY_DSA)
 			{
-			/* lets do DSS */
-			EVP_VerifyInit_ex(&md_ctx,EVP_dss1(), NULL);
+			EVP_VerifyInit_ex(&md_ctx, md, NULL);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,param,param_len);
@@ -1670,30 +1862,6 @@
 				goto f_err;
 				}
 			}
-		else
-#endif
-#ifndef OPENSSL_NO_ECDSA
-			if (pkey->type == EVP_PKEY_EC)
-			{
-			/* let's do ECDSA */
-			EVP_VerifyInit_ex(&md_ctx,EVP_ecdsa(), NULL);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,param,param_len);
-			if (EVP_VerifyFinal(&md_ctx,p,(int)n,pkey) <= 0)
-				{
-				/* bad signature */
-				al=SSL_AD_DECRYPT_ERROR;
-				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SIGNATURE);
-				goto f_err;
-				}
-			}
-		else
-#endif
-			{
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
 		}
 	else
 		{
@@ -1740,7 +1908,7 @@
 	{
 	int ok,ret=0;
 	unsigned long n,nc,l;
-	unsigned int llen,ctype_num,i;
+	unsigned int llen, ctype_num,i;
 	X509_NAME *xn=NULL;
 	const unsigned char *p,*q;
 	unsigned char *d;
@@ -1760,6 +1928,14 @@
 	if (s->s3->tmp.message_type == SSL3_MT_SERVER_DONE)
 		{
 		s->s3->tmp.reuse_message=1;
+		/* If we get here we don't need any cached handshake records
+		 * as we wont be doing client auth.
+		 */
+		if (s->s3->handshake_buffer)
+			{
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
 		return(1);
 		}
 
@@ -1796,6 +1972,26 @@
 	for (i=0; i<ctype_num; i++)
 		s->s3->tmp.ctype[i]= p[i];
 	p+=ctype_num;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		n2s(p, llen);
+		/* Check we have enough room for signature algorithms and
+		 * following length value.
+		 */
+		if ((unsigned long)(p - d + llen + 2) > n)
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_DATA_LENGTH_TOO_LONG);
+			goto err;
+			}
+		if ((llen & 1) || !tls1_process_sigalgs(s, p, llen))
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_SIGNATURE_ALGORITHMS_ERROR);
+			goto err;
+			}
+		p += llen;
+		}
 
 	/* get the CA RDNs */
 	n2s(p,llen);
@@ -1808,7 +2004,7 @@
 }
 #endif
 
-	if ((llen+ctype_num+2+1) != n)
+	if ((unsigned long)(p - d + llen) != n)
 		{
 		ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
 		SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_LENGTH_MISMATCH);
@@ -2630,6 +2826,39 @@
 			EVP_PKEY_free(pub_key);
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (alg_k & SSL_kSRP)
+			{
+			if (s->srp_ctx.A != NULL)
+				{
+				/* send off the data */
+				n=BN_num_bytes(s->srp_ctx.A);
+				s2n(n,p);
+				BN_bn2bin(s->srp_ctx.A,p);
+				n+=2;
+				}
+			else
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_client_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			}
+#endif
 #ifndef OPENSSL_NO_PSK
 		else if (alg_k & SSL_kPSK)
 			{
@@ -2749,12 +2978,13 @@
 	unsigned char data[MD5_DIGEST_LENGTH+SHA_DIGEST_LENGTH];
 	EVP_PKEY *pkey;
 	EVP_PKEY_CTX *pctx=NULL;
-#ifndef OPENSSL_NO_RSA
+	EVP_MD_CTX mctx;
 	unsigned u=0;
-#endif
 	unsigned long n;
 	int j;
 
+	EVP_MD_CTX_init(&mctx);
+
 	if (s->state == SSL3_ST_CW_CERT_VRFY_A)
 		{
 		d=(unsigned char *)s->init_buf->data;
@@ -2765,7 +2995,8 @@
 		EVP_PKEY_sign_init(pctx);
 		if (EVP_PKEY_CTX_set_signature_md(pctx, EVP_sha1())>0)
 			{
-			s->method->ssl3_enc->cert_verify_mac(s,
+			if (TLS1_get_version(s) < TLS1_2_VERSION)
+				s->method->ssl3_enc->cert_verify_mac(s,
 						NID_sha1,
 						&(data[MD5_DIGEST_LENGTH]));
 			}
@@ -2773,6 +3004,41 @@
 			{
 			ERR_clear_error();
 			}
+		/* For TLS v1.2 send signature algorithm and signature
+		 * using agreed digest and cached handshake records.
+		 */
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			long hdatalen = 0;
+			void *hdata;
+			const EVP_MD *md = s->cert->key->digest;
+			hdatalen = BIO_get_mem_data(s->s3->handshake_buffer,
+								&hdata);
+			if (hdatalen <= 0 || !tls12_get_sigandhash(p, pkey, md))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			p += 2;
+#ifdef SSL_DEBUG
+			fprintf(stderr, "Using TLS 1.2 with client alg %s\n",
+							EVP_MD_name(md));
+#endif
+			if (!EVP_SignInit_ex(&mctx, md, NULL)
+				|| !EVP_SignUpdate(&mctx, hdata, hdatalen)
+				|| !EVP_SignFinal(&mctx, p + 2, &u, pkey))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_EVP_LIB);
+				goto err;
+				}
+			s2n(u,p);
+			n = u + 4;
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
+		else
 #ifndef OPENSSL_NO_RSA
 		if (pkey->type == EVP_PKEY_RSA)
 			{
@@ -2855,9 +3121,11 @@
 		s->init_num=(int)n+4;
 		s->init_off=0;
 		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 err:
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(-1);
 	}
@@ -2981,7 +3249,7 @@
 	if (idx == SSL_PKEY_ECC)
 		{
 		if (ssl_check_srvr_ecc_cert_and_alg(sc->peer_pkeys[idx].x509,
-		    s->s3->tmp.new_cipher) == 0) 
+		    						s) == 0) 
 			{ /* check failed */
 			SSLerr(SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM,SSL_R_BAD_ECC_CERT);
 			goto f_err;
@@ -3077,13 +3345,7 @@
 	return(0);
 	}
 
-/* Check to see if handshake is full or resumed. Usually this is just a
- * case of checking to see if a cache hit has occurred. In the case of
- * session tickets we have to check the next message to be sure.
- */
-
-#ifndef OPENSSL_NO_TLSEXT
-# ifndef OPENSSL_NO_NEXTPROTONEG
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 int ssl3_send_next_proto(SSL *s)
 	{
 	unsigned int len, padding_len;
@@ -3106,9 +3368,15 @@
 		}
 
 	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
-	}
-# endif
+}
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
 
+/* Check to see if handshake is full or resumed. Usually this is just a
+ * case of checking to see if a cache hit has occurred. In the case of
+ * session tickets we have to check the next message to be sure.
+ */
+
+#ifndef OPENSSL_NO_TLSEXT
 int ssl3_check_finished(SSL *s)
 	{
 	int ok;

diff --git a/ssl/s3_enc.c b/ssl/s3_enc.c
index b145970..c5df2cb 100644
--- a/ssl/s3_enc.c
+++ b/ssl/s3_enc.c

@@ -170,6 +170,7 @@
 #endif
 	k=0;
 	EVP_MD_CTX_init(&m5);
+	EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 	EVP_MD_CTX_init(&s1);
 	for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
 		{
@@ -571,12 +572,12 @@
 	OPENSSL_free(s->s3->handshake_dgst);
 	s->s3->handshake_dgst=NULL;
 	}	
-		
+
 
 
 void ssl3_finish_mac(SSL *s, const unsigned char *buf, int len)
 	{
-	if (s->s3->handshake_buffer) 
+	if (s->s3->handshake_buffer && !(s->s3->flags & TLS1_FLAGS_KEEP_HANDSHAKE)) 
 		{
 		BIO_write (s->s3->handshake_buffer,(void *)buf,len);
 		} 
@@ -613,9 +614,16 @@
 	/* Loop through bitso of algorithm2 field and create MD_CTX-es */
 	for (i=0;ssl_get_handshake_digest(i,&mask,&md); i++) 
 		{
-		if ((mask & s->s3->tmp.new_cipher->algorithm2) && md) 
+		if ((mask & ssl_get_algorithm2(s)) && md) 
 			{
 			s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
+#ifdef OPENSSL_FIPS
+			if (EVP_MD_nid(md) == NID_md5)
+				{
+				EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+				}
+#endif
 			EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
 			EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
 			} 
@@ -624,9 +632,12 @@
 			s->s3->handshake_dgst[i]=NULL;
 			}
 		}
-	/* Free handshake_buffer BIO */
-	BIO_free(s->s3->handshake_buffer);
-	s->s3->handshake_buffer = NULL;
+	if (!(s->s3->flags & TLS1_FLAGS_KEEP_HANDSHAKE))
+		{
+		/* Free handshake_buffer BIO */
+		BIO_free(s->s3->handshake_buffer);
+		s->s3->handshake_buffer = NULL;
+		}
 
 	return 1;
 	}
@@ -672,6 +683,7 @@
 		return 0;
 	}	
 	EVP_MD_CTX_init(&ctx);
+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 	EVP_MD_CTX_copy_ex(&ctx,d);
 	n=EVP_MD_CTX_size(&ctx);
 	if (n < 0)

diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
index 2d651ec..a2ea25a 100644
--- a/ssl/s3_lib.c
+++ b/ssl/s3_lib.c

@@ -1071,6 +1071,103 @@
 	256,
 	},
 
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 3B */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_NULL_SHA256,
+	TLS1_CK_RSA_WITH_NULL_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_eNULL,
+	SSL_SHA256,
+	SSL_SSLV3,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher 3C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_RSA_WITH_AES_256_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 3E */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3F */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 40 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
 #ifndef OPENSSL_NO_CAMELLIA
 	/* Camellia ciphersuites from RFC4132 (128-bit portion) */
 
@@ -1287,6 +1384,122 @@
 	128,
 	},
 #endif
+
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 67 */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 68 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_256_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 69 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_256_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6A */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6B */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6C */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 6D */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_SHA256,
+	TLS1_CK_ADH_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* GOST Ciphersuites */
+
 	{
 	1,
 	"GOST94-GOST89-GOST89",
@@ -1610,6 +1823,200 @@
 
 #endif /* OPENSSL_NO_SEED */
 
+	/* GCM ciphersuites from RFC5288 */
+
+	/* Cipher 9C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher 9E */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9F */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A0 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A1 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A2 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A3 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A4 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A5 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A6 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A7 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ADH_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
 #ifndef OPENSSL_NO_ECDH
 	/* Cipher C001 */
 	{
@@ -1621,7 +2028,7 @@
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1653,7 +2060,7 @@
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1669,7 +2076,7 @@
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1685,7 +2092,7 @@
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1701,7 +2108,7 @@
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1733,7 +2140,7 @@
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1749,7 +2156,7 @@
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1765,7 +2172,7 @@
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1781,7 +2188,7 @@
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1813,7 +2220,7 @@
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1829,7 +2236,7 @@
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1845,7 +2252,7 @@
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1861,7 +2268,7 @@
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1893,7 +2300,7 @@
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1909,7 +2316,7 @@
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1925,7 +2332,7 @@
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1941,7 +2348,7 @@
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1973,7 +2380,7 @@
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1989,7 +2396,7 @@
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -2005,13 +2412,423 @@
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
 	},
 #endif	/* OPENSSL_NO_ECDH */
 
+#ifndef OPENSSL_NO_SRP
+	/* Cipher C01A */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01B */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01C */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01D */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01E */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01F */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C020 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C021 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C022 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+#endif  /* OPENSSL_NO_SRP */
+#ifndef OPENSSL_NO_ECDH
+
+	/* HMAC based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C023 */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C024 */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C025 */
+	{
+	1,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C026 */
+	{
+	1,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C027 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C028 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C029 */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C02A */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* GCM based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C02B */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C02C */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C02D */
+	{
+	1,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C02E */
+	{
+	1,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C02F */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C030 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C031 */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C032 */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+#endif /* OPENSSL_NO_ECDH */
+
+
 #ifdef TEMP_GOST_TLS
 /* Cipher FF00 */
 	{
@@ -2087,6 +2904,9 @@
 	SSL3_MD_CLIENT_FINISHED_CONST,4,
 	SSL3_MD_SERVER_FINISHED_CONST,4,
 	ssl3_alert_code,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context))ssl_undefined_function,
 	};
 
 long ssl3_default_timeout(void)
@@ -2128,6 +2948,9 @@
 
 	s->s3=s3;
 
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_init(s);
+#endif
 	s->method->ssl_clear(s);
 	return(1);
 err:
@@ -2168,6 +2991,9 @@
 		BIO_free(s->s3->handshake_buffer);
 	}
 	if (s->s3->handshake_dgst) ssl3_free_digest_list(s);
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_free(s);
+#endif
 	OPENSSL_cleanse(s->s3,sizeof *s->s3);
 	OPENSSL_free(s->s3);
 	s->s3=NULL;
@@ -2250,6 +3076,13 @@
 #endif
 	}
 
+#ifndef OPENSSL_NO_SRP
+static char * MS_CALLBACK srp_password_from_info_cb(SSL *s, void *arg)
+	{
+	return BUF_strdup(s->srp_ctx.info) ;
+	}
+#endif
+
 long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
 	{
 	int ret=0;
@@ -2495,6 +3328,27 @@
 		ret = 1;
 		break;
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	case SSL_CTRL_TLS_EXT_SEND_HEARTBEAT:
+		if (SSL_version(s) == DTLS1_VERSION || SSL_version(s) == DTLS1_BAD_VER)
+			ret = dtls1_heartbeat(s);
+		else
+			ret = tls1_heartbeat(s);
+		break;
+
+	case SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING:
+		ret = s->tlsext_hb_pending;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS:
+		if (larg)
+			s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		else
+			s->tlsext_heartbeat &= ~SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		ret = 1;
+		break;
+#endif
+
 #endif /* !OPENSSL_NO_TLSEXT */
 	default:
 		break;
@@ -2727,6 +3581,38 @@
 		return 1;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		if (ctx->srp_ctx.login != NULL)
+			OPENSSL_free(ctx->srp_ctx.login);
+		ctx->srp_ctx.login = NULL;
+		if (parg == NULL)
+			break;
+		if (strlen((const char *)parg) > 255 || strlen((const char *)parg) < 1)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, SSL_R_INVALID_SRP_USERNAME);
+			return 0;
+			} 
+		if ((ctx->srp_ctx.login = BUF_strdup((char *)parg)) == NULL)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, ERR_R_INTERNAL_ERROR);
+			return 0;
+			}
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD:
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=srp_password_from_info_cb;
+		ctx->srp_ctx.info=parg;
+		break;
+	case SSL_CTRL_SET_SRP_ARG:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_cb_arg=parg;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH:
+		ctx->srp_ctx.strength=larg;
+		break;
+#endif
 #endif /* !OPENSSL_NO_TLSEXT */
 
 	/* A Thawte special :-) */
@@ -2739,6 +3625,18 @@
 		sk_X509_push(ctx->extra_certs,(X509 *)parg);
 		break;
 
+	case SSL_CTRL_GET_EXTRA_CHAIN_CERTS:
+		*(STACK_OF(X509) **)parg =  ctx->extra_certs;
+		break;
+
+	case SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS:
+		if (ctx->extra_certs)
+			{
+			sk_X509_pop_free(ctx->extra_certs, X509_free);
+			ctx->extra_certs = NULL;
+			}
+		break;
+
 	default:
 		return(0);
 		}
@@ -2796,6 +3694,20 @@
 						HMAC_CTX *, int))fp;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_SRP_VERIFY_PARAM_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_verify_param_callback=(int (*)(SSL *,void *))fp;
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.TLS_ext_srp_username_callback=(int (*)(SSL *,int *,void *))fp;
+		break;
+	case SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=(char *(*)(SSL *,void *))fp;
+		break;
+#endif
 #endif
 	default:
 		return(0);
@@ -2814,6 +3726,9 @@
 	id=0x03000000L|((unsigned long)p[0]<<8L)|(unsigned long)p[1];
 	c.id=id;
 	cp = OBJ_bsearch_ssl_cipher_id(&c, ssl3_ciphers, SSL3_NUM_CIPHERS);
+#ifdef DEBUG_PRINT_UNKNOWN_CIPHERSUITES
+if (cp == NULL) fprintf(stderr, "Unknown cipher ID %x\n", (p[0] << 8) | p[1]);
+#endif
 	if (cp == NULL || cp->valid == 0)
 		return NULL;
 	else
@@ -2891,11 +3806,20 @@
 		{
 		c=sk_SSL_CIPHER_value(prio,i);
 
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_version(s) < TLS1_2_VERSION))
+			continue;
+
 		ssl_set_cert_masks(cert,c);
 		mask_k = cert->mask_k;
 		mask_a = cert->mask_a;
 		emask_k = cert->export_mask_k;
 		emask_a = cert->export_mask_a;
+#ifndef OPENSSL_NO_SRP
+		mask_k=cert->mask_k | s->srp_ctx.srp_Mask;
+		emask_k=cert->export_mask_k | s->srp_ctx.srp_Mask;
+#endif
 			
 #ifdef KSSL_DEBUG
 /*		printf("ssl3_choose_cipher %d alg= %lx\n", i,c->algorithms);*/
@@ -3357,4 +4281,15 @@
 		}
 	return(ret);
 	}
-
+/* If we are using TLS v1.2 or later and default SHA1+MD5 algorithms switch
+ * to new SHA256 PRF and handshake macs
+ */
+long ssl_get_algorithm2(SSL *s)
+	{
+	long alg2 = s->s3->tmp.new_cipher->algorithm2;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION &&
+	    alg2 == (SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF))
+		return SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256;
+	return alg2;
+	}
+		

diff --git a/ssl/s3_pkt.c b/ssl/s3_pkt.c
index 0d3874a..bd0571f 100644
--- a/ssl/s3_pkt.c
+++ b/ssl/s3_pkt.c

@@ -115,6 +115,7 @@
 #include "ssl_locl.h"
 #include <openssl/evp.h>
 #include <openssl/buffer.h>
+#include <openssl/rand.h>
 
 static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 			 unsigned int len, int create_empty_fragment);
@@ -657,6 +658,7 @@
 	unsigned char *p,*plen;
 	int i,mac_size,clear=0;
 	int prefix_len=0;
+	int eivlen;
 	long align=0;
 	SSL3_RECORD *wr;
 	SSL3_BUFFER *wb=&(s->s3->wbuf);
@@ -778,9 +780,27 @@
 	/* field where we are to write out packet length */
 	plen=p; 
 	p+=2;
+	/* Explicit IV length, block ciphers and TLS version 1.1 or later */
+	if (s->enc_write_ctx && s->version >= TLS1_1_VERSION)
+		{
+		int mode = EVP_CIPHER_CTX_mode(s->enc_write_ctx);
+		if (mode == EVP_CIPH_CBC_MODE)
+			{
+			eivlen = EVP_CIPHER_CTX_iv_length(s->enc_write_ctx);
+			if (eivlen <= 1)
+				eivlen = 0;
+			}
+		/* Need explicit part of IV for GCM mode */
+		else if (mode == EVP_CIPH_GCM_MODE)
+			eivlen = EVP_GCM_TLS_EXPLICIT_IV_LEN;
+		else
+			eivlen = 0;
+		}
+	else 
+		eivlen = 0;
 
 	/* lets setup the record stuff. */
-	wr->data=p;
+	wr->data=p + eivlen;
 	wr->length=(int)len;
 	wr->input=(unsigned char *)buf;
 
@@ -808,11 +828,19 @@
 
 	if (mac_size != 0)
 		{
-		if (s->method->ssl3_enc->mac(s,&(p[wr->length]),1) < 0)
+		if (s->method->ssl3_enc->mac(s,&(p[wr->length + eivlen]),1) < 0)
 			goto err;
 		wr->length+=mac_size;
-		wr->input=p;
-		wr->data=p;
+		}
+
+	wr->input=p;
+	wr->data=p;
+
+	if (eivlen)
+		{
+	/*	if (RAND_pseudo_bytes(p, eivlen) <= 0)
+			goto err; */
+		wr->length += eivlen;
 		}
 
 	/* ssl3_enc can only have an error on read */
@@ -1081,6 +1109,19 @@
 			dest = s->s3->alert_fragment;
 			dest_len = &s->s3->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			tls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 
 		if (dest_maxlen > 0)
 			{
@@ -1224,6 +1265,10 @@
 				SSLerr(SSL_F_SSL3_READ_BYTES,SSL_R_NO_RENEGOTIATION);
 				goto f_err;
 				}
+#ifdef SSL_AD_MISSING_SRP_USERNAME
+			if (alert_descr == SSL_AD_MISSING_SRP_USERNAME)
+				return(0);
+#endif
 			}
 		else if (alert_level == 2) /* fatal */
 			{
@@ -1302,6 +1347,7 @@
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1335,8 +1381,10 @@
 		{
 	default:
 #ifndef OPENSSL_NO_TLS
-		/* TLS just ignores unknown message types */
-		if (s->version == TLS1_VERSION)
+		/* TLS up to v1.1 just ignores unknown message types:
+		 * TLS v1.2 give an unexpected message alert.
+		 */
+		if (s->version >= TLS1_VERSION && s->version <= TLS1_1_VERSION)
 			{
 			rr->length = 0;
 			goto start;
@@ -1396,10 +1444,8 @@
 int ssl3_do_change_cipher_spec(SSL *s)
 	{
 	int i;
-#ifdef OPENSSL_NO_NEXTPROTONEG
 	const char *sender;
 	int slen;
-#endif
 
 	if (s->state & SSL_ST_ACCEPT)
 		i=SSL3_CHANGE_CIPHER_SERVER_READ;
@@ -1422,7 +1468,6 @@
 	if (!s->method->ssl3_enc->change_cipher_state(s,i))
 		return(0);
 
-#ifdef OPENSSL_NO_NEXTPROTONEG
 	/* we have to record the message digest at
 	 * this point so we can get it before we read
 	 * the finished message */
@@ -1439,7 +1484,6 @@
 
 	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
 		sender,slen,s->s3->tmp.peer_finish_md);
-#endif
 
 	return(1);
 	}

diff --git a/ssl/s3_srvr.c b/ssl/s3_srvr.c
index ec28588..0dbddde 100644
--- a/ssl/s3_srvr.c
+++ b/ssl/s3_srvr.c

@@ -179,6 +179,31 @@
 		return(NULL);
 	}
 
+#ifndef OPENSSL_NO_SRP
+static int ssl_check_srp_ext_ClientHello(SSL *s, int *al)
+	{
+	int ret = SSL_ERROR_NONE;
+
+	*al = SSL_AD_UNRECOGNIZED_NAME;
+
+	if ((s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP) &&
+	    (s->srp_ctx.TLS_ext_srp_username_callback != NULL))
+		{
+		if(s->srp_ctx.login == NULL)
+			{
+			/* There isn't any srp login extension !!! */
+			ret = SSL3_AL_FATAL;
+			*al = SSL_AD_UNKNOWN_PSK_IDENTITY;
+			}
+		else
+			{
+			ret = SSL_srp_server_param_with_username(s,al);
+			}
+		}
+	return ret;
+	}
+#endif
+
 IMPLEMENT_ssl3_meth_func(SSLv3_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
@@ -211,6 +236,18 @@
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -218,7 +255,7 @@
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -314,10 +351,34 @@
 		case SSL3_ST_SR_CLNT_HELLO_C:
 
 			s->shutdown=0;
-			ret=ssl3_get_client_hello(s);
-			if (ret <= 0) goto end;
-			
-			s->new_session = 2;
+			if (s->rwstate != SSL_X509_LOOKUP)
+			{
+				ret=ssl3_get_client_hello(s);
+				if (ret <= 0) goto end;
+			}
+#ifndef OPENSSL_NO_SRP
+			{
+			int al;
+			if ((ret = ssl_check_srp_ext_ClientHello(s,&al))  < 0)
+					{
+					/* callback indicates firther work to be done */
+					s->rwstate=SSL_X509_LOOKUP;
+					goto end;
+					}
+			if (ret != SSL_ERROR_NONE)
+				{
+				ssl3_send_alert(s,SSL3_AL_FATAL,al);	
+				/* This is not really an error but the only means to
+                                   for a client to detect whether srp is supported. */
+ 				   if (al != TLS1_AD_UNKNOWN_PSK_IDENTITY) 	
+					SSLerr(SSL_F_SSL3_ACCEPT,SSL_R_CLIENTHELLO_TLSEXT);			
+				ret = SSL_TLSEXT_ERR_ALERT_FATAL;			
+				ret= -1;
+				goto end;	
+				}
+			}
+#endif		
+			s->renegotiate = 2;
 			s->state=SSL3_ST_SW_SRVR_HELLO_A;
 			s->init_num=0;
 			break;
@@ -346,7 +407,7 @@
 		case SSL3_ST_SW_CERT_A:
 		case SSL3_ST_SW_CERT_B:
 			/* Check if it is anon DH or anon ECDH, */
-			/* normal PSK or KRB5 */
+			/* normal PSK or KRB5 or SRP */
 			if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 				&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)
 				&& !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aKRB5))
@@ -411,6 +472,10 @@
 #ifndef OPENSSL_NO_PSK
 			    || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint)
 #endif
+#ifndef OPENSSL_NO_SRP
+			    /* SRP: send ServerKeyExchange */
+			    || (alg_k & SSL_kSRP)
+#endif
 			    || (alg_k & (SSL_kDHr|SSL_kDHd|SSL_kEDH))
 			    || (alg_k & SSL_kEECDH)
 			    || ((alg_k & SSL_kRSA)
@@ -457,6 +522,9 @@
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+				if (s->s3->handshake_buffer)
+					if (!ssl3_digest_cached_records(s))
+						return -1;
 				}
 			else
 				{
@@ -549,6 +617,24 @@
 #endif
 				s->init_num = 0;
 				}
+			else if (TLS1_get_version(s) >= TLS1_2_VERSION)
+				{
+				s->state=SSL3_ST_SR_CERT_VRFY_A;
+				s->init_num=0;
+				if (!s->session->peer)
+					break;
+				/* For TLS v1.2 freeze the handshake buffer
+				 * at this point and digest cached records.
+				 */
+				if (!s->s3->handshake_buffer)
+					{
+					SSLerr(SSL_F_SSL3_ACCEPT,ERR_R_INTERNAL_ERROR);
+					return -1;
+					}
+				s->s3->flags |= TLS1_FLAGS_KEEP_HANDSHAKE;
+				if (!ssl3_digest_cached_records(s))
+					return -1;
+				}
 			else
 				{
 				int offset=0;
@@ -615,14 +701,11 @@
 			ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A,
 				SSL3_ST_SR_FINISHED_B);
 			if (ret <= 0) goto end;
-#ifndef OPENSSL_NO_TLSEXT
-			if (s->tlsext_ticket_expected)
-				s->state=SSL3_ST_SW_SESSION_TICKET_A;
-			else if (s->hit)
-				s->state=SSL_ST_OK;
-#else
 			if (s->hit)
 				s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_TLSEXT
+			else if (s->tlsext_ticket_expected)
+				s->state=SSL3_ST_SW_SESSION_TICKET_A;
 #endif
 			else
 				s->state=SSL3_ST_SW_CHANGE_A;
@@ -707,11 +790,9 @@
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -849,7 +930,8 @@
 	 * If we are SSLv3, we will respond with SSLv3, even if prompted with
 	 * TLSv1.
 	 */
-	if (s->state == SSL3_ST_SR_CLNT_HELLO_A)
+	if (s->state == SSL3_ST_SR_CLNT_HELLO_A
+		)
 		{
 		s->state=SSL3_ST_SR_CLNT_HELLO_B;
 		}
@@ -906,13 +988,16 @@
 	j= *(p++);
 
 	s->hit=0;
-	/* Versions before 0.9.7 always allow session reuse during renegotiation
-	 * (i.e. when s->new_session is true), option
-	 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is new with 0.9.7.
-	 * Maybe this optional behaviour should always have been the default,
-	 * but we cannot safely change the default behaviour (or new applications
-	 * might be written that become totally unsecure when compiled with
-	 * an earlier library version)
+	/* Versions before 0.9.7 always allow clients to resume sessions in renegotiation.
+	 * 0.9.7 and later allow this by default, but optionally ignore resumption requests
+	 * with flag SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION (it's a new flag rather
+	 * than a change to default behavior so that applications relying on this for security
+	 * won't even compile against older library versions).
+	 *
+	 * 1.0.1 and later also have a function SSL_renegotiate_abbreviated() to request
+	 * renegotiation but not a new session (s->new_session remains unset): for servers,
+	 * this essentially just means that the SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION
+	 * setting will be ignored.
 	 */
 	if ((s->new_session && (s->options & SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION)))
 		{
@@ -1313,8 +1398,11 @@
 		s->s3->tmp.new_cipher=s->session->cipher;
 		}
 
-	if (!ssl3_digest_cached_records(s))
-		goto f_err;
+	if (TLS1_get_version(s) < TLS1_2_VERSION || !(s->verify_mode & SSL_VERIFY_PEER))
+		{
+		if (!ssl3_digest_cached_records(s))
+			goto f_err;
+		}
 	
 	/* we now have the following setup. 
 	 * client_random
@@ -1369,20 +1457,20 @@
 		memcpy(p,s->s3->server_random,SSL3_RANDOM_SIZE);
 		p+=SSL3_RANDOM_SIZE;
 
-		/* now in theory we have 3 options to sending back the
-		 * session id.  If it is a re-use, we send back the
-		 * old session-id, if it is a new session, we send
-		 * back the new session-id or we send back a 0 length
-		 * session-id if we want it to be single use.
-		 * Currently I will not implement the '0' length session-id
-		 * 12-Jan-98 - I'll now support the '0' length stuff.
-		 *
-		 * We also have an additional case where stateless session
-		 * resumption is successful: we always send back the old
-		 * session id. In this case s->hit is non zero: this can
-		 * only happen if stateless session resumption is succesful
-		 * if session caching is disabled so existing functionality
-		 * is unaffected.
+		/* There are several cases for the session ID to send
+		 * back in the server hello:
+		 * - For session reuse from the session cache,
+		 *   we send back the old session ID.
+		 * - If stateless session reuse (using a session ticket)
+		 *   is successful, we send back the client's "session ID"
+		 *   (which doesn't actually identify the session).
+		 * - If it is a new session, we send back the new
+		 *   session ID.
+		 * - However, if we want the new session to be single-use,
+		 *   we send back a 0-length session ID.
+		 * s->hit is non-zero in either case of session reuse,
+		 * so the following won't overwrite an ID that we're supposed
+		 * to send back.
 		 */
 		if (!(s->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)
 			&& !s->hit)
@@ -1483,6 +1571,7 @@
 	BN_CTX *bn_ctx = NULL; 
 #endif
 	EVP_PKEY *pkey;
+	const EVP_MD *md = NULL;
 	unsigned char *p,*d;
 	int al,i;
 	unsigned long type;
@@ -1723,21 +1812,44 @@
 				}
 			else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+		if (type & SSL_kSRP)
+			{
+			if ((s->srp_ctx.N == NULL) ||
+				(s->srp_ctx.g == NULL) ||
+				(s->srp_ctx.s == NULL) ||
+				(s->srp_ctx.B == NULL))
+				{
+				SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_MISSING_SRP_PARAM);
+				goto err;
+				}
+			r[0]=s->srp_ctx.N;
+			r[1]=s->srp_ctx.g;
+			r[2]=s->srp_ctx.s;
+			r[3]=s->srp_ctx.B;
+			}
+		else 
+#endif
 			{
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE);
 			goto f_err;
 			}
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
 			nr[i]=BN_num_bytes(r[i]);
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				n+=1+nr[i];
+			else
+#endif
 			n+=2+nr[i];
 			}
 
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher,&md))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;
@@ -1759,8 +1871,16 @@
 		d=(unsigned char *)s->init_buf->data;
 		p= &(d[4]);
 
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				{
+				*p = nr[i];
+				p++;
+				}
+			else
+#endif
 			s2n(nr[i],p);
 			BN_bn2bin(r[i],p);
 			p+=nr[i];
@@ -1808,12 +1928,15 @@
 			/* n is the length of the params, they start at &(d[4])
 			 * and p points to the space at the end. */
 #ifndef OPENSSL_NO_RSA
-			if (pkey->type == EVP_PKEY_RSA)
+			if (pkey->type == EVP_PKEY_RSA
+					&& TLS1_get_version(s) < TLS1_2_VERSION)
 				{
 				q=md_buf;
 				j=0;
 				for (num=2; num > 0; num--)
 					{
+					EVP_MD_CTX_set_flags(&md_ctx,
+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 					EVP_DigestInit_ex(&md_ctx,(num == 2)
 						?s->ctx->md5:s->ctx->sha1, NULL);
 					EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1835,44 +1958,41 @@
 				}
 			else
 #endif
-#if !defined(OPENSSL_NO_DSA)
-				if (pkey->type == EVP_PKEY_DSA)
+			if (md)
 				{
-				/* lets do DSS */
-				EVP_SignInit_ex(&md_ctx,EVP_dss1(), NULL);
+				/* For TLS1.2 and later send signature
+				 * algorithm */
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
+					{
+					if (!tls12_get_sigandhash(p, pkey, md))
+						{
+						/* Should never happen */
+						al=SSL_AD_INTERNAL_ERROR;
+						SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+						goto f_err;
+						}
+					p+=2;
+					}
+#ifdef SSL_DEBUG
+				fprintf(stderr, "Using hash %s\n",
+							EVP_MD_name(md));
+#endif
+				EVP_SignInit_ex(&md_ctx, md, NULL);
 				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(d[4]),n);
 				if (!EVP_SignFinal(&md_ctx,&(p[2]),
 					(unsigned int *)&i,pkey))
 					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_DSA);
+					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_EVP);
 					goto err;
 					}
 				s2n(i,p);
 				n+=i+2;
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
+					n+= 2;
 				}
 			else
-#endif
-#if !defined(OPENSSL_NO_ECDSA)
-				if (pkey->type == EVP_PKEY_EC)
-				{
-				/* let's do ECDSA */
-				EVP_SignInit_ex(&md_ctx,EVP_ecdsa(), NULL);
-				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(d[4]),n);
-				if (!EVP_SignFinal(&md_ctx,&(p[2]),
-					(unsigned int *)&i,pkey))
-					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_ECDSA);
-					goto err;
-					}
-				s2n(i,p);
-				n+=i+2;
-				}
-			else
-#endif
 				{
 				/* Is this error check actually needed? */
 				al=SSL_AD_HANDSHAKE_FAILURE;
@@ -1925,6 +2045,14 @@
 		p+=n;
 		n++;
 
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			nl = tls12_get_req_sig_algs(s, p + 2);
+			s2n(nl, p);
+			p += nl + 2;
+			n += nl + 2;
+			}
+
 		off=n;
 		p+=2;
 		n+=2;
@@ -2644,6 +2772,44 @@
 			}
 		else
 #endif
+#ifndef OPENSSL_NO_SRP
+		if (alg_k & SSL_kSRP)
+			{
+			int param_len;
+
+			n2s(p,i);
+			param_len=i+2;
+			if (param_len > n)
+				{
+				al=SSL_AD_DECODE_ERROR;
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH);
+				goto f_err;
+				}
+			if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL)))
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+
+			p+=i;
+			}
+		else
+#endif	/* OPENSSL_NO_SRP */
 		if (alg_k & SSL_kGOST) 
 			{
 			int ret = 0;
@@ -2727,7 +2893,7 @@
 	return(1);
 f_err:
 	ssl3_send_alert(s,SSL3_AL_FATAL,al);
-#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH)
+#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH) || defined(OPENSSL_NO_SRP)
 err:
 #endif
 #ifndef OPENSSL_NO_ECDH
@@ -2748,12 +2914,15 @@
 	long n;
 	int type=0,i,j;
 	X509 *peer;
+	const EVP_MD *md = NULL;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
 
 	n=s->method->ssl_get_message(s,
 		SSL3_ST_SR_CERT_VRFY_A,
 		SSL3_ST_SR_CERT_VRFY_B,
 		-1,
-		514, /* 514? */
+		516, /* Enough for 4096 bit RSA key with TLS v1.2 */
 		&ok);
 
 	if (!ok) return((int)n);
@@ -2816,6 +2985,36 @@
 		} 
 	else 
 		{	
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,ERR_R_INTERNAL_ERROR);
+				al=SSL_AD_INTERNAL_ERROR;
+				goto f_err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
 		n2s(p,i);
 		n-=2;
 		if (i > n)
@@ -2833,6 +3032,37 @@
 		goto f_err;
 		}
 
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		long hdatalen = 0;
+		void *hdata;
+		hdatalen = BIO_get_mem_data(s->s3->handshake_buffer, &hdata);
+		if (hdatalen <= 0)
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_INTERNAL_ERROR);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+#ifdef SSL_DEBUG
+		fprintf(stderr, "Using TLS 1.2 with client verify alg %s\n",
+							EVP_MD_name(md));
+#endif
+		if (!EVP_VerifyInit_ex(&mctx, md, NULL)
+			|| !EVP_VerifyUpdate(&mctx, hdata, hdatalen))
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_EVP_LIB);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+
+		if (EVP_VerifyFinal(&mctx, p , i, pkey) <= 0)
+			{
+			al=SSL_AD_DECRYPT_ERROR;
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_BAD_SIGNATURE);
+			goto f_err;
+			}
+		}
+	else
 #ifndef OPENSSL_NO_RSA 
 	if (pkey->type == EVP_PKEY_RSA)
 		{
@@ -2923,6 +3153,13 @@
 		ssl3_send_alert(s,SSL3_AL_FATAL,al);
 		}
 end:
+	if (s->s3->handshake_buffer)
+		{
+		BIO_free(s->s3->handshake_buffer);
+		s->s3->handshake_buffer = NULL;
+		s->s3->flags &= ~TLS1_FLAGS_KEEP_HANDSHAKE;
+		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_free(pkey);
 	return(ret);
 	}
@@ -3035,6 +3272,12 @@
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			goto f_err;
 			}
+		/* No client certificate so digest cached records */
+		if (s->s3->handshake_buffer && !ssl3_digest_cached_records(s))
+			{
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
 		}
 	else
 		{
@@ -3111,13 +3354,17 @@
 	/* SSL3_ST_SW_CERT_B */
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
+
 #ifndef OPENSSL_NO_TLSEXT
+/* send a new session ticket (not necessarily for a new session) */
 int ssl3_send_newsession_ticket(SSL *s)
 	{
 	if (s->state == SSL3_ST_SW_SESSION_TICKET_A)
 		{
 		unsigned char *p, *senc, *macstart;
-		int len, slen;
+		const unsigned char *const_p;
+		int len, slen_full, slen;
+		SSL_SESSION *sess;
 		unsigned int hlen;
 		EVP_CIPHER_CTX ctx;
 		HMAC_CTX hctx;
@@ -3126,12 +3373,38 @@
 		unsigned char key_name[16];
 
 		/* get session encoding length */
-		slen = i2d_SSL_SESSION(s->session, NULL);
+		slen_full = i2d_SSL_SESSION(s->session, NULL);
 		/* Some length values are 16 bits, so forget it if session is
  		 * too long
  		 */
-		if (slen > 0xFF00)
+		if (slen_full > 0xFF00)
 			return -1;
+		senc = OPENSSL_malloc(slen_full);
+		if (!senc)
+			return -1;
+		p = senc;
+		i2d_SSL_SESSION(s->session, &p);
+
+		/* create a fresh copy (not shared with other threads) to clean up */
+		const_p = senc;
+		sess = d2i_SSL_SESSION(NULL, &const_p, slen_full);
+		if (sess == NULL)
+			{
+			OPENSSL_free(senc);
+			return -1;
+			}
+		sess->session_id_length = 0; /* ID is irrelevant for the ticket */
+
+		slen = i2d_SSL_SESSION(sess, NULL);
+		if (slen > slen_full) /* shouldn't ever happen */
+			{
+			OPENSSL_free(senc);
+			return -1;
+			}
+		p = senc;
+		i2d_SSL_SESSION(sess, &p);
+		SSL_SESSION_free(sess);
+
 		/* Grow buffer if need be: the length calculation is as
  		 * follows 1 (size of message name) + 3 (message length
  		 * bytes) + 4 (ticket lifetime hint) + 2 (ticket length) +
@@ -3143,11 +3416,6 @@
 			26 + EVP_MAX_IV_LENGTH + EVP_MAX_BLOCK_LENGTH +
 			EVP_MAX_MD_SIZE + slen))
 			return -1;
-		senc = OPENSSL_malloc(slen);
-		if (!senc)
-			return -1;
-		p = senc;
-		i2d_SSL_SESSION(s->session, &p);
 
 		p=(unsigned char *)s->init_buf->data;
 		/* do the header */
@@ -3178,7 +3446,13 @@
 					tlsext_tick_md(), NULL);
 			memcpy(key_name, tctx->tlsext_tick_key_name, 16);
 			}
-		l2n(s->session->tlsext_tick_lifetime_hint, p);
+
+		/* Ticket lifetime hint (advisory only):
+		 * We leave this unspecified for resumed session (for simplicity),
+		 * and guess that tickets for new sessions will live as long
+		 * as their sessions. */
+		l2n(s->hit ? 0 : s->session->timeout, p);
+
 		/* Skip ticket length for now */
 		p += 2;
 		/* Output key name */
@@ -3254,13 +3528,13 @@
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
 
-# ifndef OPENSSL_NO_NPN
+# ifndef OPENSSL_NO_NEXTPROTONEG
 /* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It
  * sets the next_proto member in s if found */
 int ssl3_get_next_proto(SSL *s)
 	{
 	int ok;
-	unsigned proto_len, padding_len;
+	int proto_len, padding_len;
 	long n;
 	const unsigned char *p;
 

diff --git a/ssl/srtp.h b/ssl/srtp.h
new file mode 100644
index 0000000..c0cf33e
--- /dev/null
+++ b/ssl/srtp.h

@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <[email protected]>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+

diff --git a/ssl/ssl.h b/ssl/ssl.h
index d88d22d..90d5537 100644
--- a/ssl/ssl.h
+++ b/ssl/ssl.h

@@ -252,6 +252,7 @@
 #define SSL_TXT_kEECDH		"kEECDH"
 #define SSL_TXT_kPSK            "kPSK"
 #define SSL_TXT_kGOST		"kGOST"
+#define SSL_TXT_kSRP		"kSRP"
 
 #define	SSL_TXT_aRSA		"aRSA"
 #define	SSL_TXT_aDSS		"aDSS"
@@ -275,6 +276,7 @@
 #define SSL_TXT_ECDSA		"ECDSA"
 #define SSL_TXT_KRB5      	"KRB5"
 #define SSL_TXT_PSK             "PSK"
+#define SSL_TXT_SRP		"SRP"
 
 #define SSL_TXT_DES		"DES"
 #define SSL_TXT_3DES		"3DES"
@@ -285,6 +287,7 @@
 #define SSL_TXT_AES128		"AES128"
 #define SSL_TXT_AES256		"AES256"
 #define SSL_TXT_AES		"AES"
+#define SSL_TXT_AES_GCM		"AESGCM"
 #define SSL_TXT_CAMELLIA128	"CAMELLIA128"
 #define SSL_TXT_CAMELLIA256	"CAMELLIA256"
 #define SSL_TXT_CAMELLIA	"CAMELLIA"
@@ -294,10 +297,14 @@
 #define SSL_TXT_SHA		"SHA" /* same as "SHA1" */
 #define SSL_TXT_GOST94		"GOST94" 
 #define SSL_TXT_GOST89MAC		"GOST89MAC" 
+#define SSL_TXT_SHA256		"SHA256"
+#define SSL_TXT_SHA384		"SHA384"
 
 #define SSL_TXT_SSLV2		"SSLv2"
 #define SSL_TXT_SSLV3		"SSLv3"
 #define SSL_TXT_TLSV1		"TLSv1"
+#define SSL_TXT_TLSV1_1		"TLSv1.1"
+#define SSL_TXT_TLSV1_2		"TLSv1.2"
 
 #define SSL_TXT_EXP		"EXP"
 #define SSL_TXT_EXPORT		"EXPORT"
@@ -356,9 +363,29 @@
  * in SSL_CTX. */
 typedef struct ssl_st *ssl_crock_st;
 typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT;
+typedef struct ssl_method_st SSL_METHOD;
+typedef struct ssl_cipher_st SSL_CIPHER;
+typedef struct ssl_session_st SSL_SESSION;
+
+DECLARE_STACK_OF(SSL_CIPHER)
+
+/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/
+typedef struct srtp_protection_profile_st
+       {
+       const char *name;
+       unsigned long id;
+       } SRTP_PROTECTION_PROFILE;
+
+DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE)
+
+typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
+typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
+
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 /* used to hold info on the particular ciphers used */
-typedef struct ssl_cipher_st
+struct ssl_cipher_st
 	{
 	int valid;
 	const char *name;		/* text name */
@@ -375,15 +402,11 @@
 	unsigned long algorithm2;	/* Extra flags */
 	int strength_bits;		/* Number of bits really used */
 	int alg_bits;			/* Number of bits for algorithm */
-	} SSL_CIPHER;
+	};
 
-DECLARE_STACK_OF(SSL_CIPHER)
-
-typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
-typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
 
 /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */
-typedef struct ssl_method_st
+struct ssl_method_st
 	{
 	int version;
 	int (*ssl_new)(SSL *s);
@@ -416,7 +439,7 @@
 	int (*ssl_version)(void);
 	long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void));
 	long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void));
-	} SSL_METHOD;
+	};
 
 /* Lets make this into an ASN.1 type structure as follows
  * SSL_SESSION_ID ::= SEQUENCE {
@@ -433,14 +456,17 @@
  *	Session_ID_context [ 4 ] EXPLICIT OCTET STRING,   -- the Session ID context
  *	Verify_result [ 5 ] EXPLICIT INTEGER,   -- X509_V_... code for `Peer'
  *	HostName [ 6 ] EXPLICIT OCTET STRING,   -- optional HostName from servername TLS extension 
- *	ECPointFormatList [ 7 ] OCTET STRING,     -- optional EC point format list from TLS extension
- *	PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
- *	PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity
+ *	PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
+ *	PSK_identity [ 8 ] EXPLICIT OCTET STRING,  -- optional PSK identity
+ *	Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket
+ *	Ticket [10]             EXPLICIT OCTET STRING, -- session ticket (clients only)
+ *	Compression_meth [11]   EXPLICIT OCTET STRING, -- optional compression method
+ *	SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username
  *	}
  * Look in ssl/ssl_asn1.c for more details
  * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-).
  */
-typedef struct ssl_session_st
+struct ssl_session_st
 	{
 	int ssl_version;	/* what ssl version session info is
 				 * being kept in here? */
@@ -512,8 +538,12 @@
 	size_t	tlsext_ticklen;		/* Session ticket length */	
 	long tlsext_tick_lifetime_hint;	/* Session lifetime hint in seconds */
 #endif
-	} SSL_SESSION;
+#ifndef OPENSSL_NO_SRP
+	char *srp_username;
+#endif
+	};
 
+#endif
 
 #define SSL_OP_MICROSOFT_SESS_ID_BUG			0x00000001L
 #define SSL_OP_NETSCAPE_CHALLENGE_BUG			0x00000002L
@@ -526,6 +556,7 @@
 #define SSL_OP_SSLEAY_080_CLIENT_DH_BUG			0x00000080L
 #define SSL_OP_TLS_D5_BUG				0x00000100L
 #define SSL_OP_TLS_BLOCK_PADDING_BUG			0x00000200L
+#define SSL_OP_NO_TLSv1_1				0x00000400L
 
 /* Disable SSL 3.0/TLS 1.0 CBC vulnerability workaround that was added
  * in OpenSSL 0.9.6d.  Usually (depending on the application protocol)
@@ -536,7 +567,7 @@
 
 /* SSL_OP_ALL: various bug workarounds that should be rather harmless.
  *             This used to be 0x000FFFFFL before 0.9.7. */
-#define SSL_OP_ALL					0x80000FFFL
+#define SSL_OP_ALL					0x80000BFFL
 
 /* DTLS options */
 #define SSL_OP_NO_QUERY_MTU                 0x00001000L
@@ -572,11 +603,16 @@
 #define SSL_OP_NO_SSLv2					0x01000000L
 #define SSL_OP_NO_SSLv3					0x02000000L
 #define SSL_OP_NO_TLSv1					0x04000000L
+#define SSL_OP_NO_TLSv1_2				0x08000000L
 
+/* These next two were never actually used for anything since SSLeay
+ * zap so we have some more flags.
+ */
 /* The next flag deliberately changes the ciphertest, this is a check
  * for the PKCS#1 attack */
-#define SSL_OP_PKCS1_CHECK_1				0x08000000L
-#define SSL_OP_PKCS1_CHECK_2				0x10000000L
+#define SSL_OP_PKCS1_CHECK_1				0x0
+#define SSL_OP_PKCS1_CHECK_2				0x0
+
 #define SSL_OP_NETSCAPE_CA_DN_BUG			0x20000000L
 #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG		0x40000000L
 /* Make server add server-hello extension from early version of
@@ -644,12 +680,53 @@
 #define SSL_get_secure_renegotiation_support(ssl) \
 	SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_heartbeat(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL)
+#endif
+
 void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 
+#ifndef OPENSSL_NO_SRP
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+typedef struct srp_ctx_st
+	{
+	/* param for all the callbacks */
+	void *SRP_cb_arg;
+	/* set client Hello login callback */
+	int (*TLS_ext_srp_username_callback)(SSL *, int *, void *);
+	/* set SRP N/g param callback for verification */
+	int (*SRP_verify_param_callback)(SSL *, void *);
+	/* set SRP client passwd callback */
+	char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *);
+
+	char *login;
+	BIGNUM *N,*g,*s,*B,*A;
+	BIGNUM *a,*b,*v;
+	char *info;
+	int strength;
+
+	unsigned long srp_Mask;
+	} SRP_CTX;
+
+#endif
+
+/* see tls_srp.c */
+int SSL_SRP_CTX_init(SSL *s);
+int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx);
+int SSL_SRP_CTX_free(SSL *ctx);
+int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx);
+int SSL_srp_server_param_with_username(SSL *s, int *ad);
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key);
+int SRP_Calc_A_param(SSL *s);
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key);
+
+#endif
 
 #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32)
 #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */
@@ -675,7 +752,11 @@
 typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id,
 				unsigned int *id_len);
 
-typedef struct ssl_comp_st
+typedef struct ssl_comp_st SSL_COMP;
+
+#ifndef OPENSSL_NO_SSL_INTERN
+
+struct ssl_comp_st
 	{
 	int id;
 	const char *name;
@@ -684,7 +765,7 @@
 #else
 	char *method;
 #endif
-	} SSL_COMP;
+	};
 
 DECLARE_STACK_OF(SSL_COMP)
 DECLARE_LHASH_OF(SSL_SESSION);
@@ -853,11 +934,31 @@
 	/* Callback for status request */
 	int (*tlsext_status_cb)(SSL *ssl, void *arg);
 	void *tlsext_status_arg;
-
 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
 	void *tlsext_opaque_prf_input_callback_arg;
+#endif
 
+#ifndef OPENSSL_NO_PSK
+	char *psk_identity_hint;
+	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
+		unsigned int max_identity_len, unsigned char *psk,
+		unsigned int max_psk_len);
+	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
+		unsigned char *psk, unsigned int max_psk_len);
+#endif
+
+#ifndef OPENSSL_NO_BUF_FREELISTS
+#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
+	unsigned int freelist_max_len;
+	struct ssl3_buf_freelist_st *wbuf_freelist;
+	struct ssl3_buf_freelist_st *rbuf_freelist;
+#endif
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
+
+#ifndef OPENSSL_NO_TLSEXT
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	/* Next protocol negotiation information */
 	/* (for experimental NPN extension). */
@@ -876,25 +977,13 @@
 				    void *arg);
 	void *next_proto_select_cb_arg;
 # endif
-#endif
-
-#ifndef OPENSSL_NO_PSK
-	char *psk_identity_hint;
-	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
-		unsigned int max_identity_len, unsigned char *psk,
-		unsigned int max_psk_len);
-	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
-		unsigned char *psk, unsigned int max_psk_len);
-#endif
-
-#ifndef OPENSSL_NO_BUF_FREELISTS
-#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
-	unsigned int freelist_max_len;
-	struct ssl3_buf_freelist_st *wbuf_freelist;
-	struct ssl3_buf_freelist_st *rbuf_freelist;
+        /* SRTP profiles we are willing to do from RFC 5764 */
+        STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
 #endif
 	};
 
+#endif
+
 #define SSL_SESS_CACHE_OFF			0x0000
 #define SSL_SESS_CACHE_CLIENT			0x0001
 #define SSL_SESS_CACHE_SERVER			0x0002
@@ -952,24 +1041,26 @@
 					   int (*cb) (SSL *ssl,
 						      const unsigned char **out,
 						      unsigned int *outlen,
-						      void *arg), void *arg);
+						      void *arg),
+					   void *arg);
 void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-				      int (*cb) (SSL *ssl, unsigned char **out,
+				      int (*cb) (SSL *ssl,
+						 unsigned char **out,
 						 unsigned char *outlen,
 						 const unsigned char *in,
-						 unsigned int inlen, void *arg),
+						 unsigned int inlen,
+						 void *arg),
 				      void *arg);
 
 int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
 			  const unsigned char *in, unsigned int inlen,
 			  const unsigned char *client, unsigned int client_len);
-void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-				    unsigned *len);
+void SSL_get0_next_proto_negotiated(const SSL *s,
+				    const unsigned char **data, unsigned *len);
 
 #define OPENSSL_NPN_UNSUPPORTED	0
 #define OPENSSL_NPN_NEGOTIATED	1
 #define OPENSSL_NPN_NO_OVERLAP	2
-
 #endif
 
 #ifndef OPENSSL_NO_PSK
@@ -1011,6 +1102,8 @@
 #define SSL_MAC_FLAG_READ_MAC_STREAM 1
 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 struct ssl_st
 	{
 	/* protocol version
@@ -1055,9 +1148,7 @@
 
 	int server;	/* are we the server side? - mostly used by SSL_clear*/
 
-	int new_session;/* 1 if we are to use a new session.
-	                 * 2 if we are a server and are inside a handshake
-	                 *   (i.e. not just sending a HelloRequest)
+	int new_session;/* Generate a new session or reuse an old one.
 	                 * NB: For servers, the 'new' session may actually be a previously
 	                 * cached session or even the previous session unless
 	                 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
@@ -1244,11 +1335,32 @@
 #endif
 
 #define session_ctx initial_ctx
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  /* What we'll do */
+	SRTP_PROTECTION_PROFILE *srtp_profile;            /* What's been chosen */
+
+	unsigned int tlsext_heartbeat;  /* Is use of the Heartbeat extension negotiated?
+	                                   0: disabled
+	                                   1: enabled
+	                                   2: enabled, but not allowed to send Requests
+	                                 */
+	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
 #else
 #define session_ctx ctx
 #endif /* OPENSSL_NO_TLSEXT */
+
+	int renegotiate;/* 1 if we are renegotiating.
+	                 * 2 if we are a server and are inside a handshake
+	                 * (i.e. not just sending a HelloRequest) */
+
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
 	};
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -1258,6 +1370,7 @@
 #include <openssl/tls1.h> /* This is mostly sslv3 with a few tweaks */
 #include <openssl/dtls1.h> /* Datagram TLS */
 #include <openssl/ssl23.h>
+#include <openssl/srtp.h>  /* Support for the use_srtp extension */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -1476,6 +1589,20 @@
 #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP	71
 
 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB	72
+
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB	75
+#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB		76
+#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB		77
+
+#define SSL_CTRL_SET_SRP_ARG		78
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME		79
+#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH		80
+#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD		81
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT				85
+#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+#endif
 #endif
 
 #define DTLS_CTRL_GET_TIMEOUT		73
@@ -1486,6 +1613,9 @@
 #define SSL_CTRL_CLEAR_OPTIONS			77
 #define SSL_CTRL_CLEAR_MODE			78
 
+#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS		82
+#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS	83
+
 #define DTLSv1_get_timeout(ssl, arg) \
 	SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg)
 #define DTLSv1_handle_timeout(ssl) \
@@ -1522,6 +1652,10 @@
 
 #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+#define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509)
+#define SSL_CTX_clear_extra_chain_certs(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL)
 
 #ifndef OPENSSL_NO_BIO
 BIO_METHOD *BIO_f_ssl(void);
@@ -1549,7 +1683,7 @@
 int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
 char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
 const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
+unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
 
 int	SSL_get_fd(const SSL *s);
 int	SSL_get_rfd(const SSL *s);
@@ -1619,11 +1753,15 @@
 long	SSL_SESSION_get_timeout(const SSL_SESSION *s);
 long	SSL_SESSION_set_timeout(SSL_SESSION *s, long t);
 void	SSL_copy_session_id(SSL *to,const SSL *from);
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s);
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len);
 
 SSL_SESSION *SSL_SESSION_new(void);
 const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
 					unsigned int *len);
 const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
 #ifndef OPENSSL_NO_FP_API
 int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
 #endif
@@ -1687,6 +1825,30 @@
 int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm);
 int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm);
 
+#ifndef OPENSSL_NO_SRP
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name);
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password);
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength);
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx,
+					char *(*cb)(SSL *,void *));
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx,
+					  int (*cb)(SSL *,void *));
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *));
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg);
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info);
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass,
+				const char *grp);
+
+BIGNUM *SSL_get_srp_g(SSL *s);
+BIGNUM *SSL_get_srp_N(SSL *s);
+
+char *SSL_get_srp_username(SSL *s);
+char *SSL_get_srp_userinfo(SSL *s);
+#endif
+
 void	SSL_free(SSL *ssl);
 int 	SSL_accept(SSL *ssl);
 int 	SSL_connect(SSL *ssl);
@@ -1722,6 +1884,15 @@
 const SSL_METHOD *TLSv1_server_method(void);	/* TLSv1.0 */
 const SSL_METHOD *TLSv1_client_method(void);	/* TLSv1.0 */
 
+const SSL_METHOD *TLSv1_1_method(void);		/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_server_method(void);	/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_client_method(void);	/* TLSv1.1 */
+
+const SSL_METHOD *TLSv1_2_method(void);		/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_server_method(void);	/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_client_method(void);	/* TLSv1.2 */
+
+
 const SSL_METHOD *DTLSv1_method(void);		/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_server_method(void);	/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_client_method(void);	/* DTLSv1.0 */
@@ -1730,6 +1901,7 @@
 
 int SSL_do_handshake(SSL *s);
 int SSL_renegotiate(SSL *s);
+int SSL_renegotiate_abbreviated(SSL *s);
 int SSL_renegotiate_pending(SSL *s);
 int SSL_shutdown(SSL *s);
 
@@ -1781,6 +1953,7 @@
 			   void (*cb)(const SSL *ssl,int type,int val));
 void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val);
 int SSL_state(const SSL *ssl);
+void SSL_set_state(SSL *ssl, int state);
 
 void SSL_set_verify_result(SSL *ssl,long v);
 long SSL_get_verify_result(const SSL *ssl);
@@ -1881,6 +2054,9 @@
 /* Pre-shared secret session resumption functions */
 int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg);
 
+void SSL_set_debug(SSL *s, int debug);
+int SSL_cache_hit(SSL *s);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1900,7 +2076,6 @@
 #define SSL_F_DTLS1_ACCEPT				 246
 #define SSL_F_DTLS1_ADD_CERT_TO_BUF			 295
 #define SSL_F_DTLS1_BUFFER_RECORD			 247
-#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM			 305
 #define SSL_F_DTLS1_CLIENT_HELLO			 248
 #define SSL_F_DTLS1_CONNECT				 249
 #define SSL_F_DTLS1_ENC					 250
@@ -1909,6 +2084,7 @@
 #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT		 253
 #define SSL_F_DTLS1_GET_RECORD				 254
 #define SSL_F_DTLS1_HANDLE_TIMEOUT			 297
+#define SSL_F_DTLS1_HEARTBEAT				 305
 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN			 255
 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT			 288
 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE		 256
@@ -1977,7 +2153,7 @@
 #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
 #define SSL_F_SSL3_GET_MESSAGE				 142
 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-#define SSL_F_SSL3_GET_NEXT_PROTO			 304
+#define SSL_F_SSL3_GET_NEXT_PROTO			 306
 #define SSL_F_SSL3_GET_RECORD				 143
 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
 #define SSL_F_SSL3_GET_SERVER_DONE			 145
@@ -2002,10 +2178,12 @@
 #define SSL_F_SSL3_WRITE_PENDING			 159
 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT	 298
 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT		 277
+#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT		 307
 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK	 215
 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK	 216
 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT	 299
 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT		 278
+#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT		 308
 #define SSL_F_SSL_BAD_METHOD				 160
 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST			 161
 #define SSL_F_SSL_CERT_DUP				 221
@@ -2022,6 +2200,7 @@
 #define SSL_F_SSL_CREATE_CIPHER_LIST			 166
 #define SSL_F_SSL_CTRL					 232
 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY			 168
+#define SSL_F_SSL_CTX_MAKE_PROFILES			 309
 #define SSL_F_SSL_CTX_NEW				 169
 #define SSL_F_SSL_CTX_SET_CIPHER_LIST			 269
 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE		 290
@@ -2050,8 +2229,10 @@
 #define SSL_F_SSL_NEW					 186
 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT	 300
 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT		 302
+#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT	 310
 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT	 301
 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT		 303
+#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT	 311
 #define SSL_F_SSL_PEEK					 270
 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT		 281
 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT		 282
@@ -2060,6 +2241,7 @@
 #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT			 188
 #define SSL_F_SSL_SESSION_NEW				 189
 #define SSL_F_SSL_SESSION_PRINT_FP			 190
+#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT		 312
 #define SSL_F_SSL_SESS_CERT_NEW				 225
 #define SSL_F_SSL_SET_CERT				 191
 #define SSL_F_SSL_SET_CIPHER_LIST			 271
@@ -2073,6 +2255,7 @@
 #define SSL_F_SSL_SET_TRUST				 228
 #define SSL_F_SSL_SET_WFD				 196
 #define SSL_F_SSL_SHUTDOWN				 224
+#define SSL_F_SSL_SRP_CTX_INIT				 313
 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION		 243
 #define SSL_F_SSL_UNDEFINED_FUNCTION			 197
 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION		 244
@@ -2093,6 +2276,8 @@
 #define SSL_F_TLS1_CHANGE_CIPHER_STATE			 209
 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT		 274
 #define SSL_F_TLS1_ENC					 210
+#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL		 314
+#define SSL_F_TLS1_HEARTBEAT				 315
 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT		 275
 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT		 276
 #define SSL_F_TLS1_PRF					 284
@@ -2132,6 +2317,13 @@
 #define SSL_R_BAD_RSA_MODULUS_LENGTH			 121
 #define SSL_R_BAD_RSA_SIGNATURE				 122
 #define SSL_R_BAD_SIGNATURE				 123
+#define SSL_R_BAD_SRP_A_LENGTH				 347
+#define SSL_R_BAD_SRP_B_LENGTH				 348
+#define SSL_R_BAD_SRP_G_LENGTH				 349
+#define SSL_R_BAD_SRP_N_LENGTH				 350
+#define SSL_R_BAD_SRP_S_LENGTH				 351
+#define SSL_R_BAD_SRTP_MKI_VALUE			 352
+#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST		 353
 #define SSL_R_BAD_SSL_FILETYPE				 124
 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH			 125
 #define SSL_R_BAD_STATE					 126
@@ -2170,14 +2362,15 @@
 #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE	 322
 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE	 323
 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER		 310
+#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST	 354
 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
 #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
 #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
 #define SSL_R_HTTPS_PROXY_REQUEST			 155
 #define SSL_R_HTTP_REQUEST				 156
 #define SSL_R_ILLEGAL_PADDING				 283
@@ -2186,6 +2379,7 @@
 #define SSL_R_INVALID_COMMAND				 280
 #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
 #define SSL_R_INVALID_PURPOSE				 278
+#define SSL_R_INVALID_SRP_USERNAME			 357
 #define SSL_R_INVALID_STATUS_RESPONSE			 328
 #define SSL_R_INVALID_TICKET_KEYS_LENGTH		 325
 #define SSL_R_INVALID_TRUST				 279
@@ -2215,6 +2409,7 @@
 #define SSL_R_MISSING_RSA_CERTIFICATE			 168
 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT		 169
 #define SSL_R_MISSING_RSA_SIGNING_CERT			 170
+#define SSL_R_MISSING_SRP_PARAM				 358
 #define SSL_R_MISSING_TMP_DH_KEY			 171
 #define SSL_R_MISSING_TMP_ECDH_KEY			 311
 #define SSL_R_MISSING_TMP_RSA_KEY			 172
@@ -2244,6 +2439,7 @@
 #define SSL_R_NO_RENEGOTIATION				 339
 #define SSL_R_NO_REQUIRED_DIGEST			 324
 #define SSL_R_NO_SHARED_CIPHER				 193
+#define SSL_R_NO_SRTP_PROFILES				 359
 #define SSL_R_NO_VERIFY_CALLBACK			 194
 #define SSL_R_NULL_SSL_CTX				 195
 #define SSL_R_NULL_SSL_METHOD_PASSED			 196
@@ -2288,7 +2484,12 @@
 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED		 277
 #define SSL_R_SESSION_MAY_NOT_BE_CREATED		 2000
 #define SSL_R_SHORT_READ				 219
+#define SSL_R_SIGNATURE_ALGORITHMS_ERROR		 360
 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE	 220
+#define SSL_R_SRP_A_CALC				 361
+#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES		 362
+#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG	 363
+#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE		 364
 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE		 221
 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG		 299
 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT		 321
@@ -2333,6 +2534,9 @@
 #define SSL_R_TLSV1_UNRECOGNIZED_NAME			 1112
 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION		 1110
 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER	 232
+#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT		 365
+#define SSL_R_TLS_HEARTBEAT_PENDING			 366
+#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL		 367
 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST		 157
 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233
 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG	 234
@@ -2354,6 +2558,7 @@
 #define SSL_R_UNKNOWN_CERTIFICATE_TYPE			 247
 #define SSL_R_UNKNOWN_CIPHER_RETURNED			 248
 #define SSL_R_UNKNOWN_CIPHER_TYPE			 249
+#define SSL_R_UNKNOWN_DIGEST				 368
 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE			 250
 #define SSL_R_UNKNOWN_PKEY_TYPE				 251
 #define SSL_R_UNKNOWN_PROTOCOL				 252
@@ -2368,12 +2573,14 @@
 #define SSL_R_UNSUPPORTED_PROTOCOL			 258
 #define SSL_R_UNSUPPORTED_SSL_VERSION			 259
 #define SSL_R_UNSUPPORTED_STATUS_TYPE			 329
+#define SSL_R_USE_SRTP_NOT_NEGOTIATED			 369
 #define SSL_R_WRITE_BIO_NOT_SET				 260
 #define SSL_R_WRONG_CIPHER_RETURNED			 261
 #define SSL_R_WRONG_MESSAGE_TYPE			 262
 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS			 263
 #define SSL_R_WRONG_SIGNATURE_LENGTH			 264
 #define SSL_R_WRONG_SIGNATURE_SIZE			 265
+#define SSL_R_WRONG_SIGNATURE_TYPE			 370
 #define SSL_R_WRONG_SSL_VERSION				 266
 #define SSL_R_WRONG_VERSION_NUMBER			 267
 #define SSL_R_X509_LIB					 268

diff --git a/ssl/ssl2.h b/ssl/ssl2.h
index 99a52ea..eb25dcb 100644
--- a/ssl/ssl2.h
+++ b/ssl/ssl2.h

@@ -155,6 +155,8 @@
 #define  CERT		char
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl2_state_st
 	{
 	int three_byte_header;
@@ -219,6 +221,8 @@
 		} tmp;
 	} SSL2_STATE;
 
+#endif
+
 /* SSLv2 */
 /* client */
 #define SSL2_ST_SEND_CLIENT_HELLO_A		(0x10|SSL_ST_CONNECT)

diff --git a/ssl/ssl3.h b/ssl/ssl3.h
index d6425e5..fb08e72 100644
--- a/ssl/ssl3.h
+++ b/ssl/ssl3.h

@@ -332,6 +332,7 @@
 #define SSL3_RT_ALERT			21
 #define SSL3_RT_HANDSHAKE		22
 #define SSL3_RT_APPLICATION_DATA	23
+#define TLS1_RT_HEARTBEAT		24
 
 #define SSL3_AL_WARNING			1
 #define SSL3_AL_FATAL			2
@@ -349,6 +350,11 @@
 #define SSL3_AD_CERTIFICATE_UNKNOWN	46
 #define SSL3_AD_ILLEGAL_PARAMETER	47	/* fatal */
 
+#define TLS1_HB_REQUEST		1
+#define TLS1_HB_RESPONSE	2
+	
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_record_st
 	{
 /*r */	int type;               /* type of record */
@@ -370,6 +376,8 @@
 	int left;               /* how many bytes left */
 	} SSL3_BUFFER;
 
+#endif
+
 #define SSL3_CT_RSA_SIGN			1
 #define SSL3_CT_DSS_SIGN			2
 #define SSL3_CT_RSA_FIXED_DH			3
@@ -389,6 +397,7 @@
 #define SSL3_FLAGS_POP_BUFFER			0x0004
 #define TLS1_FLAGS_TLS_PADDING_BUG		0x0008
 #define TLS1_FLAGS_SKIP_CERT_VERIFY		0x0010
+#define TLS1_FLAGS_KEEP_HANDSHAKE		0x0020
  
 /* SSL3_FLAGS_SGC_RESTART_DONE is set when we
  * restart a handshake because of MS SGC and so prevents us
@@ -401,6 +410,8 @@
  */
 #define SSL3_FLAGS_SGC_RESTART_DONE		0x0040
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_state_st
 	{
 	long flags;
@@ -476,12 +487,6 @@
 	void *server_opaque_prf_input;
 	size_t server_opaque_prf_input_len;
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-	/* Set if we saw the Next Protocol Negotiation extension from
-	   our peer. */
-	int next_proto_neg_seen;
-#endif
-
 	struct	{
 		/* actually only needs to be 16+20 */
 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
@@ -491,7 +496,7 @@
 		int finish_md_len;
 		unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2];
 		int peer_finish_md_len;
-		
+
 		unsigned long message_size;
 		int message_type;
 
@@ -539,14 +544,24 @@
         unsigned char previous_server_finished[EVP_MAX_MD_SIZE];
         unsigned char previous_server_finished_len;
         int send_connection_binding; /* TODOEKR */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+	int next_proto_neg_seen;
+#endif
 	} SSL3_STATE;
 
+#endif
 
 /* SSLv3 */
 /*client */
 /* extra state */
 #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 #define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+#define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+#endif	
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
@@ -574,10 +589,8 @@
 #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
 #define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-#endif
 #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
 /* read from server */
@@ -593,6 +606,10 @@
 /* server */
 /* extra state */
 #define SSL3_ST_SW_FLUSH		(0x100|SSL_ST_ACCEPT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_SW_WRITE_SOCK			(0x310|SSL_ST_ACCEPT)
+#define DTLS1_SCTP_ST_SR_READ_SOCK			(0x320|SSL_ST_ACCEPT)
+#endif	
 /* read from client */
 /* Do not change the number values, they do matter */
 #define SSL3_ST_SR_CLNT_HELLO_A		(0x110|SSL_ST_ACCEPT)
@@ -623,10 +640,8 @@
 #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-#endif
 #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
 /* write to client */
@@ -651,9 +666,7 @@
 #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
 #define SSL3_MT_FINISHED			20
 #define SSL3_MT_CERTIFICATE_STATUS		22
-#ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_MT_NEXT_PROTO			67
-#endif
 #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
 
 

diff --git a/ssl/ssl_algs.c b/ssl/ssl_algs.c
index 0967b2d..d443143 100644
--- a/ssl/ssl_algs.c
+++ b/ssl/ssl_algs.c

@@ -73,6 +73,9 @@
 #endif
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
+#if !defined(OPENSSL_NO_MD5) && (defined(__x86_64) || defined(__x86_64__))
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif  
 #ifndef OPENSSL_NO_RC2
 	EVP_add_cipher(EVP_rc2_cbc());
@@ -85,6 +88,12 @@
 	EVP_add_cipher(EVP_aes_128_cbc());
 	EVP_add_cipher(EVP_aes_192_cbc());
 	EVP_add_cipher(EVP_aes_256_cbc());
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_256_gcm());
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 	EVP_add_cipher(EVP_camellia_128_cbc());

diff --git a/ssl/ssl_asn1.c b/ssl/ssl_asn1.c
index d7f4c60..38540be 100644
--- a/ssl/ssl_asn1.c
+++ b/ssl/ssl_asn1.c

@@ -114,6 +114,9 @@
 	ASN1_OCTET_STRING psk_identity_hint;
 	ASN1_OCTET_STRING psk_identity;
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	ASN1_OCTET_STRING srp_username;
+#endif /* OPENSSL_NO_SRP */
 	} SSL_SESSION_ASN1;
 
 int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
@@ -130,6 +133,9 @@
 	unsigned char cbuf;
 	int v11=0;
 #endif
+#ifndef OPENSSL_NO_SRP
+	int v12=0;
+#endif
 	long l;
 	SSL_SESSION_ASN1 a;
 	M_ASN1_I2D_vars(in);
@@ -267,6 +273,14 @@
 		a.psk_identity.data=(unsigned char *)(in->psk_identity);
 		}
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		{
+		a.srp_username.length=strlen(in->srp_username);
+		a.srp_username.type=V_ASN1_OCTET_STRING;
+		a.srp_username.data=(unsigned char *)(in->srp_username);
+		}
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_len(&(a.version),		i2d_ASN1_INTEGER);
 	M_ASN1_I2D_len(&(a.ssl_version),	i2d_ASN1_INTEGER);
@@ -307,6 +321,10 @@
 	if (in->psk_identity)
         	M_ASN1_I2D_len_EXP_opt(&(a.psk_identity), i2d_ASN1_OCTET_STRING,8,v8);
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+        	M_ASN1_I2D_len_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_seq_total();
 
@@ -351,6 +369,10 @@
 	if (in->compress_meth)
         	M_ASN1_I2D_put_EXP_opt(&(a.comp_id), i2d_ASN1_OCTET_STRING,11,v11);
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		M_ASN1_I2D_put_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 	M_ASN1_I2D_finish();
 	}
 
@@ -549,6 +571,19 @@
 		}
 	else
 		ret->psk_identity_hint=NULL;
+
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,8);
+	if (os.data)
+		{
+		ret->psk_identity = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->psk_identity=NULL;
 #endif /* OPENSSL_NO_PSK */
 
 #ifndef OPENSSL_NO_TLSEXT
@@ -588,5 +623,20 @@
 		}
 #endif
 
+#ifndef OPENSSL_NO_SRP
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,12);
+	if (os.data)
+		{
+		ret->srp_username = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->srp_username=NULL;
+#endif /* OPENSSL_NO_SRP */
+
 	M_ASN1_D2I_Finish(a,SSL_SESSION_free,SSL_F_D2I_SSL_SESSION);
 	}

diff --git a/ssl/ssl_cert.c b/ssl/ssl_cert.c
index 27256ee..917be31 100644
--- a/ssl/ssl_cert.c
+++ b/ssl/ssl_cert.c

@@ -160,6 +160,21 @@
 	return ssl_x509_store_ctx_idx;
 	}
 
+static void ssl_cert_set_default_md(CERT *cert)
+	{
+	/* Set digest values to defaults */
+#ifndef OPENSSL_NO_DSA
+	cert->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1();
+#endif
+#ifndef OPENSSL_NO_RSA
+	cert->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+	cert->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	cert->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa();
+#endif
+	}
+
 CERT *ssl_cert_new(void)
 	{
 	CERT *ret;
@@ -174,7 +189,7 @@
 
 	ret->key= &(ret->pkeys[SSL_PKEY_RSA_ENC]);
 	ret->references=1;
-
+	ssl_cert_set_default_md(ret);
 	return(ret);
 	}
 
@@ -307,6 +322,10 @@
 	 * chain is held inside SSL_CTX */
 
 	ret->references=1;
+	/* Set digests to defaults. NB: we don't copy existing values as they
+	 * will be set during handshake.
+	 */
+	ssl_cert_set_default_md(ret);
 
 	return(ret);
 	

diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c
index 462c45a..1a143a7 100644
--- a/ssl/ssl_ciph.c
+++ b/ssl/ssl_ciph.c

@@ -162,11 +162,13 @@
 #define SSL_ENC_CAMELLIA256_IDX	9
 #define SSL_ENC_GOST89_IDX	10
 #define SSL_ENC_SEED_IDX    	11
-#define SSL_ENC_NUM_IDX		12
+#define SSL_ENC_AES128GCM_IDX	12
+#define SSL_ENC_AES256GCM_IDX	13
+#define SSL_ENC_NUM_IDX		14
 
 
 static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX]={
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL
 	};
 
 #define SSL_COMP_NULL_IDX	0
@@ -179,28 +181,32 @@
 #define SSL_MD_SHA1_IDX	1
 #define SSL_MD_GOST94_IDX 2
 #define SSL_MD_GOST89MAC_IDX 3
+#define SSL_MD_SHA256_IDX 4
+#define SSL_MD_SHA384_IDX 5
 /*Constant SSL_MAX_DIGEST equal to size of digests array should be 
  * defined in the
  * ssl_locl.h */
 #define SSL_MD_NUM_IDX	SSL_MAX_DIGEST 
 static const EVP_MD *ssl_digest_methods[SSL_MD_NUM_IDX]={
-	NULL,NULL,NULL,NULL
+	NULL,NULL,NULL,NULL,NULL,NULL
 	};
 /* PKEY_TYPE for GOST89MAC is known in advance, but, because
  * implementation is engine-provided, we'll fill it only if
  * corresponding EVP_PKEY_METHOD is found 
  */
 static int  ssl_mac_pkey_id[SSL_MD_NUM_IDX]={
-	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef,
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC
 	};
 
 static int ssl_mac_secret_size[SSL_MD_NUM_IDX]={
-	0,0,0,0
+	0,0,0,0,0,0
 	};
 
 static int ssl_handshake_digest_flag[SSL_MD_NUM_IDX]={
 	SSL_HANDSHAKE_MAC_MD5,SSL_HANDSHAKE_MAC_SHA,
-	SSL_HANDSHAKE_MAC_GOST94,0
+	SSL_HANDSHAKE_MAC_GOST94, 0, SSL_HANDSHAKE_MAC_SHA256,
+	SSL_HANDSHAKE_MAC_SHA384
 	};
 
 #define CIPHER_ADD	1
@@ -247,6 +253,7 @@
 	{0,SSL_TXT_ECDH,0,    SSL_kECDHr|SSL_kECDHe|SSL_kEECDH,0,0,0,0,0,0,0,0},
 
         {0,SSL_TXT_kPSK,0,    SSL_kPSK,  0,0,0,0,0,0,0,0},
+	{0,SSL_TXT_kSRP,0,    SSL_kSRP,  0,0,0,0,0,0,0,0},
 	{0,SSL_TXT_kGOST,0, SSL_kGOST,0,0,0,0,0,0,0,0},
 
 	/* server authentication aliases */
@@ -273,6 +280,7 @@
 	{0,SSL_TXT_ADH,0,     SSL_kEDH,SSL_aNULL,0,0,0,0,0,0,0},
 	{0,SSL_TXT_AECDH,0,   SSL_kEECDH,SSL_aNULL,0,0,0,0,0,0,0},
         {0,SSL_TXT_PSK,0,     SSL_kPSK,SSL_aPSK,0,0,0,0,0,0,0},
+	{0,SSL_TXT_SRP,0,     SSL_kSRP,0,0,0,0,0,0,0,0},
 
 
 	/* symmetric encryption aliases */
@@ -283,9 +291,10 @@
 	{0,SSL_TXT_IDEA,0,    0,0,SSL_IDEA,  0,0,0,0,0,0},
 	{0,SSL_TXT_SEED,0,    0,0,SSL_SEED,  0,0,0,0,0,0},
 	{0,SSL_TXT_eNULL,0,   0,0,SSL_eNULL, 0,0,0,0,0,0},
-	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128,0,0,0,0,0,0},
-	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256,0,0,0,0,0,0},
-	{0,SSL_TXT_AES,0,     0,0,SSL_AES128|SSL_AES256,0,0,0,0,0,0},
+	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128|SSL_AES128GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256|SSL_AES256GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES,0,     0,0,SSL_AES,0,0,0,0,0,0},
+	{0,SSL_TXT_AES_GCM,0, 0,0,SSL_AES128GCM|SSL_AES256GCM,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA128,0,0,0,SSL_CAMELLIA128,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA256,0,0,0,SSL_CAMELLIA256,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA   ,0,0,0,SSL_CAMELLIA128|SSL_CAMELLIA256,0,0,0,0,0,0},
@@ -296,6 +305,8 @@
 	{0,SSL_TXT_SHA,0,     0,0,0,SSL_SHA1,  0,0,0,0,0},
 	{0,SSL_TXT_GOST94,0,     0,0,0,SSL_GOST94,  0,0,0,0,0},
 	{0,SSL_TXT_GOST89MAC,0,     0,0,0,SSL_GOST89MAC,  0,0,0,0,0},
+	{0,SSL_TXT_SHA256,0,    0,0,0,SSL_SHA256,  0,0,0,0,0},
+	{0,SSL_TXT_SHA384,0,    0,0,0,SSL_SHA384,  0,0,0,0,0},
 
 	/* protocol version aliases */
 	{0,SSL_TXT_SSLV2,0,   0,0,0,0,SSL_SSLV2, 0,0,0,0},
@@ -379,6 +390,11 @@
 	ssl_cipher_methods[SSL_ENC_SEED_IDX]=
 	  EVP_get_cipherbyname(SN_seed_cbc);
 
+	ssl_cipher_methods[SSL_ENC_AES128GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_128_gcm);
+	ssl_cipher_methods[SSL_ENC_AES256GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_256_gcm);
+
 	ssl_digest_methods[SSL_MD_MD5_IDX]=
 		EVP_get_digestbyname(SN_md5);
 	ssl_mac_secret_size[SSL_MD_MD5_IDX]=
@@ -404,6 +420,14 @@
 			ssl_mac_secret_size[SSL_MD_GOST89MAC_IDX]=32;
 		}		
 
+	ssl_digest_methods[SSL_MD_SHA256_IDX]=
+		EVP_get_digestbyname(SN_sha256);
+	ssl_mac_secret_size[SSL_MD_SHA256_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA256_IDX]);
+	ssl_digest_methods[SSL_MD_SHA384_IDX]=
+		EVP_get_digestbyname(SN_sha384);
+	ssl_mac_secret_size[SSL_MD_SHA384_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA384_IDX]);
 	}
 #ifndef OPENSSL_NO_COMP
 
@@ -526,6 +550,12 @@
 	case SSL_SEED:
 		i=SSL_ENC_SEED_IDX;
 		break;
+	case SSL_AES128GCM:
+		i=SSL_ENC_AES128GCM_IDX;
+		break;
+	case SSL_AES256GCM:
+		i=SSL_ENC_AES256GCM_IDX;
+		break;
 	default:
 		i= -1;
 		break;
@@ -549,6 +579,12 @@
 	case SSL_SHA1:
 		i=SSL_MD_SHA1_IDX;
 		break;
+	case SSL_SHA256:
+		i=SSL_MD_SHA256_IDX;
+		break;
+	case SSL_SHA384:
+		i=SSL_MD_SHA384_IDX;
+		break;
 	case SSL_GOST94:
 		i = SSL_MD_GOST94_IDX;
 		break;
@@ -564,17 +600,39 @@
 		*md=NULL; 
 		if (mac_pkey_type!=NULL) *mac_pkey_type = NID_undef;
 		if (mac_secret_size!=NULL) *mac_secret_size = 0;
-
+		if (c->algorithm_mac == SSL_AEAD)
+			mac_pkey_type = NULL;
 	}
 	else
 	{
 		*md=ssl_digest_methods[i];
 		if (mac_pkey_type!=NULL) *mac_pkey_type = ssl_mac_pkey_id[i];
 		if (mac_secret_size!=NULL) *mac_secret_size = ssl_mac_secret_size[i];
-	}	
+	}
 
-	if ((*enc != NULL) && (*md != NULL) && (!mac_pkey_type||*mac_pkey_type != NID_undef))
+	if ((*enc != NULL) &&
+	    (*md != NULL || (EVP_CIPHER_flags(*enc)&EVP_CIPH_FLAG_AEAD_CIPHER)) &&
+	    (!mac_pkey_type||*mac_pkey_type != NID_undef))
+		{
+		const EVP_CIPHER *evp;
+
+		if	(s->ssl_version >= TLS1_VERSION &&
+			 c->algorithm_enc == SSL_RC4 &&
+			 c->algorithm_mac == SSL_MD5 &&
+			 (evp=EVP_get_cipherbyname("RC4-HMAC-MD5")))
+			*enc = evp, *md = NULL;
+		else if (s->ssl_version >= TLS1_VERSION &&
+			 c->algorithm_enc == SSL_AES128 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-128-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
+		else if (s->ssl_version >= TLS1_VERSION &&
+			 c->algorithm_enc == SSL_AES256 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-256-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
 		return(1);
+		}
 	else
 		return(0);
 	}
@@ -585,9 +643,11 @@
 		{
 		return 0;
 		}
-	if (ssl_handshake_digest_flag[idx]==0) return 0;
 	*mask = ssl_handshake_digest_flag[idx];
-	*md = ssl_digest_methods[idx];
+	if (*mask)
+		*md = ssl_digest_methods[idx];
+	else
+		*md = NULL;
 	return 1;
 }
 
@@ -662,6 +722,9 @@
 	*mkey |= SSL_kPSK;
 	*auth |= SSL_aPSK;
 #endif
+#ifdef OPENSSL_NO_SRP
+	*mkey |= SSL_kSRP;
+#endif
 	/* Check for presence of GOST 34.10 algorithms, and if they
 	 * do not present, disable  appropriate auth and key exchange */
 	if (!get_optional_pkey_id("gost94")) {
@@ -687,6 +750,8 @@
 	*enc |= (ssl_cipher_methods[SSL_ENC_IDEA_IDX] == NULL) ? SSL_IDEA:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES128_IDX] == NULL) ? SSL_AES128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES256_IDX] == NULL) ? SSL_AES256:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES128GCM_IDX] == NULL) ? SSL_AES128GCM:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] == NULL) ? SSL_AES256GCM:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] == NULL) ? SSL_CAMELLIA128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA256_IDX] == NULL) ? SSL_CAMELLIA256:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == NULL) ? SSL_eGOST2814789CNT:0;
@@ -694,6 +759,8 @@
 
 	*mac |= (ssl_digest_methods[SSL_MD_MD5_IDX ] == NULL) ? SSL_MD5 :0;
 	*mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA256_IDX] == NULL) ? SSL_SHA256:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA384_IDX] == NULL) ? SSL_SHA384:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST94_IDX] == NULL) ? SSL_GOST94:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST89MAC_IDX] == NULL || ssl_mac_pkey_id[SSL_MD_GOST89MAC_IDX]==NID_undef)? SSL_GOST89MAC:0;
 
@@ -724,6 +791,9 @@
 		c = ssl_method->get_cipher(i);
 		/* drop those that use any of that is not available */
 		if ((c != NULL) && c->valid &&
+#ifdef OPENSSL_FIPS
+		    (!FIPS_mode() || (c->algo_strength & SSL_FIPS)) &&
+#endif
 		    !(c->algorithm_mkey & disabled_mkey) &&
 		    !(c->algorithm_auth & disabled_auth) &&
 		    !(c->algorithm_enc & disabled_enc) &&
@@ -1423,7 +1493,11 @@
 	 */
 	for (curr = head; curr != NULL; curr = curr->next)
 		{
+#ifdef OPENSSL_FIPS
+		if (curr->active && (!FIPS_mode() || curr->cipher->algo_strength & SSL_FIPS))
+#else
 		if (curr->active)
+#endif
 			{
 			sk_SSL_CIPHER_push(cipherstack, curr->cipher);
 #ifdef CIPHER_DEBUG
@@ -1480,6 +1554,8 @@
 		ver="SSLv2";
 	else if (alg_ssl & SSL_SSLV3)
 		ver="SSLv3";
+	else if (alg_ssl & SSL_TLSV1_2)
+		ver="TLSv1.2";
 	else
 		ver="unknown";
 
@@ -1512,6 +1588,9 @@
 	case SSL_kPSK:
 		kx="PSK";
 		break;
+	case SSL_kSRP:
+		kx="SRP";
+		break;
 	default:
 		kx="unknown";
 		}
@@ -1574,6 +1653,12 @@
 	case SSL_AES256:
 		enc="AES(256)";
 		break;
+	case SSL_AES128GCM:
+		enc="AESGCM(128)";
+		break;
+	case SSL_AES256GCM:
+		enc="AESGCM(256)";
+		break;
 	case SSL_CAMELLIA128:
 		enc="Camellia(128)";
 		break;
@@ -1596,6 +1681,15 @@
 	case SSL_SHA1:
 		mac="SHA1";
 		break;
+	case SSL_SHA256:
+		mac="SHA256";
+		break;
+	case SSL_SHA384:
+		mac="SHA384";
+		break;
+	case SSL_AEAD:
+		mac="AEAD";
+		break;
 	default:
 		mac="unknown";
 		break;
@@ -1653,6 +1747,11 @@
 	return(ret);
 	}
 
+unsigned long SSL_CIPHER_get_id(const SSL_CIPHER *c)
+	{
+	return c->id;
+	}
+
 /* return string version of key exchange algorithm */
 const char* SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher)
 	{

diff --git a/ssl/ssl_err.c b/ssl/ssl_err.c
index 0b8c127..2039a0c 100644
--- a/ssl/ssl_err.c
+++ b/ssl/ssl_err.c

@@ -80,7 +80,6 @@
 {ERR_FUNC(SSL_F_DTLS1_ACCEPT),	"DTLS1_ACCEPT"},
 {ERR_FUNC(SSL_F_DTLS1_ADD_CERT_TO_BUF),	"DTLS1_ADD_CERT_TO_BUF"},
 {ERR_FUNC(SSL_F_DTLS1_BUFFER_RECORD),	"DTLS1_BUFFER_RECORD"},
-{ERR_FUNC(SSL_F_DTLS1_CHECK_TIMEOUT_NUM),	"DTLS1_CHECK_TIMEOUT_NUM"},
 {ERR_FUNC(SSL_F_DTLS1_CLIENT_HELLO),	"DTLS1_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_DTLS1_CONNECT),	"DTLS1_CONNECT"},
 {ERR_FUNC(SSL_F_DTLS1_ENC),	"DTLS1_ENC"},
@@ -89,6 +88,7 @@
 {ERR_FUNC(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT),	"DTLS1_GET_MESSAGE_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_GET_RECORD),	"DTLS1_GET_RECORD"},
 {ERR_FUNC(SSL_F_DTLS1_HANDLE_TIMEOUT),	"DTLS1_HANDLE_TIMEOUT"},
+{ERR_FUNC(SSL_F_DTLS1_HEARTBEAT),	"DTLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_DTLS1_OUTPUT_CERT_CHAIN),	"DTLS1_OUTPUT_CERT_CHAIN"},
 {ERR_FUNC(SSL_F_DTLS1_PREPROCESS_FRAGMENT),	"DTLS1_PREPROCESS_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE),	"DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE"},
@@ -182,10 +182,12 @@
 {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING),	"SSL3_WRITE_PENDING"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT),	"SSL_ADD_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT),	"SSL_ADD_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK),	"SSL_add_dir_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK),	"SSL_add_file_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT),	"SSL_ADD_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT),	"SSL_ADD_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_BAD_METHOD),	"SSL_BAD_METHOD"},
 {ERR_FUNC(SSL_F_SSL_BYTES_TO_CIPHER_LIST),	"SSL_BYTES_TO_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CERT_DUP),	"SSL_CERT_DUP"},
@@ -202,6 +204,7 @@
 {ERR_FUNC(SSL_F_SSL_CREATE_CIPHER_LIST),	"SSL_CREATE_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CTRL),	"SSL_ctrl"},
 {ERR_FUNC(SSL_F_SSL_CTX_CHECK_PRIVATE_KEY),	"SSL_CTX_check_private_key"},
+{ERR_FUNC(SSL_F_SSL_CTX_MAKE_PROFILES),	"SSL_CTX_MAKE_PROFILES"},
 {ERR_FUNC(SSL_F_SSL_CTX_NEW),	"SSL_CTX_new"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CIPHER_LIST),	"SSL_CTX_set_cipher_list"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE),	"SSL_CTX_set_client_cert_engine"},
@@ -230,8 +233,10 @@
 {ERR_FUNC(SSL_F_SSL_NEW),	"SSL_new"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT),	"SSL_PARSE_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT),	"SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT),	"SSL_PARSE_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT),	"SSL_PARSE_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PEEK),	"SSL_peek"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT),	"SSL_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT),	"SSL_PREPARE_SERVERHELLO_TLSEXT"},
@@ -240,6 +245,7 @@
 {ERR_FUNC(SSL_F_SSL_RSA_PUBLIC_ENCRYPT),	"SSL_RSA_PUBLIC_ENCRYPT"},
 {ERR_FUNC(SSL_F_SSL_SESSION_NEW),	"SSL_SESSION_new"},
 {ERR_FUNC(SSL_F_SSL_SESSION_PRINT_FP),	"SSL_SESSION_print_fp"},
+{ERR_FUNC(SSL_F_SSL_SESSION_SET1_ID_CONTEXT),	"SSL_SESSION_set1_id_context"},
 {ERR_FUNC(SSL_F_SSL_SESS_CERT_NEW),	"SSL_SESS_CERT_NEW"},
 {ERR_FUNC(SSL_F_SSL_SET_CERT),	"SSL_SET_CERT"},
 {ERR_FUNC(SSL_F_SSL_SET_CIPHER_LIST),	"SSL_set_cipher_list"},
@@ -253,6 +259,7 @@
 {ERR_FUNC(SSL_F_SSL_SET_TRUST),	"SSL_set_trust"},
 {ERR_FUNC(SSL_F_SSL_SET_WFD),	"SSL_set_wfd"},
 {ERR_FUNC(SSL_F_SSL_SHUTDOWN),	"SSL_shutdown"},
+{ERR_FUNC(SSL_F_SSL_SRP_CTX_INIT),	"SSL_SRP_CTX_init"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_CONST_FUNCTION),	"SSL_UNDEFINED_CONST_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_FUNCTION),	"SSL_UNDEFINED_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_VOID_FUNCTION),	"SSL_UNDEFINED_VOID_FUNCTION"},
@@ -272,6 +279,8 @@
 {ERR_FUNC(SSL_F_TLS1_CHANGE_CIPHER_STATE),	"TLS1_CHANGE_CIPHER_STATE"},
 {ERR_FUNC(SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT),	"TLS1_CHECK_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_ENC),	"TLS1_ENC"},
+{ERR_FUNC(SSL_F_TLS1_EXPORT_KEYING_MATERIAL),	"TLS1_EXPORT_KEYING_MATERIAL"},
+{ERR_FUNC(SSL_F_TLS1_HEARTBEAT),	"SSL_F_TLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT),	"TLS1_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT),	"TLS1_PREPARE_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PRF),	"tls1_prf"},
@@ -314,6 +323,13 @@
 {ERR_REASON(SSL_R_BAD_RSA_MODULUS_LENGTH),"bad rsa modulus length"},
 {ERR_REASON(SSL_R_BAD_RSA_SIGNATURE)     ,"bad rsa signature"},
 {ERR_REASON(SSL_R_BAD_SIGNATURE)         ,"bad signature"},
+{ERR_REASON(SSL_R_BAD_SRP_A_LENGTH)      ,"bad srp a length"},
+{ERR_REASON(SSL_R_BAD_SRP_B_LENGTH)      ,"bad srp b length"},
+{ERR_REASON(SSL_R_BAD_SRP_G_LENGTH)      ,"bad srp g length"},
+{ERR_REASON(SSL_R_BAD_SRP_N_LENGTH)      ,"bad srp n length"},
+{ERR_REASON(SSL_R_BAD_SRP_S_LENGTH)      ,"bad srp s length"},
+{ERR_REASON(SSL_R_BAD_SRTP_MKI_VALUE)    ,"bad srtp mki value"},
+{ERR_REASON(SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST),"bad srtp protection profile list"},
 {ERR_REASON(SSL_R_BAD_SSL_FILETYPE)      ,"bad ssl filetype"},
 {ERR_REASON(SSL_R_BAD_SSL_SESSION_ID_LENGTH),"bad ssl session id length"},
 {ERR_REASON(SSL_R_BAD_STATE)             ,"bad state"},
@@ -352,6 +368,7 @@
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE),"ecc cert should have rsa signature"},
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE),"ecc cert should have sha1 signature"},
 {ERR_REASON(SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER),"ecgroup too large for cipher"},
+{ERR_REASON(SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST),"empty srtp protection profile list"},
 {ERR_REASON(SSL_R_ENCRYPTED_LENGTH_TOO_LONG),"encrypted length too long"},
 {ERR_REASON(SSL_R_ERROR_GENERATING_TMP_RSA_KEY),"error generating tmp rsa key"},
 {ERR_REASON(SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST),"error in received cipher list"},
@@ -368,6 +385,7 @@
 {ERR_REASON(SSL_R_INVALID_COMMAND)       ,"invalid command"},
 {ERR_REASON(SSL_R_INVALID_COMPRESSION_ALGORITHM),"invalid compression algorithm"},
 {ERR_REASON(SSL_R_INVALID_PURPOSE)       ,"invalid purpose"},
+{ERR_REASON(SSL_R_INVALID_SRP_USERNAME)  ,"invalid srp username"},
 {ERR_REASON(SSL_R_INVALID_STATUS_RESPONSE),"invalid status response"},
 {ERR_REASON(SSL_R_INVALID_TICKET_KEYS_LENGTH),"invalid ticket keys length"},
 {ERR_REASON(SSL_R_INVALID_TRUST)         ,"invalid trust"},
@@ -397,6 +415,7 @@
 {ERR_REASON(SSL_R_MISSING_RSA_CERTIFICATE),"missing rsa certificate"},
 {ERR_REASON(SSL_R_MISSING_RSA_ENCRYPTING_CERT),"missing rsa encrypting cert"},
 {ERR_REASON(SSL_R_MISSING_RSA_SIGNING_CERT),"missing rsa signing cert"},
+{ERR_REASON(SSL_R_MISSING_SRP_PARAM)     ,"can't find SRP server param"},
 {ERR_REASON(SSL_R_MISSING_TMP_DH_KEY)    ,"missing tmp dh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_ECDH_KEY)  ,"missing tmp ecdh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_RSA_KEY)   ,"missing tmp rsa key"},
@@ -426,6 +445,7 @@
 {ERR_REASON(SSL_R_NO_RENEGOTIATION)      ,"no renegotiation"},
 {ERR_REASON(SSL_R_NO_REQUIRED_DIGEST)    ,"digest requred for handshake isn't computed"},
 {ERR_REASON(SSL_R_NO_SHARED_CIPHER)      ,"no shared cipher"},
+{ERR_REASON(SSL_R_NO_SRTP_PROFILES)      ,"no srtp profiles"},
 {ERR_REASON(SSL_R_NO_VERIFY_CALLBACK)    ,"no verify callback"},
 {ERR_REASON(SSL_R_NULL_SSL_CTX)          ,"null ssl ctx"},
 {ERR_REASON(SSL_R_NULL_SSL_METHOD_PASSED),"null ssl method passed"},
@@ -470,7 +490,12 @@
 {ERR_REASON(SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED),"session id context uninitialized"},
 {ERR_REASON(SSL_R_SESSION_MAY_NOT_BE_CREATED),"session may not be created"},
 {ERR_REASON(SSL_R_SHORT_READ)            ,"short read"},
+{ERR_REASON(SSL_R_SIGNATURE_ALGORITHMS_ERROR),"signature algorithms error"},
 {ERR_REASON(SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE),"signature for non signing certificate"},
+{ERR_REASON(SSL_R_SRP_A_CALC)            ,"error with the srp params"},
+{ERR_REASON(SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES),"srtp could not allocate profiles"},
+{ERR_REASON(SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG),"srtp protection profile list too long"},
+{ERR_REASON(SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE),"srtp unknown protection profile"},
 {ERR_REASON(SSL_R_SSL23_DOING_SESSION_ID_REUSE),"ssl23 doing session id reuse"},
 {ERR_REASON(SSL_R_SSL2_CONNECTION_ID_TOO_LONG),"ssl2 connection id too long"},
 {ERR_REASON(SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT),"ssl3 ext invalid ecpointformat"},
@@ -515,6 +540,9 @@
 {ERR_REASON(SSL_R_TLSV1_UNRECOGNIZED_NAME),"tlsv1 unrecognized name"},
 {ERR_REASON(SSL_R_TLSV1_UNSUPPORTED_EXTENSION),"tlsv1 unsupported extension"},
 {ERR_REASON(SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER),"tls client cert req with anon cipher"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT),"peer does not accept heartbearts"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PENDING) ,"heartbeat request already pending"},
+{ERR_REASON(SSL_R_TLS_ILLEGAL_EXPORTER_LABEL),"tls illegal exporter label"},
 {ERR_REASON(SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST),"tls invalid ecpointformat list"},
 {ERR_REASON(SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST),"tls peer did not respond with certificate list"},
 {ERR_REASON(SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG),"tls rsa encrypted value length is wrong"},
@@ -536,6 +564,7 @@
 {ERR_REASON(SSL_R_UNKNOWN_CERTIFICATE_TYPE),"unknown certificate type"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_RETURNED),"unknown cipher returned"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_TYPE)   ,"unknown cipher type"},
+{ERR_REASON(SSL_R_UNKNOWN_DIGEST)        ,"unknown digest"},
 {ERR_REASON(SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE),"unknown key exchange type"},
 {ERR_REASON(SSL_R_UNKNOWN_PKEY_TYPE)     ,"unknown pkey type"},
 {ERR_REASON(SSL_R_UNKNOWN_PROTOCOL)      ,"unknown protocol"},
@@ -550,12 +579,14 @@
 {ERR_REASON(SSL_R_UNSUPPORTED_PROTOCOL)  ,"unsupported protocol"},
 {ERR_REASON(SSL_R_UNSUPPORTED_SSL_VERSION),"unsupported ssl version"},
 {ERR_REASON(SSL_R_UNSUPPORTED_STATUS_TYPE),"unsupported status type"},
+{ERR_REASON(SSL_R_USE_SRTP_NOT_NEGOTIATED),"use srtp not negotiated"},
 {ERR_REASON(SSL_R_WRITE_BIO_NOT_SET)     ,"write bio not set"},
 {ERR_REASON(SSL_R_WRONG_CIPHER_RETURNED) ,"wrong cipher returned"},
 {ERR_REASON(SSL_R_WRONG_MESSAGE_TYPE)    ,"wrong message type"},
 {ERR_REASON(SSL_R_WRONG_NUMBER_OF_KEY_BITS),"wrong number of key bits"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_SIZE)  ,"wrong signature size"},
+{ERR_REASON(SSL_R_WRONG_SIGNATURE_TYPE)  ,"wrong signature type"},
 {ERR_REASON(SSL_R_WRONG_SSL_VERSION)     ,"wrong ssl version"},
 {ERR_REASON(SSL_R_WRONG_VERSION_NUMBER)  ,"wrong version number"},
 {ERR_REASON(SSL_R_X509_LIB)              ,"x509 lib"},

diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
index be83a0b..8340854 100644
--- a/ssl/ssl_lib.c
+++ b/ssl/ssl_lib.c

@@ -176,7 +176,10 @@
 	0,	/* client_finished_label_len */
 	NULL,	/* server_finished_label */
 	0,	/* server_finished_label_len */
-	(int (*)(int))ssl_undefined_function
+	(int (*)(int))ssl_undefined_function,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context)) ssl_undefined_function,
 	};
 
 int SSL_clear(SSL *s)
@@ -202,9 +205,9 @@
        * needed because SSL_clear is not called when doing renegotiation) */
 	/* This is set if we are doing dynamic renegotiation so keep
 	 * the old cipher.  It is sort of a SSL_clear_lite :-) */
-	if (s->new_session) return(1);
+	if (s->renegotiate) return(1);
 #else
-	if (s->new_session)
+	if (s->renegotiate)
 		{
 		SSLerr(SSL_F_SSL_CLEAR,ERR_R_INTERNAL_ERROR);
 		return 0;
@@ -595,6 +598,9 @@
 		OPENSSL_free(s->next_proto_negotiated);
 #endif
 
+        if (s->srtp_profiles)
+            sk_SRTP_PROTECTION_PROFILE_free(s->srtp_profiles);
+
 	OPENSSL_free(s);
 	}
 
@@ -1017,10 +1023,21 @@
 
 int SSL_renegotiate(SSL *s)
 	{
-	if (s->new_session == 0)
-		{
-		s->new_session=1;
-		}
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=1;
+
+	return(s->method->ssl_renegotiate(s));
+	}
+
+int SSL_renegotiate_abbreviated(SSL *s)
+	{
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=0;
+
 	return(s->method->ssl_renegotiate(s));
 	}
 
@@ -1028,7 +1045,7 @@
 	{
 	/* becomes true when negotiation is requested;
 	 * false again once a handshake has finished */
-	return (s->new_session != 0);
+	return (s->renegotiate != 0);
 	}
 
 long SSL_ctrl(SSL *s,int cmd,long larg,void *parg)
@@ -1395,6 +1412,10 @@
 	for (i=0; i<sk_SSL_CIPHER_num(sk); i++)
 		{
 		c=sk_SSL_CIPHER_value(sk,i);
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_client_version(s) < TLS1_2_VERSION))
+			continue;
 #ifndef OPENSSL_NO_KRB5
 		if (((c->algorithm_mkey & SSL_kKRB5) || (c->algorithm_auth & SSL_aKRB5)) &&
 		    nokrb5)
@@ -1412,7 +1433,7 @@
 	/* If p == q, no ciphers and caller indicates an error. Otherwise
 	 * add SCSV if not renegotiating.
 	 */
-	if (p != q && !s->new_session)
+	if (p != q && !s->renegotiate)
 		{
 		static SSL_CIPHER scsv =
 			{
@@ -1459,7 +1480,7 @@
 			(p[n-1] == (SSL3_CK_SCSV & 0xff)))
 			{
 			/* SCSV fatal if renegotiating */
-			if (s->new_session)
+			if (s->renegotiate)
 				{
 				SSLerr(SSL_F_SSL_BYTES_TO_CIPHER_LIST,SSL_R_SCSV_RECEIVED_WHEN_RENEGOTIATING);
 				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_HANDSHAKE_FAILURE); 
@@ -1632,10 +1653,21 @@
 	ctx->next_proto_select_cb = cb;
 	ctx->next_proto_select_cb_arg = arg;
 	}
-
 # endif
 #endif
 
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context)
+	{
+	if (s->version < TLS1_VERSION)
+		return -1;
+
+	return s->method->ssl3_enc->export_keying_material(s, out, olen, label,
+							   llen, p, plen,
+							   use_context);
+	}
+
 static unsigned long ssl_session_hash(const SSL_SESSION *a)
 	{
 	unsigned long l;
@@ -1679,6 +1711,14 @@
 		return(NULL);
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (meth->version < TLS1_VERSION))	
+		{
+		SSLerr(SSL_F_SSL_CTX_NEW, SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		return NULL;
+		}
+#endif
+
 	if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0)
 		{
 		SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS);
@@ -1808,6 +1848,9 @@
 	ret->psk_client_callback=NULL;
 	ret->psk_server_callback=NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_init(ret);
+#endif
 #ifndef OPENSSL_NO_BUF_FREELISTS
 	ret->freelist_max_len = SSL_MAX_BUF_FREELIST_LEN_DEFAULT;
 	ret->rbuf_freelist = OPENSSL_malloc(sizeof(SSL3_BUF_FREELIST));
@@ -1936,10 +1979,16 @@
 	a->comp_methods = NULL;
 #endif
 
+        if (a->srtp_profiles)
+                sk_SRTP_PROTECTION_PROFILE_free(a->srtp_profiles);
+
 #ifndef OPENSSL_NO_PSK
 	if (a->psk_identity_hint)
 		OPENSSL_free(a->psk_identity_hint);
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_free(a);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (a->client_cert_engine)
 		ENGINE_finish(a->client_cert_engine);
@@ -2193,12 +2242,13 @@
 
 #ifndef OPENSSL_NO_EC
 
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s)
 	{
 	unsigned long alg_k, alg_a;
 	EVP_PKEY *pkey = NULL;
 	int keysize = 0;
 	int signature_nid = 0, md_nid = 0, pk_nid = 0;
+	const SSL_CIPHER *cs = s->s3->tmp.new_cipher;
 
 	alg_k = cs->algorithm_mkey;
 	alg_a = cs->algorithm_auth;
@@ -2228,7 +2278,7 @@
 			SSLerr(SSL_F_SSL_CHECK_SRVR_ECC_CERT_AND_ALG, SSL_R_ECC_CERT_NOT_FOR_KEY_AGREEMENT);
 			return 0;
 			}
-		if (alg_k & SSL_kECDHe)
+		if ((alg_k & SSL_kECDHe) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be ECDSA */
 			if (pk_nid != NID_X9_62_id_ecPublicKey)
@@ -2237,7 +2287,7 @@
 				return 0;
 				}
 			}
-		if (alg_k & SSL_kECDHr)
+		if ((alg_k & SSL_kECDHr) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be RSA */
 
@@ -2327,34 +2377,36 @@
 	return(c->pkeys[i].x509);
 	}
 
-EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher)
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher, const EVP_MD **pmd)
 	{
 	unsigned long alg_a;
 	CERT *c;
+	int idx = -1;
 
 	alg_a = cipher->algorithm_auth;
 	c=s->cert;
 
 	if ((alg_a & SSL_aDSS) &&
 		(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey);
+		idx = SSL_PKEY_DSA_SIGN;
 	else if (alg_a & SSL_aRSA)
 		{
 		if (c->pkeys[SSL_PKEY_RSA_SIGN].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_SIGN].privatekey);
+			idx = SSL_PKEY_RSA_SIGN;
 		else if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_ENC].privatekey);
-		else
-			return(NULL);
+			idx = SSL_PKEY_RSA_ENC;
 		}
 	else if ((alg_a & SSL_aECDSA) &&
 	         (c->pkeys[SSL_PKEY_ECC].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_ECC].privatekey);
-	else /* if (alg_a & SSL_aNULL) */
+		idx = SSL_PKEY_ECC;
+	if (idx == -1)
 		{
 		SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
 		return(NULL);
 		}
+	if (pmd)
+		*pmd = c->pkeys[idx].digest;
+	return c->pkeys[idx].privatekey;
 	}
 
 void ssl_update_cache(SSL *s,int mode)
@@ -2579,6 +2631,10 @@
 
 static const char *ssl_get_version(int version)
 	{
+	if (version == TLS1_2_VERSION)
+		return("TLSv1.2");
+	else if (version == TLS1_1_VERSION)
+		return("TLSv1.1");
 	if (version == TLS1_VERSION)
 		return("TLSv1");
 	else if (version == SSL3_VERSION)
@@ -2607,12 +2663,8 @@
 		{
 	case SSL2_VERSION:
 		return SSL_TXT_RSA;
-	case SSL3_VERSION:
-	case TLS1_VERSION:
-	case DTLS1_VERSION:
-		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 	default:
-		return "UNKNOWN";
+		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 		}
 	}
 
@@ -2700,6 +2752,7 @@
 	ret->in_handshake = s->in_handshake;
 	ret->handshake_func = s->handshake_func;
 	ret->server = s->server;
+	ret->renegotiate = s->renegotiate;
 	ret->new_session = s->new_session;
 	ret->quiet_shutdown = s->quiet_shutdown;
 	ret->shutdown=s->shutdown;
@@ -2965,6 +3018,11 @@
 	return(ssl->state);
 	}
 
+void SSL_set_state(SSL *ssl, int state)
+	{
+	ssl->state = state;
+	}
+
 void SSL_set_verify_result(SSL *ssl,long arg)
 	{
 	ssl->verify_result=arg;
@@ -3236,6 +3294,16 @@
 	*hash=NULL;
 }
 
+void SSL_set_debug(SSL *s, int debug)
+	{
+	s->debug = debug;
+	}
+
+int SSL_cache_hit(SSL *s)
+	{
+	return s->hit;
+	}
+
 #if defined(_WINDLL) && defined(OPENSSL_SYS_WIN16)
 #include "../crypto/bio/bss_file.c"
 #endif

diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
index c78ce64..9e8172a 100644
--- a/ssl/ssl_locl.h
+++ b/ssl/ssl_locl.h

@@ -170,7 +170,7 @@
 # define OPENSSL_EXTERN OPENSSL_EXPORT
 #endif
 
-#define PKCS1_CHECK
+#undef PKCS1_CHECK
 
 #define c2l(c,l)	(l = ((unsigned long)(*((c)++)))     , \
 			 l|=(((unsigned long)(*((c)++)))<< 8), \
@@ -289,6 +289,7 @@
 #define SSL_kEECDH		0x00000080L /* ephemeral ECDH */
 #define SSL_kPSK		0x00000100L /* PSK */
 #define SSL_kGOST       0x00000200L /* GOST key exchange */
+#define SSL_kSRP        0x00000400L /* SRP */
 
 /* Bits for algorithm_auth (server authentication) */
 #define SSL_aRSA		0x00000001L /* RSA auth */
@@ -316,21 +317,29 @@
 #define SSL_CAMELLIA256		0x00000200L
 #define SSL_eGOST2814789CNT	0x00000400L
 #define SSL_SEED		0x00000800L
+#define SSL_AES128GCM		0x00001000L
+#define SSL_AES256GCM		0x00002000L
 
-#define SSL_AES        		(SSL_AES128|SSL_AES256)
+#define SSL_AES        		(SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM)
 #define SSL_CAMELLIA		(SSL_CAMELLIA128|SSL_CAMELLIA256)
 
 
 /* Bits for algorithm_mac (symmetric authentication) */
+
 #define SSL_MD5			0x00000001L
 #define SSL_SHA1		0x00000002L
 #define SSL_GOST94      0x00000004L
 #define SSL_GOST89MAC   0x00000008L
+#define SSL_SHA256		0x00000010L
+#define SSL_SHA384		0x00000020L
+/* Not a real MAC, just an indication it is part of cipher */
+#define SSL_AEAD		0x00000040L
 
 /* Bits for algorithm_ssl (protocol version) */
 #define SSL_SSLV2		0x00000001L
 #define SSL_SSLV3		0x00000002L
 #define SSL_TLSV1		SSL_SSLV3	/* for now */
+#define SSL_TLSV1_2		0x00000004L
 
 
 /* Bits for algorithm2 (handshake digests and other extra flags) */
@@ -338,15 +347,21 @@
 #define SSL_HANDSHAKE_MAC_MD5 0x10
 #define SSL_HANDSHAKE_MAC_SHA 0x20
 #define SSL_HANDSHAKE_MAC_GOST94 0x40
+#define SSL_HANDSHAKE_MAC_SHA256 0x80
+#define SSL_HANDSHAKE_MAC_SHA384 0x100
 #define SSL_HANDSHAKE_MAC_DEFAULT (SSL_HANDSHAKE_MAC_MD5 | SSL_HANDSHAKE_MAC_SHA)
 
 /* When adding new digest in the ssl_ciph.c and increment SSM_MD_NUM_IDX
  * make sure to update this constant too */
-#define SSL_MAX_DIGEST 4
+#define SSL_MAX_DIGEST 6
 
-#define TLS1_PRF_DGST_SHIFT 8
+#define TLS1_PRF_DGST_MASK	(0xff << TLS1_PRF_DGST_SHIFT)
+
+#define TLS1_PRF_DGST_SHIFT 10
 #define TLS1_PRF_MD5 (SSL_HANDSHAKE_MAC_MD5 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_SHA1 (SSL_HANDSHAKE_MAC_SHA << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA256 (SSL_HANDSHAKE_MAC_SHA256 << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA384 (SSL_HANDSHAKE_MAC_SHA384 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_GOST94 (SSL_HANDSHAKE_MAC_GOST94 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF (TLS1_PRF_MD5 | TLS1_PRF_SHA1)
 
@@ -458,6 +473,8 @@
 	X509 *x509;
 	STACK_OF(X509) *cert_chain;
 	EVP_PKEY *privatekey;
+	/* Digest to use when signing */
+	const EVP_MD *digest;
 	} CERT_PKEY;
 
 typedef struct cert_st
@@ -555,6 +572,10 @@
 	const char *server_finished_label;
 	int server_finished_label_len;
 	int (*alert_value)(int);
+	int (*export_keying_material)(SSL *, unsigned char *, size_t,
+				      const char *, size_t,
+				      const unsigned char *, size_t,
+				      int use_context);
 	} SSL3_ENC_METHOD;
 
 #ifndef OPENSSL_NO_COMP
@@ -592,11 +613,12 @@
 extern SSL3_ENC_METHOD SSLv3_enc_data;
 extern SSL3_ENC_METHOD DTLSv1_enc_data;
 
-#define IMPLEMENT_tls1_meth_func(func_name, s_accept, s_connect, s_get_meth) \
+#define IMPLEMENT_tls_meth_func(version, func_name, s_accept, s_connect, \
+				s_get_meth) \
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-		TLS1_VERSION, \
+		version, \
 		tls1_new, \
 		tls1_clear, \
 		tls1_free, \
@@ -670,7 +692,7 @@
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-	TLS1_VERSION, \
+	TLS1_2_VERSION, \
 	tls1_new, \
 	tls1_clear, \
 	tls1_free, \
@@ -753,7 +775,7 @@
 		ssl3_read, \
 		ssl3_peek, \
 		ssl3_write, \
-		ssl3_shutdown, \
+		dtls1_shutdown, \
 		ssl3_renegotiate, \
 		ssl3_renegotiate_check, \
 		dtls1_get_message, \
@@ -810,7 +832,7 @@
 int ssl_undefined_void_function(void);
 int ssl_undefined_const_function(const SSL *s);
 X509 *ssl_get_server_send_cert(SSL *);
-EVP_PKEY *ssl_get_sign_pkey(SSL *,const SSL_CIPHER *);
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *c, const EVP_MD **pmd);
 int ssl_cert_type(X509 *x,EVP_PKEY *pkey);
 void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher);
 STACK_OF(SSL_CIPHER) *ssl_get_ciphers_by_id(SSL *s);
@@ -1021,6 +1043,7 @@
 void dtls1_free(SSL *s);
 void dtls1_clear(SSL *s);
 long dtls1_ctrl(SSL *s,int cmd, long larg, void *parg);
+int dtls1_shutdown(SSL *s);
 
 long dtls1_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok);
 int dtls1_get_record(SSL *s);
@@ -1041,12 +1064,15 @@
 int tls1_mac(SSL *ssl, unsigned char *md, int snd);
 int tls1_generate_master_secret(SSL *s, unsigned char *out,
 	unsigned char *p, int len);
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen,
+	const unsigned char *p, size_t plen, int use_context);
 int tls1_alert_code(int code);
 int ssl3_alert_code(int code);
 int ssl_ok(SSL *s);
 
 #ifndef OPENSSL_NO_ECDH
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs);
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s);
 #endif
 
 SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n);
@@ -1066,6 +1092,13 @@
 int ssl_check_clienthello_tlsext(SSL *s);
 int ssl_check_serverhello_tlsext(SSL *s);
 
+#ifndef OPENSSL_NO_HEARTBEATS
+int tls1_heartbeat(SSL *s);
+int dtls1_heartbeat(SSL *s);
+int tls1_process_heartbeat(SSL *s);
+int dtls1_process_heartbeat(SSL *s);
+#endif
+
 #ifdef OPENSSL_NO_SHA256
 #define tlsext_tick_md	EVP_sha1
 #else
@@ -1073,6 +1106,12 @@
 #endif
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 				const unsigned char *limit, SSL_SESSION **ret);
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk,
+				const EVP_MD *md);
+int tls12_get_sigid(const EVP_PKEY *pk);
+const EVP_MD *tls12_get_hash(unsigned char hash_alg);
+
 #endif
 EVP_MD_CTX* ssl_replace_hash(EVP_MD_CTX **hash,const EVP_MD *md) ;
 void ssl_clear_hash_ctx(EVP_MD_CTX **hash);
@@ -1084,4 +1123,13 @@
 					int maxlen);
 int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
 					  int *al);
+long ssl_get_algorithm2(SSL *s);
+int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize);
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p);
+
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+
 #endif

diff --git a/ssl/ssl_sess.c b/ssl/ssl_sess.c
index 93954e4..920b763 100644
--- a/ssl/ssl_sess.c
+++ b/ssl/ssl_sess.c

@@ -218,6 +218,9 @@
 	ss->psk_identity_hint=NULL;
 	ss->psk_identity=NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	ss->srp_username=NULL;
+#endif
 	return(ss);
 	}
 
@@ -228,6 +231,11 @@
 	return s->session_id;
 	}
 
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s)
+	{
+	return s->compress_meth;
+	}
+
 /* Even with SSLv2, we have 16 bytes (128 bits) of session ID space. SSLv3/TLSv1
  * has 32 bytes (256 bits). As such, filling the ID with random gunk repeatedly
  * until we have no conflict is going to complete in one iteration pretty much
@@ -307,6 +315,16 @@
 			ss->ssl_version=TLS1_VERSION;
 			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
 			}
+		else if (s->version == TLS1_1_VERSION)
+			{
+			ss->ssl_version=TLS1_1_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
+		else if (s->version == TLS1_2_VERSION)
+			{
+			ss->ssl_version=TLS1_2_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
 		else if (s->version == DTLS1_BAD_VER)
 			{
 			ss->ssl_version=DTLS1_BAD_VER;
@@ -430,6 +448,25 @@
 	return(1);
 	}
 
+/* ssl_get_prev attempts to find an SSL_SESSION to be used to resume this
+ * connection. It is only called by servers.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *
+ * Returns:
+ *   -1: error
+ *    0: a session may have been found.
+ *
+ * Side effects:
+ *   - If a session is found then s->session is pointed at it (after freeing an
+ *     existing session if need be) and s->verify_result is set from the session.
+ *   - Both for new and resumed sessions, s->tlsext_ticket_expected is set to 1
+ *     if the server should issue a new session ticket (to 0 otherwise).
+ */
 int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 			const unsigned char *limit)
 	{
@@ -437,27 +474,39 @@
 
 	SSL_SESSION *ret=NULL;
 	int fatal = 0;
+	int try_session_cache = 1;
 #ifndef OPENSSL_NO_TLSEXT
 	int r;
 #endif
 
 	if (len > SSL_MAX_SSL_SESSION_ID_LENGTH)
 		goto err;
+
+	if (len == 0)
+		try_session_cache = 0;
+
 #ifndef OPENSSL_NO_TLSEXT
-	r = tls1_process_ticket(s, session_id, len, limit, &ret);
-	if (r == -1)
+	r = tls1_process_ticket(s, session_id, len, limit, &ret); /* sets s->tlsext_ticket_expected */
+	switch (r)
 		{
+	case -1: /* Error during processing */
 		fatal = 1;
 		goto err;
+	case 0: /* No ticket found */
+	case 1: /* Zero length ticket found */
+		break; /* Ok to carry on processing session id. */
+	case 2: /* Ticket found but not decrypted. */
+	case 3: /* Ticket decrypted, *ret has been set. */
+		try_session_cache = 0;
+		break;
+	default:
+		abort();
 		}
-	else if (r == 0 || (!ret && !len))
-		goto err;
-	else if (!ret && !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
-#else
-	if (len == 0)
-		goto err;
-	if (!(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 #endif
+
+	if (try_session_cache &&
+	    ret == NULL &&
+	    !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 		{
 		SSL_SESSION data;
 		data.ssl_version=s->version;
@@ -468,20 +517,22 @@
 		CRYPTO_r_lock(CRYPTO_LOCK_SSL_CTX);
 		ret=lh_SSL_SESSION_retrieve(s->session_ctx->sessions,&data);
 		if (ret != NULL)
-		    /* don't allow other threads to steal it: */
-		    CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			{
+			/* don't allow other threads to steal it: */
+			CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			}
 		CRYPTO_r_unlock(CRYPTO_LOCK_SSL_CTX);
+		if (ret == NULL)
+			s->session_ctx->stats.sess_miss++;
 		}
 
-	if (ret == NULL)
+	if (try_session_cache &&
+	    ret == NULL &&
+	    s->session_ctx->get_session_cb != NULL)
 		{
 		int copy=1;
 	
-		s->session_ctx->stats.sess_miss++;
-		ret=NULL;
-		if (s->session_ctx->get_session_cb != NULL
-		    && (ret=s->session_ctx->get_session_cb(s,session_id,len,&copy))
-		       != NULL)
+		if ((ret=s->session_ctx->get_session_cb(s,session_id,len,&copy)))
 			{
 			s->session_ctx->stats.sess_cb_hit++;
 
@@ -500,23 +551,18 @@
 				 * things are very strange */
 				SSL_CTX_add_session(s->session_ctx,ret);
 			}
-		if (ret == NULL)
-			goto err;
 		}
 
-	/* Now ret is non-NULL, and we own one of its reference counts. */
+	if (ret == NULL)
+		goto err;
+
+	/* Now ret is non-NULL and we own one of its reference counts. */
 
 	if (ret->sid_ctx_length != s->sid_ctx_length
 	    || memcmp(ret->sid_ctx,s->sid_ctx,ret->sid_ctx_length))
 		{
-		/* We've found the session named by the client, but we don't
+		/* We have the session requested by the client, but we don't
 		 * want to use it in this context. */
-
-#if 0 /* The client cannot always know when a session is not appropriate,
-       * so we shouldn't generate an error message. */
-
-		SSLerr(SSL_F_SSL_GET_PREV_SESSION,SSL_R_ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT);
-#endif
 		goto err; /* treat like cache miss */
 		}
 	
@@ -553,39 +599,38 @@
 			goto err;
 		}
 
-
-#if 0 /* This is way too late. */
-
-	/* If a thread got the session, then 'swaped', and another got
-	 * it and then due to a time-out decided to 'OPENSSL_free' it we could
-	 * be in trouble.  So I'll increment it now, then double decrement
-	 * later - am I speaking rubbish?. */
-	CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
-#endif
-
 	if (ret->timeout < (long)(time(NULL) - ret->time)) /* timeout */
 		{
 		s->session_ctx->stats.sess_timeout++;
-		/* remove it from the cache */
-		SSL_CTX_remove_session(s->session_ctx,ret);
+		if (try_session_cache)
+			{
+			/* session was from the cache, so remove it */
+			SSL_CTX_remove_session(s->session_ctx,ret);
+			}
 		goto err;
 		}
 
 	s->session_ctx->stats.sess_hit++;
 
-	/* ret->time=time(NULL); */ /* rezero timeout? */
-	/* again, just leave the session 
-	 * if it is the same session, we have just incremented and
-	 * then decremented the reference count :-) */
 	if (s->session != NULL)
 		SSL_SESSION_free(s->session);
 	s->session=ret;
 	s->verify_result = s->session->verify_result;
-	return(1);
+	return 1;
 
  err:
 	if (ret != NULL)
+		{
 		SSL_SESSION_free(ret);
+#ifndef OPENSSL_NO_TLSEXT
+		if (!try_session_cache)
+			{
+			/* The session was from a ticket, so we should
+			 * issue a ticket for the new session */
+			s->tlsext_ticket_expected = 1;
+			}
+#endif
+		}
 	if (fatal)
 		return -1;
 	else
@@ -736,6 +781,10 @@
 	if (ss->psk_identity != NULL)
 		OPENSSL_free(ss->psk_identity);
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (ss->srp_username != NULL)
+		OPENSSL_free(ss->srp_username);
+#endif
 	OPENSSL_cleanse(ss,sizeof(*ss));
 	OPENSSL_free(ss);
 	}
@@ -760,10 +809,6 @@
 			{
 			if (!SSL_set_ssl_method(s,meth))
 				return(0);
-			if (s->ctx->session_timeout == 0)
-				session->timeout=SSL_get_default_timeout(s);
-			else
-				session->timeout=s->ctx->session_timeout;
 			}
 
 #ifndef OPENSSL_NO_KRB5
@@ -831,6 +876,25 @@
 	return(t);
 	}
 
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s)
+	{
+	return s->peer;
+	}
+
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len)
+	{
+	if(sid_ctx_len > SSL_MAX_SID_CTX_LENGTH)
+		{
+		SSLerr(SSL_F_SSL_SESSION_SET1_ID_CONTEXT,SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
+		return 0;
+		}
+	s->sid_ctx_length=sid_ctx_len;
+	memcpy(s->sid_ctx,sid_ctx,sid_ctx_len);
+
+	return 1;
+	}
+
 long SSL_CTX_set_timeout(SSL_CTX *s, long t)
 	{
 	long l;

diff --git a/ssl/ssl_txt.c b/ssl/ssl_txt.c
index 3122440..6479d52 100644
--- a/ssl/ssl_txt.c
+++ b/ssl/ssl_txt.c

@@ -115,6 +115,10 @@
 		s="SSLv2";
 	else if (x->ssl_version == SSL3_VERSION)
 		s="SSLv3";
+	else if (x->ssl_version == TLS1_2_VERSION)
+		s="TLSv1.2";
+	else if (x->ssl_version == TLS1_1_VERSION)
+		s="TLSv1.1";
 	else if (x->ssl_version == TLS1_VERSION)
 		s="TLSv1";
 	else if (x->ssl_version == DTLS1_VERSION)
@@ -187,6 +191,10 @@
 	if (BIO_puts(bp,"\n    PSK identity hint: ") <= 0) goto err;
 	if (BIO_printf(bp, "%s", x->psk_identity_hint ? x->psk_identity_hint : "None") <= 0) goto err;
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (BIO_puts(bp,"\n    SRP username: ") <= 0) goto err;
+	if (BIO_printf(bp, "%s", x->srp_username ? x->srp_username : "None") <= 0) goto err;
+#endif
 #ifndef OPENSSL_NO_TLSEXT
 	if (x->tlsext_tick_lifetime_hint)
 		{

diff --git a/ssl/ssltest.c b/ssl/ssltest.c
index f6a2c79..a950b6e 100644
--- a/ssl/ssltest.c
+++ b/ssl/ssltest.c

@@ -181,6 +181,9 @@
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
 #endif
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include <openssl/bn.h>
 
 #define _XOPEN_SOURCE_EXTENDED	1 /* Or gethostname won't be declared properly
@@ -246,6 +249,49 @@
 	unsigned int max_psk_len);
 #endif
 
+#ifndef OPENSSL_NO_SRP
+/* SRP client */
+/* This is a context that we pass to all callbacks */
+typedef struct srp_client_arg_st
+	{
+	char *srppassin;
+	char *srplogin;
+	} SRP_CLIENT_ARG;
+
+#define PWD_STRLEN 1024
+
+static char * MS_CALLBACK ssl_give_srp_client_pwd_cb(SSL *s, void *arg)
+	{
+	SRP_CLIENT_ARG *srp_client_arg = (SRP_CLIENT_ARG *)arg;
+	return BUF_strdup((char *)srp_client_arg->srppassin);
+	}
+
+/* SRP server */
+/* This is a context that we pass to SRP server callbacks */
+typedef struct srp_server_arg_st
+	{
+	char *expected_user;
+	char *pass;
+	} SRP_SERVER_ARG;
+
+static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
+	{
+	SRP_SERVER_ARG * p = (SRP_SERVER_ARG *) arg;
+
+	if (strcmp(p->expected_user, SSL_get_srp_username(s)) != 0)
+		{
+		fprintf(stderr, "User %s doesn't exist\n", SSL_get_srp_username(s));
+		return SSL3_AL_FATAL;
+		}
+	if (SSL_set_srp_server_param_pw(s,p->expected_user,p->pass,"1024")<0)
+		{
+		*ad = SSL_AD_INTERNAL_ERROR;
+		return SSL3_AL_FATAL;
+		}
+	return SSL_ERROR_NONE;
+	}
+#endif
+
 static BIO *bio_err=NULL;
 static BIO *bio_stdout=NULL;
 
@@ -268,6 +314,9 @@
 	{
 	fprintf(stderr,"usage: ssltest [args ...]\n");
 	fprintf(stderr,"\n");
+#ifdef OPENSSL_FIPS
+	fprintf(stderr,"-F             - run test in FIPS mode\n");
+#endif
 	fprintf(stderr," -server_auth  - check server certificate\n");
 	fprintf(stderr," -client_auth  - do client authentication\n");
 	fprintf(stderr," -proxy        - allow proxy certificates\n");
@@ -289,6 +338,10 @@
 #ifndef OPENSSL_NO_PSK
 	fprintf(stderr," -psk arg      - PSK in hex (without 0x)\n");
 #endif
+#ifndef OPENSSL_NO_SRP
+	fprintf(stderr," -srpuser user  - SRP username to use\n");
+	fprintf(stderr," -srppass arg   - password for 'user'\n");
+#endif
 #ifndef OPENSSL_NO_SSL2
 	fprintf(stderr," -ssl2         - use SSLv2\n");
 #endif
@@ -483,6 +536,12 @@
 #ifndef OPENSSL_NO_ECDH
 	EC_KEY *ecdh = NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	/* client */
+	SRP_CLIENT_ARG srp_client_arg = {NULL,NULL};
+	/* server */
+	SRP_SERVER_ARG srp_server_arg = {NULL,NULL};
+#endif
 	int no_dhe = 0;
 	int no_ecdhe = 0;
 	int no_psk = 0;
@@ -494,6 +553,9 @@
 #endif
 	STACK_OF(SSL_COMP) *ssl_comp_methods = NULL;
 	int test_cipherlist = 0;
+#ifdef OPENSSL_FIPS
+	int fips_mode=0;
+#endif
 
 	verbose = 0;
 	debug = 0;
@@ -525,7 +587,16 @@
 
 	while (argc >= 1)
 		{
-		if	(strcmp(*argv,"-server_auth") == 0)
+		if(!strcmp(*argv,"-F"))
+			{
+#ifdef OPENSSL_FIPS
+			fips_mode=1;
+#else
+			fprintf(stderr,"not compiled with FIPS support, so exitting without running.\n");
+			EXIT(0);
+#endif
+			}
+		else if (strcmp(*argv,"-server_auth") == 0)
 			server_auth=1;
 		else if	(strcmp(*argv,"-client_auth") == 0)
 			client_auth=1;
@@ -579,6 +650,20 @@
 			no_psk=1;
 #endif
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv,"-srpuser") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_server_arg.expected_user = srp_client_arg.srplogin= *(++argv);
+			tls1=1;
+			}
+		else if (strcmp(*argv,"-srppass") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_server_arg.pass = srp_client_arg.srppassin= *(++argv);
+			tls1=1;
+			}
+#endif
 		else if	(strcmp(*argv,"-ssl2") == 0)
 			ssl2=1;
 		else if	(strcmp(*argv,"-tls1") == 0)
@@ -733,6 +818,20 @@
 		EXIT(1);
 		}
 
+#ifdef OPENSSL_FIPS
+	if(fips_mode)
+		{
+		if(!FIPS_mode_set(1))
+			{
+			ERR_load_crypto_strings();
+			ERR_print_errors(BIO_new_fp(stderr,BIO_NOCLOSE));
+			EXIT(1);
+			}
+		else
+			fprintf(stderr,"*** IN FIPS MODE ***\n");
+		}
+#endif
+
 	if (print_time)
 		{
 		if (!bio_pair)
@@ -878,7 +977,11 @@
 				}
 			}
 		else
+#ifdef OPENSSL_NO_EC2M
+			nid = NID_X9_62_prime256v1;
+#else
 			nid = NID_sect163r2;
+#endif
 
 		ecdh = EC_KEY_new_by_curve_name(nid);
 		if (ecdh == NULL)
@@ -981,6 +1084,26 @@
 			}
 #endif
 		}
+#ifndef OPENSSL_NO_SRP
+        if (srp_client_arg.srplogin)
+		{
+		if (!SSL_CTX_set_srp_username(c_ctx, srp_client_arg.srplogin))
+			{
+			BIO_printf(bio_err,"Unable to set SRP username\n");
+			goto end;
+			}
+		SSL_CTX_set_srp_cb_arg(c_ctx,&srp_client_arg);
+		SSL_CTX_set_srp_client_pwd_callback(c_ctx, ssl_give_srp_client_pwd_cb);
+		/*SSL_CTX_set_srp_strength(c_ctx, srp_client_arg.strength);*/
+		}
+
+	if (srp_server_arg.expected_user != NULL)
+		{
+		SSL_CTX_set_verify(s_ctx,SSL_VERIFY_NONE,verify_callback);
+		SSL_CTX_set_srp_cb_arg(s_ctx, &srp_server_arg);
+		SSL_CTX_set_srp_username_callback(s_ctx, ssl_srp_server_param_cb);
+		}
+#endif
 
 	c_ssl=SSL_new(c_ctx);
 	s_ssl=SSL_new(s_ctx);
@@ -2205,15 +2328,7 @@
 		}
 
 #ifndef OPENSSL_NO_X509_VERIFY
-# ifdef OPENSSL_FIPS
-	if(s->version == TLS1_VERSION)
-		FIPS_allow_md5(1);
-# endif
 	ok = X509_verify_cert(ctx);
-# ifdef OPENSSL_FIPS
-	if(s->version == TLS1_VERSION)
-		FIPS_allow_md5(0);
-# endif
 #endif
 
 	if (cb_arg->proxy_auth)

diff --git a/ssl/t1_clnt.c b/ssl/t1_clnt.c
index c87af17..578617e 100644
--- a/ssl/t1_clnt.c
+++ b/ssl/t1_clnt.c

@@ -66,13 +66,26 @@
 static const SSL_METHOD *tls1_get_client_method(int ver);
 static const SSL_METHOD *tls1_get_client_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_client_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_client_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_client_method());
-	else
-		return(NULL);
+		return TLSv1_client_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_client_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_client_method,
 			ssl_undefined_function,
 			ssl3_connect,
 			tls1_get_client_method)

diff --git a/ssl/t1_enc.c b/ssl/t1_enc.c
index 793ea43..201ca9a 100644
--- a/ssl/t1_enc.c
+++ b/ssl/t1_enc.c

@@ -143,6 +143,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/md5.h>
+#include <openssl/rand.h>
 #ifdef KSSL_DEBUG
 #include <openssl/des.h>
 #endif
@@ -158,68 +159,75 @@
 			unsigned char *out, int olen)
 	{
 	int chunk;
-	unsigned int j;
-	HMAC_CTX ctx;
-	HMAC_CTX ctx_tmp;
+	size_t j;
+	EVP_MD_CTX ctx, ctx_tmp;
+	EVP_PKEY *mac_key;
 	unsigned char A1[EVP_MAX_MD_SIZE];
-	unsigned int A1_len;
+	size_t A1_len;
 	int ret = 0;
 	
 	chunk=EVP_MD_size(md);
 	OPENSSL_assert(chunk >= 0);
 
-	HMAC_CTX_init(&ctx);
-	HMAC_CTX_init(&ctx_tmp);
-	if (!HMAC_Init_ex(&ctx,sec,sec_len,md, NULL))
+	EVP_MD_CTX_init(&ctx);
+	EVP_MD_CTX_init(&ctx_tmp);
+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	EVP_MD_CTX_set_flags(&ctx_tmp, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	mac_key = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, NULL, sec, sec_len);
+	if (!mac_key)
 		goto err;
-	if (!HMAC_Init_ex(&ctx_tmp,sec,sec_len,md, NULL))
+	if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 		goto err;
-	if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+	if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 		goto err;
-	if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+	if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 		goto err;
-	if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+	if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 		goto err;
-	if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+	if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 		goto err;
-	if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+	if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 		goto err;
-	if (!HMAC_Final(&ctx,A1,&A1_len))
+	if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
+		goto err;
+	if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 		goto err;
 
 	for (;;)
 		{
-		if (!HMAC_Init_ex(&ctx,NULL,0,NULL,NULL)) /* re-init */
+		/* Reinit mac contexts */
+		if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Init_ex(&ctx_tmp,NULL,0,NULL,NULL)) /* re-init */
+		if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Update(&ctx,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx,A1,A1_len))
 			goto err;
-		if (!HMAC_Update(&ctx_tmp,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx_tmp,A1,A1_len))
 			goto err;
-		if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+		if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 			goto err;
-		if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+		if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 			goto err;
-		if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+		if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 			goto err;
-		if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+		if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 			goto err;
-		if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+		if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
 			goto err;
 
 		if (olen > chunk)
 			{
-			if (!HMAC_Final(&ctx,out,&j))
+			if (!EVP_DigestSignFinal(&ctx,out,&j))
 				goto err;
 			out+=j;
 			olen-=j;
-			if (!HMAC_Final(&ctx_tmp,A1,&A1_len)) /* calc the next A1 value */
+			/* calc the next A1 value */
+			if (!EVP_DigestSignFinal(&ctx_tmp,A1,&A1_len))
 				goto err;
 			}
 		else	/* last one */
 			{
-			if (!HMAC_Final(&ctx,A1,&A1_len))
+			if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 				goto err;
 			memcpy(out,A1,olen);
 			break;
@@ -227,8 +235,9 @@
 		}
 	ret = 1;
 err:
-	HMAC_CTX_cleanup(&ctx);
-	HMAC_CTX_cleanup(&ctx_tmp);
+	EVP_PKEY_free(mac_key);
+	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(&ctx_tmp);
 	OPENSSL_cleanse(A1,sizeof(A1));
 	return ret;
 	}
@@ -256,6 +265,8 @@
 		if ((m<<TLS1_PRF_DGST_SHIFT) & digest_mask) count++;
 	}	
 	len=slen/count;
+	if (count == 1)
+		slen = 0;
 	S1=sec;
 	memset(out1,0,olen);
 	for (idx=0;ssl_get_handshake_digest(idx,&m,&md);idx++) {
@@ -284,7 +295,7 @@
 	     unsigned char *tmp, int num)
 	{
 	int ret;
-	ret = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	ret = tls1_PRF(ssl_get_algorithm2(s),
 		 TLS_MD_KEY_EXPANSION_CONST,TLS_MD_KEY_EXPANSION_CONST_SIZE,
 		 s->s3->server_random,SSL3_RANDOM_SIZE,
 		 s->s3->client_random,SSL3_RANDOM_SIZE,
@@ -358,7 +369,7 @@
 		{
 		if (s->s3->tmp.new_cipher->algorithm2 & TLS1_STREAM_MAC)
 			s->mac_flags |= SSL_MAC_FLAG_READ_MAC_STREAM;
-			else
+		else
 			s->mac_flags &= ~SSL_MAC_FLAG_READ_MAC_STREAM;
 
 		if (s->enc_read_ctx != NULL)
@@ -445,7 +456,11 @@
 	j=is_export ? (cl < SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher) ?
 	               cl : SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher)) : cl;
 	/* Was j=(exp)?5:EVP_CIPHER_key_length(c); */
-	k=EVP_CIPHER_iv_length(c);
+	/* If GCM mode only part of IV comes from PRF */
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		k = EVP_GCM_TLS_FIXED_IV_LEN;
+	else
+		k=EVP_CIPHER_iv_length(c);
 	if (	(which == SSL3_CHANGE_CIPHER_CLIENT_WRITE) ||
 		(which == SSL3_CHANGE_CIPHER_SERVER_READ))
 		{
@@ -474,10 +489,14 @@
 		}
 
 	memcpy(mac_secret,ms,i);
-	mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
-			mac_secret,*mac_secret_size);
-	EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
-	EVP_PKEY_free(mac_key);
+
+	if (!(EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER))
+		{
+		mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
+				mac_secret,*mac_secret_size);
+		EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
+		EVP_PKEY_free(mac_key);
+		}
 #ifdef TLS_DEBUG
 printf("which = %04X\nmac key=",which);
 { int z; for (z=0; z<i; z++) printf("%02X%c",ms[z],((z+1)%16)?' ':'\n'); }
@@ -487,7 +506,7 @@
 		/* In here I set both the read and write key/iv to the
 		 * same value since only the correct one will be used :-).
 		 */
-		if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		if (!tls1_PRF(ssl_get_algorithm2(s),
 				exp_label,exp_label_len,
 				s->s3->client_random,SSL3_RANDOM_SIZE,
 				s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -498,7 +517,7 @@
 
 		if (k > 0)
 			{
-			if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+			if (!tls1_PRF(ssl_get_algorithm2(s),
 					TLS_MD_IV_BLOCK_CONST,TLS_MD_IV_BLOCK_CONST_SIZE,
 					s->s3->client_random,SSL3_RANDOM_SIZE,
 					s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -524,7 +543,19 @@
 	}
 #endif	/* KSSL_DEBUG */
 
-	EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		{
+		EVP_CipherInit_ex(dd,c,NULL,key,NULL,(which & SSL3_CC_WRITE));
+		EVP_CIPHER_CTX_ctrl(dd, EVP_CTRL_GCM_SET_IV_FIXED, k, iv);
+		}
+	else	
+		EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+
+	/* Needed for "composite" AEADs, such as RC4-HMAC-MD5 */
+	if ((EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER) && *mac_secret_size)
+		EVP_CIPHER_CTX_ctrl(dd,EVP_CTRL_AEAD_SET_MAC_KEY,
+				*mac_secret_size,mac_secret);
+
 #ifdef TLS_DEBUG
 printf("which = %04X\nkey=",which);
 { int z; for (z=0; z<EVP_CIPHER_key_length(c); z++) printf("%02X%c",key[z],((z+1)%16)?' ':'\n'); }
@@ -606,7 +637,8 @@
 { int z; for (z=0; z<num; z++) printf("%02X%c",p1[z],((z+1)%16)?' ':'\n'); }
 #endif
 
-	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS))
+	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS)
+		&& s->method->version <= TLS1_VERSION)
 		{
 		/* enable vulnerability countermeasure for CBC ciphers with
 		 * known-IV problem (http://www.openssl.org/~bodo/tls-cbc.txt)
@@ -640,14 +672,14 @@
 	SSL3_RECORD *rec;
 	EVP_CIPHER_CTX *ds;
 	unsigned long l;
-	int bs,i,ii,j,k,n=0;
+	int bs,i,ii,j,k,pad=0;
 	const EVP_CIPHER *enc;
 
 	if (send)
 		{
 		if (EVP_MD_CTX_md(s->write_hash))
 			{
-			n=EVP_MD_CTX_size(s->write_hash);
+			int n=EVP_MD_CTX_size(s->write_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_write_ctx;
@@ -655,13 +687,34 @@
 		if (s->enc_write_ctx == NULL)
 			enc=NULL;
 		else
+			{
+			int ivlen;
 			enc=EVP_CIPHER_CTX_cipher(s->enc_write_ctx);
+			/* For TLSv1.1 and later explicit IV */
+			if (s->version >= TLS1_1_VERSION
+				&& EVP_CIPHER_mode(enc) == EVP_CIPH_CBC_MODE)
+				ivlen = EVP_CIPHER_iv_length(enc);
+			else
+				ivlen = 0;
+			if (ivlen > 1)
+				{
+				if ( rec->data != rec->input)
+					/* we can't write into the input stream:
+					 * Can this ever happen?? (steve)
+					 */
+					fprintf(stderr,
+						"%s:%d: rec->data != rec->input\n",
+						__FILE__, __LINE__);
+				else if (RAND_bytes(rec->input, ivlen) <= 0)
+					return -1;
+				}
+			}
 		}
 	else
 		{
 		if (EVP_MD_CTX_md(s->read_hash))
 			{
-			n=EVP_MD_CTX_size(s->read_hash);
+			int n=EVP_MD_CTX_size(s->read_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_read_ctx;
@@ -687,7 +740,43 @@
 		l=rec->length;
 		bs=EVP_CIPHER_block_size(ds->cipher);
 
-		if ((bs != 1) && send)
+		if (EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_AEAD_CIPHER)
+			{
+			unsigned char buf[13],*seq;
+
+			seq = send?s->s3->write_sequence:s->s3->read_sequence;
+
+			if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER)
+				{
+				unsigned char dtlsseq[9],*p=dtlsseq;
+
+				s2n(send?s->d1->w_epoch:s->d1->r_epoch,p);
+				memcpy(p,&seq[2],6);
+				memcpy(buf,dtlsseq,8);
+				}
+			else
+				{
+				memcpy(buf,seq,8);
+				for (i=7; i>=0; i--)	/* increment */
+					{
+					++seq[i];
+					if (seq[i] != 0) break; 
+					}
+				}
+
+			buf[8]=rec->type;
+			buf[9]=(unsigned char)(s->version>>8);
+			buf[10]=(unsigned char)(s->version);
+			buf[11]=rec->length>>8;
+			buf[12]=rec->length&0xff;
+			pad=EVP_CIPHER_CTX_ctrl(ds,EVP_CTRL_AEAD_TLS1_AAD,13,buf);
+			if (send)
+				{
+				l+=pad;
+				rec->length+=pad;
+				}
+			}
+		else if ((bs != 1) && send)
 			{
 			i=bs-((int)l%bs);
 
@@ -728,13 +817,25 @@
 			{
 			if (l == 0 || l%bs != 0)
 				{
+				if (s->version >= TLS1_1_VERSION)
+					return -1;
 				SSLerr(SSL_F_TLS1_ENC,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG);
 				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECRYPTION_FAILED);
 				return 0;
 				}
 			}
 		
-		EVP_Cipher(ds,rec->data,rec->input,l);
+		i = EVP_Cipher(ds,rec->data,rec->input,l);
+		if ((EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_CUSTOM_CIPHER)
+						?(i<0)
+						:(i==0))
+			return -1;	/* AEAD can fail to verify MAC */
+		if (EVP_CIPHER_mode(enc) == EVP_CIPH_GCM_MODE && !send)
+			{
+			rec->data += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->input += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->length -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			}
 
 #ifdef KSSL_DEBUG
 		{
@@ -784,8 +885,17 @@
 					return -1;
 					}
 				}
-			rec->length-=i;
+			rec->length -=i;
+			if (s->version >= TLS1_1_VERSION
+				&& EVP_CIPHER_CTX_mode(ds) == EVP_CIPH_CBC_MODE)
+				{
+				rec->data += bs;    /* skip the explicit IV */
+				rec->input += bs;
+				rec->length -= bs;
+				}
 			}
+		if (pad && !send)
+			rec->length -= pad;
 		}
 	return(1);
 	}
@@ -841,7 +951,7 @@
 
 	for (idx=0;ssl_get_handshake_digest(idx,&mask,&md);idx++)
 		{
-		if (mask & s->s3->tmp.new_cipher->algorithm2)
+		if (mask & ssl_get_algorithm2(s))
 			{
 			int hashsize = EVP_MD_size(md);
 			if (hashsize < 0 || hashsize > (int)(sizeof buf - (size_t)(q-buf)))
@@ -860,7 +970,7 @@
 			}
 		}
 		
-	if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	if (!tls1_PRF(ssl_get_algorithm2(s),
 			str,slen, buf,(int)(q-buf), NULL,0, NULL,0, NULL,0,
 			s->session->master_key,s->session->master_key_length,
 			out,buf2,sizeof buf2))
@@ -970,6 +1080,7 @@
 	const void *co = NULL, *so = NULL;
 	int col = 0, sol = 0;
 
+
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret(%p,%p, %p, %d)\n", s,out, p,len);
 #endif	/* KSSL_DEBUG */
@@ -986,7 +1097,7 @@
 		}
 #endif
 
-	tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	tls1_PRF(ssl_get_algorithm2(s),
 		TLS_MD_MASTER_SECRET_CONST,TLS_MD_MASTER_SECRET_CONST_SIZE,
 		s->s3->client_random,SSL3_RANDOM_SIZE,
 		co, col,
@@ -994,6 +1105,16 @@
 		so, sol,
 		p,len,
 		s->session->master_key,buff,sizeof buff);
+#ifdef SSL_DEBUG
+	fprintf(stderr, "Premaster Secret:\n");
+	BIO_dump_fp(stderr, (char *)p, len);
+	fprintf(stderr, "Client Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->client_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Server Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->server_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Master Secret:\n");
+	BIO_dump_fp(stderr, (char *)s->session->master_key, SSL3_MASTER_SECRET_SIZE);
+#endif
 
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret() complete\n");
@@ -1001,6 +1122,95 @@
 	return(SSL3_MASTER_SECRET_SIZE);
 	}
 
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	 const char *label, size_t llen, const unsigned char *context,
+	 size_t contextlen, int use_context)
+	{
+	unsigned char *buff;
+	unsigned char *val = NULL;
+	size_t vallen, currentvalpos;
+	int rv;
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material(%p,%p,%d,%s,%d,%p,%d)\n", s, out, olen, label, llen, p, plen);
+#endif	/* KSSL_DEBUG */
+
+	buff = OPENSSL_malloc(olen);
+	if (buff == NULL) goto err2;
+
+	/* construct PRF arguments
+	 * we construct the PRF argument ourself rather than passing separate
+	 * values into the TLS PRF to ensure that the concatenation of values
+	 * does not create a prohibited label.
+	 */
+	vallen = llen + SSL3_RANDOM_SIZE * 2;
+	if (use_context)
+		{
+		vallen += 2 + contextlen;
+		}
+
+	val = OPENSSL_malloc(vallen);
+	if (val == NULL) goto err2;
+	currentvalpos = 0;
+	memcpy(val + currentvalpos, (unsigned char *) label, llen);
+	currentvalpos += llen;
+	memcpy(val + currentvalpos, s->s3->client_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+	memcpy(val + currentvalpos, s->s3->server_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+
+	if (use_context)
+		{
+		val[currentvalpos] = (contextlen >> 8) & 0xff;
+		currentvalpos++;
+		val[currentvalpos] = contextlen & 0xff;
+		currentvalpos++;
+		if ((contextlen > 0) || (context != NULL))
+			{
+			memcpy(val + currentvalpos, context, contextlen);
+			}
+		}
+
+	/* disallow prohibited labels
+	 * note that SSL3_RANDOM_SIZE > max(prohibited label len) =
+	 * 15, so size of val > max(prohibited label len) = 15 and the
+	 * comparisons won't have buffer overflow
+	 */
+	if (memcmp(val, TLS_MD_CLIENT_FINISH_CONST,
+		 TLS_MD_CLIENT_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_SERVER_FINISH_CONST,
+		 TLS_MD_SERVER_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_MASTER_SECRET_CONST,
+		 TLS_MD_MASTER_SECRET_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_KEY_EXPANSION_CONST,
+		 TLS_MD_KEY_EXPANSION_CONST_SIZE) == 0) goto err1;
+
+	rv = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		      val, vallen,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      s->session->master_key,s->session->master_key_length,
+		      out,buff,olen);
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material() complete\n");
+#endif	/* KSSL_DEBUG */
+	goto ret;
+err1:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, SSL_R_TLS_ILLEGAL_EXPORTER_LABEL);
+	rv = 0;
+	goto ret;
+err2:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, ERR_R_MALLOC_FAILURE);
+	rv = 0;
+ret:
+	if (buff != NULL) OPENSSL_free(buff);
+	if (val != NULL) OPENSSL_free(val);
+	return(rv);
+	}
+
 int tls1_alert_code(int code)
 	{
 	switch (code)
@@ -1042,4 +1252,3 @@
 	default:			return(-1);
 		}
 	}
-

diff --git a/ssl/t1_lib.c b/ssl/t1_lib.c
index 0e4c408..57d1107 100644
--- a/ssl/t1_lib.c
+++ b/ssl/t1_lib.c

@@ -114,6 +114,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/ocsp.h>
+#include <openssl/rand.h>
 #include "ssl_locl.h"
 
 const char tls1_version_str[]="TLSv1" OPENSSL_VERSION_PTEXT;
@@ -136,6 +137,7 @@
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long tls1_default_timeout(void)
@@ -166,10 +168,11 @@
 void tls1_clear(SSL *s)
 	{
 	ssl3_clear(s);
-	s->version=TLS1_VERSION;
+	s->version = s->method->version;
 	}
 
 #ifndef OPENSSL_NO_EC
+
 static int nid_list[] =
 	{
 		NID_sect163k1, /* sect163k1 (1) */
@@ -198,7 +201,36 @@
 		NID_secp384r1, /* secp384r1 (24) */
 		NID_secp521r1  /* secp521r1 (25) */	
 	};
-	
+
+static int pref_list[] =
+	{
+		NID_sect571r1, /* sect571r1 (14) */ 
+		NID_sect571k1, /* sect571k1 (13) */ 
+		NID_secp521r1, /* secp521r1 (25) */	
+		NID_sect409k1, /* sect409k1 (11) */ 
+		NID_sect409r1, /* sect409r1 (12) */
+		NID_secp384r1, /* secp384r1 (24) */
+		NID_sect283k1, /* sect283k1 (9) */
+		NID_sect283r1, /* sect283r1 (10) */ 
+		NID_secp256k1, /* secp256k1 (22) */ 
+		NID_X9_62_prime256v1, /* secp256r1 (23) */ 
+		NID_sect239k1, /* sect239k1 (8) */ 
+		NID_sect233k1, /* sect233k1 (6) */
+		NID_sect233r1, /* sect233r1 (7) */ 
+		NID_secp224k1, /* secp224k1 (20) */ 
+		NID_secp224r1, /* secp224r1 (21) */
+		NID_sect193r1, /* sect193r1 (4) */ 
+		NID_sect193r2, /* sect193r2 (5) */ 
+		NID_secp192k1, /* secp192k1 (18) */
+		NID_X9_62_prime192v1, /* secp192r1 (19) */ 
+		NID_sect163k1, /* sect163k1 (1) */
+		NID_sect163r1, /* sect163r1 (2) */
+		NID_sect163r2, /* sect163r2 (3) */
+		NID_secp160k1, /* secp160k1 (15) */
+		NID_secp160r1, /* secp160r1 (16) */ 
+		NID_secp160r2, /* secp160r2 (17) */ 
+	};
+
 int tls1_ec_curve_id2nid(int curve_id)
 	{
 	/* ECC curves from draft-ietf-tls-ecc-12.txt (Oct. 17, 2005) */
@@ -270,6 +302,64 @@
 #endif /* OPENSSL_NO_EC */
 
 #ifndef OPENSSL_NO_TLSEXT
+
+/* List of supported signature algorithms and hashes. Should make this
+ * customisable at some point, for now include everything we support.
+ */
+
+#ifdef OPENSSL_NO_RSA
+#define tlsext_sigalg_rsa(md) /* */
+#else
+#define tlsext_sigalg_rsa(md) md, TLSEXT_signature_rsa,
+#endif
+
+#ifdef OPENSSL_NO_DSA
+#define tlsext_sigalg_dsa(md) /* */
+#else
+#define tlsext_sigalg_dsa(md) md, TLSEXT_signature_dsa,
+#endif
+
+#ifdef OPENSSL_NO_ECDSA
+#define tlsext_sigalg_ecdsa(md) /* */
+#else
+#define tlsext_sigalg_ecdsa(md) md, TLSEXT_signature_ecdsa,
+#endif
+
+#define tlsext_sigalg(md) \
+		tlsext_sigalg_rsa(md) \
+		tlsext_sigalg_dsa(md) \
+		tlsext_sigalg_ecdsa(md)
+
+static unsigned char tls12_sigalgs[] = {
+#ifndef OPENSSL_NO_SHA512
+	tlsext_sigalg(TLSEXT_hash_sha512)
+	tlsext_sigalg(TLSEXT_hash_sha384)
+#endif
+#ifndef OPENSSL_NO_SHA256
+	tlsext_sigalg(TLSEXT_hash_sha256)
+	tlsext_sigalg(TLSEXT_hash_sha224)
+#endif
+#ifndef OPENSSL_NO_SHA
+	tlsext_sigalg(TLSEXT_hash_sha1)
+#endif
+#ifndef OPENSSL_NO_MD5
+	tlsext_sigalg_rsa(TLSEXT_hash_md5)
+#endif
+};
+
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p)
+	{
+	size_t slen = sizeof(tls12_sigalgs);
+#ifdef OPENSSL_FIPS
+	/* If FIPS mode don't include MD5 which is last */
+	if (FIPS_mode())
+		slen -= 2;
+#endif
+	if (p)
+		memcpy(p, tls12_sigalgs, slen);
+	return (int)slen;
+	}
+
 unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned char *limit)
 	{
 	int extdatalen=0;
@@ -317,7 +407,7 @@
 		}
 
         /* Add RI if renegotiating */
-        if (s->new_session)
+        if (s->renegotiate)
           {
           int el;
           
@@ -341,6 +431,34 @@
           ret += el;
         }
 
+#ifndef OPENSSL_NO_SRP
+	/* Add SRP username if there is one */
+	if (s->srp_ctx.login != NULL)
+		{ /* Add TLS extension SRP username to the Client Hello message */
+
+		int login_len = strlen(s->srp_ctx.login);	
+		if (login_len > 255 || login_len == 0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			} 
+
+		/* check for enough space.
+		   4 for the srp type type and entension length
+		   1 for the srp user identity
+		   + srp user identity length 
+		*/
+		if ((limit - ret - 5 - login_len) < 0) return NULL; 
+
+		/* fill in the extension */
+		s2n(TLSEXT_TYPE_srp,ret);
+		s2n(login_len+1,ret);
+		(*ret++) = (unsigned char) login_len;
+		memcpy(ret, s->srp_ctx.login, login_len);
+		ret+=login_len;
+		}
+#endif
+
 #ifndef OPENSSL_NO_EC
 	if (s->tlsext_ecpointformatlist != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -426,6 +544,17 @@
 		}
 		skip_ext:
 
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		if ((size_t)(limit - ret) < sizeof(tls12_sigalgs) + 6)
+			return NULL; 
+		s2n(TLSEXT_TYPE_signature_algorithms,ret);
+		s2n(sizeof(tls12_sigalgs) + 2, ret);
+		s2n(sizeof(tls12_sigalgs), ret);
+		memcpy(ret, tls12_sigalgs, sizeof(tls12_sigalgs));
+		ret += sizeof(tls12_sigalgs);
+		}
+
 #ifdef TLSEXT_TYPE_opaque_prf_input
 	if (s->s3->client_opaque_prf_input != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -494,6 +623,20 @@
 			i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension */
+	s2n(TLSEXT_TYPE_heartbeat,ret);
+	s2n(1,ret);
+	/* Set mode:
+	 * 1: peer may send requests
+	 * 2: peer not allowed to send requests
+	 */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+		*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+	else
+		*(ret++) = SSL_TLSEXT_HB_ENABLED;
+#endif
+
 #ifndef OPENSSL_NO_NEXTPROTONEG
 	if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len)
 		{
@@ -506,6 +649,25 @@
 		}
 #endif
 
+        if(SSL_get_srtp_profiles(s))
+                {
+                int el;
+
+                ssl_add_clienthello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_clienthello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret += el;
+                }
+
 	if ((extdatalen = ret-p-2)== 0) 
 		return p;
 
@@ -618,6 +780,26 @@
 		ret += sol;
 		}
 #endif
+
+        if(s->srtp_profile)
+                {
+                int el;
+
+                ssl_add_serverhello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_serverhello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret+=el;
+                }
+
 	if (((s->s3->tmp.new_cipher->id & 0xFFFF)==0x80 || (s->s3->tmp.new_cipher->id & 0xFFFF)==0x81) 
 		&& (SSL_get_options(s) & SSL_OP_CRYPTOPRO_TLSEXT_BUG))
 		{ const unsigned char cryptopro_ext[36] = {
@@ -633,6 +815,24 @@
 
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension if we've received one */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED)
+		{
+		s2n(TLSEXT_TYPE_heartbeat,ret);
+		s2n(1,ret);
+		/* Set mode:
+		 * 1: peer may send requests
+		 * 2: peer not allowed to send requests
+		 */
+		if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+			*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+		else
+			*(ret++) = SSL_TLSEXT_HB_ENABLED;
+
+		}
+#endif
+
 #ifndef OPENSSL_NO_NEXTPROTONEG
 	next_proto_neg_seen = s->s3->next_proto_neg_seen;
 	s->s3->next_proto_neg_seen = 0;
@@ -669,9 +869,18 @@
 	unsigned short len;
 	unsigned char *data = *p;
 	int renegotiate_seen = 0;
+	int sigalg_seen = 0;
 
 	s->servername_done = 0;
 	s->tlsext_status_type = -1;
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
 
 	if (data >= (d+n-2))
 		goto ri_check;
@@ -799,6 +1008,31 @@
 				}
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (type == TLSEXT_TYPE_srp)
+			{
+			if (size <= 0 || ((len = data[0])) != (size -1))
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (s->srp_ctx.login != NULL)
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if ((s->srp_ctx.login = OPENSSL_malloc(len+1)) == NULL)
+				return -1;
+			memcpy(s->srp_ctx.login, &data[1], len);
+			s->srp_ctx.login[len]='\0';
+  
+			if (strlen(s->srp_ctx.login) != len) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			}
+#endif
 
 #ifndef OPENSSL_NO_EC
 		else if (type == TLSEXT_TYPE_ec_point_formats &&
@@ -919,6 +1153,28 @@
 				return 0;
 			renegotiate_seen = 1;
 			}
+		else if (type == TLSEXT_TYPE_signature_algorithms)
+			{
+			int dsize;
+			if (sigalg_seen || size < 2) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			sigalg_seen = 1;
+			n2s(data,dsize);
+			size -= 2;
+			if (dsize != size || dsize & 1) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (!tls1_process_sigalgs(s, data, dsize))
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			}
 		else if (type == TLSEXT_TYPE_status_request &&
 		         s->version != DTLS1_VERSION && s->ctx->tlsext_status_cb)
 			{
@@ -1031,9 +1287,26 @@
 				else
 					s->tlsext_status_type = -1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Client allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Client doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
 #ifndef OPENSSL_NO_NEXTPROTONEG
 		else if (type == TLSEXT_TYPE_next_proto_neg &&
-                         s->s3->tmp.finish_md_len == 0)
+			 s->s3->tmp.finish_md_len == 0)
 			{
 			/* We shouldn't accept this extension on a
 			 * renegotiation.
@@ -1055,6 +1328,13 @@
 #endif
 
 		/* session ticket processed earlier */
+		else if (type == TLSEXT_TYPE_use_srtp)
+                        {
+			if(ssl_parse_clienthello_use_srtp_ext(s, data, size,
+							      al))
+				return 0;
+                        }
+
 		data+=size;
 		}
 				
@@ -1064,7 +1344,7 @@
 
 	/* Need RI if renegotiating */
 
-	if (!renegotiate_seen && s->new_session &&
+	if (!renegotiate_seen && s->renegotiate &&
 		!(s->options & SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION))
 		{
 		*al = SSL_AD_HANDSHAKE_FAILURE;
@@ -1080,7 +1360,7 @@
 /* ssl_next_proto_validate validates a Next Protocol Negotiation block. No
  * elements of zero length are allowed and the set of elements must exactly fill
  * the length of the block. */
-static int ssl_next_proto_validate(unsigned char *d, unsigned len)
+static char ssl_next_proto_validate(unsigned char *d, unsigned len)
 	{
 	unsigned int off = 0;
 
@@ -1105,6 +1385,15 @@
 	int tlsext_servername = 0;
 	int renegotiate_seen = 0;
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
+
 	if (data >= (d+n-2))
 		goto ri_check;
 
@@ -1231,7 +1520,8 @@
 			s->tlsext_status_expected = 1;
 			}
 #ifndef OPENSSL_NO_NEXTPROTONEG
-		else if (type == TLSEXT_TYPE_next_proto_neg)
+		else if (type == TLSEXT_TYPE_next_proto_neg &&
+			 s->s3->tmp.finish_md_len == 0)
 			{
 			unsigned char *selected;
 			unsigned char selected_len;
@@ -1261,6 +1551,7 @@
 				}
 			memcpy(s->next_proto_negotiated, selected, selected_len);
 			s->next_proto_negotiated_len = selected_len;
+			s->s3->next_proto_neg_seen = 1;
 			}
 #endif
 		else if (type == TLSEXT_TYPE_renegotiate)
@@ -1269,6 +1560,30 @@
 				return 0;
 			renegotiate_seen = 1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Server allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Server doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
+		else if (type == TLSEXT_TYPE_use_srtp)
+                        {
+                        if(ssl_parse_serverhello_use_srtp_ext(s, data, size,
+							      al))
+                                return 0;
+                        }
+
 		data+=size;		
 		}
 
@@ -1348,7 +1663,7 @@
 			break;
 			}
 		}
-	using_ecc = using_ecc && (s->version == TLS1_VERSION);
+	using_ecc = using_ecc && (s->version >= TLS1_VERSION);
 	if (using_ecc)
 		{
 		if (s->tlsext_ecpointformatlist != NULL) OPENSSL_free(s->tlsext_ecpointformatlist);
@@ -1364,16 +1679,19 @@
 
 		/* we support all named elliptic curves in draft-ietf-tls-ecc-12 */
 		if (s->tlsext_ellipticcurvelist != NULL) OPENSSL_free(s->tlsext_ellipticcurvelist);
-		s->tlsext_ellipticcurvelist_length = sizeof(nid_list)/sizeof(nid_list[0]) * 2;
+		s->tlsext_ellipticcurvelist_length = sizeof(pref_list)/sizeof(pref_list[0]) * 2;
 		if ((s->tlsext_ellipticcurvelist = OPENSSL_malloc(s->tlsext_ellipticcurvelist_length)) == NULL)
 			{
 			s->tlsext_ellipticcurvelist_length = 0;
 			SSLerr(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT,ERR_R_MALLOC_FAILURE);
 			return -1;
 			}
-		for (i = 1, j = s->tlsext_ellipticcurvelist; (unsigned int)i <=
-				sizeof(nid_list)/sizeof(nid_list[0]); i++)
-			s2n(i,j);
+		for (i = 0, j = s->tlsext_ellipticcurvelist; (unsigned int)i <
+				sizeof(pref_list)/sizeof(pref_list[0]); i++)
+			{
+			int id = tls1_ec_nid2curve_id(pref_list[i]);
+			s2n(id,j);
+			}
 		}
 #endif /* OPENSSL_NO_EC */
 
@@ -1682,26 +2000,56 @@
 		}
 	}
 
-/* Since the server cache lookup is done early on in the processing of client
- * hello and other operations depend on the result we need to handle any TLS
- * session ticket extension at the same time.
+/* Since the server cache lookup is done early on in the processing of the
+ * ClientHello, and other operations depend on the result, we need to handle
+ * any TLS session ticket extension at the same time.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *   ret: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * If s->tls_session_secret_cb is set then we are expecting a pre-shared key
+ * ciphersuite, in which case we have no use for session tickets and one will
+ * never be decrypted, nor will s->tlsext_ticket_expected be set to 1.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    0: no ticket was found (or was ignored, based on settings).
+ *    1: a zero length extension was found, indicating that the client supports
+ *       session tickets but doesn't currently have one to offer.
+ *    2: either s->tls_session_secret_cb was set, or a ticket was offered but
+ *       couldn't be decrypted because of a non-fatal error.
+ *    3: a ticket was successfully decrypted and *ret was set.
+ *
+ * Side effects:
+ *   Sets s->tlsext_ticket_expected to 1 if the server will have to issue
+ *   a new session ticket to the client because the client indicated support
+ *   (and s->tls_session_secret_cb is NULL) but the client either doesn't have
+ *   a session ticket or we couldn't use the one it gave us, or if
+ *   s->ctx->tlsext_ticket_key_cb asked to renew the client's ticket.
+ *   Otherwise, s->tlsext_ticket_expected is set to 0.
  */
-
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
-				const unsigned char *limit, SSL_SESSION **ret)
+			const unsigned char *limit, SSL_SESSION **ret)
 	{
 	/* Point after session ID in client hello */
 	const unsigned char *p = session_id + len;
 	unsigned short i;
 
-	/* If tickets disabled behave as if no ticket present
- 	 * to permit stateful resumption.
- 	 */
-	if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-		return 1;
+	*ret = NULL;
+	s->tlsext_ticket_expected = 0;
 
+	/* If tickets disabled behave as if no ticket present
+	 * to permit stateful resumption.
+	 */
+	if (SSL_get_options(s) & SSL_OP_NO_TICKET)
+		return 0;
 	if ((s->version <= SSL3_VERSION) || !limit)
-		return 1;
+		return 0;
 	if (p >= limit)
 		return -1;
 	/* Skip past DTLS cookie */
@@ -1724,7 +2072,7 @@
 		return -1;
 	/* Now at start of extensions */
 	if ((p + 2) >= limit)
-		return 1;
+		return 0;
 	n2s(p, i);
 	while ((p + 4) <= limit)
 		{
@@ -1732,39 +2080,61 @@
 		n2s(p, type);
 		n2s(p, size);
 		if (p + size > limit)
-			return 1;
+			return 0;
 		if (type == TLSEXT_TYPE_session_ticket)
 			{
-			/* If tickets disabled indicate cache miss which will
- 			 * trigger a full handshake
- 			 */
-			if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-				return 1;
-			/* If zero length note client will accept a ticket
- 			 * and indicate cache miss to trigger full handshake
- 			 */
+			int r;
 			if (size == 0)
 				{
+				/* The client will accept a ticket but doesn't
+				 * currently have one. */
 				s->tlsext_ticket_expected = 1;
-				return 0;	/* Cache miss */
+				return 1;
 				}
 			if (s->tls_session_secret_cb)
 				{
-				/* Indicate cache miss here and instead of
-				 * generating the session from ticket now,
-				 * trigger abbreviated handshake based on
-				 * external mechanism to calculate the master
-				 * secret later. */
-				return 0;
+				/* Indicate that the ticket couldn't be
+				 * decrypted rather than generating the session
+				 * from ticket now, trigger abbreviated
+				 * handshake based on external mechanism to
+				 * calculate the master secret later. */
+				return 2;
 				}
-			return tls_decrypt_ticket(s, p, size, session_id, len,
-									ret);
+			r = tls_decrypt_ticket(s, p, size, session_id, len, ret);
+			switch (r)
+				{
+				case 2: /* ticket couldn't be decrypted */
+					s->tlsext_ticket_expected = 1;
+					return 2;
+				case 3: /* ticket was decrypted */
+					return r;
+				case 4: /* ticket decrypted but need to renew */
+					s->tlsext_ticket_expected = 1;
+					return 3;
+				default: /* fatal error */
+					return -1;
+				}
 			}
 		p += size;
 		}
-	return 1;
+	return 0;
 	}
 
+/* tls_decrypt_ticket attempts to decrypt a session ticket.
+ *
+ *   etick: points to the body of the session ticket extension.
+ *   eticklen: the length of the session tickets extenion.
+ *   sess_id: points at the session ID.
+ *   sesslen: the length of the session ID.
+ *   psess: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    2: the ticket couldn't be decrypted.
+ *    3: a ticket was successfully decrypted and *psess was set.
+ *    4: same as 3, but the ticket needs to be renewed.
+ */
 static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 				const unsigned char *sess_id, int sesslen,
 				SSL_SESSION **psess)
@@ -1779,7 +2149,7 @@
 	SSL_CTX *tctx = s->initial_ctx;
 	/* Need at least keyname + iv + some encrypted data */
 	if (eticklen < 48)
-		goto tickerr;
+		return 2;
 	/* Initialize session ticket encryption and HMAC contexts */
 	HMAC_CTX_init(&hctx);
 	EVP_CIPHER_CTX_init(&ctx);
@@ -1791,7 +2161,7 @@
 		if (rv < 0)
 			return -1;
 		if (rv == 0)
-			goto tickerr;
+			return 2;
 		if (rv == 2)
 			renew_ticket = 1;
 		}
@@ -1799,15 +2169,15 @@
 		{
 		/* Check key name matches */
 		if (memcmp(etick, tctx->tlsext_tick_key_name, 16))
-			goto tickerr;
+			return 2;
 		HMAC_Init_ex(&hctx, tctx->tlsext_tick_hmac_key, 16,
 					tlsext_tick_md(), NULL);
 		EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL,
 				tctx->tlsext_tick_aes_key, etick + 16);
 		}
 	/* Attempt to process session ticket, first conduct sanity and
- 	 * integrity checks on ticket.
- 	 */
+	 * integrity checks on ticket.
+	 */
 	mlen = HMAC_size(&hctx);
 	if (mlen < 0)
 		{
@@ -1820,7 +2190,7 @@
 	HMAC_Final(&hctx, tick_hmac, NULL);
 	HMAC_CTX_cleanup(&hctx);
 	if (memcmp(tick_hmac, etick + eticklen, mlen))
-		goto tickerr;
+		return 2;
 	/* Attempt to decrypt session data */
 	/* Move p after IV to start of encrypted ticket, update length */
 	p = etick + 16 + EVP_CIPHER_CTX_iv_length(&ctx);
@@ -1833,33 +2203,376 @@
 		}
 	EVP_DecryptUpdate(&ctx, sdec, &slen, p, eticklen);
 	if (EVP_DecryptFinal(&ctx, sdec + slen, &mlen) <= 0)
-		goto tickerr;
+		return 2;
 	slen += mlen;
 	EVP_CIPHER_CTX_cleanup(&ctx);
 	p = sdec;
-		
+
 	sess = d2i_SSL_SESSION(NULL, &p, slen);
 	OPENSSL_free(sdec);
 	if (sess)
 		{
-		/* The session ID if non-empty is used by some clients to
- 		 * detect that the ticket has been accepted. So we copy it to
- 		 * the session structure. If it is empty set length to zero
- 		 * as required by standard.
- 		 */
+		/* The session ID, if non-empty, is used by some clients to
+		 * detect that the ticket has been accepted. So we copy it to
+		 * the session structure. If it is empty set length to zero
+		 * as required by standard.
+		 */
 		if (sesslen)
 			memcpy(sess->session_id, sess_id, sesslen);
 		sess->session_id_length = sesslen;
 		*psess = sess;
-		s->tlsext_ticket_expected = renew_ticket;
-		return 1;
+		if (renew_ticket)
+			return 4;
+		else
+			return 3;
 		}
-	/* If session decrypt failure indicate a cache miss and set state to
- 	 * send a new ticket
- 	 */
-	tickerr:	
-	s->tlsext_ticket_expected = 1;
+        ERR_clear_error();
+	/* For session parse failure, indicate that we need to send a new
+	 * ticket. */
+	return 2;
+	}
+
+/* Tables to translate from NIDs to TLS v1.2 ids */
+
+typedef struct 
+	{
+	int nid;
+	int id;
+	} tls12_lookup;
+
+static tls12_lookup tls12_md[] = {
+#ifndef OPENSSL_NO_MD5
+	{NID_md5, TLSEXT_hash_md5},
+#endif
+#ifndef OPENSSL_NO_SHA
+	{NID_sha1, TLSEXT_hash_sha1},
+#endif
+#ifndef OPENSSL_NO_SHA256
+	{NID_sha224, TLSEXT_hash_sha224},
+	{NID_sha256, TLSEXT_hash_sha256},
+#endif
+#ifndef OPENSSL_NO_SHA512
+	{NID_sha384, TLSEXT_hash_sha384},
+	{NID_sha512, TLSEXT_hash_sha512}
+#endif
+};
+
+static tls12_lookup tls12_sig[] = {
+#ifndef OPENSSL_NO_RSA
+	{EVP_PKEY_RSA, TLSEXT_signature_rsa},
+#endif
+#ifndef OPENSSL_NO_DSA
+	{EVP_PKEY_DSA, TLSEXT_signature_dsa},
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	{EVP_PKEY_EC, TLSEXT_signature_ecdsa}
+#endif
+};
+
+static int tls12_find_id(int nid, tls12_lookup *table, size_t tlen)
+	{
+	size_t i;
+	for (i = 0; i < tlen; i++)
+		{
+		if (table[i].nid == nid)
+			return table[i].id;
+		}
+	return -1;
+	}
+#if 0
+static int tls12_find_nid(int id, tls12_lookup *table, size_t tlen)
+	{
+	size_t i;
+	for (i = 0; i < tlen; i++)
+		{
+		if (table[i].id == id)
+			return table[i].nid;
+		}
+	return -1;
+	}
+#endif
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md)
+	{
+	int sig_id, md_id;
+	if (!md)
+		return 0;
+	md_id = tls12_find_id(EVP_MD_type(md), tls12_md,
+				sizeof(tls12_md)/sizeof(tls12_lookup));
+	if (md_id == -1)
+		return 0;
+	sig_id = tls12_get_sigid(pk);
+	if (sig_id == -1)
+		return 0;
+	p[0] = (unsigned char)md_id;
+	p[1] = (unsigned char)sig_id;
+	return 1;
+	}
+
+int tls12_get_sigid(const EVP_PKEY *pk)
+	{
+	return tls12_find_id(pk->type, tls12_sig,
+				sizeof(tls12_sig)/sizeof(tls12_lookup));
+	}
+
+const EVP_MD *tls12_get_hash(unsigned char hash_alg)
+	{
+	switch(hash_alg)
+		{
+#ifndef OPENSSL_NO_MD5
+		case TLSEXT_hash_md5:
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return NULL;
+#endif
+		return EVP_md5();
+#endif
+#ifndef OPENSSL_NO_SHA
+		case TLSEXT_hash_sha1:
+		return EVP_sha1();
+#endif
+#ifndef OPENSSL_NO_SHA256
+		case TLSEXT_hash_sha224:
+		return EVP_sha224();
+
+		case TLSEXT_hash_sha256:
+		return EVP_sha256();
+#endif
+#ifndef OPENSSL_NO_SHA512
+		case TLSEXT_hash_sha384:
+		return EVP_sha384();
+
+		case TLSEXT_hash_sha512:
+		return EVP_sha512();
+#endif
+		default:
+		return NULL;
+
+		}
+	}
+
+/* Set preferred digest for each key type */
+
+int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
+	{
+	int i, idx;
+	const EVP_MD *md;
+	CERT *c = s->cert;
+	/* Extension ignored for TLS versions below 1.2 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION)
+		return 1;
+	/* Should never happen */
+	if (!c)
+		return 0;
+
+	c->pkeys[SSL_PKEY_DSA_SIGN].digest = NULL;
+	c->pkeys[SSL_PKEY_RSA_SIGN].digest = NULL;
+	c->pkeys[SSL_PKEY_RSA_ENC].digest = NULL;
+	c->pkeys[SSL_PKEY_ECC].digest = NULL;
+
+	for (i = 0; i < dsize; i += 2)
+		{
+		unsigned char hash_alg = data[i], sig_alg = data[i+1];
+
+		switch(sig_alg)
+			{
+#ifndef OPENSSL_NO_RSA
+			case TLSEXT_signature_rsa:
+			idx = SSL_PKEY_RSA_SIGN;
+			break;
+#endif
+#ifndef OPENSSL_NO_DSA
+			case TLSEXT_signature_dsa:
+			idx = SSL_PKEY_DSA_SIGN;
+			break;
+#endif
+#ifndef OPENSSL_NO_ECDSA
+			case TLSEXT_signature_ecdsa:
+			idx = SSL_PKEY_ECC;
+			break;
+#endif
+			default:
+			continue;
+			}
+
+		if (c->pkeys[idx].digest == NULL)
+			{
+			md = tls12_get_hash(hash_alg);
+			if (md)
+				{
+				c->pkeys[idx].digest = md;
+				if (idx == SSL_PKEY_RSA_SIGN)
+					c->pkeys[SSL_PKEY_RSA_ENC].digest = md;
+				}
+			}
+
+		}
+
+
+	/* Set any remaining keys to default values. NOTE: if alg is not
+	 * supported it stays as NULL.
+	 */
+#ifndef OPENSSL_NO_DSA
+	if (!c->pkeys[SSL_PKEY_DSA_SIGN].digest)
+		c->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1();
+#endif
+#ifndef OPENSSL_NO_RSA
+	if (!c->pkeys[SSL_PKEY_RSA_SIGN].digest)
+		{
+		c->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+		c->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+		}
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	if (!c->pkeys[SSL_PKEY_ECC].digest)
+		c->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa();
+#endif
+	return 1;
+	}
+
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+tls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 bytes
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+		
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+		
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+		
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
 	return 0;
 	}
 
+int
+tls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+		
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+		
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		s->tlsext_hb_pending = 1;
+		}
+		
+	OPENSSL_free(buf);
+
+	return ret;
+	}
 #endif

diff --git a/ssl/t1_meth.c b/ssl/t1_meth.c
index 6ce7c0b..53c807d 100644
--- a/ssl/t1_meth.c
+++ b/ssl/t1_meth.c

@@ -60,16 +60,28 @@
 #include <openssl/objects.h>
 #include "ssl_locl.h"
 
-static const SSL_METHOD *tls1_get_method(int ver);
 static const SSL_METHOD *tls1_get_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_method());
-	else
-		return(NULL);
+		return TLSv1_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_method,
 			ssl3_accept,
 			ssl3_connect,
 			tls1_get_method)

diff --git a/ssl/t1_srvr.c b/ssl/t1_srvr.c
index 42525e9..f1d1565 100644
--- a/ssl/t1_srvr.c
+++ b/ssl/t1_srvr.c

@@ -67,13 +67,26 @@
 static const SSL_METHOD *tls1_get_server_method(int ver);
 static const SSL_METHOD *tls1_get_server_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_server_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_server_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_server_method());
-	else
-		return(NULL);
+		return TLSv1_server_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_server_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
 			tls1_get_server_method)

diff --git a/ssl/tls1.h b/ssl/tls1.h
index 76f368a..c39c267 100644
--- a/ssl/tls1.h
+++ b/ssl/tls1.h

@@ -159,10 +159,24 @@
 
 #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES	0
 
+#define TLS1_2_VERSION			0x0303
+#define TLS1_2_VERSION_MAJOR		0x03
+#define TLS1_2_VERSION_MINOR		0x03
+
+#define TLS1_1_VERSION			0x0302
+#define TLS1_1_VERSION_MAJOR		0x03
+#define TLS1_1_VERSION_MINOR		0x02
+
 #define TLS1_VERSION			0x0301
 #define TLS1_VERSION_MAJOR		0x03
 #define TLS1_VERSION_MINOR		0x01
 
+#define TLS1_get_version(s) \
+		((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0)
+
+#define TLS1_get_client_version(s) \
+		((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0)
+
 #define TLS1_AD_DECRYPTION_FAILED	21
 #define TLS1_AD_RECORD_OVERFLOW		22
 #define TLS1_AD_UNKNOWN_CA		48	/* fatal */
@@ -183,17 +197,42 @@
 #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114
 #define TLS1_AD_UNKNOWN_PSK_IDENTITY	115	/* fatal */
 
-/* ExtensionType values from RFC3546 / RFC4366 */
+/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */
 #define TLSEXT_TYPE_server_name			0
 #define TLSEXT_TYPE_max_fragment_length		1
 #define TLSEXT_TYPE_client_certificate_url	2
 #define TLSEXT_TYPE_trusted_ca_keys		3
 #define TLSEXT_TYPE_truncated_hmac		4
 #define TLSEXT_TYPE_status_request		5
+/* ExtensionType values from RFC4681 */
+#define TLSEXT_TYPE_user_mapping		6
+
+/* ExtensionType values from RFC5878 */
+#define TLSEXT_TYPE_client_authz		7
+#define TLSEXT_TYPE_server_authz		8
+
+/* ExtensionType values from RFC6091 */
+#define TLSEXT_TYPE_cert_type		9
+
 /* ExtensionType values from RFC4492 */
 #define TLSEXT_TYPE_elliptic_curves		10
 #define TLSEXT_TYPE_ec_point_formats		11
+
+/* ExtensionType value from RFC5054 */
+#define TLSEXT_TYPE_srp				12
+
+/* ExtensionType values from RFC5246 */
+#define TLSEXT_TYPE_signature_algorithms	13
+
+/* ExtensionType value from RFC5764 */
+#define TLSEXT_TYPE_use_srtp	14
+
+/* ExtensionType value from RFC5620 */
+#define TLSEXT_TYPE_heartbeat	15
+
+/* ExtensionType value from RFC4507 */
 #define TLSEXT_TYPE_session_ticket		35
+
 /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */
 #if 0 /* will have to be provided externally for now ,
        * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183
@@ -221,12 +260,37 @@
 #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2	2
 #define TLSEXT_ECPOINTFORMAT_last			2
 
+/* Signature and hash algorithms from RFC 5246 */
+
+#define TLSEXT_signature_anonymous			0
+#define TLSEXT_signature_rsa				1
+#define TLSEXT_signature_dsa				2
+#define TLSEXT_signature_ecdsa				3
+
+#define TLSEXT_hash_none				0
+#define TLSEXT_hash_md5					1
+#define TLSEXT_hash_sha1				2
+#define TLSEXT_hash_sha224				3
+#define TLSEXT_hash_sha256				4
+#define TLSEXT_hash_sha384				5
+#define TLSEXT_hash_sha512				6
+
 #ifndef OPENSSL_NO_TLSEXT
 
 #define TLSEXT_MAXLEN_host_name 255
 
-const char *SSL_get_servername(const SSL *s, const int type) ;
-int SSL_get_servername_type(const SSL *s) ;
+const char *SSL_get_servername(const SSL *s, const int type);
+int SSL_get_servername_type(const SSL *s);
+/* SSL_export_keying_material exports a value derived from the master secret,
+ * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and
+ * optional context. (Since a zero length context is allowed, the |use_context|
+ * flag controls whether a context is included.)
+ *
+ * It returns 1 on success and zero otherwise.
+ */
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context);
 
 #define SSL_set_tlsext_host_name(s,name) \
 SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name)
@@ -290,6 +354,16 @@
 #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \
 SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_TLSEXT_HB_ENABLED				0x01
+#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS	0x02
+#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS	0x04
+
+#define SSL_get_tlsext_heartbeat_pending(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL)
+#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \
+        SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL)
+#endif
 #endif
 
 /* PSK ciphersuites from 4279 */
@@ -327,6 +401,14 @@
 #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA		0x03000039
 #define TLS1_CK_ADH_WITH_AES_256_SHA			0x0300003A
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_RSA_WITH_NULL_SHA256			0x0300003B
+#define TLS1_CK_RSA_WITH_AES_128_SHA256			0x0300003C
+#define TLS1_CK_RSA_WITH_AES_256_SHA256			0x0300003D
+#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256		0x0300003E
+#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256		0x0300003F
+#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256		0x03000040
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA		0x03000041
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	0x03000042
@@ -335,6 +417,16 @@
 #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA	0x03000045
 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA		0x03000046
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256		0x03000067
+#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256		0x03000068
+#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256		0x03000069
+#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256		0x0300006A
+#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256		0x0300006B
+#define TLS1_CK_ADH_WITH_AES_128_SHA256			0x0300006C
+#define TLS1_CK_ADH_WITH_AES_256_SHA256			0x0300006D
+
+/* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA		0x03000084
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA	0x03000085
 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA	0x03000086
@@ -350,6 +442,20 @@
 #define TLS1_CK_DHE_RSA_WITH_SEED_SHA                   0x0300009A
 #define TLS1_CK_ADH_WITH_SEED_SHA                	0x0300009B
 
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256		0x0300009C
+#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384		0x0300009D
+#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256		0x0300009E
+#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384		0x0300009F
+#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256		0x030000A0
+#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384		0x030000A1
+#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256		0x030000A2
+#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384		0x030000A3
+#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256		0x030000A4
+#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384		0x030000A5
+#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256		0x030000A6
+#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384		0x030000A7
+
 /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */
 #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA                0x0300C001
 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA             0x0300C002
@@ -381,6 +487,38 @@
 #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA          0x0300C018
 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA          0x0300C019
 
+/* SRP ciphersuites from RFC 5054 */
+#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA		0x0300C01A
+#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	0x0300C01B
+#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	0x0300C01C
+#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA		0x0300C01D
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	0x0300C01E
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	0x0300C01F
+#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA		0x0300C020
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	0x0300C021
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	0x0300C022
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256         0x0300C023
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384         0x0300C024
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256          0x0300C025
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384          0x0300C026
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256           0x0300C027
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384           0x0300C028
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256            0x0300C029
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384            0x0300C02A
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256	0x0300C02B
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384	0x0300C02C
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256      0x0300C02D
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384      0x0300C02E
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256       0x0300C02F
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384       0x0300C030
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -448,6 +586,17 @@
 #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA		"PSK-AES128-CBC-SHA"
 #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA		"PSK-AES256-CBC-SHA"
 
+/* SRP ciphersuite from RFC 5054 */
+#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA		"SRP-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	"SRP-RSA-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	"SRP-DSS-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA		"SRP-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	"SRP-RSA-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	"SRP-DSS-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA		"SRP-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	"SRP-RSA-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	"SRP-DSS-AES-256-CBC-SHA"
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA		"CAMELLIA128-SHA"
 #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	"DH-DSS-CAMELLIA128-SHA"
@@ -471,6 +620,55 @@
 #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA                  "DHE-RSA-SEED-SHA"
 #define TLS1_TXT_ADH_WITH_SEED_SHA                      "ADH-SEED-SHA"
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_TXT_RSA_WITH_NULL_SHA256			"NULL-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_128_SHA256		"AES128-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_SHA256		"AES256-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256		"DH-DSS-AES128-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256		"DH-RSA-AES128-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256		"DHE-DSS-AES128-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256		"DHE-RSA-AES128-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256		"DH-DSS-AES256-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256		"DH-RSA-AES256-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256		"DHE-DSS-AES256-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256		"DHE-RSA-AES256-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_128_SHA256		"ADH-AES128-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_SHA256		"ADH-AES256-SHA256"
+
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256		"AES128-GCM-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384		"AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256	"DHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384	"DHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256		"DH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384		"DH-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256	"DHE-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384	"DHE-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256		"DH-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384		"DH-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256		"ADH-AES128-GCM-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384		"ADH-AES256-GCM-SHA384"
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256    "ECDHE-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384    "ECDHE-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256     "ECDH-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384     "ECDH-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256      "ECDHE-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384      "ECDHE-RSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256       "ECDH-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384       "ECDH-RSA-AES256-SHA384"
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256    "ECDHE-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384    "ECDHE-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256     "ECDH-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384     "ECDH-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256      "ECDHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384      "ECDHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2

diff --git a/ssl/tls_srp.c b/ssl/tls_srp.c
new file mode 100644
index 0000000..8512c4d
--- /dev/null
+++ b/ssl/tls_srp.c

@@ -0,0 +1,506 @@
+/* ssl/tls_srp.c */
+/* Written by Christophe Renou ([email protected]) with 
+ * the precious help of Peter Sylvester ([email protected]) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]).  This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+#include "ssl_locl.h"
+#ifndef OPENSSL_NO_SRP
+
+#include <openssl/rand.h>
+#include <openssl/srp.h>
+#include <openssl/err.h>
+
+int SSL_CTX_SRP_CTX_free(struct ssl_ctx_st *ctx)
+	{
+	if (ctx == NULL)
+		return 0;
+	OPENSSL_free(ctx->srp_ctx.login);
+	BN_free(ctx->srp_ctx.N);
+	BN_free(ctx->srp_ctx.g);
+	BN_free(ctx->srp_ctx.s);
+	BN_free(ctx->srp_ctx.B);
+	BN_free(ctx->srp_ctx.A);
+	BN_free(ctx->srp_ctx.a);
+	BN_free(ctx->srp_ctx.b);
+	BN_free(ctx->srp_ctx.v);
+	ctx->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	ctx->srp_ctx.SRP_cb_arg = NULL;
+	ctx->srp_ctx.SRP_verify_param_callback = NULL;
+	ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+	ctx->srp_ctx.N = NULL;
+	ctx->srp_ctx.g = NULL;
+	ctx->srp_ctx.s = NULL;
+	ctx->srp_ctx.B = NULL;
+	ctx->srp_ctx.A = NULL;
+	ctx->srp_ctx.a = NULL;
+	ctx->srp_ctx.b = NULL;
+	ctx->srp_ctx.v = NULL;
+	ctx->srp_ctx.login = NULL;
+	ctx->srp_ctx.info = NULL;
+	ctx->srp_ctx.strength = SRP_MINIMAL_N;
+	ctx->srp_ctx.srp_Mask = 0;
+	return (1);
+	}
+
+int SSL_SRP_CTX_free(struct ssl_st *s)
+	{
+	if (s == NULL)
+		return 0;
+	OPENSSL_free(s->srp_ctx.login);
+	BN_free(s->srp_ctx.N);
+	BN_free(s->srp_ctx.g);
+	BN_free(s->srp_ctx.s);
+	BN_free(s->srp_ctx.B);
+	BN_free(s->srp_ctx.A);
+	BN_free(s->srp_ctx.a);
+	BN_free(s->srp_ctx.b);
+	BN_free(s->srp_ctx.v);
+	s->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	s->srp_ctx.SRP_cb_arg = NULL;
+	s->srp_ctx.SRP_verify_param_callback = NULL;
+	s->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+	s->srp_ctx.N = NULL;
+	s->srp_ctx.g = NULL;
+	s->srp_ctx.s = NULL;
+	s->srp_ctx.B = NULL;
+	s->srp_ctx.A = NULL;
+	s->srp_ctx.a = NULL;
+	s->srp_ctx.b = NULL;
+	s->srp_ctx.v = NULL;
+	s->srp_ctx.login = NULL;
+	s->srp_ctx.info = NULL;
+	s->srp_ctx.strength = SRP_MINIMAL_N;
+	s->srp_ctx.srp_Mask = 0;
+	return (1);
+	}
+
+int SSL_SRP_CTX_init(struct ssl_st *s)
+	{
+	SSL_CTX *ctx;
+
+	if ((s == NULL) || ((ctx = s->ctx) == NULL))
+		return 0;
+	s->srp_ctx.SRP_cb_arg = ctx->srp_ctx.SRP_cb_arg;
+	/* set client Hello login callback */
+	s->srp_ctx.TLS_ext_srp_username_callback = ctx->srp_ctx.TLS_ext_srp_username_callback;
+	/* set SRP N/g param callback for verification */
+	s->srp_ctx.SRP_verify_param_callback = ctx->srp_ctx.SRP_verify_param_callback;
+	/* set SRP client passwd callback */
+	s->srp_ctx.SRP_give_srp_client_pwd_callback = ctx->srp_ctx.SRP_give_srp_client_pwd_callback;
+
+	s->srp_ctx.N = NULL;
+	s->srp_ctx.g = NULL;
+	s->srp_ctx.s = NULL;
+	s->srp_ctx.B = NULL;
+	s->srp_ctx.A = NULL;
+	s->srp_ctx.a = NULL;
+	s->srp_ctx.b = NULL;
+	s->srp_ctx.v = NULL;
+	s->srp_ctx.login = NULL;
+	s->srp_ctx.info = ctx->srp_ctx.info;
+	s->srp_ctx.strength = ctx->srp_ctx.strength;
+
+	if (((ctx->srp_ctx.N != NULL) &&
+		 ((s->srp_ctx.N = BN_dup(ctx->srp_ctx.N)) == NULL)) ||
+		((ctx->srp_ctx.g != NULL) &&
+		 ((s->srp_ctx.g = BN_dup(ctx->srp_ctx.g)) == NULL)) ||
+		((ctx->srp_ctx.s != NULL) &&
+		 ((s->srp_ctx.s = BN_dup(ctx->srp_ctx.s)) == NULL)) ||
+		((ctx->srp_ctx.B != NULL) &&
+		 ((s->srp_ctx.B = BN_dup(ctx->srp_ctx.B)) == NULL)) ||
+		((ctx->srp_ctx.A != NULL) &&
+		 ((s->srp_ctx.A = BN_dup(ctx->srp_ctx.A)) == NULL)) ||
+		((ctx->srp_ctx.a != NULL) &&
+		 ((s->srp_ctx.a = BN_dup(ctx->srp_ctx.a)) == NULL)) ||
+		((ctx->srp_ctx.v != NULL) &&
+		 ((s->srp_ctx.v = BN_dup(ctx->srp_ctx.v)) == NULL)) ||
+		((ctx->srp_ctx.b != NULL) &&
+		 ((s->srp_ctx.b = BN_dup(ctx->srp_ctx.b)) == NULL)))
+		{
+		SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_BN_LIB);
+		goto err;
+		}
+	if ((ctx->srp_ctx.login != NULL) && 
+		((s->srp_ctx.login = BUF_strdup(ctx->srp_ctx.login)) == NULL))
+		{
+		SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+	s->srp_ctx.srp_Mask = ctx->srp_ctx.srp_Mask;
+
+	return (1);
+err:
+	OPENSSL_free(s->srp_ctx.login);
+	BN_free(s->srp_ctx.N);
+	BN_free(s->srp_ctx.g);
+	BN_free(s->srp_ctx.s);
+	BN_free(s->srp_ctx.B);
+	BN_free(s->srp_ctx.A);
+	BN_free(s->srp_ctx.a);
+	BN_free(s->srp_ctx.b);
+	BN_free(s->srp_ctx.v);
+	return (0);
+	}
+
+int SSL_CTX_SRP_CTX_init(struct ssl_ctx_st *ctx)
+	{
+	if (ctx == NULL)
+		return 0;
+
+	ctx->srp_ctx.SRP_cb_arg = NULL;
+	/* set client Hello login callback */
+	ctx->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	/* set SRP N/g param callback for verification */
+	ctx->srp_ctx.SRP_verify_param_callback = NULL;
+	/* set SRP client passwd callback */
+	ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+
+	ctx->srp_ctx.N = NULL;
+	ctx->srp_ctx.g = NULL;
+	ctx->srp_ctx.s = NULL;
+	ctx->srp_ctx.B = NULL;
+	ctx->srp_ctx.A = NULL;
+	ctx->srp_ctx.a = NULL;
+	ctx->srp_ctx.b = NULL;
+	ctx->srp_ctx.v = NULL;
+	ctx->srp_ctx.login = NULL;
+	ctx->srp_ctx.srp_Mask = 0;
+	ctx->srp_ctx.info = NULL;
+	ctx->srp_ctx.strength = SRP_MINIMAL_N;
+
+	return (1);
+	}
+
+/* server side */
+int SSL_srp_server_param_with_username(SSL *s, int *ad)
+	{
+	unsigned char b[SSL_MAX_MASTER_KEY_LENGTH];
+	int al;
+
+	*ad = SSL_AD_UNKNOWN_PSK_IDENTITY;
+	if ((s->srp_ctx.TLS_ext_srp_username_callback !=NULL) &&
+		((al = s->srp_ctx.TLS_ext_srp_username_callback(s, ad, s->srp_ctx.SRP_cb_arg))!=SSL_ERROR_NONE))
+			return al;
+
+	*ad = SSL_AD_INTERNAL_ERROR;
+	if ((s->srp_ctx.N == NULL) ||
+		(s->srp_ctx.g == NULL) ||
+		(s->srp_ctx.s == NULL) ||
+		(s->srp_ctx.v == NULL))
+		return SSL3_AL_FATAL;
+
+	RAND_bytes(b, sizeof(b));
+	s->srp_ctx.b = BN_bin2bn(b,sizeof(b),NULL);
+	OPENSSL_cleanse(b,sizeof(b));
+
+	/* Calculate:  B = (kv + g^b) % N  */
+
+	return ((s->srp_ctx.B = SRP_Calc_B(s->srp_ctx.b, s->srp_ctx.N, s->srp_ctx.g, s->srp_ctx.v)) != NULL)?
+			SSL_ERROR_NONE:SSL3_AL_FATAL;
+	}
+
+/* If the server just has the raw password, make up a verifier entry on the fly */
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass, const char *grp)
+	{
+	SRP_gN *GN = SRP_get_default_gN(grp);
+	if(GN == NULL) return -1;
+	s->srp_ctx.N = BN_dup(GN->N);
+	s->srp_ctx.g = BN_dup(GN->g);
+	if(s->srp_ctx.v != NULL)
+		{
+		BN_clear_free(s->srp_ctx.v);
+		s->srp_ctx.v = NULL;
+		}
+	if(s->srp_ctx.s != NULL)
+		{
+		BN_clear_free(s->srp_ctx.s);
+		s->srp_ctx.s = NULL;
+		}
+	if(!SRP_create_verifier_BN(user, pass, &s->srp_ctx.s, &s->srp_ctx.v, GN->N, GN->g)) return -1;
+
+	return 1;
+	}
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info)
+	{
+	if (N!= NULL)
+		{
+		if (s->srp_ctx.N != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.N,N))
+				{
+				BN_free(s->srp_ctx.N);
+				s->srp_ctx.N = NULL;
+				}
+			}
+		else
+			s->srp_ctx.N = BN_dup(N);
+		}
+	if (g!= NULL)
+		{
+		if (s->srp_ctx.g != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.g,g))
+				{
+				BN_free(s->srp_ctx.g);
+				s->srp_ctx.g = NULL;
+				}
+			}
+		else
+			s->srp_ctx.g = BN_dup(g);
+		}
+	if (sa!= NULL)
+		{
+		if (s->srp_ctx.s != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.s,sa))
+				{
+				BN_free(s->srp_ctx.s);
+				s->srp_ctx.s = NULL;
+				}
+			}
+		else
+			s->srp_ctx.s = BN_dup(sa);
+		}
+	if (v!= NULL)
+		{
+		if (s->srp_ctx.v != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.v,v))
+				{
+				BN_free(s->srp_ctx.v);
+				s->srp_ctx.v = NULL;
+				}
+			}
+		else
+			s->srp_ctx.v = BN_dup(v);
+		}
+	s->srp_ctx.info = info;
+
+	if (!(s->srp_ctx.N) ||
+		!(s->srp_ctx.g) ||
+		!(s->srp_ctx.s) ||
+		!(s->srp_ctx.v))
+		return -1;
+
+	return 1;
+	}
+
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key)
+	{
+	BIGNUM *K = NULL, *u = NULL;
+	int ret = -1, tmp_len;
+	unsigned char *tmp = NULL;
+
+	if (!SRP_Verify_A_mod_N(s->srp_ctx.A,s->srp_ctx.N))
+		goto err;
+	if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N)))
+		goto err;
+	if (!(K = SRP_Calc_server_key(s->srp_ctx.A, s->srp_ctx.v, u, s->srp_ctx.b, s->srp_ctx.N)))
+		goto err;
+
+	tmp_len = BN_num_bytes(K);
+	if ((tmp = OPENSSL_malloc(tmp_len)) == NULL)
+		goto err;
+	BN_bn2bin(K, tmp);
+	ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len);
+err:
+	if (tmp)
+		{
+		OPENSSL_cleanse(tmp,tmp_len) ;
+		OPENSSL_free(tmp);
+		}
+	BN_clear_free(K);
+	BN_clear_free(u);
+	return ret;
+	}
+
+/* client side */
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key)
+	{
+	BIGNUM *x = NULL, *u = NULL, *K = NULL;
+	int ret = -1, tmp_len;
+	char *passwd = NULL;
+	unsigned char *tmp = NULL;
+
+	/* Checks if b % n == 0
+	 */
+	if (SRP_Verify_B_mod_N(s->srp_ctx.B,s->srp_ctx.N)==0) goto err;
+	if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N))) goto err;
+	if (s->srp_ctx.SRP_give_srp_client_pwd_callback == NULL) goto err;
+	if (!(passwd = s->srp_ctx.SRP_give_srp_client_pwd_callback(s, s->srp_ctx.SRP_cb_arg))) goto err;
+	if (!(x = SRP_Calc_x(s->srp_ctx.s,s->srp_ctx.login,passwd))) goto err;
+	if (!(K = SRP_Calc_client_key(s->srp_ctx.N, s->srp_ctx.B, s->srp_ctx.g, x, s->srp_ctx.a, u))) goto err;
+
+	tmp_len = BN_num_bytes(K);
+	if ((tmp = OPENSSL_malloc(tmp_len)) == NULL) goto err;
+	BN_bn2bin(K, tmp);
+	ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len);
+err:
+	if (tmp)
+		{
+		OPENSSL_cleanse(tmp,tmp_len) ;
+		OPENSSL_free(tmp);
+		}
+	BN_clear_free(K);
+	BN_clear_free(x);
+	if (passwd)
+		{
+		OPENSSL_cleanse(passwd,strlen(passwd)) ;
+		OPENSSL_free(passwd);
+		}
+	BN_clear_free(u);
+	return ret;
+	}
+
+int SRP_Calc_A_param(SSL *s)
+	{
+	unsigned char rnd[SSL_MAX_MASTER_KEY_LENGTH];
+
+	if (BN_num_bits(s->srp_ctx.N) < s->srp_ctx.strength)
+		return -1;
+
+	if (s->srp_ctx.SRP_verify_param_callback ==NULL && 
+		!SRP_check_known_gN_param(s->srp_ctx.g,s->srp_ctx.N))
+		return -1 ;
+
+	RAND_bytes(rnd, sizeof(rnd));
+	s->srp_ctx.a = BN_bin2bn(rnd, sizeof(rnd), s->srp_ctx.a);
+	OPENSSL_cleanse(rnd, sizeof(rnd));
+
+	if (!(s->srp_ctx.A = SRP_Calc_A(s->srp_ctx.a,s->srp_ctx.N,s->srp_ctx.g)))
+		return -1;
+
+	/* We can have a callback to verify SRP param!! */
+	if (s->srp_ctx.SRP_verify_param_callback !=NULL) 
+		return s->srp_ctx.SRP_verify_param_callback(s,s->srp_ctx.SRP_cb_arg);
+
+	return 1;
+	}
+
+BIGNUM *SSL_get_srp_g(SSL *s)
+	{
+	if (s->srp_ctx.g != NULL)
+		return s->srp_ctx.g;
+	return s->ctx->srp_ctx.g;
+	}
+
+BIGNUM *SSL_get_srp_N(SSL *s)
+	{
+	if (s->srp_ctx.N != NULL)
+		return s->srp_ctx.N;
+	return s->ctx->srp_ctx.N;
+	}
+
+char *SSL_get_srp_username(SSL *s)
+	{
+	if (s->srp_ctx.login != NULL)
+		return s->srp_ctx.login;
+	return s->ctx->srp_ctx.login;
+	}
+
+char *SSL_get_srp_userinfo(SSL *s)
+	{
+	if (s->srp_ctx.info != NULL)
+		return s->srp_ctx.info;
+	return s->ctx->srp_ctx.info;
+	}
+
+#define tls1_ctx_ctrl ssl3_ctx_ctrl
+#define tls1_ctx_callback_ctrl ssl3_ctx_callback_ctrl
+
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME,0,name);
+	}
+
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD,0,password);
+	}
+
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength)
+	{
+	return tls1_ctx_ctrl(ctx, SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH, strength,
+			     NULL);
+	}
+
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx, int (*cb)(SSL *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_VERIFY_PARAM_CB,
+				      (void (*)(void))cb);
+	}
+
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_SRP_ARG,0,arg);
+	}
+
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB,
+				      (void (*)(void))cb);
+	}
+
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx, char *(*cb)(SSL *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB,
+				      (void (*)(void))cb);
+	}
+
+#endif
commit	392aa7cc7d2b122614c5393c3e357da07fd07af3	[log] [tgz]
author	Brian Carlstrom <[email protected]>	Thu Mar 15 16:03:43 2012 -0700
committer	Brian Carlstrom <[email protected]>	Wed Mar 21 11:09:32 2012 -0700
tree	69f0b217fb624fdc56abb9f659c9bdea1b1865aa
parent	7f1d63479ce92a2a4a0874b007e49f8acb13a0d9 [diff]