Import CRYPTOGAMS ASM code (from OpenSSL master for now)

author David Woodhouse <dwmw2@infradead.org>

Thu, 11 Apr 2019 21:44:48 +0000 (00:44 +0300)

committer David Woodhouse <dwmw2@infradead.org>

Mon, 15 Apr 2019 17:12:24 +0000 (18:12 +0100)
author David Woodhouse <dwmw2@infradead.org>
Thu, 11 Apr 2019 21:44:48 +0000 (00:44 +0300)
committer David Woodhouse <dwmw2@infradead.org>
Mon, 15 Apr 2019 17:12:24 +0000 (18:12 +0100)
diff --git a/Makefile.am b/Makefile.am

index b6d26309bedd1a8e34d504a82fc568c0762626db..bb43b8c38a4f098cf2aff38ccabe2ea39aa8b8fd 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -42,9 +42,10 @@ lib_srcs_yubikey = yubikey.c
  lib_srcs_stoken = stoken.c
  lib_srcs_esp = esp.c esp-seqno.c
  lib_srcs_dtls = dtls.c
+lib_srcs_aesni = aesni-esp.c aesni-esp.h
  
  POTFILES = $(openconnect_SOURCES) $(lib_srcs_cisco) $(lib_srcs_juniper) $(lib_srcs_globalprotect) \
-          gnutls-esp.c gnutls-dtls.c openssl-esp.c openssl-dtls.c \
+          gnutls-esp.c gnutls-dtls.c openssl-esp.c openssl-dtls.c $(lib_srcs_aesni) \
            $(lib_srcs_esp) $(lib_srcs_dtls) gnutls_tpm2_esys.c gnutls_tpm2_ibm.c \
            $(lib_srcs_openssl) $(lib_srcs_gnutls) $(library_srcs) \
            $(lib_srcs_win32) $(lib_srcs_posix) $(lib_srcs_gssapi) $(lib_srcs_iconv) \
@@ -79,6 +80,9 @@ endif
  if OPENCONNECT_DTLS
  lib_srcs_cisco += $(lib_srcs_dtls)
  endif
+if OPENCONNECT_AESNI
+lib_srcs_esp += x86_64cpuid.s aesni-sha1-x86_64.s aesni-x86_64.s sha1-x86_64.s $(lib_srcs_aesni)
+endif
  if OPENCONNECT_ESP
  lib_srcs_juniper += $(lib_srcs_esp)
  endif
@@ -139,6 +143,11 @@ pkglibexec_SCRIPTS = trojans/csd-post.sh trojans/csd-wrapper.sh trojans/hiprepor
  # main.c includes version.c
  openconnect-main.$(OBJEXT): version.c
  
+# s/OPENSSL/OPENCONNECT/ here so instead of doing it in the .pl files, so
+# that they can remain pristine copies from upstream.
+%.s: aesni/%.pl
+       CC=$(CC) $(PERL) $< | sed s/OPENSSL_ia32/OPENCONNECT_ia32/g > $@
+
  version.c: $(library_srcs) $(lib_openssl_srcs) $(lib_gnutls_srcs) \
            $(openconnect_SOURCES) Makefile.am configure.ac \
            openconnect.h openconnect-internal.h version.sh @GITVERSIONDEPS@
diff --git a/aesni-esp.c b/aesni-esp.c

new file mode 100644 (file)

index 0000000..a7b2362
--- /dev/null
+++ b/aesni-esp.c
@@ -0,0 +1,48 @@
+/*
+ * OpenConnect (SSL + DTLS) VPN client
+ *
+ * Copyright © 2019 David Woodhouse
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "openconnect-internal.h"
+
+#include "aesni-esp.h"
+
+uint64_t OPENCONNECT_ia32cap_P[2];
+
+int aesni_init_esp_ciphers(struct openconnect_info *vpninfo,
+                          struct esp *esp_out, struct esp *esp_in)
+{
+       if (!(OPENCONNECT_ia32cap_P[0] & (1<<10))) {
+               uint64_t cap = OPENCONNECT_ia32_cpuid(OPENCONNECT_ia32cap_P);
+
+               OPENCONNECT_ia32cap_P[0] = cap | (1<<10);
+
+               vpn_progress(vpninfo, PRG_DEBUG,
+                            _("CPU capabilities: %08lx %08lx %08lx %08lx\n"),
+                            OPENCONNECT_ia32cap_P[0] & 0xffffffff,
+                            OPENCONNECT_ia32cap_P[0] >> 32,
+                            OPENCONNECT_ia32cap_P[1] & 0xffffffff,
+                            OPENCONNECT_ia32cap_P[1] >> 32);
+       }
+
+       return -EINVAL;
+}
diff --git a/aesni-esp.h b/aesni-esp.h

new file mode 100644 (file)

index 0000000..7589389
--- /dev/null
+++ b/aesni-esp.h
@@ -0,0 +1,61 @@
+/*
+ * OpenConnect (SSL + DTLS) VPN client
+ *
+ * Copyright © 2019 David Woodhouse
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#ifndef __AESNI_ESP_H__
+#define __AESNI_ESP_H__
+
+/* ABI definitions for the CRYPTOGAMS routines */
+
+#define AES_MAXKEYBITS 256
+#define AES_MAXROUNDS 14
+#define AES_BLOCK 16
+
+struct aesni_key {
+       uint32_t rd_key[4 * (AES_MAXROUNDS + 1)];
+       int rounds;
+};
+
+/* Not literally AES-NI but we are only using this in the context of the
+   stitched AES-NI + SHA1 routines. */
+
+#define SHA1_BLOCK 64
+
+struct aesni_sha1 {
+       uint32_t h0, h1, h2, h3, h4;
+       uint64_t N; /* The CRYPTOGAMS routines don't touch this */
+};
+
+
+int aesni_set_encrypt_key (const unsigned char *userKey, int bits,
+                          struct aesni_key *key);
+int aesni_set_decrypt_key (const unsigned char *userKey, int bits,
+                          struct aesni_key *key);
+
+void aesni_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const struct aesni_key *key,
+                      unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc(const void *inp, void *out, size_t blocks,
+                        const struct aesni_key *key, unsigned char iv[16],
+                        const struct aesni_sha1 *ctx, const void *in0);
+
+void sha1_block_data_order(struct aesni_sha1 *ctx, const void *p,
+                          size_t n_blocks);
+
+uint64_t OPENCONNECT_ia32_cpuid(uint64_t *cap);
+
+#endif /* AESNI_ESP_H__ */
diff --git a/aesni/aesni-sha1-x86_64.pl b/aesni/aesni-sha1-x86_64.pl

new file mode 100644 (file)

index 0000000..eb8364f
--- /dev/null
+++ b/aesni/aesni-sha1-x86_64.pl
@@ -0,0 +1,2140 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#              AES-128-CBC     +SHA1           stitch      gain
+# Westmere     3.77[+5.3]      9.07            6.55        +38%
+# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15)    5.98(7.05)  +68%(+58%)
+# Ivy Bridge   5.05[+4.6]      9.65            5.54        +74%
+# Haswell      4.43[+3.6(4.2)] 8.00(8.58)      4.55(5.21)  +75%(+65%)
+# Skylake      2.63[+3.5(4.1)] 6.17(6.69)      4.23(4.44)  +46%(+51%)
+# Bulldozer    5.77[+6.0]      11.72           6.37        +84%
+# Ryzen(**)    2.71[+1.93]     4.64            2.74        +69%
+# Goldmont(**) 3.82[+1.70]     5.52            4.20        +31%
+#
+#              AES-192-CBC
+# Westmere     4.51            9.81            6.80        +44%
+# Sandy Bridge 6.05            11.06(12.15)    6.11(7.19)  +81%(+69%)
+# Ivy Bridge   6.05            10.65           6.07        +75%
+# Haswell      5.29            8.86(9.44)      5.32(5.32)  +67%(+77%)
+# Bulldozer    6.89            12.84           6.96        +84%
+#
+#              AES-256-CBC
+# Westmere     5.25            10.55           7.21        +46%
+# Sandy Bridge 7.05            12.06(13.15)    7.12(7.72)  +69%(+70%)
+# Ivy Bridge   7.05            11.65           7.12        +64%
+# Haswell      6.19            9.76(10.34)     6.21(6.25)  +57%(+65%)
+# Skylake      3.62            7.16(7.68)      4.56(4.76)  +57%(+61%)
+# Bulldozer    8.00            13.95           8.25        +69%
+# Ryzen(**)    3.71            5.64            3.72        +52%
+# Goldmont(**) 5.35            7.05            5.76        +22%
+#
+# (*)  There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#      background information. Above numbers in parentheses are SSSE3
+#      results collected on AVX-capable CPU, i.e. apply on OSes that
+#      don't support AVX.
+# (**) SHAEXT results.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#              AES-128-CBC     AES-192-CBC     AES-256-CBC
+# Westmere     1.25            1.50            1.75
+# Sandy Bridge 0.74            0.91            1.09
+# Ivy Bridge   0.74            0.90            1.11
+# Haswell      0.63            0.76            0.88
+# Bulldozer    0.70            0.85            0.99
+
+# And indeed:
+#
+#              AES-256-CBC     +SHA1           stitch      gain
+# Westmere     1.75            7.20            6.68        +7.8%
+# Sandy Bridge 1.09            6.09(7.22)      5.82(6.95)  +4.6%(+3.9%)
+# Ivy Bridge   1.11            5.70            5.45        +4.6%
+# Haswell      0.88            4.45(5.00)      4.39(4.69)  +1.4%(*)(+6.6%)
+# Bulldozer    0.99            6.95            5.95        +17%(**)
+#
+# (*)  Tiny improvement coefficient on Haswell is because we compare
+#      AVX1 stitch to sum with AVX2 SHA1.
+# (**) Execution is fully dominated by integer code sequence and
+#      SIMD still hardly shows [in single-process benchmark;-]
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+          $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+          $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+          `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+          $1>=10);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+
+$shaext=1;     ### set to zero if compiling for 1.0.1
+
+$stitched_decrypt=0;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#                      void *out,
+#                      size_t length,
+#                      const AES_KEY *key,
+#                      unsigned char *iv,
+#                      SHA_CTX *ctx,
+#                      const void *in0);
+
+$code.=<<___;
+.text
+.extern        OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type  aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 32
+aesni_cbc_sha1_enc:
+       # caller should check for SSSE3 and AES-NI bits
+       mov     OPENSSL_ia32cap_P+0(%rip),%r10d
+       mov     OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
+       bt      \$61,%r11               # check SHA bit
+       jc      aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___ if ($avx);
+       and     \$`1<<28`,%r11d         # mask AVX bit
+       and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
+       or      %r11d,%r10d
+       cmp     \$`1<<28|1<<30`,%r10d
+       je      aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+       jmp     aesni_cbc_sha1_enc_ssse3
+       ret
+.size  aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");   # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
+my $K_XX_XX="%r11";
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));                  # for enc
+my @rndkey=("%xmm14","%xmm15");                                        # for enc
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));   # for dec
+
+if (1) {       # reassign for Atom Silvermont
+    # The goal is to minimize amount of instructions with more than
+    # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
+    # SSSE3 instructions to upper half of the register bank.
+    @X=map("%xmm$_",(8..11,4..7));
+    @Tx=map("%xmm$_",(12,13,3));
+    ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+    @rndkey=("%xmm0","%xmm1");
+}
+
+sub AUTOLOAD()         # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type  aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 32
+aesni_cbc_sha1_enc_ssse3:
+.cfi_startproc
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       #shr    \$6,$len                        # debugging artefact
+       #jz     .Lepilogue_ssse3                # debugging artefact
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       push    %r15
+.cfi_push      %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+       #mov    $in0,$inp                       # debugging artefact
+       #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       lea     112($key),%r15                  # size optimization
+       movdqu  ($ivp),$iv                      # load IV
+       mov     $ivp,88(%rsp)                   # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15));    # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       mov     240-112($key),$rounds
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       movdqa  64($K_XX_XX),@Tx[2]     # pbswap mask
+       movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
+       movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       movdqu  16($inp),@X[-3&7]
+       movdqu  32($inp),@X[-2&7]
+       movdqu  48($inp),@X[-1&7]
+       pshufb  @Tx[2],@X[-4&7]         # byte swap
+       pshufb  @Tx[2],@X[-3&7]
+       pshufb  @Tx[2],@X[-2&7]
+       add     \$64,$inp
+       paddd   @Tx[1],@X[-4&7]         # add K_00_19
+       pshufb  @Tx[2],@X[-1&7]
+       paddd   @Tx[1],@X[-3&7]
+       paddd   @Tx[1],@X[-2&7]
+       movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+       psubd   @Tx[1],@X[-4&7]         # restore X[]
+       movdqa  @X[-3&7],16(%rsp)
+       psubd   @Tx[1],@X[-3&7]
+       movdqa  @X[-2&7],32(%rsp)
+       psubd   @Tx[1],@X[-2&7]
+       movups  -112($key),$rndkey0     # $key[0]
+       movups  16-112($key),$rndkey[0] # forward reference
+       jmp     .Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+       movups          `16*$n`($in0),$in               # load input
+       xorps           $rndkey0,$in
+___
+      $code.=<<___ if ($n);
+       movups          $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+       xorps           $in,$iv
+       movups          `32+16*$k-112`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+       cmp             \$11,$rounds
+       jb              .Laesenclast$sn
+       movups          `32+16*($k+0)-112`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*($k+1)-112`($key),$rndkey[0]
+       aesenc          $rndkey[1],$iv
+       je              .Laesenclast$sn
+       movups          `32+16*($k+2)-112`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*($k+3)-112`($key),$rndkey[0]
+       aesenc          $rndkey[1],$iv
+.Laesenclast$sn:
+       aesenclast      $rndkey[0],$iv
+       movups          16-112($key),$rndkey[1]         # forward reference
+___
+    } else {
+      $code.=<<___;
+       movups          `32+16*$k-112`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+___
+    }
+    $r++;      unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()              # recall that $Xi starts with 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));           # ror
+       &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa   (@X[0],@X[-3&7]);
+        eval(shift(@insns));
+       &movdqa (@Tx[0],@X[-1&7]);
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+       &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &movdqa (@Tx[2],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &movdqa (@Tx[0],@X[0]);
+        eval(shift(@insns));
+
+       &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
+       &paddd  (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+       &movdqa (@Tx[1],@Tx[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[2],30);
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pslld  (@Tx[1],2);
+       &pxor   (@X[0],@Tx[2]);
+        eval(shift(@insns));
+         &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
+       &pshufd (@Tx[1],@X[-1&7],0xee)  if ($Xi==7);    # was &movdqa   (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns))            if ($Xi==8);
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns))            if ($Xi==8);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns))            if (@insns[1] =~ /_ror/);
+        eval(shift(@insns))            if (@insns[0] =~ /_ror/);
+       &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+       if ($Xi%5) {
+         &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+       } else {                        # ... or load next one
+         &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+       }
+        eval(shift(@insns));           # ror
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns))            if (@insns[0] =~ /_ror/);
+
+       &movdqa (@Tx[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+        eval(shift(@insns));           # body_20_39
+
+       &pslld  (@X[0],2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psrld  (@Tx[0],30);
+        eval(shift(@insns))            if (@insns[0] =~ /_rol/);# rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
+        eval(shift(@insns));
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns))            if (@insns[1] =~ /_rol/);
+        eval(shift(@insns))            if (@insns[0] =~ /_rol/);
+         &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$len);
+       &je     (shift);
+
+       unshift(@Tx,pop(@Tx));
+
+       &movdqa (@Tx[2],"64($K_XX_XX)");        # pbswap mask
+       &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
+       &movdqu (@X[-4&7],"0($inp)");           # load input
+       &movdqu (@X[-3&7],"16($inp)");
+       &movdqu (@X[-2&7],"32($inp)");
+       &movdqu (@X[-1&7],"48($inp)");
+       &pshufb (@X[-4&7],@Tx[2]);              # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &pshufb (@X[($Xi-3)&7],@Tx[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &paddd  (@X[($Xi-4)&7],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psubd  (@X[($Xi-4)&7],@Tx[1]);
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+my @body_00_19 = (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&$_ror ($b,$j?7:2);',  # $b>>>2
+       '&xor   (@T[0],$d);',
+       '&mov   (@T[1],$a);',   # $b for next round
+
+       '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+       '&xor   ($b,$c);',      # $c^$d for next round
+
+       '&$_rol ($a,5);',
+       '&add   ($e,@T[0]);',
+       '&and   (@T[1],$b);',   # ($b&($c^$d)) for next round
+
+       '&xor   ($b,$c);',      # restore $b
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+
+sub body_00_19 () {    # ((c^d)&b)^d
+    # on start @T[0]=(c^d)&b
+    return &body_20_39() if ($rx==19); $rx++;
+
+    use integer;
+    my ($k,$n);
+    my @r=@body_00_19;
+
+       $n = scalar(@r);
+       $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+       $jj++;
+
+    return @r;
+}
+
+my @body_20_39 = (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+       '&xor   (@T[0],$d)      if($j==19);'.
+       '&xor   (@T[0],$c)      if($j> 19);',   # ($b^$d^$c)
+       '&mov   (@T[1],$a);',   # $b for next round
+
+       '&$_rol ($a,5);',
+       '&add   ($e,@T[0]);',
+       '&xor   (@T[1],$c)      if ($j< 79);',  # $b^$d for next round
+
+       '&$_ror ($b,7);',       # $b>>>2
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+
+sub body_20_39 () {    # b^d^c
+    # on entry @T[0]=b^d
+    return &body_40_59() if ($rx==39); $rx++;
+
+    use integer;
+    my ($k,$n);
+    my @r=@body_20_39;
+
+       $n = scalar(@r);
+       $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=20);
+       $jj++;
+
+    return @r;
+}
+
+my @body_40_59 = (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+       '&and   (@T[0],$c)      if ($j>=40);',  # (b^c)&(c^d)
+       '&xor   ($c,$d)         if ($j>=40);',  # restore $c
+
+       '&$_ror ($b,7);',       # $b>>>2
+       '&mov   (@T[1],$a);',   # $b for next round
+       '&xor   (@T[0],$c);',
+
+       '&$_rol ($a,5);',
+       '&add   ($e,@T[0]);',
+       '&xor   (@T[1],$c)      if ($j==59);'.
+       '&xor   (@T[1],$b)      if ($j< 59);',  # b^c for next round
+
+       '&xor   ($b,$c)         if ($j< 59);',  # c^d for next round
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+
+sub body_40_59 () {    # ((b^c)&(c^d))^c
+    # on entry @T[0]=(b^c), (c^=d)
+    $rx++;
+
+    use integer;
+    my ($k,$n);
+    my @r=@body_40_59;
+
+       $n = scalar(@r);
+       $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=40);
+       $jj++;
+
+    return @r;
+}
+$code.=<<___;
+.align 32
+.Loop_ssse3:
+___
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+                               $saved_r=$r; @saved_rndkey=@rndkey;
+
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+       movups  $iv,48($out,$in0)               # write output
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_ssse3
+
+.Ldone_ssse3:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $r=$saved_r;     @rndkey=@saved_rndkey;
+
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+       movups  $iv,48($out,$in0)               # write output
+       mov     88(%rsp),$ivp                   # restore $ivp
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       movups  $iv,($ivp)                      # write IV
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa   %rsi,56
+       mov     0(%rsi),%r15
+.cfi_restore   %r15
+       mov     8(%rsi),%r14
+.cfi_restore   %r14
+       mov     16(%rsi),%r13
+.cfi_restore   %r13
+       mov     24(%rsi),%r12
+.cfi_restore   %r12
+       mov     32(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     40(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     48(%rsi),%rsp
+.cfi_def_cfa   %rsp,8
+.Lepilogue_ssse3:
+       ret
+.cfi_endproc
+.size  aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+                                               if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+# reassign for Atom Silvermont (see above)
+($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
+@X=map("%xmm$_",(8..13,6,7));
+@Tx=map("%xmm$_",(14,15,5));
+
+my @aes256_dec = (
+       '&movdqu($inout0,"0x00($in0)");',
+       '&movdqu($inout1,"0x10($in0)"); &pxor   ($inout0,$rndkey0);',
+       '&movdqu($inout2,"0x20($in0)"); &pxor   ($inout1,$rndkey0);',
+       '&movdqu($inout3,"0x30($in0)"); &pxor   ($inout2,$rndkey0);',
+
+       '&pxor  ($inout3,$rndkey0);     &movups ($rndkey0,"16-112($key)");',
+       '&movaps("64(%rsp)",@X[2]);',   # save IV, originally @X[3]
+       undef,undef
+       );
+for ($i=0;$i<13;$i++) {
+    push (@aes256_dec,(
+       '&aesdec        ($inout0,$rndkey0);',
+       '&aesdec        ($inout1,$rndkey0);',
+       '&aesdec        ($inout2,$rndkey0);',
+       '&aesdec        ($inout3,$rndkey0);     &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+       ));
+    push (@aes256_dec,(undef,undef))   if (($i>=3 && $i<=5) || $i>=11);
+    push (@aes256_dec,(undef,undef))   if ($i==5);
+}
+push(@aes256_dec,(
+       '&aesdeclast    ($inout0,$rndkey0);     &movups (@X[0],"0x00($in0)");',
+       '&aesdeclast    ($inout1,$rndkey0);     &movups (@X[1],"0x10($in0)");',
+       '&aesdeclast    ($inout2,$rndkey0);     &movups (@X[2],"0x20($in0)");',
+       '&aesdeclast    ($inout3,$rndkey0);     &movups (@X[3],"0x30($in0)");',
+
+       '&xorps         ($inout0,"64(%rsp)");   &movdqu ($rndkey0,"-112($key)");',
+       '&xorps         ($inout1,@X[0]);        &movups ("0x00($out,$in0)",$inout0);',
+       '&xorps         ($inout2,@X[1]);        &movups ("0x10($out,$in0)",$inout1);',
+       '&xorps         ($inout3,@X[2]);        &movups ("0x20($out,$in0)",$inout2);',
+
+       '&movups        ("0x30($out,$in0)",$inout3);'
+       ));
+
+sub body_00_19_dec () {        # ((c^d)&b)^d
+    # on start @T[0]=(c^d)&b
+    return &body_20_39_dec() if ($rx==19);
+
+    my @r=@body_00_19;
+
+       unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
+       $rx++;
+
+    return @r;
+}
+
+sub body_20_39_dec () {        # b^d^c
+    # on entry @T[0]=b^d
+    return &body_40_59_dec() if ($rx==39);
+
+    my @r=@body_20_39;
+
+       unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
+       $rx++;
+
+    return @r;
+}
+
+sub body_40_59_dec () {        # ((b^c)&(c^d))^c
+    # on entry @T[0]=(b^c), (c^=d)
+
+    my @r=@body_40_59;
+
+       unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
+       $rx++;
+
+    return @r;
+}
+
+$code.=<<___;
+.globl aesni256_cbc_sha1_dec
+.type  aesni256_cbc_sha1_dec,\@abi-omnipotent
+.align 32
+aesni256_cbc_sha1_dec:
+       # caller should check for SSSE3 and AES-NI bits
+       mov     OPENSSL_ia32cap_P+0(%rip),%r10d
+       mov     OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+       and     \$`1<<28`,%r11d         # mask AVX bit
+       and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
+       or      %r11d,%r10d
+       cmp     \$`1<<28|1<<30`,%r10d
+       je      aesni256_cbc_sha1_dec_avx
+___
+$code.=<<___;
+       jmp     aesni256_cbc_sha1_dec_ssse3
+       ret
+.size  aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
+
+.type  aesni256_cbc_sha1_dec_ssse3,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_ssse3:
+.cfi_startproc
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       push    %r15
+.cfi_push      %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_dec_ssse3:
+___
+$code.=<<___;
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       lea     112($key),%r15                  # size optimization
+       movdqu  ($ivp),@X[3]                    # load IV
+       #mov    $ivp,88(%rsp)                   # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15));    # reassign arguments
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       movdqa  64($K_XX_XX),@Tx[2]     # pbswap mask
+       movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
+       movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       movdqu  16($inp),@X[-3&7]
+       movdqu  32($inp),@X[-2&7]
+       movdqu  48($inp),@X[-1&7]
+       pshufb  @Tx[2],@X[-4&7]         # byte swap
+       add     \$64,$inp
+       pshufb  @Tx[2],@X[-3&7]
+       pshufb  @Tx[2],@X[-2&7]
+       pshufb  @Tx[2],@X[-1&7]
+       paddd   @Tx[1],@X[-4&7]         # add K_00_19
+       paddd   @Tx[1],@X[-3&7]
+       paddd   @Tx[1],@X[-2&7]
+       movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+       psubd   @Tx[1],@X[-4&7]         # restore X[]
+       movdqa  @X[-3&7],16(%rsp)
+       psubd   @Tx[1],@X[-3&7]
+       movdqa  @X[-2&7],32(%rsp)
+       psubd   @Tx[1],@X[-2&7]
+       movdqu  -112($key),$rndkey0     # $key[0]
+       jmp     .Loop_dec_ssse3
+
+.align 32
+.Loop_dec_ssse3:
+___
+       &Xupdate_ssse3_16_31(\&body_00_19_dec);
+       &Xupdate_ssse3_16_31(\&body_00_19_dec);
+       &Xupdate_ssse3_16_31(\&body_00_19_dec);
+       &Xupdate_ssse3_16_31(\&body_00_19_dec);
+       &Xupdate_ssse3_32_79(\&body_00_19_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xupdate_ssse3_32_79(\&body_40_59_dec);
+       &Xupdate_ssse3_32_79(\&body_40_59_dec);
+       &Xupdate_ssse3_32_79(\&body_40_59_dec);
+       &Xupdate_ssse3_32_79(\&body_40_59_dec);
+       &Xupdate_ssse3_32_79(\&body_40_59_dec);
+       &Xupdate_ssse3_32_79(\&body_20_39_dec);
+       &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
+
+                               $saved_j=$j;   @saved_V=@V;
+                               $saved_rx=$rx;
+
+       &Xloop_ssse3(\&body_20_39_dec);
+       &Xloop_ssse3(\&body_20_39_dec);
+       &Xloop_ssse3(\&body_20_39_dec);
+
+       eval(@aes256_dec[-1]);                  # last store
+$code.=<<___;
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_dec_ssse3
+
+.Ldone_dec_ssse3:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $rx=$saved_rx;
+
+       &Xtail_ssse3(\&body_20_39_dec);
+       &Xtail_ssse3(\&body_20_39_dec);
+       &Xtail_ssse3(\&body_20_39_dec);
+
+       eval(@aes256_dec[-1]);                  # last store
+$code.=<<___;
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       movups  @X[3],($ivp)                    # write IV
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_cfa_def   %rsi,56
+       mov     0(%rsi),%r15
+.cfi_restore   %r15
+       mov     8(%rsi),%r14
+.cfi_restore   %r14
+       mov     16(%rsi),%r13
+.cfi_restore   %r13
+       mov     24(%rsi),%r12
+.cfi_restore   %r12
+       mov     32(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     40(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     48(%rsi),%rsp
+.cfi_cfa_def   %rsp,8
+.Lepilogue_dec_ssse3:
+       ret
+.cfi_endproc
+.size  aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
+___
+                                               }}}
+$j=$jj=$r=$rx=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");   # size optimization
+my @T=("%esi","%edi");
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));   # for dec
+my $Kx=@Tx[2];
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type  aesni_cbc_sha1_enc_avx,\@function,6
+.align 32
+aesni_cbc_sha1_enc_avx:
+.cfi_startproc
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       #shr    \$6,$len                        # debugging artefact
+       #jz     .Lepilogue_avx                  # debugging artefact
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       push    %r15
+.cfi_push      %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+       #mov    $in0,$inp                       # debugging artefact
+       #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+       vzeroall
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       lea     112($key),%r15                  # size optimization
+       vmovdqu ($ivp),$iv                      # load IV
+       mov     $ivp,88(%rsp)                   # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15));    # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       mov     240-112($key),$rounds
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+       vmovdqa 0($K_XX_XX),$Kx         # K_00_19
+       vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       vmovdqu 16($inp),@X[-3&7]
+       vmovdqu 32($inp),@X[-2&7]
+       vmovdqu 48($inp),@X[-1&7]
+       vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+       add     \$64,$inp
+       vpshufb @X[2],@X[-3&7],@X[-3&7]
+       vpshufb @X[2],@X[-2&7],@X[-2&7]
+       vpshufb @X[2],@X[-1&7],@X[-1&7]
+       vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
+       vpaddd  $Kx,@X[-3&7],@X[1]
+       vpaddd  $Kx,@X[-2&7],@X[2]
+       vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+       vmovdqa @X[1],16(%rsp)
+       vmovdqa @X[2],32(%rsp)
+       vmovups -112($key),$rndkey[1]   # $key[0]
+       vmovups 16-112($key),$rndkey[0] # forward reference
+       jmp     .Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+       vmovdqu         `16*$n`($in0),$in               # load input
+       vpxor           $rndkey[1],$in,$in
+___
+      $code.=<<___ if ($n);
+       vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+       vpxor           $in,$iv,$iv
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+       cmp             \$11,$rounds
+       jb              .Lvaesenclast$sn
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
+       vaesenc         $rndkey[1],$iv,$iv
+       vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
+       je              .Lvaesenclast$sn
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
+       vaesenc         $rndkey[1],$iv,$iv
+       vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+       vaesenclast     $rndkey[0],$iv,$iv
+       vmovups         -112($key),$rndkey[0]
+       vmovups         16-112($key),$rndkey[1]         # forward reference
+___
+    } else {
+      $code.=<<___;
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;      unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()                # recall that $Xi starts with 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[0],@X[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslldq(@Tx[1],@X[0],12);              # "X[0]"<<96, extract one dword
+       &vpaddd (@X[0],@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+       &vpsrld (@Tx[0],@Tx[1],30);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslld (@Tx[1],@Tx[1],2);
+       &vpxor  (@X[0],@X[0],@Tx[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[1]);           # "X[0]"^=("X[0]">>96)<<<2
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+         &vmovdqa      ($Kx,eval(16*($Xi/5))."($K_XX_XX)")     if ($Xi%5==0);
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpsrld (@Tx[0],@X[0],30);
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpslld (@X[0],@X[0],2);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$len);
+       &je     (shift);
+
+       &vmovdqa(@Tx[1],"64($K_XX_XX)");        # pbswap mask
+       &vmovdqa($Kx,"0($K_XX_XX)");            # K_00_19
+       &vmovdqu(@X[-4&7],"0($inp)");           # load input
+       &vmovdqu(@X[-3&7],"16($inp)");
+       &vmovdqu(@X[-2&7],"32($inp)");
+       &vmovdqu(@X[-1&7],"48($inp)");
+       &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);     # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 32
+.Loop_avx:
+___
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xuplast_avx_80(\&body_20_39,".Ldone_avx");     # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+                               $saved_r=$r; @saved_rndkey=@rndkey;
+
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+       vmovups $iv,48($out,$in0)               # write output
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_avx
+
+.Ldone_avx:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $r=$saved_r;     @rndkey=@saved_rndkey;
+
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+       vmovups $iv,48($out,$in0)               # write output
+       mov     88(%rsp),$ivp                   # restore $ivp
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       vmovups $iv,($ivp)                      # write IV
+       vzeroall
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa   %rsi,56
+       mov     0(%rsi),%r15
+.cfi_restore   %r15
+       mov     8(%rsi),%r14
+.cfi_restore   %r14
+       mov     16(%rsi),%r13
+.cfi_restore   %r13
+       mov     24(%rsi),%r12
+.cfi_restore   %r12
+       mov     32(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     40(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     48(%rsi),%rsp
+.cfi_def_cfa   %rsp,8
+.Lepilogue_avx:
+       ret
+.cfi_endproc
+.size  aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+
+                                               if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+@aes256_dec = (
+       '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
+       '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
+       '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
+       '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
+
+       '&vmovups($rndkey0,"16-112($key)");',
+       '&vmovups("64(%rsp)",@X[2]);',          # save IV, originally @X[3]
+       undef,undef
+       );
+for ($i=0;$i<13;$i++) {
+    push (@aes256_dec,(
+       '&vaesdec       ($inout0,$inout0,$rndkey0);',
+       '&vaesdec       ($inout1,$inout1,$rndkey0);',
+       '&vaesdec       ($inout2,$inout2,$rndkey0);',
+       '&vaesdec       ($inout3,$inout3,$rndkey0);     &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+       ));
+    push (@aes256_dec,(undef,undef))   if (($i>=3 && $i<=5) || $i>=11);
+    push (@aes256_dec,(undef,undef))   if ($i==5);
+}
+push(@aes256_dec,(
+       '&vaesdeclast   ($inout0,$inout0,$rndkey0);     &vmovups(@X[0],"0x00($in0)");',
+       '&vaesdeclast   ($inout1,$inout1,$rndkey0);     &vmovups(@X[1],"0x10($in0)");',
+       '&vaesdeclast   ($inout2,$inout2,$rndkey0);     &vmovups(@X[2],"0x20($in0)");',
+       '&vaesdeclast   ($inout3,$inout3,$rndkey0);     &vmovups(@X[3],"0x30($in0)");',
+
+       '&vxorps        ($inout0,$inout0,"64(%rsp)");   &vmovdqu($rndkey0,"-112($key)");',
+       '&vxorps        ($inout1,$inout1,@X[0]);        &vmovups("0x00($out,$in0)",$inout0);',
+       '&vxorps        ($inout2,$inout2,@X[1]);        &vmovups("0x10($out,$in0)",$inout1);',
+       '&vxorps        ($inout3,$inout3,@X[2]);        &vmovups("0x20($out,$in0)",$inout2);',
+
+       '&vmovups       ("0x30($out,$in0)",$inout3);'
+       ));
+
+$code.=<<___;
+.type  aesni256_cbc_sha1_dec_avx,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_avx:
+.cfi_startproc
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       push    %r15
+.cfi_push      %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_dec_avx:
+___
+$code.=<<___;
+       vzeroall
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       lea     112($key),%r15                  # size optimization
+       vmovdqu ($ivp),@X[3]                    # load IV
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15));    # reassign arguments
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+       vmovdqa 0($K_XX_XX),$Kx         # K_00_19
+       vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       vmovdqu 16($inp),@X[-3&7]
+       vmovdqu 32($inp),@X[-2&7]
+       vmovdqu 48($inp),@X[-1&7]
+       vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+       add     \$64,$inp
+       vpshufb @X[2],@X[-3&7],@X[-3&7]
+       vpshufb @X[2],@X[-2&7],@X[-2&7]
+       vpshufb @X[2],@X[-1&7],@X[-1&7]
+       vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
+       vpaddd  $Kx,@X[-3&7],@X[1]
+       vpaddd  $Kx,@X[-2&7],@X[2]
+       vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+       vmovdqa @X[1],16(%rsp)
+       vmovdqa @X[2],32(%rsp)
+       vmovups -112($key),$rndkey0     # $key[0]
+       jmp     .Loop_dec_avx
+
+.align 32
+.Loop_dec_avx:
+___
+       &Xupdate_avx_16_31(\&body_00_19_dec);
+       &Xupdate_avx_16_31(\&body_00_19_dec);
+       &Xupdate_avx_16_31(\&body_00_19_dec);
+       &Xupdate_avx_16_31(\&body_00_19_dec);
+       &Xupdate_avx_32_79(\&body_00_19_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xupdate_avx_32_79(\&body_40_59_dec);
+       &Xupdate_avx_32_79(\&body_40_59_dec);
+       &Xupdate_avx_32_79(\&body_40_59_dec);
+       &Xupdate_avx_32_79(\&body_40_59_dec);
+       &Xupdate_avx_32_79(\&body_40_59_dec);
+       &Xupdate_avx_32_79(\&body_20_39_dec);
+       &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");     # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+                               $saved_rx=$rx;
+
+       &Xloop_avx(\&body_20_39_dec);
+       &Xloop_avx(\&body_20_39_dec);
+       &Xloop_avx(\&body_20_39_dec);
+
+       eval(@aes256_dec[-1]);                  # last store
+$code.=<<___;
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_dec_avx
+
+.Ldone_dec_avx:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $rx=$saved_rx;
+
+       &Xtail_avx(\&body_20_39_dec);
+       &Xtail_avx(\&body_20_39_dec);
+       &Xtail_avx(\&body_20_39_dec);
+
+       eval(@aes256_dec[-1]);                  # last store
+$code.=<<___;
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       vmovups @X[3],($ivp)                    # write IV
+       vzeroall
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa   %rsi,56
+       mov     0(%rsi),%r15
+.cfi_restore   %r15
+       mov     8(%rsi),%r14
+.cfi_restore   %r14
+       mov     16(%rsi),%r13
+.cfi_restore   %r13
+       mov     24(%rsi),%r12
+.cfi_restore   %r12
+       mov     32(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     40(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     48(%rsi),%rsp
+.cfi_def_cfa   %rsp,8
+.Lepilogue_dec_avx:
+       ret
+.cfi_endproc
+.size  aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
+___
+                                               }}}
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+                                               if ($shaext) {{{
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$rounds="%r11d";
+
+($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+@rndkey=("%xmm0","%xmm1");
+$r=0;
+
+my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type  aesni_cbc_sha1_enc_shaext,\@function,6
+.align 32
+aesni_cbc_sha1_enc_shaext:
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+___
+$code.=<<___ if ($win64);
+       lea     `-8-10*16`(%rsp),%rsp
+       movaps  %xmm6,-8-10*16(%rax)
+       movaps  %xmm7,-8-9*16(%rax)
+       movaps  %xmm8,-8-8*16(%rax)
+       movaps  %xmm9,-8-7*16(%rax)
+       movaps  %xmm10,-8-6*16(%rax)
+       movaps  %xmm11,-8-5*16(%rax)
+       movaps  %xmm12,-8-4*16(%rax)
+       movaps  %xmm13,-8-3*16(%rax)
+       movaps  %xmm14,-8-2*16(%rax)
+       movaps  %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+       movdqu  ($ctx),$ABCD
+       movd    16($ctx),$E
+       movdqa  K_XX_XX+0x50(%rip),$BSWAP       # byte-n-word swap
+
+       mov     240($key),$rounds
+       sub     $in0,$out
+       movups  ($key),$rndkey0                 # $key[0]
+       movups  ($ivp),$iv                      # load IV
+       movups  16($key),$rndkey[0]             # forward reference
+       lea     112($key),$key                  # size optimization
+
+       pshufd  \$0b00011011,$ABCD,$ABCD        # flip word order
+       pshufd  \$0b00011011,$E,$E              # flip word order
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+___
+       &$aesenc();
+$code.=<<___;
+       movdqu          ($inp),@MSG[0]
+       movdqa          $E,$E_SAVE              # offload $E
+       pshufb          $BSWAP,@MSG[0]
+       movdqu          0x10($inp),@MSG[1]
+       movdqa          $ABCD,$ABCD_SAVE        # offload $ABCD
+___
+       &$aesenc();
+$code.=<<___;
+       pshufb          $BSWAP,@MSG[1]
+
+       paddd           @MSG[0],$E
+       movdqu          0x20($inp),@MSG[2]
+       lea             0x40($inp),$inp
+       pxor            $E_SAVE,@MSG[0]         # black magic
+___
+       &$aesenc();
+$code.=<<___;
+       pxor            $E_SAVE,@MSG[0]         # black magic
+       movdqa          $ABCD,$E_
+       pshufb          $BSWAP,@MSG[2]
+       sha1rnds4       \$0,$E,$ABCD            # 0-3
+       sha1nexte       @MSG[1],$E_
+___
+       &$aesenc();
+$code.=<<___;
+       sha1msg1        @MSG[1],@MSG[0]
+       movdqu          -0x10($inp),@MSG[3]
+       movdqa          $ABCD,$E
+       pshufb          $BSWAP,@MSG[3]
+___
+       &$aesenc();
+$code.=<<___;
+       sha1rnds4       \$0,$E_,$ABCD           # 4-7
+       sha1nexte       @MSG[2],$E
+       pxor            @MSG[2],@MSG[0]
+       sha1msg1        @MSG[2],@MSG[1]
+___
+       &$aesenc();
+
+for($i=2;$i<20-4;$i++) {
+$code.=<<___;
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$`int($i/5)`,$E,$ABCD  # 8-11
+       sha1nexte       @MSG[3],$E_
+___
+       &$aesenc();
+$code.=<<___;
+       sha1msg2        @MSG[3],@MSG[0]
+       pxor            @MSG[3],@MSG[1]
+       sha1msg1        @MSG[3],@MSG[2]
+___
+       ($E,$E_)=($E_,$E);
+       push(@MSG,shift(@MSG));
+
+       &$aesenc();
+}
+$code.=<<___;
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$3,$E,$ABCD            # 64-67
+       sha1nexte       @MSG[3],$E_
+       sha1msg2        @MSG[3],@MSG[0]
+       pxor            @MSG[3],@MSG[1]
+___
+       &$aesenc();
+$code.=<<___;
+       movdqa          $ABCD,$E
+       sha1rnds4       \$3,$E_,$ABCD           # 68-71
+       sha1nexte       @MSG[0],$E
+       sha1msg2        @MSG[0],@MSG[1]
+___
+       &$aesenc();
+$code.=<<___;
+       movdqa          $E_SAVE,@MSG[0]
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$3,$E,$ABCD            # 72-75
+       sha1nexte       @MSG[1],$E_
+___
+       &$aesenc();
+$code.=<<___;
+       movdqa          $ABCD,$E
+       sha1rnds4       \$3,$E_,$ABCD           # 76-79
+       sha1nexte       $MSG[0],$E
+___
+       while($r<40)    { &$aesenc(); }         # remaining aesenc's
+$code.=<<___;
+       dec             $len
+
+       paddd           $ABCD_SAVE,$ABCD
+       movups          $iv,48($out,$in0)       # write output
+       lea             64($in0),$in0
+       jnz             .Loop_shaext
+
+       pshufd  \$0b00011011,$ABCD,$ABCD
+       pshufd  \$0b00011011,$E,$E
+       movups  $iv,($ivp)                      # write IV
+       movdqu  $ABCD,($ctx)
+       movd    $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+       movaps  -8-10*16(%rax),%xmm6
+       movaps  -8-9*16(%rax),%xmm7
+       movaps  -8-8*16(%rax),%xmm8
+       movaps  -8-7*16(%rax),%xmm9
+       movaps  -8-6*16(%rax),%xmm10
+       movaps  -8-5*16(%rax),%xmm11
+       movaps  -8-4*16(%rax),%xmm12
+       movaps  -8-3*16(%rax),%xmm13
+       movaps  -8-2*16(%rax),%xmm14
+       movaps  -8-1*16(%rax),%xmm15
+       mov     %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+       ret
+.size  aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
+___
+                                               }}}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+       lea     aesni_cbc_sha1_enc_shaext(%rip),%r10
+       cmp     %r10,%rbx
+       jb      .Lseh_no_shaext
+
+       lea     (%rax),%rsi
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     168(%rax),%rax          # adjust stack pointer
+       jmp     .Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
+       lea     96(%rax),%rsi
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     `104+10*16`(%rax),%rax  # adjust stack pointer
+
+       mov     0(%rax),%r15
+       mov     8(%rax),%r14
+       mov     16(%rax),%r13
+       mov     24(%rax),%r12
+       mov     32(%rax),%rbp
+       mov     40(%rax),%rbx
+       lea     48(%rax),%rax
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  ssse3_handler,.-ssse3_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+       .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
+       .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+       .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
+       .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
+       .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___ if ($shaext);
+       .rva    .LSEH_begin_aesni_cbc_sha1_enc_shaext
+       .rva    .LSEH_end_aesni_cbc_sha1_enc_shaext
+       .rva    .LSEH_info_aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___;
+.section       .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04                 if($dst>=8);
+    $rex|=0x01                 if($src>=8);
+    unshift @opcode,$rex|0x40  if($rex);
+}
+
+sub sha1rnds4 {
+    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x0f,0x3a,0xcc);
+       rex(\@opcode,$3,$2);
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       return ".byte\t".join(',',@opcode);
+    } else {
+       return "sha1rnds4\t".@_[0];
+    }
+}
+
+sub sha1op38 {
+    my $instr = shift;
+    my %opcodelet = (
+               "sha1nexte" => 0xc8,
+               "sha1msg1"  => 0xc9,
+               "sha1msg2"  => 0xca     );
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x0f,0x38);
+       rex(\@opcode,$2,$1);
+       push @opcode,$opcodelet{$instr};
+       push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
+       return ".byte\t".join(',',@opcode);
+    } else {
+       return $instr."\t".@_[0];
+    }
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x0f,0x38);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       my %opcodelet = (
+               "aesenc" => 0xdc,       "aesenclast" => 0xdd,
+               "aesdec" => 0xde,       "aesdeclast" => 0xdf
+       );
+       return undef if (!defined($opcodelet{$1}));
+       rex(\@opcode,$3,$2);
+       push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);    # ModR/M
+       unshift @opcode,0x66;
+       return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo                or
+       s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo             or
+       s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
+
+       print $_,"\n";
+}
+close STDOUT;
diff --git a/aesni/aesni-x86_64.pl b/aesni/aesni-x86_64.pl

new file mode 100755 (executable)

index 0000000..406377a
--- /dev/null
+++ b/aesni/aesni-x86_64.pl
@@ -0,0 +1,5141 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#      16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB  4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26   1.26/1.26
+# CTR  5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC  4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM  5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
+# OFB  5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB  5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straightforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor                3x      6x      8x
+# theoretical asymptotic limit         1.67    0.83    0.625
+# measured performance for 8KB block   1.05    0.86    0.84
+#
+# "as if" interleave factor            4.7x    5.8x    6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt                          1.16    0.93    0.74
+# CTR                                  1.14    0.91    0.74
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instructions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizes 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#              CBC en-/decrypt CTR     XTS     ECB     OCB
+# Westmere     3.77/1.25       1.25    1.25    1.26
+# * Bridge     5.07/0.74       0.75    0.90    0.85    0.98
+# Haswell      4.44/0.63       0.63    0.73    0.63    0.70
+# Skylake      2.62/0.63       0.63    0.63    0.63
+# Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
+# Knights L    2.54/0.77       0.78    0.85    -       1.50
+# Goldmont     3.82/1.26       1.26    1.29    1.29    1.50
+# Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
+# Ryzen                2.71/0.35       0.35    0.44    0.38    0.49
+#
+# (*)  Atom Silvermont ECB result is suboptimal because of penalties
+#      incurred by operations on %xmm8-15. As ECB is not considered
+#      critical, nothing was done to mitigate the problem.
+
+$PREFIX="aesni";       # if $PREFIX is set to "AES", the script
+                       # generates drop-in replacement for
+                       # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64?        ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+               ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+$code=".text\n";
+$code.=".extern        OPENSSL_ia32cap_P\n";
+
+$rounds="%eax";        # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";   # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";    # cbc, ctr, ...
+
+$rnds_="%r10d";        # backup copy for $rounds
+$key_="%r11";  # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";      $rndkey1="%xmm1";
+$inout0="%xmm2";       $inout1="%xmm3";
+$inout2="%xmm4";       $inout3="%xmm5";
+$inout4="%xmm6";       $inout5="%xmm7";
+$inout6="%xmm8";       $inout7="%xmm9";
+
+$in2="%xmm6";          $in1="%xmm7";   # used in CBC decrypt, CTR, ...
+$in0="%xmm8";          $iv="%xmm9";
+\f
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_;  $inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+       $movkey ($key),$rndkey0
+       $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+       xorps   $rndkey0,$ivec
+       lea     32($key),$key
+       xorps   $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+       lea     32($key),$key
+       xorps   $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+       aes${p} $rndkey1,$inout
+       dec     $rounds
+       $movkey ($key),$rndkey1
+       lea     16($key),$key
+       jnz     .Loop_${p}1_$sn # loop body is 16 bytes
+       aes${p}last     $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type  ${PREFIX}_encrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_encrypt:
+.cfi_startproc
+       movups  ($inp),$inout0          # load input
+       mov     240($key),$rounds       # key->rounds
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+        pxor   $rndkey0,$rndkey0       # clear register bank
+        pxor   $rndkey1,$rndkey1
+       movups  $inout0,($out)          # output
+        pxor   $inout0,$inout0
+       ret
+.cfi_endproc
+.size  ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type  ${PREFIX}_decrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_decrypt:
+.cfi_startproc
+       movups  ($inp),$inout0          # load input
+       mov     240($key),$rounds       # key->rounds
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+        pxor   $rndkey0,$rndkey0       # clear register bank
+        pxor   $rndkey1,$rndkey1
+       movups  $inout0,($out)          # output
+        pxor   $inout0,$inout0
+       ret
+.cfi_endproc
+.size  ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+\f
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+.cfi_startproc
+       $movkey ($key),$rndkey0
+       shl     \$4,$rounds
+       $movkey 16($key),$rndkey1
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       $movkey 32($key),$rndkey0
+       lea     32($key,$rounds),$key
+       neg     %rax                            # $rounds
+       add     \$16,%rax
+
+.L${dir}_loop2:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop2
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       ret
+.cfi_endproc
+.size  _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt3,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt3:
+.cfi_startproc
+       $movkey ($key),$rndkey0
+       shl     \$4,$rounds
+       $movkey 16($key),$rndkey1
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       xorps   $rndkey0,$inout2
+       $movkey 32($key),$rndkey0
+       lea     32($key,$rounds),$key
+       neg     %rax                            # $rounds
+       add     \$16,%rax
+
+.L${dir}_loop3:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       aes${dir}       $rndkey0,$inout2
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop3
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       ret
+.cfi_endproc
+.size  _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt4,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt4:
+.cfi_startproc
+       $movkey ($key),$rndkey0
+       shl     \$4,$rounds
+       $movkey 16($key),$rndkey1
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       xorps   $rndkey0,$inout2
+       xorps   $rndkey0,$inout3
+       $movkey 32($key),$rndkey0
+       lea     32($key,$rounds),$key
+       neg     %rax                            # $rounds
+       .byte   0x0f,0x1f,0x00
+       add     \$16,%rax
+
+.L${dir}_loop4:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop4
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       ret
+.cfi_endproc
+.size  _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt6,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt6:
+.cfi_startproc
+       $movkey         ($key),$rndkey0
+       shl             \$4,$rounds
+       $movkey         16($key),$rndkey1
+       xorps           $rndkey0,$inout0
+       pxor            $rndkey0,$inout1
+       pxor            $rndkey0,$inout2
+       aes${dir}       $rndkey1,$inout0
+       lea             32($key,$rounds),$key
+       neg             %rax                    # $rounds
+       aes${dir}       $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+       pxor            $rndkey0,$inout4
+       aes${dir}       $rndkey1,$inout2
+       pxor            $rndkey0,$inout5
+       $movkey         ($key,%rax),$rndkey0
+       add             \$16,%rax
+       jmp             .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+.L${dir}_loop6_enter:
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       aes${dir}       $rndkey0,$inout4
+       aes${dir}       $rndkey0,$inout5
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop6
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       aes${dir}last   $rndkey0,$inout4
+       aes${dir}last   $rndkey0,$inout5
+       ret
+.cfi_endproc
+.size  _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt8,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt8:
+.cfi_startproc
+       $movkey         ($key),$rndkey0
+       shl             \$4,$rounds
+       $movkey         16($key),$rndkey1
+       xorps           $rndkey0,$inout0
+       xorps           $rndkey0,$inout1
+       pxor            $rndkey0,$inout2
+       pxor            $rndkey0,$inout3
+       pxor            $rndkey0,$inout4
+       lea             32($key,$rounds),$key
+       neg             %rax                    # $rounds
+       aes${dir}       $rndkey1,$inout0
+       pxor            $rndkey0,$inout5
+       pxor            $rndkey0,$inout6
+       aes${dir}       $rndkey1,$inout1
+       pxor            $rndkey0,$inout7
+       $movkey         ($key,%rax),$rndkey0
+       add             \$16,%rax
+       jmp             .L${dir}_loop8_inner
+.align 16
+.L${dir}_loop8:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+.L${dir}_loop8_inner:
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}       $rndkey1,$inout6
+       aes${dir}       $rndkey1,$inout7
+.L${dir}_loop8_enter:
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       aes${dir}       $rndkey0,$inout4
+       aes${dir}       $rndkey0,$inout5
+       aes${dir}       $rndkey0,$inout6
+       aes${dir}       $rndkey0,$inout7
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop8
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}       $rndkey1,$inout6
+       aes${dir}       $rndkey1,$inout7
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       aes${dir}last   $rndkey0,$inout4
+       aes${dir}last   $rndkey0,$inout5
+       aes${dir}last   $rndkey0,$inout6
+       aes${dir}last   $rndkey0,$inout7
+       ret
+.cfi_endproc
+.size  _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+\f
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                        size_t length, const AES_KEY *key,
+#                        int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type  aesni_ecb_encrypt,\@function,5
+.align 16
+aesni_ecb_encrypt:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)            # offload $inout4..7
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+       and     \$-16,$len              # if ($len<16)
+       jz      .Lecb_ret               # return
+
+       mov     240($key),$rounds       # key->rounds
+       $movkey ($key),$rndkey0
+       mov     $key,$key_              # backup $key
+       mov     $rounds,$rnds_          # backup $rounds
+       test    %r8d,%r8d               # 5th argument
+       jz      .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+       cmp     \$0x80,$len             # if ($len<8*16)
+       jb      .Lecb_enc_tail          # short input
+
+       movdqu  ($inp),$inout0          # load 8 input blocks
+       movdqu  0x10($inp),$inout1
+       movdqu  0x20($inp),$inout2
+       movdqu  0x30($inp),$inout3
+       movdqu  0x40($inp),$inout4
+       movdqu  0x50($inp),$inout5
+       movdqu  0x60($inp),$inout6
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp         # $inp+=8*16
+       sub     \$0x80,$len             # $len-=8*16 (can be zero)
+       jmp     .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+       movups  $inout0,($out)          # store 8 output blocks
+       mov     $key_,$key              # restore $key
+       movdqu  ($inp),$inout0          # load 8 input blocks
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout1,0x10($out)
+       movdqu  0x10($inp),$inout1
+       movups  $inout2,0x20($out)
+       movdqu  0x20($inp),$inout2
+       movups  $inout3,0x30($out)
+       movdqu  0x30($inp),$inout3
+       movups  $inout4,0x40($out)
+       movdqu  0x40($inp),$inout4
+       movups  $inout5,0x50($out)
+       movdqu  0x50($inp),$inout5
+       movups  $inout6,0x60($out)
+       movdqu  0x60($inp),$inout6
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out         # $out+=8*16
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp         # $inp+=8*16
+.Lecb_enc_loop8_enter:
+
+       call    _aesni_encrypt8
+
+       sub     \$0x80,$len
+       jnc     .Lecb_enc_loop8         # loop if $len-=8*16 didn't borrow
+
+       movups  $inout0,($out)          # store 8 output blocks
+       mov     $key_,$key              # restore $key
+       movups  $inout1,0x10($out)
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out         # $out+=8*16
+       add     \$0x80,$len             # restore real remaining $len
+       jz      .Lecb_ret               # done if ($len==0)
+
+.Lecb_enc_tail:                                # $len is less than 8*16
+       movups  ($inp),$inout0
+       cmp     \$0x20,$len
+       jb      .Lecb_enc_one
+       movups  0x10($inp),$inout1
+       je      .Lecb_enc_two
+       movups  0x20($inp),$inout2
+       cmp     \$0x40,$len
+       jb      .Lecb_enc_three
+       movups  0x30($inp),$inout3
+       je      .Lecb_enc_four
+       movups  0x40($inp),$inout4
+       cmp     \$0x60,$len
+       jb      .Lecb_enc_five
+       movups  0x50($inp),$inout5
+       je      .Lecb_enc_six
+       movdqu  0x60($inp),$inout6
+       xorps   $inout7,$inout7
+       call    _aesni_encrypt8
+       movups  $inout0,($out)          # store 7 output blocks
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)          # store one output block
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_two:
+       call    _aesni_encrypt2
+       movups  $inout0,($out)          # store 2 output blocks
+       movups  $inout1,0x10($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_three:
+       call    _aesni_encrypt3
+       movups  $inout0,($out)          # store 3 output blocks
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_four:
+       call    _aesni_encrypt4
+       movups  $inout0,($out)          # store 4 output blocks
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_five:
+       xorps   $inout5,$inout5
+       call    _aesni_encrypt6
+       movups  $inout0,($out)          # store 5 output blocks
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_six:
+       call    _aesni_encrypt6
+       movups  $inout0,($out)          # store 6 output blocks
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       jmp     .Lecb_ret
+\f#--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+       cmp     \$0x80,$len             # if ($len<8*16)
+       jb      .Lecb_dec_tail          # short input
+
+       movdqu  ($inp),$inout0          # load 8 input blocks
+       movdqu  0x10($inp),$inout1
+       movdqu  0x20($inp),$inout2
+       movdqu  0x30($inp),$inout3
+       movdqu  0x40($inp),$inout4
+       movdqu  0x50($inp),$inout5
+       movdqu  0x60($inp),$inout6
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp         # $inp+=8*16
+       sub     \$0x80,$len             # $len-=8*16 (can be zero)
+       jmp     .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+       movups  $inout0,($out)          # store 8 output blocks
+       mov     $key_,$key              # restore $key
+       movdqu  ($inp),$inout0          # load 8 input blocks
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout1,0x10($out)
+       movdqu  0x10($inp),$inout1
+       movups  $inout2,0x20($out)
+       movdqu  0x20($inp),$inout2
+       movups  $inout3,0x30($out)
+       movdqu  0x30($inp),$inout3
+       movups  $inout4,0x40($out)
+       movdqu  0x40($inp),$inout4
+       movups  $inout5,0x50($out)
+       movdqu  0x50($inp),$inout5
+       movups  $inout6,0x60($out)
+       movdqu  0x60($inp),$inout6
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out         # $out+=8*16
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp         # $inp+=8*16
+.Lecb_dec_loop8_enter:
+
+       call    _aesni_decrypt8
+
+       $movkey ($key_),$rndkey0
+       sub     \$0x80,$len
+       jnc     .Lecb_dec_loop8         # loop if $len-=8*16 didn't borrow
+
+       movups  $inout0,($out)          # store 8 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       mov     $key_,$key              # restore $key
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movups  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       movups  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+       movups  $inout5,0x50($out)
+        pxor   $inout5,$inout5
+       movups  $inout6,0x60($out)
+        pxor   $inout6,$inout6
+       movups  $inout7,0x70($out)
+        pxor   $inout7,$inout7
+       lea     0x80($out),$out         # $out+=8*16
+       add     \$0x80,$len             # restore real remaining $len
+       jz      .Lecb_ret               # done if ($len==0)
+
+.Lecb_dec_tail:
+       movups  ($inp),$inout0
+       cmp     \$0x20,$len
+       jb      .Lecb_dec_one
+       movups  0x10($inp),$inout1
+       je      .Lecb_dec_two
+       movups  0x20($inp),$inout2
+       cmp     \$0x40,$len
+       jb      .Lecb_dec_three
+       movups  0x30($inp),$inout3
+       je      .Lecb_dec_four
+       movups  0x40($inp),$inout4
+       cmp     \$0x60,$len
+       jb      .Lecb_dec_five
+       movups  0x50($inp),$inout5
+       je      .Lecb_dec_six
+       movups  0x60($inp),$inout6
+       $movkey ($key),$rndkey0
+       xorps   $inout7,$inout7
+       call    _aesni_decrypt8
+       movups  $inout0,($out)          # store 7 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movups  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       movups  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+       movups  $inout5,0x50($out)
+        pxor   $inout5,$inout5
+       movups  $inout6,0x60($out)
+        pxor   $inout6,$inout6
+        pxor   $inout7,$inout7
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)          # store one output block
+        pxor   $inout0,$inout0         # clear register bank
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_two:
+       call    _aesni_decrypt2
+       movups  $inout0,($out)          # store 2 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_three:
+       call    _aesni_decrypt3
+       movups  $inout0,($out)          # store 3 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_four:
+       call    _aesni_decrypt4
+       movups  $inout0,($out)          # store 4 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movups  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_five:
+       xorps   $inout5,$inout5
+       call    _aesni_decrypt6
+       movups  $inout0,($out)          # store 5 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movups  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       movups  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+        pxor   $inout5,$inout5
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_six:
+       call    _aesni_decrypt6
+       movups  $inout0,($out)          # store 6 output blocks
+        pxor   $inout0,$inout0         # clear register bank
+       movups  $inout1,0x10($out)
+        pxor   $inout1,$inout1
+       movups  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movups  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       movups  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+       movups  $inout5,0x50($out)
+        pxor   $inout5,$inout5
+
+.Lecb_ret:
+       xorps   $rndkey0,$rndkey0       # %xmm0
+       pxor    $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  %xmm0,(%rsp)            # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       lea     0x58(%rsp),%rsp
+.Lecb_enc_ret:
+___
+$code.=<<___;
+       ret
+.cfi_endproc
+.size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+\f
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";        # 6th argument
+
+my $increment="%xmm9";
+my $iv="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type  aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)            # $iv
+       movaps  %xmm7,0x10(%rsp)        # $bswap_mask
+       movaps  %xmm8,0x20(%rsp)        # $in0
+       movaps  %xmm9,0x30(%rsp)        # $increment
+.Lccm64_enc_body:
+___
+$code.=<<___;
+       mov     240($key),$rounds               # key->rounds
+       movdqu  ($ivp),$iv
+       movdqa  .Lincrement64(%rip),$increment
+       movdqa  .Lbswap_mask(%rip),$bswap_mask
+
+       shl     \$4,$rounds
+       mov     \$16,$rnds_
+       lea     0($key),$key_
+       movdqu  ($cmac),$inout1
+       movdqa  $iv,$inout0
+       lea     32($key,$rounds),$key           # end of key schedule
+       pshufb  $bswap_mask,$iv
+       sub     %rax,%r10                       # twisted $rounds
+       jmp     .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+       $movkey ($key_),$rndkey0
+       mov     %r10,%rax
+       movups  ($inp),$in0                     # load inp
+
+       xorps   $rndkey0,$inout0                # counter
+       $movkey 16($key_),$rndkey1
+       xorps   $in0,$rndkey0
+       xorps   $rndkey0,$inout1                # cmac^=inp
+       $movkey 32($key_),$rndkey0
+
+.Lccm64_enc2_loop:
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       $movkey ($key,%rax),$rndkey1
+       add     \$32,%rax
+       aesenc  $rndkey0,$inout0
+       aesenc  $rndkey0,$inout1
+       $movkey -16($key,%rax),$rndkey0
+       jnz     .Lccm64_enc2_loop
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       paddq   $increment,$iv
+       dec     $len                            # $len-- ($len is in blocks)
+       aesenclast      $rndkey0,$inout0
+       aesenclast      $rndkey0,$inout1
+
+       lea     16($inp),$inp
+       xorps   $inout0,$in0                    # inp ^= E(iv)
+       movdqa  $iv,$inout0
+       movups  $in0,($out)                     # save output
+       pshufb  $bswap_mask,$inout0
+       lea     16($out),$out                   # $out+=16
+       jnz     .Lccm64_enc_outer               # loop if ($len!=0)
+
+        pxor   $rndkey0,$rndkey0               # clear register bank
+        pxor   $rndkey1,$rndkey1
+        pxor   $inout0,$inout0
+       movups  $inout1,($cmac)                 # store resulting mac
+        pxor   $inout1,$inout1
+        pxor   $in0,$in0
+        pxor   $iv,$iv
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  %xmm0,(%rsp)                    # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       lea     0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+       ret
+.size  aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type  aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)            # $iv
+       movaps  %xmm7,0x10(%rsp)        # $bswap_mask
+       movaps  %xmm8,0x20(%rsp)        # $in8
+       movaps  %xmm9,0x30(%rsp)        # $increment
+.Lccm64_dec_body:
+___
+$code.=<<___;
+       mov     240($key),$rounds               # key->rounds
+       movups  ($ivp),$iv
+       movdqu  ($cmac),$inout1
+       movdqa  .Lincrement64(%rip),$increment
+       movdqa  .Lbswap_mask(%rip),$bswap_mask
+
+       movaps  $iv,$inout0
+       mov     $rounds,$rnds_
+       mov     $key,$key_
+       pshufb  $bswap_mask,$iv
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       shl     \$4,$rnds_
+       mov     \$16,$rounds
+       movups  ($inp),$in0                     # load inp
+       paddq   $increment,$iv
+       lea     16($inp),$inp                   # $inp+=16
+       sub     %r10,%rax                       # twisted $rounds
+       lea     32($key_,$rnds_),$key           # end of key schedule
+       mov     %rax,%r10
+       jmp     .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+       xorps   $inout0,$in0                    # inp ^= E(iv)
+       movdqa  $iv,$inout0
+       movups  $in0,($out)                     # save output
+       lea     16($out),$out                   # $out+=16
+       pshufb  $bswap_mask,$inout0
+
+       sub     \$1,$len                        # $len-- ($len is in blocks)
+       jz      .Lccm64_dec_break               # if ($len==0) break
+
+       $movkey ($key_),$rndkey0
+       mov     %r10,%rax
+       $movkey 16($key_),$rndkey1
+       xorps   $rndkey0,$in0
+       xorps   $rndkey0,$inout0
+       xorps   $in0,$inout1                    # cmac^=out
+       $movkey 32($key_),$rndkey0
+       jmp     .Lccm64_dec2_loop
+.align 16
+.Lccm64_dec2_loop:
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       $movkey ($key,%rax),$rndkey1
+       add     \$32,%rax
+       aesenc  $rndkey0,$inout0
+       aesenc  $rndkey0,$inout1
+       $movkey -16($key,%rax),$rndkey0
+       jnz     .Lccm64_dec2_loop
+       movups  ($inp),$in0                     # load input
+       paddq   $increment,$iv
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       aesenclast      $rndkey0,$inout0
+       aesenclast      $rndkey0,$inout1
+       lea     16($inp),$inp                   # $inp+=16
+       jmp     .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+       #xorps  $in0,$inout1                    # cmac^=out
+       mov     240($key_),$rounds
+___
+       &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+        pxor   $rndkey0,$rndkey0               # clear register bank
+        pxor   $rndkey1,$rndkey1
+        pxor   $inout0,$inout0
+       movups  $inout1,($cmac)                 # store resulting mac
+        pxor   $inout1,$inout1
+        pxor   $in0,$in0
+        pxor   $iv,$iv
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  %xmm0,(%rsp)                    # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       lea     0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+       ret
+.size  aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}\f
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
+{
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("%ebp","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type  aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+.cfi_startproc
+       cmp     \$1,$len
+       jne     .Lctr32_bulk
+
+       # handle single block without allocating stack frame,
+       # useful when handling edges
+       movups  ($ivp),$inout0
+       movups  ($inp),$inout1
+       mov     240($key),%edx                  # key->rounds
+___
+       &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+        pxor   $rndkey0,$rndkey0               # clear register bank
+        pxor   $rndkey1,$rndkey1
+       xorps   $inout1,$inout0
+        pxor   $inout1,$inout1
+       movups  $inout0,($out)
+        xorps  $inout0,$inout0
+       jmp     .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+       lea     (%rsp),$key_                    # use $key_ as frame pointer
+.cfi_def_cfa_register  $key_
+       push    %rbp
+.cfi_push      %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,-0xa8($key_)              # offload everything
+       movaps  %xmm7,-0x98($key_)
+       movaps  %xmm8,-0x88($key_)
+       movaps  %xmm9,-0x78($key_)
+       movaps  %xmm10,-0x68($key_)
+       movaps  %xmm11,-0x58($key_)
+       movaps  %xmm12,-0x48($key_)
+       movaps  %xmm13,-0x38($key_)
+       movaps  %xmm14,-0x28($key_)
+       movaps  %xmm15,-0x18($key_)
+.Lctr32_body:
+___
+$code.=<<___;
+
+       # 8 16-byte words on top of stack are counter values
+       # xor-ed with zero-round key
+
+       movdqu  ($ivp),$inout0
+       movdqu  ($key),$rndkey0
+       mov     12($ivp),$ctr                   # counter LSB
+       pxor    $rndkey0,$inout0
+       mov     12($key),$key0                  # 0-round key LSB
+       movdqa  $inout0,0x00(%rsp)              # populate counter block
+       bswap   $ctr
+       movdqa  $inout0,$inout1
+       movdqa  $inout0,$inout2
+       movdqa  $inout0,$inout3
+       movdqa  $inout0,0x40(%rsp)
+       movdqa  $inout0,0x50(%rsp)
+       movdqa  $inout0,0x60(%rsp)
+       mov     %rdx,%r10                       # about to borrow %rdx
+       movdqa  $inout0,0x70(%rsp)
+
+       lea     1($ctr),%rax
+        lea    2($ctr),%rdx
+       bswap   %eax
+        bswap  %edx
+       xor     $key0,%eax
+        xor    $key0,%edx
+       pinsrd  \$3,%eax,$inout1
+       lea     3($ctr),%rax
+       movdqa  $inout1,0x10(%rsp)
+        pinsrd \$3,%edx,$inout2
+       bswap   %eax
+        mov    %r10,%rdx                       # restore %rdx
+        lea    4($ctr),%r10
+        movdqa $inout2,0x20(%rsp)
+       xor     $key0,%eax
+        bswap  %r10d
+       pinsrd  \$3,%eax,$inout3
+        xor    $key0,%r10d
+       movdqa  $inout3,0x30(%rsp)
+       lea     5($ctr),%r9
+        mov    %r10d,0x40+12(%rsp)
+       bswap   %r9d
+        lea    6($ctr),%r10
+       mov     240($key),$rounds               # key->rounds
+       xor     $key0,%r9d
+        bswap  %r10d
+       mov     %r9d,0x50+12(%rsp)
+        xor    $key0,%r10d
+       lea     7($ctr),%r9
+        mov    %r10d,0x60+12(%rsp)
+       bswap   %r9d
+        mov    OPENSSL_ia32cap_P+4(%rip),%r10d
+       xor     $key0,%r9d
+        and    \$`1<<26|1<<22`,%r10d           # isolate XSAVE+MOVBE
+       mov     %r9d,0x70+12(%rsp)
+
+       $movkey 0x10($key),$rndkey1
+
+       movdqa  0x40(%rsp),$inout4
+       movdqa  0x50(%rsp),$inout5
+
+       cmp     \$8,$len                # $len is in blocks
+       jb      .Lctr32_tail            # short input if ($len<8)
+
+       sub     \$6,$len                # $len is biased by -6
+       cmp     \$`1<<22`,%r10d         # check for MOVBE without XSAVE
+       je      .Lctr32_6x              # [which denotes Atom Silvermont]
+
+       lea     0x80($key),$key         # size optimization
+       sub     \$2,$len                # $len is biased by -8
+       jmp     .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+       shl     \$4,$rounds
+       mov     \$48,$rnds_
+       bswap   $key0
+       lea     32($key,$rounds),$key   # end of key schedule
+       sub     %rax,%r10               # twisted $rounds
+       jmp     .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+        add    \$6,$ctr                # next counter value
+       $movkey -48($key,$rnds_),$rndkey0
+       aesenc  $rndkey1,$inout0
+        mov    $ctr,%eax
+        xor    $key0,%eax
+       aesenc  $rndkey1,$inout1
+        movbe  %eax,`0x00+12`(%rsp)    # store next counter value
+        lea    1($ctr),%eax
+       aesenc  $rndkey1,$inout2
+        xor    $key0,%eax
+        movbe  %eax,`0x10+12`(%rsp)
+       aesenc  $rndkey1,$inout3
+        lea    2($ctr),%eax
+        xor    $key0,%eax
+       aesenc  $rndkey1,$inout4
+        movbe  %eax,`0x20+12`(%rsp)
+        lea    3($ctr),%eax
+       aesenc  $rndkey1,$inout5
+       $movkey -32($key,$rnds_),$rndkey1
+        xor    $key0,%eax
+
+       aesenc  $rndkey0,$inout0
+        movbe  %eax,`0x30+12`(%rsp)
+        lea    4($ctr),%eax
+       aesenc  $rndkey0,$inout1
+        xor    $key0,%eax
+        movbe  %eax,`0x40+12`(%rsp)
+       aesenc  $rndkey0,$inout2
+        lea    5($ctr),%eax
+        xor    $key0,%eax
+       aesenc  $rndkey0,$inout3
+        movbe  %eax,`0x50+12`(%rsp)
+        mov    %r10,%rax               # mov   $rnds_,$rounds
+       aesenc  $rndkey0,$inout4
+       aesenc  $rndkey0,$inout5
+       $movkey -16($key,$rnds_),$rndkey0
+
+       call    .Lenc_loop6
+
+       movdqu  ($inp),$inout6          # load 6 input blocks
+       movdqu  0x10($inp),$inout7
+       movdqu  0x20($inp),$in0
+       movdqu  0x30($inp),$in1
+       movdqu  0x40($inp),$in2
+       movdqu  0x50($inp),$in3
+       lea     0x60($inp),$inp         # $inp+=6*16
+       $movkey -64($key,$rnds_),$rndkey1
+       pxor    $inout0,$inout6         # inp^=E(ctr)
+       movaps  0x00(%rsp),$inout0      # load next counter [xor-ed with 0 round]
+       pxor    $inout1,$inout7
+       movaps  0x10(%rsp),$inout1
+       pxor    $inout2,$in0
+       movaps  0x20(%rsp),$inout2
+       pxor    $inout3,$in1
+       movaps  0x30(%rsp),$inout3
+       pxor    $inout4,$in2
+       movaps  0x40(%rsp),$inout4
+       pxor    $inout5,$in3
+       movaps  0x50(%rsp),$inout5
+       movdqu  $inout6,($out)          # store 6 output blocks
+       movdqu  $inout7,0x10($out)
+       movdqu  $in0,0x20($out)
+       movdqu  $in1,0x30($out)
+       movdqu  $in2,0x40($out)
+       movdqu  $in3,0x50($out)
+       lea     0x60($out),$out         # $out+=6*16
+
+       sub     \$6,$len
+       jnc     .Lctr32_loop6           # loop if $len-=6 didn't borrow
+
+       add     \$6,$len                # restore real remaining $len
+       jz      .Lctr32_done            # done if ($len==0)
+
+       lea     -48($rnds_),$rounds
+       lea     -80($key,$rnds_),$key   # restore $key
+       neg     $rounds
+       shr     \$4,$rounds             # restore $rounds
+       jmp     .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+        add            \$8,$ctr                # next counter value
+       movdqa          0x60(%rsp),$inout6
+       aesenc          $rndkey1,$inout0
+        mov            $ctr,%r9d
+       movdqa          0x70(%rsp),$inout7
+       aesenc          $rndkey1,$inout1
+        bswap          %r9d
+       $movkey         0x20-0x80($key),$rndkey0
+       aesenc          $rndkey1,$inout2
+        xor            $key0,%r9d
+        nop
+       aesenc          $rndkey1,$inout3
+        mov            %r9d,0x00+12(%rsp)      # store next counter value
+        lea            1($ctr),%r9
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       $movkey         0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+        bswap          %r9d
+       aesenc          $rndkeyx,$inout0
+       aesenc          $rndkeyx,$inout1
+        xor            $key0,%r9d
+        .byte          0x66,0x90
+       aesenc          $rndkeyx,$inout2
+       aesenc          $rndkeyx,$inout3
+        mov            %r9d,`0x10*($i-1)`+12(%rsp)
+        lea            $i($ctr),%r9
+       aesenc          $rndkeyx,$inout4
+       aesenc          $rndkeyx,$inout5
+       aesenc          $rndkeyx,$inout6
+       aesenc          $rndkeyx,$inout7
+       $movkey         `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+        bswap          %r9d
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+        xor            $key0,%r9d
+        movdqu         0x00($inp),$in0         # start loading input
+       aesenc          $rndkey0,$inout3
+        mov            %r9d,0x70+12(%rsp)
+        cmp            \$11,$rounds
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       aesenc          $rndkey0,$inout6
+       aesenc          $rndkey0,$inout7
+       $movkey         0xa0-0x80($key),$rndkey0
+
+       jb              .Lctr32_enc_done
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       $movkey         0xb0-0x80($key),$rndkey1
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       aesenc          $rndkey0,$inout6
+       aesenc          $rndkey0,$inout7
+       $movkey         0xc0-0x80($key),$rndkey0
+       je              .Lctr32_enc_done
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       $movkey         0xd0-0x80($key),$rndkey1
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       aesenc          $rndkey0,$inout6
+       aesenc          $rndkey0,$inout7
+       $movkey         0xe0-0x80($key),$rndkey0
+       jmp             .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+       movdqu          0x10($inp),$in1
+       pxor            $rndkey0,$in0           # input^=round[last]
+       movdqu          0x20($inp),$in2
+       pxor            $rndkey0,$in1
+       movdqu          0x30($inp),$in3
+       pxor            $rndkey0,$in2
+       movdqu          0x40($inp),$in4
+       pxor            $rndkey0,$in3
+       movdqu          0x50($inp),$in5
+       pxor            $rndkey0,$in4
+       pxor            $rndkey0,$in5
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+       aesenc          $rndkey1,$inout7
+       movdqu          0x60($inp),$rndkey1     # borrow $rndkey1 for inp[6]
+       lea             0x80($inp),$inp         # $inp+=8*16
+
+       aesenclast      $in0,$inout0            # $inN is inp[N]^round[last]
+       pxor            $rndkey0,$rndkey1       # borrowed $rndkey
+       movdqu          0x70-0x80($inp),$in0
+       aesenclast      $in1,$inout1
+       pxor            $rndkey0,$in0
+       movdqa          0x00(%rsp),$in1         # load next counter block
+       aesenclast      $in2,$inout2
+       aesenclast      $in3,$inout3
+       movdqa          0x10(%rsp),$in2
+       movdqa          0x20(%rsp),$in3
+       aesenclast      $in4,$inout4
+       aesenclast      $in5,$inout5
+       movdqa          0x30(%rsp),$in4
+       movdqa          0x40(%rsp),$in5
+       aesenclast      $rndkey1,$inout6
+       movdqa          0x50(%rsp),$rndkey0
+       $movkey         0x10-0x80($key),$rndkey1#real 1st-round key
+       aesenclast      $in0,$inout7
+
+       movups          $inout0,($out)          # store 8 output blocks
+       movdqa          $in1,$inout0
+       movups          $inout1,0x10($out)
+       movdqa          $in2,$inout1
+       movups          $inout2,0x20($out)
+       movdqa          $in3,$inout2
+       movups          $inout3,0x30($out)
+       movdqa          $in4,$inout3
+       movups          $inout4,0x40($out)
+       movdqa          $in5,$inout4
+       movups          $inout5,0x50($out)
+       movdqa          $rndkey0,$inout5
+       movups          $inout6,0x60($out)
+       movups          $inout7,0x70($out)
+       lea             0x80($out),$out         # $out+=8*16
+
+       sub     \$8,$len
+       jnc     .Lctr32_loop8                   # loop if $len-=8 didn't borrow
+
+       add     \$8,$len                        # restore real remaining $len
+       jz      .Lctr32_done                    # done if ($len==0)
+       lea     -0x80($key),$key
+
+.Lctr32_tail:
+       # note that at this point $inout0..5 are populated with
+       # counter values xor-ed with 0-round key
+       lea     16($key),$key
+       cmp     \$4,$len
+       jb      .Lctr32_loop3
+       je      .Lctr32_loop4
+
+       # if ($len>4) compute 7 E(counter)
+       shl             \$4,$rounds
+       movdqa          0x60(%rsp),$inout6
+       pxor            $inout7,$inout7
+
+       $movkey         16($key),$rndkey0
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       lea             32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+       neg             %rax
+       aesenc          $rndkey1,$inout2
+       add             \$16,%rax               # prepare for .Lenc_loop8_enter
+        movups         ($inp),$in0
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+        movups         0x10($inp),$in1         # pre-load input
+        movups         0x20($inp),$in2
+       aesenc          $rndkey1,$inout5
+       aesenc          $rndkey1,$inout6
+
+       call            .Lenc_loop8_enter
+
+       movdqu  0x30($inp),$in3
+       pxor    $in0,$inout0
+       movdqu  0x40($inp),$in0
+       pxor    $in1,$inout1
+       movdqu  $inout0,($out)                  # store output
+       pxor    $in2,$inout2
+       movdqu  $inout1,0x10($out)
+       pxor    $in3,$inout3
+       movdqu  $inout2,0x20($out)
+       pxor    $in0,$inout4
+       movdqu  $inout3,0x30($out)
+       movdqu  $inout4,0x40($out)
+       cmp     \$6,$len
+       jb      .Lctr32_done                    # $len was 5, stop store
+
+       movups  0x50($inp),$in1
+       xorps   $in1,$inout5
+       movups  $inout5,0x50($out)
+       je      .Lctr32_done                    # $len was 6, stop store
+
+       movups  0x60($inp),$in2
+       xorps   $in2,$inout6
+       movups  $inout6,0x60($out)
+       jmp     .Lctr32_done                    # $len was 7, stop store
+
+.align 32
+.Lctr32_loop4:
+       aesenc          $rndkey1,$inout0
+       lea             16($key),$key
+       dec             $rounds
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         ($key),$rndkey1
+       jnz             .Lctr32_loop4
+       aesenclast      $rndkey1,$inout0
+       aesenclast      $rndkey1,$inout1
+        movups         ($inp),$in0             # load input
+        movups         0x10($inp),$in1
+       aesenclast      $rndkey1,$inout2
+       aesenclast      $rndkey1,$inout3
+        movups         0x20($inp),$in2
+        movups         0x30($inp),$in3
+
+       xorps   $in0,$inout0
+       movups  $inout0,($out)                  # store output
+       xorps   $in1,$inout1
+       movups  $inout1,0x10($out)
+       pxor    $in2,$inout2
+       movdqu  $inout2,0x20($out)
+       pxor    $in3,$inout3
+       movdqu  $inout3,0x30($out)
+       jmp     .Lctr32_done                    # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+       aesenc          $rndkey1,$inout0
+       lea             16($key),$key
+       dec             $rounds
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       $movkey         ($key),$rndkey1
+       jnz             .Lctr32_loop3
+       aesenclast      $rndkey1,$inout0
+       aesenclast      $rndkey1,$inout1
+       aesenclast      $rndkey1,$inout2
+
+       movups  ($inp),$in0                     # load input
+       xorps   $in0,$inout0
+       movups  $inout0,($out)                  # store output
+       cmp     \$2,$len
+       jb      .Lctr32_done                    # $len was 1, stop store
+
+       movups  0x10($inp),$in1
+       xorps   $in1,$inout1
+       movups  $inout1,0x10($out)
+       je      .Lctr32_done                    # $len was 2, stop store
+
+       movups  0x20($inp),$in2
+       xorps   $in2,$inout2
+       movups  $inout2,0x20($out)              # $len was 3, stop store
+
+.Lctr32_done:
+       xorps   %xmm0,%xmm0                     # clear register bank
+       xor     $key0,$key0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,0x10(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,0x20(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,0x30(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,0x40(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,0x50(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,0x60(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,0x70(%rsp)
+       pxor    %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+       movaps  -0xa8($key_),%xmm6
+       movaps  %xmm0,-0xa8($key_)              # clear stack
+       movaps  -0x98($key_),%xmm7
+       movaps  %xmm0,-0x98($key_)
+       movaps  -0x88($key_),%xmm8
+       movaps  %xmm0,-0x88($key_)
+       movaps  -0x78($key_),%xmm9
+       movaps  %xmm0,-0x78($key_)
+       movaps  -0x68($key_),%xmm10
+       movaps  %xmm0,-0x68($key_)
+       movaps  -0x58($key_),%xmm11
+       movaps  %xmm0,-0x58($key_)
+       movaps  -0x48($key_),%xmm12
+       movaps  %xmm0,-0x48($key_)
+       movaps  -0x38($key_),%xmm13
+       movaps  %xmm0,-0x38($key_)
+       movaps  -0x28($key_),%xmm14
+       movaps  %xmm0,-0x28($key_)
+       movaps  -0x18($key_),%xmm15
+       movaps  %xmm0,-0x18($key_)
+       movaps  %xmm0,0x00(%rsp)
+       movaps  %xmm0,0x10(%rsp)
+       movaps  %xmm0,0x20(%rsp)
+       movaps  %xmm0,0x30(%rsp)
+       movaps  %xmm0,0x40(%rsp)
+       movaps  %xmm0,0x50(%rsp)
+       movaps  %xmm0,0x60(%rsp)
+       movaps  %xmm0,0x70(%rsp)
+___
+$code.=<<___;
+       mov     -8($key_),%rbp
+.cfi_restore   %rbp
+       lea     ($key_),%rsp
+.cfi_def_cfa_register  %rsp
+.Lctr32_epilogue:
+       ret
+.cfi_endproc
+.size  aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+\f
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2
+#      const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp";     # override so that we can use %r11 as FP
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type  aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+.cfi_startproc
+       lea     (%rsp),%r11                     # frame pointer
+.cfi_def_cfa_register  %r11
+       push    %rbp
+.cfi_push      %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,-0xa8(%r11)               # offload everything
+       movaps  %xmm7,-0x98(%r11)
+       movaps  %xmm8,-0x88(%r11)
+       movaps  %xmm9,-0x78(%r11)
+       movaps  %xmm10,-0x68(%r11)
+       movaps  %xmm11,-0x58(%r11)
+       movaps  %xmm12,-0x48(%r11)
+       movaps  %xmm13,-0x38(%r11)
+       movaps  %xmm14,-0x28(%r11)
+       movaps  %xmm15,-0x18(%r11)
+.Lxts_enc_body:
+___
+$code.=<<___;
+       movups  ($ivp),$inout0                  # load clear-text tweak
+       mov     240(%r8),$rounds                # key2->rounds
+       mov     240($key),$rnds_                # key1->rounds
+___
+       # generate the tweak
+       &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+       $movkey ($key),$rndkey0                 # zero round key
+       mov     $key,$key_                      # backup $key
+       mov     $rnds_,$rounds                  # backup $rounds
+       shl     \$4,$rnds_
+       mov     $len,$len_                      # backup $len
+       and     \$-16,$len
+
+       $movkey 16($key,$rnds_),$rndkey1        # last round key
+
+       movdqa  .Lxts_magic(%rip),$twmask
+       movdqa  $inout0,@tweak[5]
+       pshufd  \$0x5f,$inout0,$twres
+       pxor    $rndkey0,$rndkey1
+___
+    # alternative tweak calculation algorithm is based on suggestions
+    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+    # and should help in the future...
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+       movdqa  $twres,$twtmp
+       paddd   $twres,$twres
+       movdqa  @tweak[5],@tweak[$i]
+       psrad   \$31,$twtmp                     # broadcast upper bits
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+       pxor    $rndkey0,@tweak[$i]
+       pxor    $twtmp,@tweak[5]
+___
+    }
+$code.=<<___;
+       movdqa  @tweak[5],@tweak[4]
+       psrad   \$31,$twres
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twres
+       pxor    $rndkey0,@tweak[4]
+       pxor    $twres,@tweak[5]
+       movaps  $rndkey1,0x60(%rsp)             # save round[0]^round[last]
+
+       sub     \$16*6,$len
+       jc      .Lxts_enc_short                 # if $len-=6*16 borrowed
+
+       mov     \$16+96,$rounds
+       lea     32($key_,$rnds_),$key           # end of key schedule
+       sub     %r10,%rax                       # twisted $rounds
+       $movkey 16($key_),$rndkey1
+       mov     %rax,%r10                       # backup twisted $rounds
+       lea     .Lxts_magic(%rip),%r8
+       jmp     .Lxts_enc_grandloop
+
+.align 32
+.Lxts_enc_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqa  $rndkey0,$twmask
+       movdqu  `16*1`($inp),$inout1
+       pxor    @tweak[0],$inout0               # input^=tweak^round[0]
+       movdqu  `16*2`($inp),$inout2
+       pxor    @tweak[1],$inout1
+        aesenc         $rndkey1,$inout0
+       movdqu  `16*3`($inp),$inout3
+       pxor    @tweak[2],$inout2
+        aesenc         $rndkey1,$inout1
+       movdqu  `16*4`($inp),$inout4
+       pxor    @tweak[3],$inout3
+        aesenc         $rndkey1,$inout2
+       movdqu  `16*5`($inp),$inout5
+       pxor    @tweak[5],$twmask               # round[0]^=tweak[5]
+        movdqa 0x60(%rsp),$twres               # load round[0]^round[last]
+       pxor    @tweak[4],$inout4
+        aesenc         $rndkey1,$inout3
+       $movkey 32($key_),$rndkey0
+       lea     `16*6`($inp),$inp
+       pxor    $twmask,$inout5
+
+        pxor   $twres,@tweak[0]                # calculate tweaks^round[last]
+       aesenc          $rndkey1,$inout4
+        pxor   $twres,@tweak[1]
+        movdqa @tweak[0],`16*0`(%rsp)          # put aside tweaks^round[last]
+       aesenc          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor   $twres,@tweak[2]
+
+       aesenc          $rndkey0,$inout0
+        pxor   $twres,@tweak[3]
+        movdqa @tweak[1],`16*1`(%rsp)
+       aesenc          $rndkey0,$inout1
+        pxor   $twres,@tweak[4]
+        movdqa @tweak[2],`16*2`(%rsp)
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+        pxor   $twres,$twmask
+        movdqa @tweak[4],`16*4`(%rsp)
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+        movdqa $twmask,`16*5`(%rsp)
+       pshufd  \$0x5f,@tweak[5],$twres
+       jmp     .Lxts_enc_loop6
+.align 32
+.Lxts_enc_loop6:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       $movkey         -64($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         -80($key,%rax),$rndkey0
+       jnz             .Lxts_enc_loop6
+
+       movdqa  (%r8),$twmask                   # start calculating next tweak
+       movdqa  $twres,$twtmp
+       paddd   $twres,$twres
+        aesenc         $rndkey1,$inout0
+       paddq   @tweak[5],@tweak[5]
+       psrad   \$31,$twtmp
+        aesenc         $rndkey1,$inout1
+       pand    $twmask,$twtmp
+       $movkey ($key_),@tweak[0]               # load round[0]
+        aesenc         $rndkey1,$inout2
+        aesenc         $rndkey1,$inout3
+        aesenc         $rndkey1,$inout4
+       pxor    $twtmp,@tweak[5]
+       movaps  @tweak[0],@tweak[1]             # copy round[0]
+        aesenc         $rndkey1,$inout5
+        $movkey        -64($key),$rndkey1
+
+       movdqa  $twres,$twtmp
+        aesenc         $rndkey0,$inout0
+       paddd   $twres,$twres
+       pxor    @tweak[5],@tweak[0]
+        aesenc         $rndkey0,$inout1
+       psrad   \$31,$twtmp
+       paddq   @tweak[5],@tweak[5]
+        aesenc         $rndkey0,$inout2
+        aesenc         $rndkey0,$inout3
+       pand    $twmask,$twtmp
+       movaps  @tweak[1],@tweak[2]
+        aesenc         $rndkey0,$inout4
+       pxor    $twtmp,@tweak[5]
+       movdqa  $twres,$twtmp
+        aesenc         $rndkey0,$inout5
+        $movkey        -48($key),$rndkey0
+
+       paddd   $twres,$twres
+        aesenc         $rndkey1,$inout0
+       pxor    @tweak[5],@tweak[1]
+       psrad   \$31,$twtmp
+        aesenc         $rndkey1,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+        aesenc         $rndkey1,$inout2
+        aesenc         $rndkey1,$inout3
+        movdqa @tweak[3],`16*3`(%rsp)
+       pxor    $twtmp,@tweak[5]
+        aesenc         $rndkey1,$inout4
+       movaps  @tweak[2],@tweak[3]
+       movdqa  $twres,$twtmp
+        aesenc         $rndkey1,$inout5
+        $movkey        -32($key),$rndkey1
+
+       paddd   $twres,$twres
+        aesenc         $rndkey0,$inout0
+       pxor    @tweak[5],@tweak[2]
+       psrad   \$31,$twtmp
+        aesenc         $rndkey0,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+        aesenc         $rndkey0,$inout2
+        aesenc         $rndkey0,$inout3
+        aesenc         $rndkey0,$inout4
+       pxor    $twtmp,@tweak[5]
+       movaps  @tweak[3],@tweak[4]
+        aesenc         $rndkey0,$inout5
+
+       movdqa  $twres,$rndkey0
+       paddd   $twres,$twres
+        aesenc         $rndkey1,$inout0
+       pxor    @tweak[5],@tweak[3]
+       psrad   \$31,$rndkey0
+        aesenc         $rndkey1,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$rndkey0
+        aesenc         $rndkey1,$inout2
+        aesenc         $rndkey1,$inout3
+       pxor    $rndkey0,@tweak[5]
+       $movkey         ($key_),$rndkey0
+        aesenc         $rndkey1,$inout4
+        aesenc         $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+
+       pxor    @tweak[5],@tweak[4]
+        aesenclast     `16*0`(%rsp),$inout0
+       psrad   \$31,$twres
+       paddq   @tweak[5],@tweak[5]
+        aesenclast     `16*1`(%rsp),$inout1
+        aesenclast     `16*2`(%rsp),$inout2
+       pand    $twmask,$twres
+       mov     %r10,%rax                       # restore $rounds
+        aesenclast     `16*3`(%rsp),$inout3
+        aesenclast     `16*4`(%rsp),$inout4
+        aesenclast     `16*5`(%rsp),$inout5
+       pxor    $twres,@tweak[5]
+
+       lea     `16*6`($out),$out               # $out+=6*16
+       movups  $inout0,`-16*6`($out)           # store 6 output blocks
+       movups  $inout1,`-16*5`($out)
+       movups  $inout2,`-16*4`($out)
+       movups  $inout3,`-16*3`($out)
+       movups  $inout4,`-16*2`($out)
+       movups  $inout5,`-16*1`($out)
+       sub     \$16*6,$len
+       jnc     .Lxts_enc_grandloop             # loop if $len-=6*16 didn't borrow
+
+       mov     \$16+96,$rounds
+       sub     $rnds_,$rounds
+       mov     $key_,$key                      # restore $key
+       shr     \$4,$rounds                     # restore original value
+
+.Lxts_enc_short:
+       # at the point @tweak[0..5] are populated with tweak values
+       mov     $rounds,$rnds_                  # backup $rounds
+       pxor    $rndkey0,@tweak[0]
+       add     \$16*6,$len                     # restore real remaining $len
+       jz      .Lxts_enc_done                  # done if ($len==0)
+
+       pxor    $rndkey0,@tweak[1]
+       cmp     \$0x20,$len
+       jb      .Lxts_enc_one                   # $len is 1*16
+       pxor    $rndkey0,@tweak[2]
+       je      .Lxts_enc_two                   # $len is 2*16
+
+       pxor    $rndkey0,@tweak[3]
+       cmp     \$0x40,$len
+       jb      .Lxts_enc_three                 # $len is 3*16
+       pxor    $rndkey0,@tweak[4]
+       je      .Lxts_enc_four                  # $len is 4*16
+
+       movdqu  ($inp),$inout0                  # $len is 5*16
+       movdqu  16*1($inp),$inout1
+       movdqu  16*2($inp),$inout2
+       pxor    @tweak[0],$inout0
+       movdqu  16*3($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  16*4($inp),$inout4
+       lea     16*5($inp),$inp                 # $inp+=5*16
+       pxor    @tweak[2],$inout2
+       pxor    @tweak[3],$inout3
+       pxor    @tweak[4],$inout4
+       pxor    $inout5,$inout5
+
+       call    _aesni_encrypt6
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[5],@tweak[0]
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movdqu  $inout0,($out)                  # store 5 output blocks
+       xorps   @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       xorps   @tweak[4],$inout4
+       movdqu  $inout2,16*2($out)
+       movdqu  $inout3,16*3($out)
+       movdqu  $inout4,16*4($out)
+       lea     16*5($out),$out                 # $out+=5*16
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+       movups  ($inp),$inout0
+       lea     16*1($inp),$inp                 # inp+=1*16
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[1],@tweak[0]
+       movups  $inout0,($out)                  # store one output block
+       lea     16*1($out),$out                 # $out+=1*16
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+       movups  ($inp),$inout0
+       movups  16($inp),$inout1
+       lea     32($inp),$inp                   # $inp+=2*16
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+
+       call    _aesni_encrypt2
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[2],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movups  $inout0,($out)                  # store 2 output blocks
+       movups  $inout1,16*1($out)
+       lea     16*2($out),$out                 # $out+=2*16
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       lea     16*3($inp),$inp                 # $inp+=3*16
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+
+       call    _aesni_encrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[3],@tweak[0]
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)                  # store 3 output blocks
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       lea     16*3($out),$out                 # $out+=3*16
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       xorps   @tweak[0],$inout0
+       movups  16*3($inp),$inout3
+       lea     16*4($inp),$inp                 # $inp+=4*16
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       xorps   @tweak[3],$inout3
+
+       call    _aesni_encrypt4
+
+       pxor    @tweak[0],$inout0
+       movdqa  @tweak[4],@tweak[0]
+       pxor    @tweak[1],$inout1
+       pxor    @tweak[2],$inout2
+       movdqu  $inout0,($out)                  # store 4 output blocks
+       pxor    @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       movdqu  $inout2,16*2($out)
+       movdqu  $inout3,16*3($out)
+       lea     16*4($out),$out                 # $out+=4*16
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+       and     \$15,$len_                      # see if $len%16 is 0
+       jz      .Lxts_enc_ret
+       mov     $len_,$len
+
+.Lxts_enc_steal:
+       movzb   ($inp),%eax                     # borrow $rounds ...
+       movzb   -16($out),%ecx                  # ... and $key
+       lea     1($inp),$inp
+       mov     %al,-16($out)
+       mov     %cl,0($out)
+       lea     1($out),$out
+       sub     \$1,$len
+       jnz     .Lxts_enc_steal
+
+       sub     $len_,$out                      # rewind $out
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  -16($out),$inout0
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movups  $inout0,-16($out)
+
+.Lxts_enc_ret:
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,0x10(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,0x20(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,0x30(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,0x40(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,0x50(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,0x60(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+       movaps  -0xa8(%r11),%xmm6
+       movaps  %xmm0,-0xa8(%r11)               # clear stack
+       movaps  -0x98(%r11),%xmm7
+       movaps  %xmm0,-0x98(%r11)
+       movaps  -0x88(%r11),%xmm8
+       movaps  %xmm0,-0x88(%r11)
+       movaps  -0x78(%r11),%xmm9
+       movaps  %xmm0,-0x78(%r11)
+       movaps  -0x68(%r11),%xmm10
+       movaps  %xmm0,-0x68(%r11)
+       movaps  -0x58(%r11),%xmm11
+       movaps  %xmm0,-0x58(%r11)
+       movaps  -0x48(%r11),%xmm12
+       movaps  %xmm0,-0x48(%r11)
+       movaps  -0x38(%r11),%xmm13
+       movaps  %xmm0,-0x38(%r11)
+       movaps  -0x28(%r11),%xmm14
+       movaps  %xmm0,-0x28(%r11)
+       movaps  -0x18(%r11),%xmm15
+       movaps  %xmm0,-0x18(%r11)
+       movaps  %xmm0,0x00(%rsp)
+       movaps  %xmm0,0x10(%rsp)
+       movaps  %xmm0,0x20(%rsp)
+       movaps  %xmm0,0x30(%rsp)
+       movaps  %xmm0,0x40(%rsp)
+       movaps  %xmm0,0x50(%rsp)
+       movaps  %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+       mov     -8(%r11),%rbp
+.cfi_restore   %rbp
+       lea     (%r11),%rsp
+.cfi_def_cfa_register  %rsp
+.Lxts_enc_epilogue:
+       ret
+.cfi_endproc
+.size  aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type  aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+.cfi_startproc
+       lea     (%rsp),%r11                     # frame pointer
+.cfi_def_cfa_register  %r11
+       push    %rbp
+.cfi_push      %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,-0xa8(%r11)               # offload everything
+       movaps  %xmm7,-0x98(%r11)
+       movaps  %xmm8,-0x88(%r11)
+       movaps  %xmm9,-0x78(%r11)
+       movaps  %xmm10,-0x68(%r11)
+       movaps  %xmm11,-0x58(%r11)
+       movaps  %xmm12,-0x48(%r11)
+       movaps  %xmm13,-0x38(%r11)
+       movaps  %xmm14,-0x28(%r11)
+       movaps  %xmm15,-0x18(%r11)
+.Lxts_dec_body:
+___
+$code.=<<___;
+       movups  ($ivp),$inout0                  # load clear-text tweak
+       mov     240($key2),$rounds              # key2->rounds
+       mov     240($key),$rnds_                # key1->rounds
+___
+       # generate the tweak
+       &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+       xor     %eax,%eax                       # if ($len%16) len-=16;
+       test    \$15,$len
+       setnz   %al
+       shl     \$4,%rax
+       sub     %rax,$len
+
+       $movkey ($key),$rndkey0                 # zero round key
+       mov     $key,$key_                      # backup $key
+       mov     $rnds_,$rounds                  # backup $rounds
+       shl     \$4,$rnds_
+       mov     $len,$len_                      # backup $len
+       and     \$-16,$len
+
+       $movkey 16($key,$rnds_),$rndkey1        # last round key
+
+       movdqa  .Lxts_magic(%rip),$twmask
+       movdqa  $inout0,@tweak[5]
+       pshufd  \$0x5f,$inout0,$twres
+       pxor    $rndkey0,$rndkey1
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+       movdqa  $twres,$twtmp
+       paddd   $twres,$twres
+       movdqa  @tweak[5],@tweak[$i]
+       psrad   \$31,$twtmp                     # broadcast upper bits
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+       pxor    $rndkey0,@tweak[$i]
+       pxor    $twtmp,@tweak[5]
+___
+    }
+$code.=<<___;
+       movdqa  @tweak[5],@tweak[4]
+       psrad   \$31,$twres
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twres
+       pxor    $rndkey0,@tweak[4]
+       pxor    $twres,@tweak[5]
+       movaps  $rndkey1,0x60(%rsp)             # save round[0]^round[last]
+
+       sub     \$16*6,$len
+       jc      .Lxts_dec_short                 # if $len-=6*16 borrowed
+
+       mov     \$16+96,$rounds
+       lea     32($key_,$rnds_),$key           # end of key schedule
+       sub     %r10,%rax                       # twisted $rounds
+       $movkey 16($key_),$rndkey1
+       mov     %rax,%r10                       # backup twisted $rounds
+       lea     .Lxts_magic(%rip),%r8
+       jmp     .Lxts_dec_grandloop
+
+.align 32
+.Lxts_dec_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqa  $rndkey0,$twmask
+       movdqu  `16*1`($inp),$inout1
+       pxor    @tweak[0],$inout0               # intput^=tweak^round[0]
+       movdqu  `16*2`($inp),$inout2
+       pxor    @tweak[1],$inout1
+        aesdec         $rndkey1,$inout0
+       movdqu  `16*3`($inp),$inout3
+       pxor    @tweak[2],$inout2
+        aesdec         $rndkey1,$inout1
+       movdqu  `16*4`($inp),$inout4
+       pxor    @tweak[3],$inout3
+        aesdec         $rndkey1,$inout2
+       movdqu  `16*5`($inp),$inout5
+       pxor    @tweak[5],$twmask               # round[0]^=tweak[5]
+        movdqa 0x60(%rsp),$twres               # load round[0]^round[last]
+       pxor    @tweak[4],$inout4
+        aesdec         $rndkey1,$inout3
+       $movkey 32($key_),$rndkey0
+       lea     `16*6`($inp),$inp
+       pxor    $twmask,$inout5
+
+        pxor   $twres,@tweak[0]                # calculate tweaks^round[last]
+       aesdec          $rndkey1,$inout4
+        pxor   $twres,@tweak[1]
+        movdqa @tweak[0],`16*0`(%rsp)          # put aside tweaks^last round key
+       aesdec          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor   $twres,@tweak[2]
+
+       aesdec          $rndkey0,$inout0
+        pxor   $twres,@tweak[3]
+        movdqa @tweak[1],`16*1`(%rsp)
+       aesdec          $rndkey0,$inout1
+        pxor   $twres,@tweak[4]
+        movdqa @tweak[2],`16*2`(%rsp)
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+        pxor   $twres,$twmask
+        movdqa @tweak[4],`16*4`(%rsp)
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+        movdqa $twmask,`16*5`(%rsp)
+       pshufd  \$0x5f,@tweak[5],$twres
+       jmp     .Lxts_dec_loop6
+.align 32
+.Lxts_dec_loop6:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       $movkey         -64($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         -80($key,%rax),$rndkey0
+       jnz             .Lxts_dec_loop6
+
+       movdqa  (%r8),$twmask                   # start calculating next tweak
+       movdqa  $twres,$twtmp
+       paddd   $twres,$twres
+        aesdec         $rndkey1,$inout0
+       paddq   @tweak[5],@tweak[5]
+       psrad   \$31,$twtmp
+        aesdec         $rndkey1,$inout1
+       pand    $twmask,$twtmp
+       $movkey ($key_),@tweak[0]               # load round[0]
+        aesdec         $rndkey1,$inout2
+        aesdec         $rndkey1,$inout3
+        aesdec         $rndkey1,$inout4
+       pxor    $twtmp,@tweak[5]
+       movaps  @tweak[0],@tweak[1]             # copy round[0]
+        aesdec         $rndkey1,$inout5
+        $movkey        -64($key),$rndkey1
+
+       movdqa  $twres,$twtmp
+        aesdec         $rndkey0,$inout0
+       paddd   $twres,$twres
+       pxor    @tweak[5],@tweak[0]
+        aesdec         $rndkey0,$inout1
+       psrad   \$31,$twtmp
+       paddq   @tweak[5],@tweak[5]
+        aesdec         $rndkey0,$inout2
+        aesdec         $rndkey0,$inout3
+       pand    $twmask,$twtmp
+       movaps  @tweak[1],@tweak[2]
+        aesdec         $rndkey0,$inout4
+       pxor    $twtmp,@tweak[5]
+       movdqa  $twres,$twtmp
+        aesdec         $rndkey0,$inout5
+        $movkey        -48($key),$rndkey0
+
+       paddd   $twres,$twres
+        aesdec         $rndkey1,$inout0
+       pxor    @tweak[5],@tweak[1]
+       psrad   \$31,$twtmp
+        aesdec         $rndkey1,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+        aesdec         $rndkey1,$inout2
+        aesdec         $rndkey1,$inout3
+        movdqa @tweak[3],`16*3`(%rsp)
+       pxor    $twtmp,@tweak[5]
+        aesdec         $rndkey1,$inout4
+       movaps  @tweak[2],@tweak[3]
+       movdqa  $twres,$twtmp
+        aesdec         $rndkey1,$inout5
+        $movkey        -32($key),$rndkey1
+
+       paddd   $twres,$twres
+        aesdec         $rndkey0,$inout0
+       pxor    @tweak[5],@tweak[2]
+       psrad   \$31,$twtmp
+        aesdec         $rndkey0,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$twtmp
+        aesdec         $rndkey0,$inout2
+        aesdec         $rndkey0,$inout3
+        aesdec         $rndkey0,$inout4
+       pxor    $twtmp,@tweak[5]
+       movaps  @tweak[3],@tweak[4]
+        aesdec         $rndkey0,$inout5
+
+       movdqa  $twres,$rndkey0
+       paddd   $twres,$twres
+        aesdec         $rndkey1,$inout0
+       pxor    @tweak[5],@tweak[3]
+       psrad   \$31,$rndkey0
+        aesdec         $rndkey1,$inout1
+       paddq   @tweak[5],@tweak[5]
+       pand    $twmask,$rndkey0
+        aesdec         $rndkey1,$inout2
+        aesdec         $rndkey1,$inout3
+       pxor    $rndkey0,@tweak[5]
+       $movkey         ($key_),$rndkey0
+        aesdec         $rndkey1,$inout4
+        aesdec         $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+
+       pxor    @tweak[5],@tweak[4]
+        aesdeclast     `16*0`(%rsp),$inout0
+       psrad   \$31,$twres
+       paddq   @tweak[5],@tweak[5]
+        aesdeclast     `16*1`(%rsp),$inout1
+        aesdeclast     `16*2`(%rsp),$inout2
+       pand    $twmask,$twres
+       mov     %r10,%rax                       # restore $rounds
+        aesdeclast     `16*3`(%rsp),$inout3
+        aesdeclast     `16*4`(%rsp),$inout4
+        aesdeclast     `16*5`(%rsp),$inout5
+       pxor    $twres,@tweak[5]
+
+       lea     `16*6`($out),$out               # $out+=6*16
+       movups  $inout0,`-16*6`($out)           # store 6 output blocks
+       movups  $inout1,`-16*5`($out)
+       movups  $inout2,`-16*4`($out)
+       movups  $inout3,`-16*3`($out)
+       movups  $inout4,`-16*2`($out)
+       movups  $inout5,`-16*1`($out)
+       sub     \$16*6,$len
+       jnc     .Lxts_dec_grandloop             # loop if $len-=6*16 didn't borrow
+
+       mov     \$16+96,$rounds
+       sub     $rnds_,$rounds
+       mov     $key_,$key                      # restore $key
+       shr     \$4,$rounds                     # restore original value
+
+.Lxts_dec_short:
+       # at the point @tweak[0..5] are populated with tweak values
+       mov     $rounds,$rnds_                  # backup $rounds
+       pxor    $rndkey0,@tweak[0]
+       pxor    $rndkey0,@tweak[1]
+       add     \$16*6,$len                     # restore real remaining $len
+       jz      .Lxts_dec_done                  # done if ($len==0)
+
+       pxor    $rndkey0,@tweak[2]
+       cmp     \$0x20,$len
+       jb      .Lxts_dec_one                   # $len is 1*16
+       pxor    $rndkey0,@tweak[3]
+       je      .Lxts_dec_two                   # $len is 2*16
+
+       pxor    $rndkey0,@tweak[4]
+       cmp     \$0x40,$len
+       jb      .Lxts_dec_three                 # $len is 3*16
+       je      .Lxts_dec_four                  # $len is 4*16
+
+       movdqu  ($inp),$inout0                  # $len is 5*16
+       movdqu  16*1($inp),$inout1
+       movdqu  16*2($inp),$inout2
+       pxor    @tweak[0],$inout0
+       movdqu  16*3($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  16*4($inp),$inout4
+       lea     16*5($inp),$inp                 # $inp+=5*16
+       pxor    @tweak[2],$inout2
+       pxor    @tweak[3],$inout3
+       pxor    @tweak[4],$inout4
+
+       call    _aesni_decrypt6
+
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movdqu  $inout0,($out)                  # store 5 output blocks
+       xorps   @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       xorps   @tweak[4],$inout4
+       movdqu  $inout2,16*2($out)
+        pxor           $twtmp,$twtmp
+       movdqu  $inout3,16*3($out)
+        pcmpgtd        @tweak[5],$twtmp
+       movdqu  $inout4,16*4($out)
+       lea     16*5($out),$out                 # $out+=5*16
+        pshufd         \$0x13,$twtmp,@tweak[1] # $twres
+       and     \$15,$len_
+       jz      .Lxts_dec_ret
+
+       movdqa  @tweak[5],@tweak[0]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       pand    $twmask,@tweak[1]               # isolate carry and residue
+       pxor    @tweak[5],@tweak[1]
+       jmp     .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+       movups  ($inp),$inout0
+       lea     16*1($inp),$inp                 # $inp+=1*16
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[1],@tweak[0]
+       movups  $inout0,($out)                  # store one output block
+       movdqa  @tweak[2],@tweak[1]
+       lea     16*1($out),$out                 # $out+=1*16
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+       movups  ($inp),$inout0
+       movups  16($inp),$inout1
+       lea     32($inp),$inp                   # $inp+=2*16
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+
+       call    _aesni_decrypt2
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[2],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movdqa  @tweak[3],@tweak[1]
+       movups  $inout0,($out)                  # store 2 output blocks
+       movups  $inout1,16*1($out)
+       lea     16*2($out),$out                 # $out+=2*16
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       lea     16*3($inp),$inp                 # $inp+=3*16
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+
+       call    _aesni_decrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[3],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movdqa  @tweak[4],@tweak[1]
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)                  # store 3 output blocks
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       lea     16*3($out),$out                 # $out+=3*16
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       xorps   @tweak[0],$inout0
+       movups  16*3($inp),$inout3
+       lea     16*4($inp),$inp                 # $inp+=4*16
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       xorps   @tweak[3],$inout3
+
+       call    _aesni_decrypt4
+
+       pxor    @tweak[0],$inout0
+       movdqa  @tweak[4],@tweak[0]
+       pxor    @tweak[1],$inout1
+       movdqa  @tweak[5],@tweak[1]
+       pxor    @tweak[2],$inout2
+       movdqu  $inout0,($out)                  # store 4 output blocks
+       pxor    @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       movdqu  $inout2,16*2($out)
+       movdqu  $inout3,16*3($out)
+       lea     16*4($out),$out                 # $out+=4*16
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+       and     \$15,$len_                      # see if $len%16 is 0
+       jz      .Lxts_dec_ret
+.Lxts_dec_done2:
+       mov     $len_,$len
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  ($inp),$inout0
+       xorps   @tweak[1],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[1],$inout0
+       movups  $inout0,($out)
+
+.Lxts_dec_steal:
+       movzb   16($inp),%eax                   # borrow $rounds ...
+       movzb   ($out),%ecx                     # ... and $key
+       lea     1($inp),$inp
+       mov     %al,($out)
+       mov     %cl,16($out)
+       lea     1($out),$out
+       sub     \$1,$len
+       jnz     .Lxts_dec_steal
+
+       sub     $len_,$out                      # rewind $out
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  ($out),$inout0
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movups  $inout0,($out)
+
+.Lxts_dec_ret:
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,0x10(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,0x20(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,0x30(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,0x40(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,0x50(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,0x60(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+       movaps  -0xa8(%r11),%xmm6
+       movaps  %xmm0,-0xa8(%r11)               # clear stack
+       movaps  -0x98(%r11),%xmm7
+       movaps  %xmm0,-0x98(%r11)
+       movaps  -0x88(%r11),%xmm8
+       movaps  %xmm0,-0x88(%r11)
+       movaps  -0x78(%r11),%xmm9
+       movaps  %xmm0,-0x78(%r11)
+       movaps  -0x68(%r11),%xmm10
+       movaps  %xmm0,-0x68(%r11)
+       movaps  -0x58(%r11),%xmm11
+       movaps  %xmm0,-0x58(%r11)
+       movaps  -0x48(%r11),%xmm12
+       movaps  %xmm0,-0x48(%r11)
+       movaps  -0x38(%r11),%xmm13
+       movaps  %xmm0,-0x38(%r11)
+       movaps  -0x28(%r11),%xmm14
+       movaps  %xmm0,-0x28(%r11)
+       movaps  -0x18(%r11),%xmm15
+       movaps  %xmm0,-0x18(%r11)
+       movaps  %xmm0,0x00(%rsp)
+       movaps  %xmm0,0x10(%rsp)
+       movaps  %xmm0,0x20(%rsp)
+       movaps  %xmm0,0x30(%rsp)
+       movaps  %xmm0,0x40(%rsp)
+       movaps  %xmm0,0x50(%rsp)
+       movaps  %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+       mov     -8(%r11),%rbp
+.cfi_restore   %rbp
+       lea     (%r11),%rsp
+.cfi_def_cfa_register  %rsp
+.Lxts_dec_epilogue:
+       ret
+.cfi_endproc
+.size  aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+}
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#      const AES_KEY *key, unsigned int start_block_num,
+#      unsigned char offset_i[16], const unsigned char L_[][16],
+#      unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9");               # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type  aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+.cfi_startproc
+       lea     (%rsp),%rax
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp),%rsp
+       movaps  %xmm6,0x00(%rsp)                # offload everything
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm10,0x40(%rsp)
+       movaps  %xmm11,0x50(%rsp)
+       movaps  %xmm12,0x60(%rsp)
+       movaps  %xmm13,0x70(%rsp)
+       movaps  %xmm14,0x80(%rsp)
+       movaps  %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+       mov     $seventh_arg(%rax),$L_p         # 7th argument
+       mov     $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+       mov     240($key),$rnds_
+       mov     $key,$key_
+       shl     \$4,$rnds_
+       $movkey ($key),$rndkey0l                # round[0]
+       $movkey 16($key,$rnds_),$rndkey1        # round[last]
+
+       movdqu  ($offset_p),@offset[5]          # load last offset_i
+       pxor    $rndkey1,$rndkey0l              # round[0] ^ round[last]
+       pxor    $rndkey1,@offset[5]             # offset_i ^ round[last]
+
+       mov     \$16+32,$rounds
+       lea     32($key_,$rnds_),$key
+       $movkey 16($key_),$rndkey1              # round[1]
+       sub     %r10,%rax                       # twisted $rounds
+       mov     %rax,%r10                       # backup twisted $rounds
+
+       movdqu  ($L_p),@offset[0]               # L_0 for all odd-numbered blocks
+       movdqu  ($checksum_p),$checksum         # load checksum
+
+       test    \$1,$block_num                  # is first block number odd?
+       jnz     .Locb_enc_odd
+
+       bsf     $block_num,$i1
+       add     \$1,$block_num
+       shl     \$4,$i1
+       movdqu  ($L_p,$i1),$inout5              # borrow
+       movdqu  ($inp),$inout0
+       lea     16($inp),$inp
+
+       call    __ocb_encrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,($out)
+       lea     16($out),$out
+       sub     \$1,$blocks
+       jz      .Locb_enc_done
+
+.Locb_enc_odd:
+       lea     1($block_num),$i1               # even-numbered blocks
+       lea     3($block_num),$i3
+       lea     5($block_num),$i5
+       lea     6($block_num),$block_num
+       bsf     $i1,$i1                         # ntz(block)
+       bsf     $i3,$i3
+       bsf     $i5,$i5
+       shl     \$4,$i1                         # ntz(block) -> table offset
+       shl     \$4,$i3
+       shl     \$4,$i5
+
+       sub     \$6,$blocks
+       jc      .Locb_enc_short
+       jmp     .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqu  `16*1`($inp),$inout1
+       movdqu  `16*2`($inp),$inout2
+       movdqu  `16*3`($inp),$inout3
+       movdqu  `16*4`($inp),$inout4
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+
+       call    __ocb_encrypt6
+
+       movups  $inout0,`16*0`($out)            # store output
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+       movups  $inout4,`16*4`($out)
+       movups  $inout5,`16*5`($out)
+       lea     `16*6`($out),$out
+       sub     \$6,$blocks
+       jnc     .Locb_enc_grandloop
+
+.Locb_enc_short:
+       add     \$6,$blocks
+       jz      .Locb_enc_done
+
+       movdqu  `16*0`($inp),$inout0
+       cmp     \$2,$blocks
+       jb      .Locb_enc_one
+       movdqu  `16*1`($inp),$inout1
+       je      .Locb_enc_two
+
+       movdqu  `16*2`($inp),$inout2
+       cmp     \$4,$blocks
+       jb      .Locb_enc_three
+       movdqu  `16*3`($inp),$inout3
+       je      .Locb_enc_four
+
+       movdqu  `16*4`($inp),$inout4
+       pxor    $inout5,$inout5
+
+       call    __ocb_encrypt6
+
+       movdqa  @offset[4],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+       movups  $inout4,`16*4`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+       movdqa  @offset[0],$inout5              # borrow
+
+       call    __ocb_encrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,`16*0`($out)
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+       pxor    $inout2,$inout2
+       pxor    $inout3,$inout3
+
+       call    __ocb_encrypt4
+
+       movdqa  @offset[1],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+       pxor    $inout3,$inout3
+
+       call    __ocb_encrypt4
+
+       movdqa  @offset[2],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+       call    __ocb_encrypt4
+
+       movdqa  @offset[3],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+
+.Locb_enc_done:
+       pxor    $rndkey0,@offset[5]             # "remove" round[last]
+       movdqu  $checksum,($checksum_p)         # store checksum
+       movdqu  @offset[5],($offset_p)          # store last offset_i
+
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       lea     0x28(%rsp),%rax
+.cfi_def_cfa   %rax,8
+___
+$code.=<<___ if ($win64);
+       movaps  0x00(%rsp),%xmm6
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       movaps  0x40(%rsp),%xmm10
+       movaps  %xmm0,0x40(%rsp)
+       movaps  0x50(%rsp),%xmm11
+       movaps  %xmm0,0x50(%rsp)
+       movaps  0x60(%rsp),%xmm12
+       movaps  %xmm0,0x60(%rsp)
+       movaps  0x70(%rsp),%xmm13
+       movaps  %xmm0,0x70(%rsp)
+       movaps  0x80(%rsp),%xmm14
+       movaps  %xmm0,0x80(%rsp)
+       movaps  0x90(%rsp),%xmm15
+       movaps  %xmm0,0x90(%rsp)
+       lea     0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+       mov     -40(%rax),%r14
+.cfi_restore   %r14
+       mov     -32(%rax),%r13
+.cfi_restore   %r13
+       mov     -24(%rax),%r12
+.cfi_restore   %r12
+       mov     -16(%rax),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rax),%rbx
+.cfi_restore   %rbx
+       lea     (%rax),%rsp
+.cfi_def_cfa_register  %rsp
+.Locb_enc_epilogue:
+       ret
+.cfi_endproc
+.size  aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type  __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        movdqa         @offset[0],@offset[4]
+        pxor           @offset[5],@offset[0]
+        movdqu         ($L_p,$i5),@offset[5]
+        pxor           @offset[0],@offset[1]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            $inout1,$checksum
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            $inout2,$checksum
+       pxor            @offset[2],$inout2
+        pxor           @offset[3],@offset[4]
+       pxor            $inout3,$checksum
+       pxor            @offset[3],$inout3
+        pxor           @offset[4],@offset[5]
+       pxor            $inout4,$checksum
+       pxor            @offset[4],$inout4
+       pxor            $inout5,$checksum
+       pxor            @offset[5],$inout5
+       $movkey         32($key_),$rndkey0
+
+       lea             1($block_num),$i1       # even-numbered blocks
+       lea             3($block_num),$i3
+       lea             5($block_num),$i5
+       add             \$6,$block_num
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+       bsf             $i1,$i1                 # ntz(block)
+       bsf             $i3,$i3
+       bsf             $i5,$i5
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+       aesenc          $rndkey1,$inout4
+        pxor           $rndkey0l,@offset[3]
+        pxor           $rndkey0l,@offset[4]
+       aesenc          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor           $rndkey0l,@offset[5]
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+       shl             \$4,$i1                 # ntz(block) -> table offset
+       shl             \$4,$i3
+       jmp             .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop6
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+       shl             \$4,$i5
+
+       aesenclast      @offset[0],$inout0
+       movdqu          ($L_p),@offset[0]       # L_0 for all odd-numbered blocks
+       mov             %r10,%rax               # restore twisted rounds
+       aesenclast      @offset[1],$inout1
+       aesenclast      @offset[2],$inout2
+       aesenclast      @offset[3],$inout3
+       aesenclast      @offset[4],$inout4
+       aesenclast      @offset[5],$inout5
+       ret
+.size  __ocb_encrypt6,.-__ocb_encrypt6
+
+.type  __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        pxor           @offset[5],@offset[0]
+        pxor           @offset[0],@offset[1]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            $inout1,$checksum
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            $inout2,$checksum
+       pxor            @offset[2],$inout2
+       pxor            $inout3,$checksum
+       pxor            @offset[3],$inout3
+       $movkey         32($key_),$rndkey0
+
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+        pxor           $rndkey0l,@offset[3]
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         48($key_),$rndkey1
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop4
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         16($key_),$rndkey1
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesenclast      @offset[0],$inout0
+       aesenclast      @offset[1],$inout1
+       aesenclast      @offset[2],$inout2
+       aesenclast      @offset[3],$inout3
+       ret
+.size  __ocb_encrypt4,.-__ocb_encrypt4
+
+.type  __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+        pxor           @offset[5],$inout5      # offset_i
+        pxor           $rndkey0l,$inout5       # offset_i ^ round[0]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            $inout5,$inout0         # input ^ round[0] ^ offset_i
+       $movkey         32($key_),$rndkey0
+
+       aesenc          $rndkey1,$inout0
+       $movkey         48($key_),$rndkey1
+       pxor            $rndkey0l,$inout5       # offset_i ^ round[last]
+
+       aesenc          $rndkey0,$inout0
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+       aesenc          $rndkey1,$inout0
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop1
+
+       aesenc          $rndkey1,$inout0
+       $movkey         16($key_),$rndkey1      # redundant in tail
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesenclast      $inout5,$inout0
+       ret
+.size  __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type  aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+.cfi_startproc
+       lea     (%rsp),%rax
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp),%rsp
+       movaps  %xmm6,0x00(%rsp)                # offload everything
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm10,0x40(%rsp)
+       movaps  %xmm11,0x50(%rsp)
+       movaps  %xmm12,0x60(%rsp)
+       movaps  %xmm13,0x70(%rsp)
+       movaps  %xmm14,0x80(%rsp)
+       movaps  %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+       mov     $seventh_arg(%rax),$L_p         # 7th argument
+       mov     $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+       mov     240($key),$rnds_
+       mov     $key,$key_
+       shl     \$4,$rnds_
+       $movkey ($key),$rndkey0l                # round[0]
+       $movkey 16($key,$rnds_),$rndkey1        # round[last]
+
+       movdqu  ($offset_p),@offset[5]          # load last offset_i
+       pxor    $rndkey1,$rndkey0l              # round[0] ^ round[last]
+       pxor    $rndkey1,@offset[5]             # offset_i ^ round[last]
+
+       mov     \$16+32,$rounds
+       lea     32($key_,$rnds_),$key
+       $movkey 16($key_),$rndkey1              # round[1]
+       sub     %r10,%rax                       # twisted $rounds
+       mov     %rax,%r10                       # backup twisted $rounds
+
+       movdqu  ($L_p),@offset[0]               # L_0 for all odd-numbered blocks
+       movdqu  ($checksum_p),$checksum         # load checksum
+
+       test    \$1,$block_num                  # is first block number odd?
+       jnz     .Locb_dec_odd
+
+       bsf     $block_num,$i1
+       add     \$1,$block_num
+       shl     \$4,$i1
+       movdqu  ($L_p,$i1),$inout5              # borrow
+       movdqu  ($inp),$inout0
+       lea     16($inp),$inp
+
+       call    __ocb_decrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,($out)
+       xorps   $inout0,$checksum               # accumulate checksum
+       lea     16($out),$out
+       sub     \$1,$blocks
+       jz      .Locb_dec_done
+
+.Locb_dec_odd:
+       lea     1($block_num),$i1               # even-numbered blocks
+       lea     3($block_num),$i3
+       lea     5($block_num),$i5
+       lea     6($block_num),$block_num
+       bsf     $i1,$i1                         # ntz(block)
+       bsf     $i3,$i3
+       bsf     $i5,$i5
+       shl     \$4,$i1                         # ntz(block) -> table offset
+       shl     \$4,$i3
+       shl     \$4,$i5
+
+       sub     \$6,$blocks
+       jc      .Locb_dec_short
+       jmp     .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqu  `16*1`($inp),$inout1
+       movdqu  `16*2`($inp),$inout2
+       movdqu  `16*3`($inp),$inout3
+       movdqu  `16*4`($inp),$inout4
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+
+       call    __ocb_decrypt6
+
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+       movups  $inout4,`16*4`($out)
+       pxor    $inout4,$checksum
+       movups  $inout5,`16*5`($out)
+       pxor    $inout5,$checksum
+       lea     `16*6`($out),$out
+       sub     \$6,$blocks
+       jnc     .Locb_dec_grandloop
+
+.Locb_dec_short:
+       add     \$6,$blocks
+       jz      .Locb_dec_done
+
+       movdqu  `16*0`($inp),$inout0
+       cmp     \$2,$blocks
+       jb      .Locb_dec_one
+       movdqu  `16*1`($inp),$inout1
+       je      .Locb_dec_two
+
+       movdqu  `16*2`($inp),$inout2
+       cmp     \$4,$blocks
+       jb      .Locb_dec_three
+       movdqu  `16*3`($inp),$inout3
+       je      .Locb_dec_four
+
+       movdqu  `16*4`($inp),$inout4
+       pxor    $inout5,$inout5
+
+       call    __ocb_decrypt6
+
+       movdqa  @offset[4],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+       movups  $inout4,`16*4`($out)
+       pxor    $inout4,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+       movdqa  @offset[0],$inout5              # borrow
+
+       call    __ocb_decrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+       pxor    $inout2,$inout2
+       pxor    $inout3,$inout3
+
+       call    __ocb_decrypt4
+
+       movdqa  @offset[1],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       xorps   $inout1,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+       pxor    $inout3,$inout3
+
+       call    __ocb_decrypt4
+
+       movdqa  @offset[2],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       xorps   $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       xorps   $inout2,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+       call    __ocb_decrypt4
+
+       movdqa  @offset[3],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+
+.Locb_dec_done:
+       pxor    $rndkey0,@offset[5]             # "remove" round[last]
+       movdqu  $checksum,($checksum_p)         # store checksum
+       movdqu  @offset[5],($offset_p)          # store last offset_i
+
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       lea     0x28(%rsp),%rax
+.cfi_def_cfa   %rax,8
+___
+$code.=<<___ if ($win64);
+       movaps  0x00(%rsp),%xmm6
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       movaps  0x40(%rsp),%xmm10
+       movaps  %xmm0,0x40(%rsp)
+       movaps  0x50(%rsp),%xmm11
+       movaps  %xmm0,0x50(%rsp)
+       movaps  0x60(%rsp),%xmm12
+       movaps  %xmm0,0x60(%rsp)
+       movaps  0x70(%rsp),%xmm13
+       movaps  %xmm0,0x70(%rsp)
+       movaps  0x80(%rsp),%xmm14
+       movaps  %xmm0,0x80(%rsp)
+       movaps  0x90(%rsp),%xmm15
+       movaps  %xmm0,0x90(%rsp)
+       lea     0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+       mov     -40(%rax),%r14
+.cfi_restore   %r14
+       mov     -32(%rax),%r13
+.cfi_restore   %r13
+       mov     -24(%rax),%r12
+.cfi_restore   %r12
+       mov     -16(%rax),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rax),%rbx
+.cfi_restore   %rbx
+       lea     (%rax),%rsp
+.cfi_def_cfa_register  %rsp
+.Locb_dec_epilogue:
+       ret
+.cfi_endproc
+.size  aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type  __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        movdqa         @offset[0],@offset[4]
+        pxor           @offset[5],@offset[0]
+        movdqu         ($L_p,$i5),@offset[5]
+        pxor           @offset[0],@offset[1]
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            @offset[2],$inout2
+        pxor           @offset[3],@offset[4]
+       pxor            @offset[3],$inout3
+        pxor           @offset[4],@offset[5]
+       pxor            @offset[4],$inout4
+       pxor            @offset[5],$inout5
+       $movkey         32($key_),$rndkey0
+
+       lea             1($block_num),$i1       # even-numbered blocks
+       lea             3($block_num),$i3
+       lea             5($block_num),$i5
+       add             \$6,$block_num
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+       bsf             $i1,$i1                 # ntz(block)
+       bsf             $i3,$i3
+       bsf             $i5,$i5
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+       aesdec          $rndkey1,$inout4
+        pxor           $rndkey0l,@offset[3]
+        pxor           $rndkey0l,@offset[4]
+       aesdec          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor           $rndkey0l,@offset[5]
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+       shl             \$4,$i1                 # ntz(block) -> table offset
+       shl             \$4,$i3
+       jmp             .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop6
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+       shl             \$4,$i5
+
+       aesdeclast      @offset[0],$inout0
+       movdqu          ($L_p),@offset[0]       # L_0 for all odd-numbered blocks
+       mov             %r10,%rax               # restore twisted rounds
+       aesdeclast      @offset[1],$inout1
+       aesdeclast      @offset[2],$inout2
+       aesdeclast      @offset[3],$inout3
+       aesdeclast      @offset[4],$inout4
+       aesdeclast      @offset[5],$inout5
+       ret
+.size  __ocb_decrypt6,.-__ocb_decrypt6
+
+.type  __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        pxor           @offset[5],@offset[0]
+        pxor           @offset[0],@offset[1]
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            @offset[2],$inout2
+       pxor            @offset[3],$inout3
+       $movkey         32($key_),$rndkey0
+
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+        pxor           $rndkey0l,@offset[3]
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         48($key_),$rndkey1
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop4
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         16($key_),$rndkey1
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesdeclast      @offset[0],$inout0
+       aesdeclast      @offset[1],$inout1
+       aesdeclast      @offset[2],$inout2
+       aesdeclast      @offset[3],$inout3
+       ret
+.size  __ocb_decrypt4,.-__ocb_decrypt4
+
+.type  __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+        pxor           @offset[5],$inout5      # offset_i
+        pxor           $rndkey0l,$inout5       # offset_i ^ round[0]
+       pxor            $inout5,$inout0         # input ^ round[0] ^ offset_i
+       $movkey         32($key_),$rndkey0
+
+       aesdec          $rndkey1,$inout0
+       $movkey         48($key_),$rndkey1
+       pxor            $rndkey0l,$inout5       # offset_i ^ round[last]
+
+       aesdec          $rndkey0,$inout0
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+       aesdec          $rndkey1,$inout0
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop1
+
+       aesdec          $rndkey1,$inout0
+       $movkey         16($key_),$rndkey1      # redundant in tail
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesdeclast      $inout5,$inout0
+       ret
+.size  __ocb_decrypt1,.-__ocb_decrypt1
+___
+} }}
+\f
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                          size_t length, const AES_KEY *key,
+#                          unsigned char *ivp,const int enc);
+{
+my $frame_size = 0x10 + ($win64?0xa0:0);       # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type  ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+.cfi_startproc
+       test    $len,$len               # check length
+       jz      .Lcbc_ret
+
+       mov     240($key),$rnds_        # key->rounds
+       mov     $key,$key_              # backup $key
+       test    %r9d,%r9d               # 6th argument
+       jz      .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+       movups  ($ivp),$inout0          # load iv as initial state
+       mov     $rnds_,$rounds
+       cmp     \$16,$len
+       jb      .Lcbc_enc_tail
+       sub     \$16,$len
+       jmp     .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+       movups  ($inp),$inout1          # load input
+       lea     16($inp),$inp
+       #xorps  $inout1,$inout0
+___
+       &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+       mov     $rnds_,$rounds          # restore $rounds
+       mov     $key_,$key              # restore $key
+       movups  $inout0,0($out)         # store output
+       lea     16($out),$out
+       sub     \$16,$len
+       jnc     .Lcbc_enc_loop
+       add     \$16,$len
+       jnz     .Lcbc_enc_tail
+        pxor   $rndkey0,$rndkey0       # clear register bank
+        pxor   $rndkey1,$rndkey1
+       movups  $inout0,($ivp)
+        pxor   $inout0,$inout0
+        pxor   $inout1,$inout1
+       jmp     .Lcbc_ret
+
+.Lcbc_enc_tail:
+       mov     $len,%rcx       # zaps $key
+       xchg    $inp,$out       # $inp is %rsi and $out is %rdi now
+       .long   0x9066A4F3      # rep movsb
+       mov     \$16,%ecx       # zero tail
+       sub     $len,%rcx
+       xor     %eax,%eax
+       .long   0x9066AAF3      # rep stosb
+       lea     -16(%rdi),%rdi  # rewind $out by 1 block
+       mov     $rnds_,$rounds  # restore $rounds
+       mov     %rdi,%rsi       # $inp and $out are the same
+       mov     $key_,$key      # restore $key
+       xor     $len,$len       # len=16
+       jmp     .Lcbc_enc_loop  # one more spin
+\f#--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+       cmp     \$16,$len
+       jne     .Lcbc_decrypt_bulk
+
+       # handle single block without allocating stack frame,
+       # useful in ciphertext stealing mode
+       movdqu  ($inp),$inout0          # load input
+       movdqu  ($ivp),$inout1          # load iv
+       movdqa  $inout0,$inout2         # future iv
+___
+       &aesni_generate1("dec",$key,$rnds_);
+$code.=<<___;
+        pxor   $rndkey0,$rndkey0       # clear register bank
+        pxor   $rndkey1,$rndkey1
+       movdqu  $inout2,($ivp)          # store iv
+       xorps   $inout1,$inout0         # ^=iv
+        pxor   $inout1,$inout1
+       movups  $inout0,($out)          # store output
+        pxor   $inout0,$inout0
+       jmp     .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+       lea     (%rsp),%r11             # frame pointer
+.cfi_def_cfa_register  %r11
+       push    %rbp
+.cfi_push      %rbp
+       sub     \$$frame_size,%rsp
+       and     \$-16,%rsp      # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Lcbc_decrypt_body:
+___
+
+my $inp_=$key_="%rbp";                 # reassign $key_
+
+$code.=<<___;
+       mov     $key,$key_              # [re-]backup $key [after reassignment]
+       movups  ($ivp),$iv
+       mov     $rnds_,$rounds
+       cmp     \$0x50,$len
+       jbe     .Lcbc_dec_tail
+
+       $movkey ($key),$rndkey0
+       movdqu  0x00($inp),$inout0      # load input
+       movdqu  0x10($inp),$inout1
+       movdqa  $inout0,$in0
+       movdqu  0x20($inp),$inout2
+       movdqa  $inout1,$in1
+       movdqu  0x30($inp),$inout3
+       movdqa  $inout2,$in2
+       movdqu  0x40($inp),$inout4
+       movdqa  $inout3,$in3
+       movdqu  0x50($inp),$inout5
+       movdqa  $inout4,$in4
+       mov     OPENSSL_ia32cap_P+4(%rip),%r9d
+       cmp     \$0x70,$len
+       jbe     .Lcbc_dec_six_or_seven
+
+       and     \$`1<<26|1<<22`,%r9d    # isolate XSAVE+MOVBE
+       sub     \$0x50,$len             # $len is biased by -5*16
+       cmp     \$`1<<22`,%r9d          # check for MOVBE without XSAVE
+       je      .Lcbc_dec_loop6_enter   # [which denotes Atom Silvermont]
+       sub     \$0x20,$len             # $len is biased by -7*16
+       lea     0x70($key),$key         # size optimization
+       jmp     .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+       movups  $inout7,($out)
+       lea     0x10($out),$out
+.Lcbc_dec_loop8_enter:
+       movdqu          0x60($inp),$inout6
+       pxor            $rndkey0,$inout0
+       movdqu          0x70($inp),$inout7
+       pxor            $rndkey0,$inout1
+       $movkey         0x10-0x70($key),$rndkey1
+       pxor            $rndkey0,$inout2
+       mov             \$-1,$inp_
+       cmp             \$0x70,$len     # is there at least 0x60 bytes ahead?
+       pxor            $rndkey0,$inout3
+       pxor            $rndkey0,$inout4
+       pxor            $rndkey0,$inout5
+       pxor            $rndkey0,$inout6
+
+       aesdec          $rndkey1,$inout0
+       pxor            $rndkey0,$inout7
+       $movkey         0x20-0x70($key),$rndkey0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       aesdec          $rndkey1,$inout6
+       adc             \$0,$inp_
+       and             \$128,$inp_
+       aesdec          $rndkey1,$inout7
+       add             $inp,$inp_
+       $movkey         0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___   if ($i==7);
+       cmp             \$11,$rounds
+___
+$code.=<<___;
+       aesdec          $rndkeyx,$inout0
+       aesdec          $rndkeyx,$inout1
+       aesdec          $rndkeyx,$inout2
+       aesdec          $rndkeyx,$inout3
+       aesdec          $rndkeyx,$inout4
+       aesdec          $rndkeyx,$inout5
+       aesdec          $rndkeyx,$inout6
+       aesdec          $rndkeyx,$inout7
+       $movkey         `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___   if ($i<6 || (!($i&1) && $i>7));
+       nop
+___
+$code.=<<___   if ($i==7);
+       jb              .Lcbc_dec_done
+___
+$code.=<<___   if ($i==9);
+       je              .Lcbc_dec_done
+___
+$code.=<<___   if ($i==11);
+       jmp             .Lcbc_dec_done
+___
+}
+$code.=<<___;
+.align 16
+.Lcbc_dec_done:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       pxor            $rndkey0,$iv
+       pxor            $rndkey0,$in0
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       pxor            $rndkey0,$in1
+       pxor            $rndkey0,$in2
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       pxor            $rndkey0,$in3
+       pxor            $rndkey0,$in4
+       aesdec          $rndkey1,$inout6
+       aesdec          $rndkey1,$inout7
+       movdqu          0x50($inp),$rndkey1
+
+       aesdeclast      $iv,$inout0
+       movdqu          0x60($inp),$iv          # borrow $iv
+       pxor            $rndkey0,$rndkey1
+       aesdeclast      $in0,$inout1
+       pxor            $rndkey0,$iv
+       movdqu          0x70($inp),$rndkey0     # next IV
+       aesdeclast      $in1,$inout2
+       lea             0x80($inp),$inp
+       movdqu          0x00($inp_),$in0
+       aesdeclast      $in2,$inout3
+       aesdeclast      $in3,$inout4
+       movdqu          0x10($inp_),$in1
+       movdqu          0x20($inp_),$in2
+       aesdeclast      $in4,$inout5
+       aesdeclast      $rndkey1,$inout6
+       movdqu          0x30($inp_),$in3
+       movdqu          0x40($inp_),$in4
+       aesdeclast      $iv,$inout7
+       movdqa          $rndkey0,$iv            # return $iv
+       movdqu          0x50($inp_),$rndkey1
+       $movkey         -0x70($key),$rndkey0
+
+       movups          $inout0,($out)          # store output
+       movdqa          $in0,$inout0
+       movups          $inout1,0x10($out)
+       movdqa          $in1,$inout1
+       movups          $inout2,0x20($out)
+       movdqa          $in2,$inout2
+       movups          $inout3,0x30($out)
+       movdqa          $in3,$inout3
+       movups          $inout4,0x40($out)
+       movdqa          $in4,$inout4
+       movups          $inout5,0x50($out)
+       movdqa          $rndkey1,$inout5
+       movups          $inout6,0x60($out)
+       lea             0x70($out),$out
+
+       sub     \$0x80,$len
+       ja      .Lcbc_dec_loop8
+
+       movaps  $inout7,$inout0
+       lea     -0x70($key),$key
+       add     \$0x70,$len
+       jle     .Lcbc_dec_clear_tail_collected
+       movups  $inout7,($out)
+       lea     0x10($out),$out
+       cmp     \$0x50,$len
+       jbe     .Lcbc_dec_tail
+
+       movaps  $in0,$inout0
+.Lcbc_dec_six_or_seven:
+       cmp     \$0x60,$len
+       ja      .Lcbc_dec_seven
+
+       movaps  $inout5,$inout6
+       call    _aesni_decrypt6
+       pxor    $iv,$inout0             # ^= IV
+       movaps  $inout6,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+        pxor   $inout1,$inout1         # clear register bank
+       pxor    $in2,$inout3
+       movdqu  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       pxor    $in3,$inout4
+       movdqu  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       pxor    $in4,$inout5
+       movdqu  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+       lea     0x50($out),$out
+       movdqa  $inout5,$inout0
+        pxor   $inout5,$inout5
+       jmp     .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+       movups  0x60($inp),$inout6
+       xorps   $inout7,$inout7
+       call    _aesni_decrypt8
+       movups  0x50($inp),$inout7
+       pxor    $iv,$inout0             # ^= IV
+       movups  0x60($inp),$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+        pxor   $inout1,$inout1         # clear register bank
+       pxor    $in2,$inout3
+       movdqu  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       pxor    $in3,$inout4
+       movdqu  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       pxor    $in4,$inout5
+       movdqu  $inout4,0x40($out)
+        pxor   $inout4,$inout4
+       pxor    $inout7,$inout6
+       movdqu  $inout5,0x50($out)
+        pxor   $inout5,$inout5
+       lea     0x60($out),$out
+       movdqa  $inout6,$inout0
+        pxor   $inout6,$inout6
+        pxor   $inout7,$inout7
+       jmp     .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+       movups  $inout5,($out)
+       lea     0x10($out),$out
+       movdqu  0x00($inp),$inout0      # load input
+       movdqu  0x10($inp),$inout1
+       movdqa  $inout0,$in0
+       movdqu  0x20($inp),$inout2
+       movdqa  $inout1,$in1
+       movdqu  0x30($inp),$inout3
+       movdqa  $inout2,$in2
+       movdqu  0x40($inp),$inout4
+       movdqa  $inout3,$in3
+       movdqu  0x50($inp),$inout5
+       movdqa  $inout4,$in4
+.Lcbc_dec_loop6_enter:
+       lea     0x60($inp),$inp
+       movdqa  $inout5,$inout6
+
+       call    _aesni_decrypt6
+
+       pxor    $iv,$inout0             # ^= IV
+       movdqa  $inout6,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+       pxor    $in2,$inout3
+       movdqu  $inout2,0x20($out)
+       pxor    $in3,$inout4
+       mov     $key_,$key
+       movdqu  $inout3,0x30($out)
+       pxor    $in4,$inout5
+       mov     $rnds_,$rounds
+       movdqu  $inout4,0x40($out)
+       lea     0x50($out),$out
+       sub     \$0x60,$len
+       ja      .Lcbc_dec_loop6
+
+       movdqa  $inout5,$inout0
+       add     \$0x50,$len
+       jle     .Lcbc_dec_clear_tail_collected
+       movups  $inout5,($out)
+       lea     0x10($out),$out
+
+.Lcbc_dec_tail:
+       movups  ($inp),$inout0
+       sub     \$0x10,$len
+       jbe     .Lcbc_dec_one           # $len is 1*16 or less
+
+       movups  0x10($inp),$inout1
+       movaps  $inout0,$in0
+       sub     \$0x10,$len
+       jbe     .Lcbc_dec_two           # $len is 2*16 or less
+
+       movups  0x20($inp),$inout2
+       movaps  $inout1,$in1
+       sub     \$0x10,$len
+       jbe     .Lcbc_dec_three         # $len is 3*16 or less
+
+       movups  0x30($inp),$inout3
+       movaps  $inout2,$in2
+       sub     \$0x10,$len
+       jbe     .Lcbc_dec_four          # $len is 4*16 or less
+
+       movups  0x40($inp),$inout4      # $len is 5*16 or less
+       movaps  $inout3,$in3
+       movaps  $inout4,$in4
+       xorps   $inout5,$inout5
+       call    _aesni_decrypt6
+       pxor    $iv,$inout0
+       movaps  $in4,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+        pxor   $inout1,$inout1         # clear register bank
+       pxor    $in2,$inout3
+       movdqu  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       pxor    $in3,$inout4
+       movdqu  $inout3,0x30($out)
+        pxor   $inout3,$inout3
+       lea     0x40($out),$out
+       movdqa  $inout4,$inout0
+        pxor   $inout4,$inout4
+        pxor   $inout5,$inout5
+       sub     \$0x10,$len
+       jmp     .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_one:
+       movaps  $inout0,$in0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   $iv,$inout0
+       movaps  $in0,$iv
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+       movaps  $inout1,$in1
+       call    _aesni_decrypt2
+       pxor    $iv,$inout0
+       movaps  $in1,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       movdqa  $inout1,$inout0
+        pxor   $inout1,$inout1         # clear register bank
+       lea     0x10($out),$out
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+       movaps  $inout2,$in2
+       call    _aesni_decrypt3
+       pxor    $iv,$inout0
+       movaps  $in2,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+        pxor   $inout1,$inout1         # clear register bank
+       movdqa  $inout2,$inout0
+        pxor   $inout2,$inout2
+       lea     0x20($out),$out
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+       movaps  $inout3,$in3
+       call    _aesni_decrypt4
+       pxor    $iv,$inout0
+       movaps  $in3,$iv
+       pxor    $in0,$inout1
+       movdqu  $inout0,($out)
+       pxor    $in1,$inout2
+       movdqu  $inout1,0x10($out)
+        pxor   $inout1,$inout1         # clear register bank
+       pxor    $in2,$inout3
+       movdqu  $inout2,0x20($out)
+        pxor   $inout2,$inout2
+       movdqa  $inout3,$inout0
+        pxor   $inout3,$inout3
+       lea     0x30($out),$out
+       jmp     .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_clear_tail_collected:
+       pxor    $inout1,$inout1         # clear register bank
+       pxor    $inout2,$inout2
+       pxor    $inout3,$inout3
+___
+$code.=<<___ if (!$win64);
+       pxor    $inout4,$inout4         # %xmm6..9
+       pxor    $inout5,$inout5
+       pxor    $inout6,$inout6
+       pxor    $inout7,$inout7
+___
+$code.=<<___;
+.Lcbc_dec_tail_collected:
+       movups  $iv,($ivp)
+       and     \$15,$len
+       jnz     .Lcbc_dec_tail_partial
+       movups  $inout0,($out)
+       pxor    $inout0,$inout0
+       jmp     .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+       movaps  $inout0,(%rsp)
+       pxor    $inout0,$inout0
+       mov     \$16,%rcx
+       mov     $out,%rdi
+       sub     $len,%rcx
+       lea     (%rsp),%rsi
+       .long   0x9066A4F3              # rep movsb
+       movdqa  $inout0,(%rsp)
+
+.Lcbc_dec_ret:
+       xorps   $rndkey0,$rndkey0       # %xmm0
+       pxor    $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  %xmm0,0x10(%rsp)        # clear stack
+       movaps  0x20(%rsp),%xmm7
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm8
+       movaps  %xmm0,0x30(%rsp)
+       movaps  0x40(%rsp),%xmm9
+       movaps  %xmm0,0x40(%rsp)
+       movaps  0x50(%rsp),%xmm10
+       movaps  %xmm0,0x50(%rsp)
+       movaps  0x60(%rsp),%xmm11
+       movaps  %xmm0,0x60(%rsp)
+       movaps  0x70(%rsp),%xmm12
+       movaps  %xmm0,0x70(%rsp)
+       movaps  0x80(%rsp),%xmm13
+       movaps  %xmm0,0x80(%rsp)
+       movaps  0x90(%rsp),%xmm14
+       movaps  %xmm0,0x90(%rsp)
+       movaps  0xa0(%rsp),%xmm15
+       movaps  %xmm0,0xa0(%rsp)
+___
+$code.=<<___;
+       mov     -8(%r11),%rbp
+.cfi_restore   %rbp
+       lea     (%r11),%rsp
+.cfi_def_cfa_register  %rsp
+.Lcbc_ret:
+       ret
+.cfi_endproc
+.size  ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} \f
+# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
+#                              int bits, AES_KEY *key)
+#
+# input:       $inp    user-supplied key
+#              $bits   $inp length in bits
+#              $key    pointer to key schedule
+# output:      %eax    0 denoting success, -1 or -2 - failure (see C)
+#              *$key   key schedule
+#
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type  ${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_decrypt_key:
+.cfi_startproc
+       .byte   0x48,0x83,0xEC,0x08     # sub rsp,8
+.cfi_adjust_cfa_offset 8
+       call    __aesni_set_encrypt_key
+       shl     \$4,$bits               # rounds-1 after _aesni_set_encrypt_key
+       test    %eax,%eax
+       jnz     .Ldec_key_ret
+       lea     16($key,$bits),$inp     # points at the end of key schedule
+
+       $movkey ($key),%xmm0            # just swap
+       $movkey ($inp),%xmm1
+       $movkey %xmm0,($inp)
+       $movkey %xmm1,($key)
+       lea     16($key),$key
+       lea     -16($inp),$inp
+
+.Ldec_key_inverse:
+       $movkey ($key),%xmm0            # swap and inverse
+       $movkey ($inp),%xmm1
+       aesimc  %xmm0,%xmm0
+       aesimc  %xmm1,%xmm1
+       lea     16($key),$key
+       lea     -16($inp),$inp
+       $movkey %xmm0,16($inp)
+       $movkey %xmm1,-16($key)
+       cmp     $key,$inp
+       ja      .Ldec_key_inverse
+
+       $movkey ($key),%xmm0            # inverse middle
+       aesimc  %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       $movkey %xmm0,($inp)
+       pxor    %xmm0,%xmm0
+.Ldec_key_ret:
+       add     \$8,%rsp
+.cfi_adjust_cfa_offset -8
+       ret
+.cfi_endproc
+.LSEH_end_set_decrypt_key:
+.size  ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+\f
+# This is based on submission from Intel by
+#      Huang Ying
+#      Vinodh Gopal
+#      Kahraman Akdemir
+#
+# Aggressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+#                              int bits, AES_KEY * const key);
+#
+# input:       $inp    user-supplied key
+#              $bits   $inp length in bits
+#              $key    pointer to key schedule
+# output:      %eax    0 denoting success, -1 or -2 - failure (see C)
+#              $bits   rounds-1 (used in aesni_set_decrypt_key)
+#              *$key   key schedule
+#              $key    pointer to key schedule (used in
+#                      aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type  ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc
+       .byte   0x48,0x83,0xEC,0x08     # sub rsp,8
+.cfi_adjust_cfa_offset 8
+       mov     \$-1,%rax
+       test    $inp,$inp
+       jz      .Lenc_key_ret
+       test    $key,$key
+       jz      .Lenc_key_ret
+
+       mov     \$`1<<28|1<<11`,%r10d   # AVX and XOP bits
+       movups  ($inp),%xmm0            # pull first 128 bits of *userKey
+       xorps   %xmm4,%xmm4             # low dword of xmm4 is assumed 0
+       and     OPENSSL_ia32cap_P+4(%rip),%r10d
+       lea     16($key),%rax           # %rax is used as modifiable copy of $key
+       cmp     \$256,$bits
+       je      .L14rounds
+       cmp     \$192,$bits
+       je      .L12rounds
+       cmp     \$128,$bits
+       jne     .Lbad_keybits
+
+.L10rounds:
+       mov     \$9,$bits                       # 10 rounds for 128-bit key
+       cmp     \$`1<<28`,%r10d                 # AVX, bit no XOP
+       je      .L10rounds_alt
+
+       $movkey %xmm0,($key)                    # round 0
+       aeskeygenassist \$0x1,%xmm0,%xmm1       # round 1
+       call            .Lkey_expansion_128_cold
+       aeskeygenassist \$0x2,%xmm0,%xmm1       # round 2
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x4,%xmm0,%xmm1       # round 3
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x8,%xmm0,%xmm1       # round 4
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x10,%xmm0,%xmm1      # round 5
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x20,%xmm0,%xmm1      # round 6
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x40,%xmm0,%xmm1      # round 7
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x80,%xmm0,%xmm1      # round 8
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x1b,%xmm0,%xmm1      # round 9
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x36,%xmm0,%xmm1      # round 10
+       call            .Lkey_expansion_128
+       $movkey %xmm0,(%rax)
+       mov     $bits,80(%rax)  # 240(%rdx)
+       xor     %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L10rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       mov     \$8,%r10d
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,($key)
+       jmp     .Loop_key128
+
+.align 16
+.Loop_key128:
+       pshufb          %xmm5,%xmm0
+       aesenclast      %xmm4,%xmm0
+       pslld           \$1,%xmm4
+       lea             16(%rax),%rax
+
+       movdqa          %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm3,%xmm2
+
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,-16(%rax)
+       movdqa          %xmm0,%xmm2
+
+       dec     %r10d
+       jnz     .Loop_key128
+
+       movdqa          .Lkey_rcon1b(%rip),%xmm4
+
+       pshufb          %xmm5,%xmm0
+       aesenclast      %xmm4,%xmm0
+       pslld           \$1,%xmm4
+
+       movdqa          %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm3,%xmm2
+
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,(%rax)
+
+       movdqa          %xmm0,%xmm2
+       pshufb          %xmm5,%xmm0
+       aesenclast      %xmm4,%xmm0
+
+       movdqa          %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm2,%xmm3
+       pslldq          \$4,%xmm2
+       pxor            %xmm3,%xmm2
+
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,16(%rax)
+
+       mov     $bits,96(%rax)  # 240($key)
+       xor     %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L12rounds:
+       movq    16($inp),%xmm2                  # remaining 1/3 of *userKey
+       mov     \$11,$bits                      # 12 rounds for 192
+       cmp     \$`1<<28`,%r10d                 # AVX, but no XOP
+       je      .L12rounds_alt
+
+       $movkey %xmm0,($key)                    # round 0
+       aeskeygenassist \$0x1,%xmm2,%xmm1       # round 1,2
+       call            .Lkey_expansion_192a_cold
+       aeskeygenassist \$0x2,%xmm2,%xmm1       # round 2,3
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x4,%xmm2,%xmm1       # round 4,5
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x8,%xmm2,%xmm1       # round 5,6
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x10,%xmm2,%xmm1      # round 7,8
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x20,%xmm2,%xmm1      # round 8,9
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x40,%xmm2,%xmm1      # round 10,11
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x80,%xmm2,%xmm1      # round 11,12
+       call            .Lkey_expansion_192b
+       $movkey %xmm0,(%rax)
+       mov     $bits,48(%rax)  # 240(%rdx)
+       xor     %rax, %rax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L12rounds_alt:
+       movdqa  .Lkey_rotate192(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       mov     \$8,%r10d
+       movdqu  %xmm0,($key)
+       jmp     .Loop_key192
+
+.align 16
+.Loop_key192:
+       movq            %xmm2,0(%rax)
+       movdqa          %xmm2,%xmm1
+       pshufb          %xmm5,%xmm2
+       aesenclast      %xmm4,%xmm2
+       pslld           \$1, %xmm4
+       lea             24(%rax),%rax
+
+       movdqa          %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm3,%xmm0
+
+       pshufd          \$0xff,%xmm0,%xmm3
+       pxor            %xmm1,%xmm3
+       pslldq          \$4,%xmm1
+       pxor            %xmm1,%xmm3
+
+       pxor            %xmm2,%xmm0
+       pxor            %xmm3,%xmm2
+       movdqu          %xmm0,-16(%rax)
+
+       dec     %r10d
+       jnz     .Loop_key192
+
+       mov     $bits,32(%rax)  # 240($key)
+       xor     %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L14rounds:
+       movups  16($inp),%xmm2                  # remaining half of *userKey
+       mov     \$13,$bits                      # 14 rounds for 256
+       lea     16(%rax),%rax
+       cmp     \$`1<<28`,%r10d                 # AVX, but no XOP
+       je      .L14rounds_alt
+
+       $movkey %xmm0,($key)                    # round 0
+       $movkey %xmm2,16($key)                  # round 1
+       aeskeygenassist \$0x1,%xmm2,%xmm1       # round 2
+       call            .Lkey_expansion_256a_cold
+       aeskeygenassist \$0x1,%xmm0,%xmm1       # round 3
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x2,%xmm2,%xmm1       # round 4
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x2,%xmm0,%xmm1       # round 5
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x4,%xmm2,%xmm1       # round 6
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x4,%xmm0,%xmm1       # round 7
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x8,%xmm2,%xmm1       # round 8
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x8,%xmm0,%xmm1       # round 9
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x10,%xmm2,%xmm1      # round 10
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x10,%xmm0,%xmm1      # round 11
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x20,%xmm2,%xmm1      # round 12
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x20,%xmm0,%xmm1      # round 13
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x40,%xmm2,%xmm1      # round 14
+       call            .Lkey_expansion_256a
+       $movkey %xmm0,(%rax)
+       mov     $bits,16(%rax)  # 240(%rdx)
+       xor     %rax,%rax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L14rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       mov     \$7,%r10d
+       movdqu  %xmm0,0($key)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,16($key)
+       jmp     .Loop_key256
+
+.align 16
+.Loop_key256:
+       pshufb          %xmm5,%xmm2
+       aesenclast      %xmm4,%xmm2
+
+       movdqa          %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm0,%xmm3
+       pslldq          \$4,%xmm0
+       pxor            %xmm3,%xmm0
+       pslld           \$1,%xmm4
+
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,(%rax)
+
+       dec     %r10d
+       jz      .Ldone_key256
+
+       pshufd          \$0xff,%xmm0,%xmm2
+       pxor            %xmm3,%xmm3
+       aesenclast      %xmm3,%xmm2
+
+       movdqa          %xmm1,%xmm3
+       pslldq          \$4,%xmm1
+       pxor            %xmm1,%xmm3
+       pslldq          \$4,%xmm1
+       pxor            %xmm1,%xmm3
+       pslldq          \$4,%xmm1
+       pxor            %xmm3,%xmm1
+
+       pxor            %xmm1,%xmm2
+       movdqu          %xmm2,16(%rax)
+       lea             32(%rax),%rax
+       movdqa          %xmm2,%xmm1
+
+       jmp     .Loop_key256
+
+.Ldone_key256:
+       mov     $bits,16(%rax)  # 240($key)
+       xor     %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+       mov     \$-2,%rax
+.Lenc_key_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       add     \$8,%rsp
+.cfi_adjust_cfa_offset -8
+       ret
+.cfi_endproc
+.LSEH_end_set_encrypt_key:
+\f
+.align 16
+.Lkey_expansion_128:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_128_cold:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       xorps   %xmm4, %xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       xorps   %xmm4, %xmm0
+       shufps  \$0b11111111,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm0
+       ret
+
+.align 16
+.Lkey_expansion_192a:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_192a_cold:
+       movaps  %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       movdqa  %xmm2,%xmm3
+       xorps   %xmm4,%xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       pslldq  \$4,%xmm3
+       xorps   %xmm4,%xmm0
+       pshufd  \$0b01010101,%xmm1,%xmm1        # critical path
+       pxor    %xmm3,%xmm2
+       pxor    %xmm1,%xmm0
+       pshufd  \$0b11111111,%xmm0,%xmm3
+       pxor    %xmm3,%xmm2
+       ret
+
+.align 16
+.Lkey_expansion_192b:
+       movaps  %xmm0,%xmm3
+       shufps  \$0b01000100,%xmm0,%xmm5
+       $movkey %xmm5,(%rax)
+       shufps  \$0b01001110,%xmm2,%xmm3
+       $movkey %xmm3,16(%rax)
+       lea     32(%rax),%rax
+       jmp     .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+       $movkey %xmm2,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_256a_cold:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       xorps   %xmm4,%xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       xorps   %xmm4,%xmm0
+       shufps  \$0b11111111,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm0
+       ret
+
+.align 16
+.Lkey_expansion_256b:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+
+       shufps  \$0b00010000,%xmm2,%xmm4
+       xorps   %xmm4,%xmm2
+       shufps  \$0b10001100,%xmm2,%xmm4
+       xorps   %xmm4,%xmm2
+       shufps  \$0b10101010,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm2
+       ret
+.size  ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size  __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+\f
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+       .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+       .long   6,6,6,0
+.Lincrement64:
+       .long   1,0,0,0
+.Lxts_magic:
+       .long   0x87,0,1,0
+.Lincrement1:
+       .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+       .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+       .long   0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+       .long   1,1,1,1
+.Lkey_rcon1b:
+       .long   0x1b,0x1b,0x1b,0x1b
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type  ecb_ccm64_se_handler,\@abi-omnipotent
+.align 16
+ecb_ccm64_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       lea     0(%rax),%rsi            # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$8,%ecx                # 4*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0x58(%rax),%rax         # adjust stack pointer
+
+       jmp     .Lcommon_seh_tail
+.size  ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
+
+.type  ctr_xts_se_handler,\@abi-omnipotent
+.align 16
+ctr_xts_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue lable
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       mov     208($context),%rax      # pull context->R11
+
+       lea     -0xa8(%rax),%rsi        # %xmm save area
+       lea     512($context),%rdi      # & context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     -8(%rax),%rbp           # restore saved %rbp
+       mov     %rbp,160($context)      # restore context->Rbp
+       jmp     .Lcommon_seh_tail
+.size  ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type  ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue lable
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       mov     8(%r11),%r10d           # HandlerData[2]
+       lea     (%rsi,%r10),%r10
+       cmp     %r10,%rbx               # context->Rip>=pop label
+       jae     .Locb_no_xmm
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     (%rax),%rsi             # %xmm save area
+       lea     512($context),%rdi      # & context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+
+       jmp     .Lcommon_seh_tail
+.size  ocb_se_handler,.-ocb_se_handler
+___
+$code.=<<___;
+.type  cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     152($context),%rax      # pull context->Rsp
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lcbc_decrypt_bulk(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<"prologue" label
+       jb      .Lcommon_seh_tail
+
+       mov     120($context),%rax      # pull context->Rax
+
+       lea     .Lcbc_decrypt_body(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<cbc_decrypt_body
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lcbc_ret(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>="epilogue" label
+       jae     .Lcommon_seh_tail
+
+       lea     16(%rax),%rsi           # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     208($context),%rax      # pull context->R11
+
+       mov     -8(%rax),%rbp           # restore saved %rbp
+       mov     %rbp,160($context)      # restore context->Rbp
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  cbc_se_handler,.-cbc_se_handler
+
+.section       .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+       .rva    .LSEH_begin_aesni_ecb_encrypt
+       .rva    .LSEH_end_aesni_ecb_encrypt
+       .rva    .LSEH_info_ecb
+
+       .rva    .LSEH_begin_aesni_ccm64_encrypt_blocks
+       .rva    .LSEH_end_aesni_ccm64_encrypt_blocks
+       .rva    .LSEH_info_ccm64_enc
+
+       .rva    .LSEH_begin_aesni_ccm64_decrypt_blocks
+       .rva    .LSEH_end_aesni_ccm64_decrypt_blocks
+       .rva    .LSEH_info_ccm64_dec
+
+       .rva    .LSEH_begin_aesni_ctr32_encrypt_blocks
+       .rva    .LSEH_end_aesni_ctr32_encrypt_blocks
+       .rva    .LSEH_info_ctr32
+
+       .rva    .LSEH_begin_aesni_xts_encrypt
+       .rva    .LSEH_end_aesni_xts_encrypt
+       .rva    .LSEH_info_xts_enc
+
+       .rva    .LSEH_begin_aesni_xts_decrypt
+       .rva    .LSEH_end_aesni_xts_decrypt
+       .rva    .LSEH_info_xts_dec
+
+       .rva    .LSEH_begin_aesni_ocb_encrypt
+       .rva    .LSEH_end_aesni_ocb_encrypt
+       .rva    .LSEH_info_ocb_enc
+
+       .rva    .LSEH_begin_aesni_ocb_decrypt
+       .rva    .LSEH_end_aesni_ocb_decrypt
+       .rva    .LSEH_info_ocb_dec
+___
+$code.=<<___;
+       .rva    .LSEH_begin_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_end_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_info_cbc
+
+       .rva    ${PREFIX}_set_decrypt_key
+       .rva    .LSEH_end_set_decrypt_key
+       .rva    .LSEH_info_key
+
+       .rva    ${PREFIX}_set_encrypt_key
+       .rva    .LSEH_end_set_encrypt_key
+       .rva    .LSEH_info_key
+.section       .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+       .byte   9,0,0,0
+       .rva    ecb_ccm64_se_handler
+       .rva    .Lecb_enc_body,.Lecb_enc_ret            # HandlerData[]
+.LSEH_info_ccm64_enc:
+       .byte   9,0,0,0
+       .rva    ecb_ccm64_se_handler
+       .rva    .Lccm64_enc_body,.Lccm64_enc_ret        # HandlerData[]
+.LSEH_info_ccm64_dec:
+       .byte   9,0,0,0
+       .rva    ecb_ccm64_se_handler
+       .rva    .Lccm64_dec_body,.Lccm64_dec_ret        # HandlerData[]
+.LSEH_info_ctr32:
+       .byte   9,0,0,0
+       .rva    ctr_xts_se_handler
+       .rva    .Lctr32_body,.Lctr32_epilogue           # HandlerData[]
+.LSEH_info_xts_enc:
+       .byte   9,0,0,0
+       .rva    ctr_xts_se_handler
+       .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
+.LSEH_info_xts_dec:
+       .byte   9,0,0,0
+       .rva    ctr_xts_se_handler
+       .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
+.LSEH_info_ocb_enc:
+       .byte   9,0,0,0
+       .rva    ocb_se_handler
+       .rva    .Locb_enc_body,.Locb_enc_epilogue       # HandlerData[]
+       .rva    .Locb_enc_pop
+       .long   0
+.LSEH_info_ocb_dec:
+       .byte   9,0,0,0
+       .rva    ocb_se_handler
+       .rva    .Locb_dec_body,.Locb_dec_epilogue       # HandlerData[]
+       .rva    .Locb_dec_pop
+       .long   0
+___
+$code.=<<___;
+.LSEH_info_cbc:
+       .byte   9,0,0,0
+       .rva    cbc_se_handler
+.LSEH_info_key:
+       .byte   0x01,0x04,0x01,0x00
+       .byte   0x04,0x02,0x00,0x00     # sub rsp,8
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04                 if($dst>=8);
+    $rex|=0x01                 if($src>=8);
+    push @opcode,$rex|0x40     if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       rex(\@opcode,$4,$3);
+       push @opcode,0x0f,0x3a,0xdf;
+       push @opcode,0xc0|($3&7)|(($4&7)<<3);   # ModR/M
+       my $c=$2;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       my %opcodelet = (
+               "aesimc" => 0xdb,
+               "aesenc" => 0xdc,       "aesenclast" => 0xdd,
+               "aesdec" => 0xde,       "aesdeclast" => 0xdf
+       );
+       return undef if (!defined($opcodelet{$1}));
+       rex(\@opcode,$3,$2);
+       push @opcode,0x0f,0x38,$opcodelet{$1};
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
+       return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+       my %opcodelet = (
+               "aesenc" => 0xdc,       "aesenclast" => 0xdd,
+               "aesdec" => 0xde,       "aesdeclast" => 0xdf
+       );
+       return undef if (!defined($opcodelet{$1}));
+       my $off = $2;
+       push @opcode,0x44 if ($3>=8);
+       push @opcode,0x0f,0x38,$opcodelet{$1};
+       push @opcode,0x44|(($3&7)<<3),0x24;     # ModR/M
+       push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+       return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+sub movbe {
+       ".byte  0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;    # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/aesni/sha1-x86_64.pl b/aesni/sha1-x86_64.pl

new file mode 100755 (executable)

index 0000000..5352afc
--- /dev/null
+++ b/aesni/sha1-x86_64.pl
@@ -0,0 +1,2132 @@
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# sha1_block procedure for x86_64.
+#
+# It was brought to my attention that on EM64T compiler-generated code
+# was far behind 32-bit assembler implementation. This is unlike on
+# Opteron where compiler-generated code was only 15% behind 32-bit
+# assembler, which originally made it hard to motivate the effort.
+# There was suggestion to mechanically translate 32-bit code, but I
+# dismissed it, reasoning that x86_64 offers enough register bank
+# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+# implementation:-) However! While 64-bit code does perform better
+# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+# x86_64 does offer larger *addressable* bank, but out-of-order core
+# reaches for even more registers through dynamic aliasing, and EM64T
+# core must have managed to run-time optimize even 32-bit code just as
+# good as 64-bit one. Performance improvement is summarized in the
+# following table:
+#
+#              gcc 3.4         32-bit asm      cycles/byte
+# Opteron      +45%            +20%            6.8
+# Xeon P4      +65%            +0%             9.9
+# Core2                +60%            +10%            7.0
+
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+# May 2013.
+#
+# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
+# and loading pair of consecutive blocks to 256-bit %ymm registers)
+# did not provide impressive performance improvement till a crucial
+# hint regarding the number of Xupdate iterations to pre-compute in
+# advance was provided by Ilya Albrekht of Intel Corp.
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#              x86_64          SSSE3           AVX[2]
+# P4           9.05            -
+# Opteron      6.26            -
+# Core2                6.55            6.05/+8%        -
+# Westmere     6.73            5.30/+27%       -
+# Sandy Bridge 7.70            6.10/+26%       4.99/+54%
+# Ivy Bridge   6.06            4.67/+30%       4.60/+32%
+# Haswell      5.45            4.15/+31%       3.57/+53%
+# Skylake      5.18            4.06/+28%       3.54/+46%
+# Bulldozer    9.11            5.95/+53%
+# Ryzen                4.75            3.80/+24%       1.93/+150%(**)
+# VIA Nano     9.32            7.15/+30%
+# Atom         10.3            9.17/+12%
+# Silvermont   13.1(*)         9.37/+40%
+# Knights L    13.2(*)         9.68/+36%       8.30/+59%
+# Goldmont     8.13            6.42/+27%       1.70/+380%(**)
+#
+# (*)  obviously suboptimal result, nothing was done about it,
+#      because SSSE3 code is compiled unconditionally;
+# (**) SHAEXT result
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=1;     ### set to zero if compiling for 1.0.1
+$avx=1         if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$ctx="%rdi";   # 1st arg
+$inp="%rsi";   # 2nd arg
+$num="%rdx";   # 3rd arg
+
+# reassign arguments in order to produce more compact code
+$ctx="%r8";
+$inp="%r9";
+$num="%r10";
+
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp","%r14d");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
+
+@V=($A,$B,$C,$D,$E);
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+       mov     `4*$i`($inp),$xi[0]
+       bswap   $xi[0]
+___
+$code.=<<___ if ($i<15);
+       mov     `4*$j`($inp),$xi[1]
+       mov     $d,$t0
+       mov     $xi[0],`4*$i`(%rsp)
+       mov     $a,$t2
+       bswap   $xi[1]
+       xor     $c,$t0
+       rol     \$5,$t2
+       and     $b,$t0
+       lea     0x5a827999($xi[0],$e),$e
+       add     $t2,$e
+       xor     $d,$t0
+       rol     \$30,$b
+       add     $t0,$e
+___
+$code.=<<___ if ($i>=15);
+       xor     `4*($j%16)`(%rsp),$xi[1]
+       mov     $d,$t0
+       mov     $xi[0],`4*($i%16)`(%rsp)
+       mov     $a,$t2
+       xor     `4*(($j+2)%16)`(%rsp),$xi[1]
+       xor     $c,$t0
+       rol     \$5,$t2
+       xor     `4*(($j+8)%16)`(%rsp),$xi[1]
+       and     $b,$t0
+       lea     0x5a827999($xi[0],$e),$e
+       rol     \$30,$b
+       xor     $d,$t0
+       add     $t2,$e
+       rol     \$1,$xi[1]
+       add     $t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+$code.=<<___ if ($i<79);
+       xor     `4*($j%16)`(%rsp),$xi[1]
+       mov     $b,$t0
+       `"mov   $xi[0],".4*($i%16)."(%rsp)"     if ($i<72)`
+       mov     $a,$t2
+       xor     `4*(($j+2)%16)`(%rsp),$xi[1]
+       xor     $d,$t0
+       rol     \$5,$t2
+       xor     `4*(($j+8)%16)`(%rsp),$xi[1]
+       lea     $K($xi[0],$e),$e
+       xor     $c,$t0
+       add     $t2,$e
+       rol     \$30,$b
+       add     $t0,$e
+       rol     \$1,$xi[1]
+___
+$code.=<<___ if ($i==79);
+       mov     $b,$t0
+       mov     $a,$t2
+       xor     $d,$t0
+       lea     $K($xi[0],$e),$e
+       rol     \$5,$t2
+       xor     $c,$t0
+       add     $t2,$e
+       rol     \$30,$b
+       add     $t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+       xor     `4*($j%16)`(%rsp),$xi[1]
+       mov     $d,$t0
+       mov     $xi[0],`4*($i%16)`(%rsp)
+       mov     $d,$t1
+       xor     `4*(($j+2)%16)`(%rsp),$xi[1]
+       and     $c,$t0
+       mov     $a,$t2
+       xor     `4*(($j+8)%16)`(%rsp),$xi[1]
+       lea     0x8f1bbcdc($xi[0],$e),$e
+       xor     $c,$t1
+       rol     \$5,$t2
+       add     $t0,$e
+       rol     \$1,$xi[1]
+       and     $b,$t1
+       add     $t2,$e
+       rol     \$30,$b
+       add     $t1,$e
+___
+push(@xi,shift(@xi));
+}
+
+$code.=<<___;
+.text
+.extern        OPENSSL_ia32cap_P
+
+.globl sha1_block_data_order
+.type  sha1_block_data_order,\@function,3
+.align 16
+sha1_block_data_order:
+.cfi_startproc
+       mov     OPENSSL_ia32cap_P+0(%rip),%r9d
+       mov     OPENSSL_ia32cap_P+4(%rip),%r8d
+       mov     OPENSSL_ia32cap_P+8(%rip),%r10d
+       test    \$`1<<9`,%r8d           # check SSSE3 bit
+       jz      .Lialu
+___
+$code.=<<___ if ($shaext);
+       test    \$`1<<29`,%r10d         # check SHA bit
+       jnz     _shaext_shortcut
+___
+$code.=<<___ if ($avx>1);
+       and     \$`1<<3|1<<5|1<<8`,%r10d        # check AVX2+BMI1+BMI2
+       cmp     \$`1<<3|1<<5|1<<8`,%r10d
+       je      _avx2_shortcut
+___
+$code.=<<___ if ($avx);
+       and     \$`1<<28`,%r8d          # mask AVX bit
+       and     \$`1<<30`,%r9d          # mask "Intel CPU" bit
+       or      %r9d,%r8d
+       cmp     \$`1<<28|1<<30`,%r8d
+       je      _avx_shortcut
+___
+$code.=<<___;
+       jmp     _ssse3_shortcut
+
+.align 16
+.Lialu:
+       mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       mov     %rdi,$ctx       # reassigned argument
+       sub     \$`8+16*4`,%rsp
+       mov     %rsi,$inp       # reassigned argument
+       and     \$-64,%rsp
+       mov     %rdx,$num       # reassigned argument
+       mov     %rax,`16*4`(%rsp)
+.cfi_cfa_expression    %rsp+64,deref,+8
+.Lprologue:
+
+       mov     0($ctx),$A
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     16($ctx),$E
+       jmp     .Lloop
+
+.align 16
+.Lloop:
+___
+for($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       add     0($ctx),$A
+       add     4($ctx),$B
+       add     8($ctx),$C
+       add     12($ctx),$D
+       add     16($ctx),$E
+       mov     $A,0($ctx)
+       mov     $B,4($ctx)
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+
+       sub     \$1,$num
+       lea     `16*4`($inp),$inp
+       jnz     .Lloop
+
+       mov     `16*4`(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       mov     -40(%rsi),%r14
+.cfi_restore   %r14
+       mov     -32(%rsi),%r13
+.cfi_restore   %r13
+       mov     -24(%rsi),%r12
+.cfi_restore   %r12
+       mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue:
+       ret
+.cfi_endproc
+.size  sha1_block_data_order,.-sha1_block_data_order
+___
+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type  sha1_block_data_order_shaext,\@function,3
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+       lea     `-8-4*16`(%rsp),%rsp
+       movaps  %xmm6,-8-4*16(%rax)
+       movaps  %xmm7,-8-3*16(%rax)
+       movaps  %xmm8,-8-2*16(%rax)
+       movaps  %xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+       movdqu  ($ctx),$ABCD
+       movd    16($ctx),$E
+       movdqa  K_XX_XX+0xa0(%rip),$BSWAP       # byte-n-word swap
+
+       movdqu  ($inp),@MSG[0]
+       pshufd  \$0b00011011,$ABCD,$ABCD        # flip word order
+       movdqu  0x10($inp),@MSG[1]
+       pshufd  \$0b00011011,$E,$E              # flip word order
+       movdqu  0x20($inp),@MSG[2]
+       pshufb  $BSWAP,@MSG[0]
+       movdqu  0x30($inp),@MSG[3]
+       pshufb  $BSWAP,@MSG[1]
+       pshufb  $BSWAP,@MSG[2]
+       movdqa  $E,$E_SAVE                      # offload $E
+       pshufb  $BSWAP,@MSG[3]
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+       dec             $num
+       lea             0x40($inp),%r8          # next input block
+       paddd           @MSG[0],$E
+       cmovne          %r8,$inp
+       movdqa          $ABCD,$ABCD_SAVE        # offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+       sha1msg1        @MSG[1],@MSG[0]
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$`int($i/5)`,$E,$ABCD  # 0-3...
+       sha1nexte       @MSG[1],$E_
+       pxor            @MSG[2],@MSG[0]
+       sha1msg1        @MSG[2],@MSG[1]
+       sha1msg2        @MSG[3],@MSG[0]
+
+       movdqa          $ABCD,$E
+       sha1rnds4       \$`int(($i+1)/5)`,$E_,$ABCD
+       sha1nexte       @MSG[2],$E
+       pxor            @MSG[3],@MSG[1]
+       sha1msg2        @MSG[0],@MSG[1]
+___
+       push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       movdqu          ($inp),@MSG[0]
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$3,$E,$ABCD            # 64-67
+       sha1nexte       @MSG[1],$E_
+       movdqu          0x10($inp),@MSG[1]
+       pshufb          $BSWAP,@MSG[0]
+
+       movdqa          $ABCD,$E
+       sha1rnds4       \$3,$E_,$ABCD           # 68-71
+       sha1nexte       @MSG[2],$E
+       movdqu          0x20($inp),@MSG[2]
+       pshufb          $BSWAP,@MSG[1]
+
+       movdqa          $ABCD,$E_
+       sha1rnds4       \$3,$E,$ABCD            # 72-75
+       sha1nexte       @MSG[3],$E_
+       movdqu          0x30($inp),@MSG[3]
+       pshufb          $BSWAP,@MSG[2]
+
+       movdqa          $ABCD,$E
+       sha1rnds4       \$3,$E_,$ABCD           # 76-79
+       sha1nexte       $E_SAVE,$E
+       pshufb          $BSWAP,@MSG[3]
+
+       paddd           $ABCD_SAVE,$ABCD
+       movdqa          $E,$E_SAVE              # offload $E
+
+       jnz             .Loop_shaext
+
+       pshufd  \$0b00011011,$ABCD,$ABCD
+       pshufd  \$0b00011011,$E,$E
+       movdqu  $ABCD,($ctx)
+       movd    $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+       movaps  -8-4*16(%rax),%xmm6
+       movaps  -8-3*16(%rax),%xmm7
+       movaps  -8-2*16(%rax),%xmm8
+       movaps  -8-1*16(%rax),%xmm9
+       mov     %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+.cfi_endproc
+       ret
+.size  sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my $Kx="%xmm11";
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");   # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $rx=0;
+my $K_XX_XX="%r14";
+my $fp="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+{ my $sn;
+sub align32() {
+  ++$sn;
+$code.=<<___;
+       jmp     .Lalign32_$sn   # see "Decoded ICache" in manual
+.align 32
+.Lalign32_$sn:
+___
+}
+}
+
+$code.=<<___;
+.type  sha1_block_data_order_ssse3,\@function,3
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+.cfi_startproc
+       mov     %rsp,$fp        # frame pointer
+.cfi_def_cfa_register  $fp
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13            # redundant, done to share Win64 SE handler
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       lea     `-64-($win64?6*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,-40-6*16($fp)
+       movaps  %xmm7,-40-5*16($fp)
+       movaps  %xmm8,-40-4*16($fp)
+       movaps  %xmm9,-40-3*16($fp)
+       movaps  %xmm10,-40-2*16($fp)
+       movaps  %xmm11,-40-1*16($fp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+       and     \$-64,%rsp
+       mov     %rdi,$ctx       # reassigned argument
+       mov     %rsi,$inp       # reassigned argument
+       mov     %rdx,$num       # reassigned argument
+
+       shl     \$6,$num
+       add     $inp,$num
+       lea     K_XX_XX+64(%rip),$K_XX_XX
+
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       movdqa  64($K_XX_XX),@X[2]      # pbswap mask
+       movdqa  -64($K_XX_XX),@Tx[1]    # K_00_19
+       movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       movdqu  16($inp),@X[-3&7]
+       movdqu  32($inp),@X[-2&7]
+       movdqu  48($inp),@X[-1&7]
+       pshufb  @X[2],@X[-4&7]          # byte swap
+       pshufb  @X[2],@X[-3&7]
+       pshufb  @X[2],@X[-2&7]
+       add     \$64,$inp
+       paddd   @Tx[1],@X[-4&7]         # add K_00_19
+       pshufb  @X[2],@X[-1&7]
+       paddd   @Tx[1],@X[-3&7]
+       paddd   @Tx[1],@X[-2&7]
+       movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+       psubd   @Tx[1],@X[-4&7]         # restore X[]
+       movdqa  @X[-3&7],16(%rsp)
+       psubd   @Tx[1],@X[-3&7]
+       movdqa  @X[-2&7],32(%rsp)
+       psubd   @Tx[1],@X[-2&7]
+       jmp     .Loop_ssse3
+___
+
+sub AUTOLOAD()         # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()              # recall that $Xi starts with 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));           # ror
+       &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa   (@X[0],@X[-3&7]);
+        eval(shift(@insns));
+       &movdqa (@Tx[0],@X[-1&7]);
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+       &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &movdqa (@Tx[2],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &movdqa (@Tx[0],@X[0]);
+        eval(shift(@insns));
+
+       &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
+       &paddd  (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+       &movdqa (@Tx[1],@Tx[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[2],30);
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pslld  (@Tx[1],2);
+       &pxor   (@X[0],@Tx[2]);
+        eval(shift(@insns));
+         &movdqa       (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");  # K_XX_XX
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
+       &pshufd (@Tx[1],@X[-1&7],0xee)  if ($Xi==7);    # was &movdqa   (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns))            if ($Xi==8);
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns))            if ($Xi==8);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns))            if (@insns[1] =~ /_ror/);
+        eval(shift(@insns))            if (@insns[0] =~ /_ror/);
+       &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+       if ($Xi%5) {
+         &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+       } else {                        # ... or load next one
+         &movdqa       (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
+       }
+        eval(shift(@insns));           # ror
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns))            if (@insns[0] =~ /_ror/);
+
+       &movdqa (@Tx[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+        eval(shift(@insns));           # body_20_39
+
+       &pslld  (@X[0],2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psrld  (@Tx[0],30);
+        eval(shift(@insns))            if (@insns[0] =~ /_rol/);# rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
+        eval(shift(@insns));
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns))            if (@insns[1] =~ /_rol/);
+        eval(shift(@insns))            if (@insns[0] =~ /_rol/);
+         &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$num);
+       &je     (".Ldone_ssse3");
+
+       unshift(@Tx,pop(@Tx));
+
+       &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
+       &movdqa (@Tx[1],"-64($K_XX_XX)");       # K_00_19
+       &movdqu (@X[-4&7],"0($inp)");           # load input
+       &movdqu (@X[-3&7],"16($inp)");
+       &movdqu (@X[-2&7],"32($inp)");
+       &movdqu (@X[-1&7],"48($inp)");
+       &pshufb (@X[-4&7],@X[2]);               # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &pshufb (@X[($Xi-3)&7],@X[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &paddd  (@X[($Xi-4)&7],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psubd  (@X[($Xi-4)&7],@Tx[1]);
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {    # ((c^d)&b)^d
+       # on start @T[0]=(c^d)&b
+       return &body_20_39() if ($rx==19); $rx++;
+       (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&$_ror ($b,$j?7:2)',   # $b>>>2
+       '&xor   (@T[0],$d)',
+       '&mov   (@T[1],$a)',    # $b for next round
+
+       '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+       '&xor   ($b,$c)',       # $c^$d for next round
+
+       '&$_rol ($a,5)',
+       '&add   ($e,@T[0])',
+       '&and   (@T[1],$b)',    # ($b&($c^$d)) for next round
+
+       '&xor   ($b,$c)',       # restore $b
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+}
+
+sub body_20_39 () {    # b^d^c
+       # on entry @T[0]=b^d
+       return &body_40_59() if ($rx==39); $rx++;
+       (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+       '&xor   (@T[0],$d)      if($j==19);'.
+       '&xor   (@T[0],$c)      if($j> 19)',    # ($b^$d^$c)
+       '&mov   (@T[1],$a)',    # $b for next round
+
+       '&$_rol ($a,5)',
+       '&add   ($e,@T[0])',
+       '&xor   (@T[1],$c)      if ($j< 79)',   # $b^$d for next round
+
+       '&$_ror ($b,7)',        # $b>>>2
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+}
+
+sub body_40_59 () {    # ((b^c)&(c^d))^c
+       # on entry @T[0]=(b^c), (c^=d)
+       $rx++;
+       (
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+       '&and   (@T[0],$c)      if ($j>=40)',   # (b^c)&(c^d)
+       '&xor   ($c,$d)         if ($j>=40)',   # restore $c
+
+       '&$_ror ($b,7)',        # $b>>>2
+       '&mov   (@T[1],$a)',    # $b for next round
+       '&xor   (@T[0],$c)',
+
+       '&$_rol ($a,5)',
+       '&add   ($e,@T[0])',
+       '&xor   (@T[1],$c)      if ($j==59);'.
+       '&xor   (@T[1],$b)      if ($j< 59)',   # b^c for next round
+
+       '&xor   ($b,$c)         if ($j< 59)',   # c^d for next round
+       '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+                               $j=$saved_j; @V=@saved_V;
+
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+       movaps  -40-6*16($fp),%xmm6
+       movaps  -40-5*16($fp),%xmm7
+       movaps  -40-4*16($fp),%xmm8
+       movaps  -40-3*16($fp),%xmm9
+       movaps  -40-2*16($fp),%xmm10
+       movaps  -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+       mov     -40($fp),%r14
+.cfi_restore   %r14
+       mov     -32($fp),%r13
+.cfi_restore   %r13
+       mov     -24($fp),%r12
+.cfi_restore   %r12
+       mov     -16($fp),%rbp
+.cfi_restore   %rbp
+       mov     -8($fp),%rbx
+.cfi_restore   %rbx
+       lea     ($fp),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_ssse3:
+       ret
+.cfi_endproc
+.size  sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+$Xi=4;                         # reset variables
+@X=map("%xmm$_",(4..7,0..3));
+@Tx=map("%xmm$_",(8..10));
+$j=0;
+$rx=0;
+
+my $done_avx_label=".Ldone_avx";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type  sha1_block_data_order_avx,\@function,3
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+       mov     %rsp,$fp
+.cfi_def_cfa_register  $fp
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13            # redundant, done to share Win64 SE handler
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       lea     `-64-($win64?6*16:0)`(%rsp),%rsp
+       vzeroupper
+___
+$code.=<<___ if ($win64);
+       vmovaps %xmm6,-40-6*16($fp)
+       vmovaps %xmm7,-40-5*16($fp)
+       vmovaps %xmm8,-40-4*16($fp)
+       vmovaps %xmm9,-40-3*16($fp)
+       vmovaps %xmm10,-40-2*16($fp)
+       vmovaps %xmm11,-40-1*16($fp)
+.Lprologue_avx:
+___
+$code.=<<___;
+       and     \$-64,%rsp
+       mov     %rdi,$ctx       # reassigned argument
+       mov     %rsi,$inp       # reassigned argument
+       mov     %rdx,$num       # reassigned argument
+
+       shl     \$6,$num
+       add     $inp,$num
+       lea     K_XX_XX+64(%rip),$K_XX_XX
+
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+       mov     $C,@T[1]
+       xor     $D,@T[1]
+       and     @T[1],@T[0]
+
+       vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+       vmovdqa -64($K_XX_XX),$Kx       # K_00_19
+       vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       vmovdqu 16($inp),@X[-3&7]
+       vmovdqu 32($inp),@X[-2&7]
+       vmovdqu 48($inp),@X[-1&7]
+       vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+       add     \$64,$inp
+       vpshufb @X[2],@X[-3&7],@X[-3&7]
+       vpshufb @X[2],@X[-2&7],@X[-2&7]
+       vpshufb @X[2],@X[-1&7],@X[-1&7]
+       vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
+       vpaddd  $Kx,@X[-3&7],@X[1]
+       vpaddd  $Kx,@X[-2&7],@X[2]
+       vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+       vmovdqa @X[1],16(%rsp)
+       vmovdqa @X[2],32(%rsp)
+       jmp     .Loop_avx
+___
+
+sub Xupdate_avx_16_31()                # recall that $Xi starts with 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[0],@X[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+       &vpaddd (@X[0],@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[1],@Tx[2],30);
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslld (@Tx[2],@Tx[2],2);
+       &vpxor  (@X[0],@X[0],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")      if ($Xi%5==0);  # K_XX_XX
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+         &vmovdqa      ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")        if ($Xi%5==0);
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpsrld (@Tx[0],@X[0],30);
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpslld (@X[0],@X[0],2);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+         &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$num);
+       &je     ($done_avx_label);
+
+       &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
+       &vmovdqa($Kx,"-64($K_XX_XX)");          # K_00_19
+       &vmovdqu(@X[-4&7],"0($inp)");           # load input
+       &vmovdqu(@X[-3&7],"16($inp)");
+       &vmovdqu(@X[-2&7],"32($inp)");
+       &vmovdqu(@X[-1&7],"48($inp)");
+       &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $C,@T[1]
+       mov     $D,12($ctx)
+       xor     $D,@T[1]
+       mov     $E,16($ctx)
+       and     @T[1],@T[0]
+       jmp     .Loop_avx
+
+.align 16
+$done_avx_label:
+___
+                               $j=$saved_j; @V=@saved_V;
+
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+       vzeroupper
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+       movaps  -40-6*16($fp),%xmm6
+       movaps  -40-5*16($fp),%xmm7
+       movaps  -40-4*16($fp),%xmm8
+       movaps  -40-3*16($fp),%xmm9
+       movaps  -40-2*16($fp),%xmm10
+       movaps  -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+       mov     -40($fp),%r14
+.cfi_restore   %r14
+       mov     -32($fp),%r13
+.cfi_restore   %r13
+       mov     -24($fp),%r12
+.cfi_restore   %r12
+       mov     -16($fp),%rbp
+.cfi_restore   %rbp
+       mov     -8($fp),%rbx
+.cfi_restore   %rbx
+       lea     ($fp),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       ret
+.cfi_endproc
+.size  sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+
+if ($avx>1) {
+use integer;
+$Xi=4;                                 # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+@Tx=map("%ymm$_",(8..10));
+$Kx="%ymm11";
+$j=0;
+
+my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
+my ($a5,$t0)=("%r12d","%edi");
+
+my ($A,$F,$B,$C,$D,$E)=@ROTX;
+my $rx=0;
+my $frame="%r13";
+
+$code.=<<___;
+.type  sha1_block_data_order_avx2,\@function,3
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+       mov     %rsp,$fp
+.cfi_def_cfa_register  $fp
+       push    %rbx
+.cfi_push      %rbx
+       push    %rbp
+.cfi_push      %rbp
+       push    %r12
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       vzeroupper
+___
+$code.=<<___ if ($win64);
+       lea     -6*16(%rsp),%rsp
+       vmovaps %xmm6,-40-6*16($fp)
+       vmovaps %xmm7,-40-5*16($fp)
+       vmovaps %xmm8,-40-4*16($fp)
+       vmovaps %xmm9,-40-3*16($fp)
+       vmovaps %xmm10,-40-2*16($fp)
+       vmovaps %xmm11,-40-1*16($fp)
+.Lprologue_avx2:
+___
+$code.=<<___;
+       mov     %rdi,$ctx               # reassigned argument
+       mov     %rsi,$inp               # reassigned argument
+       mov     %rdx,$num               # reassigned argument
+
+       lea     -640(%rsp),%rsp
+       shl     \$6,$num
+        lea    64($inp),$frame
+       and     \$-128,%rsp
+       add     $inp,$num
+       lea     K_XX_XX+64(%rip),$K_XX_XX
+
+       mov     0($ctx),$A              # load context
+        cmp    $num,$frame
+        cmovae $inp,$frame             # next or same block
+       mov     4($ctx),$F
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     16($ctx),$E
+       vmovdqu 64($K_XX_XX),@X[2]      # pbswap mask
+
+       vmovdqu         ($inp),%xmm0
+       vmovdqu         16($inp),%xmm1
+       vmovdqu         32($inp),%xmm2
+       vmovdqu         48($inp),%xmm3
+       lea             64($inp),$inp
+       vinserti128     \$1,($frame),@X[-4&7],@X[-4&7]
+       vinserti128     \$1,16($frame),@X[-3&7],@X[-3&7]
+       vpshufb         @X[2],@X[-4&7],@X[-4&7]
+       vinserti128     \$1,32($frame),@X[-2&7],@X[-2&7]
+       vpshufb         @X[2],@X[-3&7],@X[-3&7]
+       vinserti128     \$1,48($frame),@X[-1&7],@X[-1&7]
+       vpshufb         @X[2],@X[-2&7],@X[-2&7]
+       vmovdqu         -64($K_XX_XX),$Kx       # K_00_19
+       vpshufb         @X[2],@X[-1&7],@X[-1&7]
+
+       vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
+       vpaddd  $Kx,@X[-3&7],@X[1]
+       vmovdqu @X[0],0(%rsp)           # X[]+K xfer to IALU
+       vpaddd  $Kx,@X[-2&7],@X[2]
+       vmovdqu @X[1],32(%rsp)
+       vpaddd  $Kx,@X[-1&7],@X[3]
+       vmovdqu @X[2],64(%rsp)
+       vmovdqu @X[3],96(%rsp)
+___
+for (;$Xi<8;$Xi++) {   # Xupdate_avx2_16_31
+    use integer;
+
+       &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+       &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+       &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+       &vpsrld (@Tx[0],@X[0],31);
+       &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")      if ($Xi%5==0);  # K_XX_XX
+       &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+       &vpaddd (@X[0],@X[0],@X[0]);
+       &vpsrld (@Tx[1],@Tx[2],30);
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+       &vpslld (@Tx[2],@Tx[2],2);
+       &vpxor  (@X[0],@X[0],@Tx[1]);
+       &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+       &vpaddd (@Tx[1],@X[0],$Kx);
+       &vmovdqu("32*$Xi(%rsp)",@Tx[1]);        # X[]+K xfer to IALU
+
+       push(@X,shift(@X));     # "rotate" X[]
+}
+$code.=<<___;
+       lea     128(%rsp),$frame
+       jmp     .Loop_avx2
+.align 32
+.Loop_avx2:
+       rorx    \$2,$F,$B
+       andn    $D,$F,$t0
+       and     $C,$F
+       xor     $t0,$F
+___
+sub bodyx_00_19 () {   # 8 instructions, 3 cycles critical path
+       # at start $f=(b&c)^(~b&d), $b>>>=2
+       return &bodyx_20_39() if ($rx==19); $rx++;
+       (
+       '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+       '&add   ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.       # e+=X[i]+K
+        '&lea  ($frame,"256($frame)")  if ($j%32==31);',
+       '&andn  ($t0,$a,$c)',                   # ~b&d for next round
+
+       '&add   ($e,$f)',                       # e+=(b&c)^(~b&d)
+       '&rorx  ($a5,$a,27)',                   # a<<<5
+       '&rorx  ($f,$a,2)',                     # b>>>2 for next round
+       '&and   ($a,$b)',                       # b&c for next round
+
+       '&add   ($e,$a5)',                      # e+=a<<<5
+       '&xor   ($a,$t0);'.                     # f=(b&c)^(~b&d) for next round
+
+       'unshift(@ROTX,pop(@ROTX)); $j++;'
+       )
+}
+
+sub bodyx_20_39 () {   # 7 instructions, 2 cycles critical path
+       # on entry $f=b^c^d, $b>>>=2
+       return &bodyx_40_59() if ($rx==39); $rx++;
+       (
+       '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+       '&add   ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.       # e+=X[i]+K
+        '&lea  ($frame,"256($frame)")  if ($j%32==31);',
+
+       '&lea   ($e,"($e,$f)")',                # e+=b^c^d
+       '&rorx  ($a5,$a,27)',                   # a<<<5
+       '&rorx  ($f,$a,2)       if ($j<79)',    # b>>>2 in next round
+       '&xor   ($a,$b)         if ($j<79)',    # b^c for next round
+
+       '&add   ($e,$a5)',                      # e+=a<<<5
+       '&xor   ($a,$c)         if ($j<79);'.   # f=b^c^d for next round
+
+       'unshift(@ROTX,pop(@ROTX)); $j++;'
+       )
+}
+
+sub bodyx_40_59 () {   # 10 instructions, 3 cycles critical path
+       # on entry $f=((b^c)&(c^d)), $b>>>=2
+       $rx++;
+       (
+       '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+       '&add   ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.       # e+=X[i]+K
+        '&lea  ($frame,"256($frame)")  if ($j%32==31);',
+       '&xor   ($f,$c)         if ($j>39)',    # (b^c)&(c^d)^c
+       '&mov   ($t0,$b)        if ($j<59)',    # count on zero latency
+       '&xor   ($t0,$c)        if ($j<59)',    # c^d for next round
+
+       '&lea   ($e,"($e,$f)")',                # e+=(b^c)&(c^d)^c
+       '&rorx  ($a5,$a,27)',                   # a<<<5
+       '&rorx  ($f,$a,2)',                     # b>>>2 in next round
+       '&xor   ($a,$b)',                       # b^c for next round
+
+       '&add   ($e,$a5)',                      # e+=a<<<5
+       '&and   ($a,$t0)        if ($j< 59);'.  # f=(b^c)&(c^d) for next round
+       '&xor   ($a,$c)         if ($j==59);'.  # f=b^c^d for next round
+
+       'unshift(@ROTX,pop(@ROTX)); $j++;'
+       )
+}
+
+sub Xupdate_avx2_16_31()               # recall that $Xi starts with 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);    # 35 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+       &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[0],@X[0],31);
+       &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")      if ($Xi%5==0);  # K_XX_XX
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+       &vpaddd (@X[0],@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[1],@Tx[2],30);
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslld (@Tx[2],@Tx[2],2);
+       &vpxor  (@X[0],@X[0],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpaddd (@Tx[1],@X[0],$Kx);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);       # X[]+K xfer to IALU
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+       $Xi++;
+       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xupdate_avx2_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);    # 35 to 50 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+       &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")        if ($Xi%5==0);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[0],@X[0],30);
+       &vpslld (@X[0],@X[0],2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       #&vpslld        (@X[0],@X[0],2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpaddd (@Tx[1],@X[0],$Kx);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vmovdqu("32*$Xi(%rsp)",@Tx[1]);        # X[]+K xfer to IALU
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+       $Xi++;
+       push(@X,shift(@X));     # "rotate" X[]
+}
+
+sub Xloop_avx2()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        foreach (@insns) { eval; }
+}
+
+       &align32();
+       &Xupdate_avx2_32_79(\&bodyx_00_19);
+       &Xupdate_avx2_32_79(\&bodyx_00_19);
+       &Xupdate_avx2_32_79(\&bodyx_00_19);
+       &Xupdate_avx2_32_79(\&bodyx_00_19);
+
+       &Xupdate_avx2_32_79(\&bodyx_20_39);
+       &Xupdate_avx2_32_79(\&bodyx_20_39);
+       &Xupdate_avx2_32_79(\&bodyx_20_39);
+       &Xupdate_avx2_32_79(\&bodyx_20_39);
+
+       &align32();
+       &Xupdate_avx2_32_79(\&bodyx_40_59);
+       &Xupdate_avx2_32_79(\&bodyx_40_59);
+       &Xupdate_avx2_32_79(\&bodyx_40_59);
+       &Xupdate_avx2_32_79(\&bodyx_40_59);
+
+       &Xloop_avx2(\&bodyx_20_39);
+       &Xloop_avx2(\&bodyx_20_39);
+       &Xloop_avx2(\&bodyx_20_39);
+       &Xloop_avx2(\&bodyx_20_39);
+
+$code.=<<___;
+       lea     128($inp),$frame
+       lea     128($inp),%rdi                  # borrow $t0
+       cmp     $num,$frame
+       cmovae  $inp,$frame                     # next or previous block
+
+       # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+       add     0($ctx),@ROTX[0]                # update context
+       add     4($ctx),@ROTX[1]
+       add     8($ctx),@ROTX[3]
+       mov     @ROTX[0],0($ctx)
+       add     12($ctx),@ROTX[4]
+       mov     @ROTX[1],4($ctx)
+        mov    @ROTX[0],$A                     # A=d
+       add     16($ctx),@ROTX[5]
+        mov    @ROTX[3],$a5
+       mov     @ROTX[3],8($ctx)
+        mov    @ROTX[4],$D                     # D=b
+        #xchg  @ROTX[5],$F                     # F=c, C=f
+       mov     @ROTX[4],12($ctx)
+        mov    @ROTX[1],$F                     # F=e
+       mov     @ROTX[5],16($ctx)
+       #mov    $F,16($ctx)
+        mov    @ROTX[5],$E                     # E=c
+        mov    $a5,$C                          # C=f
+        #xchg  $F,$E                           # E=c, F=e
+
+       cmp     $num,$inp
+       je      .Ldone_avx2
+___
+
+$Xi=4;                         # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+
+$code.=<<___;
+       vmovdqu 64($K_XX_XX),@X[2]              # pbswap mask
+       cmp     $num,%rdi                       # borrowed $t0
+       ja      .Last_avx2
+
+       vmovdqu         -64(%rdi),%xmm0         # low part of @X[-4&7]
+       vmovdqu         -48(%rdi),%xmm1
+       vmovdqu         -32(%rdi),%xmm2
+       vmovdqu         -16(%rdi),%xmm3
+       vinserti128     \$1,0($frame),@X[-4&7],@X[-4&7]
+       vinserti128     \$1,16($frame),@X[-3&7],@X[-3&7]
+       vinserti128     \$1,32($frame),@X[-2&7],@X[-2&7]
+       vinserti128     \$1,48($frame),@X[-1&7],@X[-1&7]
+       jmp     .Last_avx2
+
+.align 32
+.Last_avx2:
+       lea     128+16(%rsp),$frame
+       rorx    \$2,$F,$B
+       andn    $D,$F,$t0
+       and     $C,$F
+       xor     $t0,$F
+       sub     \$-128,$inp
+___
+       $rx=$j=0;       @ROTX=($A,$F,$B,$C,$D,$E);
+
+       &Xloop_avx2     (\&bodyx_00_19);
+       &Xloop_avx2     (\&bodyx_00_19);
+       &Xloop_avx2     (\&bodyx_00_19);
+       &Xloop_avx2     (\&bodyx_00_19);
+
+       &Xloop_avx2     (\&bodyx_20_39);
+         &vmovdqu      ($Kx,"-64($K_XX_XX)");          # K_00_19
+         &vpshufb      (@X[-4&7],@X[-4&7],@X[2]);      # byte swap
+       &Xloop_avx2     (\&bodyx_20_39);
+         &vpshufb      (@X[-3&7],@X[-3&7],@X[2]);
+         &vpaddd       (@Tx[0],@X[-4&7],$Kx);          # add K_00_19
+       &Xloop_avx2     (\&bodyx_20_39);
+         &vmovdqu      ("0(%rsp)",@Tx[0]);
+         &vpshufb      (@X[-2&7],@X[-2&7],@X[2]);
+         &vpaddd       (@Tx[1],@X[-3&7],$Kx);
+       &Xloop_avx2     (\&bodyx_20_39);
+         &vmovdqu      ("32(%rsp)",@Tx[1]);
+         &vpshufb      (@X[-1&7],@X[-1&7],@X[2]);
+         &vpaddd       (@X[2],@X[-2&7],$Kx);
+
+       &Xloop_avx2     (\&bodyx_40_59);
+       &align32        ();
+         &vmovdqu      ("64(%rsp)",@X[2]);
+         &vpaddd       (@X[3],@X[-1&7],$Kx);
+       &Xloop_avx2     (\&bodyx_40_59);
+         &vmovdqu      ("96(%rsp)",@X[3]);
+       &Xloop_avx2     (\&bodyx_40_59);
+       &Xupdate_avx2_16_31(\&bodyx_40_59);
+
+       &Xupdate_avx2_16_31(\&bodyx_20_39);
+       &Xupdate_avx2_16_31(\&bodyx_20_39);
+       &Xupdate_avx2_16_31(\&bodyx_20_39);
+       &Xloop_avx2     (\&bodyx_20_39);
+
+$code.=<<___;
+       lea     128(%rsp),$frame
+
+       # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+       add     0($ctx),@ROTX[0]                # update context
+       add     4($ctx),@ROTX[1]
+       add     8($ctx),@ROTX[3]
+       mov     @ROTX[0],0($ctx)
+       add     12($ctx),@ROTX[4]
+       mov     @ROTX[1],4($ctx)
+        mov    @ROTX[0],$A                     # A=d
+       add     16($ctx),@ROTX[5]
+        mov    @ROTX[3],$a5
+       mov     @ROTX[3],8($ctx)
+        mov    @ROTX[4],$D                     # D=b
+        #xchg  @ROTX[5],$F                     # F=c, C=f
+       mov     @ROTX[4],12($ctx)
+        mov    @ROTX[1],$F                     # F=e
+       mov     @ROTX[5],16($ctx)
+       #mov    $F,16($ctx)
+        mov    @ROTX[5],$E                     # E=c
+        mov    $a5,$C                          # C=f
+        #xchg  $F,$E                           # E=c, F=e
+
+       cmp     $num,$inp
+       jbe     .Loop_avx2
+
+.Ldone_avx2:
+       vzeroupper
+___
+$code.=<<___ if ($win64);
+       movaps  -40-6*16($fp),%xmm6
+       movaps  -40-5*16($fp),%xmm7
+       movaps  -40-4*16($fp),%xmm8
+       movaps  -40-3*16($fp),%xmm9
+       movaps  -40-2*16($fp),%xmm10
+       movaps  -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+       mov     -40($fp),%r14
+.cfi_restore   %r14
+       mov     -32($fp),%r13
+.cfi_restore   %r13
+       mov     -24($fp),%r12
+.cfi_restore   %r12
+       mov     -16($fp),%rbp
+.cfi_restore   %rbp
+       mov     -8($fp),%rbx
+.cfi_restore   %rbx
+       lea     ($fp),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       ret
+.cfi_endproc
+.size  sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+___
+}
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+___
+}}}
+$code.=<<___;
+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lprologue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<.Lprologue
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lepilogue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+       jae     .Lcommon_seh_tail
+
+       mov     `16*4`(%rax),%rax       # pull saved stack pointer
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+
+       jmp     .Lcommon_seh_tail
+.size  se_handler,.-se_handler
+___
+
+$code.=<<___ if ($shaext);
+.type  shaext_handler,\@abi-omnipotent
+.align 16
+shaext_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lprologue_shaext(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<.Lprologue
+       jb      .Lcommon_seh_tail
+
+       lea     .Lepilogue_shaext(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+       jae     .Lcommon_seh_tail
+
+       lea     -8-4*16(%rax),%rsi
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$8,%ecx
+       .long   0xa548f3fc              # cld; rep movsq
+
+       jmp     .Lcommon_seh_tail
+.size  shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
+.type  ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     208($context),%rax      # pull context->R11
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       lea     -40-6*16(%rax),%rsi
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$12,%ecx
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  ssse3_handler,.-ssse3_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_sha1_block_data_order
+       .rva    .LSEH_end_sha1_block_data_order
+       .rva    .LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
+       .rva    .LSEH_begin_sha1_block_data_order_shaext
+       .rva    .LSEH_end_sha1_block_data_order_shaext
+       .rva    .LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
+       .rva    .LSEH_begin_sha1_block_data_order_ssse3
+       .rva    .LSEH_end_sha1_block_data_order_ssse3
+       .rva    .LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+       .rva    .LSEH_begin_sha1_block_data_order_avx
+       .rva    .LSEH_end_sha1_block_data_order_avx
+       .rva    .LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___ if ($avx>1);
+       .rva    .LSEH_begin_sha1_block_data_order_avx2
+       .rva    .LSEH_end_sha1_block_data_order_avx2
+       .rva    .LSEH_info_sha1_block_data_order_avx2
+___
+$code.=<<___;
+.section       .xdata
+.align 8
+.LSEH_info_sha1_block_data_order:
+       .byte   9,0,0,0
+       .rva    se_handler
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_sha1_block_data_order_shaext:
+       .byte   9,0,0,0
+       .rva    shaext_handler
+___
+$code.=<<___;
+.LSEH_info_sha1_block_data_order_ssse3:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_block_data_order_avx2:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
+___
+}
+
+####################################################################
+
+sub sha1rnds4 {
+    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x3a,0xcc);
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       return ".byte\t".join(',',@opcode);
+    } else {
+       return "sha1rnds4\t".@_[0];
+    }
+}
+
+sub sha1op38 {
+    my $instr = shift;
+    my %opcodelet = (
+               "sha1nexte" => 0xc8,
+               "sha1msg1"  => 0xc9,
+               "sha1msg2"  => 0xca     );
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x0f,0x38);
+      my $rex=0;
+       $rex|=0x04                      if ($2>=8);
+       $rex|=0x01                      if ($1>=8);
+       unshift @opcode,0x40|$rex       if ($rex);
+       push @opcode,$opcodelet{$instr};
+       push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
+       return ".byte\t".join(',',@opcode);
+    } else {
+       return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo        or
+       s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
+
+       print $_,"\n";
+}
+close STDOUT;
diff --git a/aesni/x86_64-xlate.pl b/aesni/x86_64-xlate.pl

new file mode 100755 (executable)

index 0000000..4a7ec7a
--- /dev/null
+++ b/aesni/x86_64-xlate.pl
@@ -0,0 +1,1447 @@
+#! /usr/bin/env perl
+# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+#    ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+#    stack frame allocation. If volatile storage is actually required
+#    that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+#    unified Win64 prologue and epilogue automatically. If you want
+#    to take care of ABI differences yourself, tag functions as
+#    ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+#    arguments as ".type name,@function,N." Keep in mind that if N is
+#    larger than 6, then you *have to* write "abi-omnipotent" code,
+#    because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+#    (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+#    required to identify the spots, where to inject Win64 epilogue!
+#    But on the pros, it's then prefixed with rep automatically:-)
+# 7. Stick to explicit ip-relative addressing. If you have to use
+#    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
+#    Both are recognized and translated to proper Win64 addressing
+#    modes.
+#
+# 8. In order to provide for structured exception handling unified
+#    Win64 prologue copies %rsp value to %rax. For further details
+#    see SEH paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+#    is declared as non 64-bit value, do clear its upper part.
+\f
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+open STDOUT,">$output" || die "can't open $output: $!"
+       if (defined($output));
+
+my $gas=1;     $gas=0 if ($output =~ /\.asm$/);
+my $elf=1;     $elf=0 if (!$gas);
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32;  # 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if    ($flavour eq "mingw64")  { $gas=1; $elf=0; $win64=1;
+                                 $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`;
+                                 $prefix =~ s|\R$||; # Better chomp
+                               }
+elsif ($flavour eq "macosx")   { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm")     { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
+elsif ($flavour eq "nasm")     { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
+elsif (!$gas)
+{   if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
+    {  $nasm = $1 + $2*0.01; $PTR="";  }
+    elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
+    {  $masm = $1 + $2*2**-16 + $4*2**-32;   }
+    die "no assembler found on %PATH%" if (!($nasm || $masm));
+    $win64=1;
+    $elf=0;
+    $decor="\$L\$";
+}
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode;      # pick up opcodes
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ /^([a-z][a-z0-9]*)/i) {
+           bless $self,$class;
+           $self->{op} = $1;
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+           undef $self->{sz};
+           if ($self->{op} =~ /^(movz)x?([bw]).*/) {   # movz is pain...
+               $self->{op} = $1;
+               $self->{sz} = $2;
+           } elsif ($self->{op} =~ /call|jmp/) {
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+               $self->{op} = $1;
+               $self->{sz} = $2;
+           }
+       }
+       $ret;
+    }
+    sub size {
+       my ($self, $sz) = @_;
+       $self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+       $self->{sz};
+    }
+    sub out {
+       my $self = shift;
+       if ($gas) {
+           if ($self->{op} eq "movz") {        # movz is pain...
+               sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+           } elsif ($self->{op} =~ /^set/) {
+               "$self->{op}";
+           } elsif ($self->{op} eq "ret") {
+               my $epilogue = "";
+               if ($win64 && $current_function->{abi} eq "svr4") {
+                   $epilogue = "movq   8(%rsp),%rdi\n\t" .
+                               "movq   16(%rsp),%rsi\n\t";
+               }
+               $epilogue . ".byte      0xf3,0xc3";
+           } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
+               ".p2align\t3\n\t.quad";
+           } else {
+               "$self->{op}$self->{sz}";
+           }
+       } else {
+           $self->{op} =~ s/^movz/movzx/;
+           if ($self->{op} eq "ret") {
+               $self->{op} = "";
+               if ($win64 && $current_function->{abi} eq "svr4") {
+                   $self->{op} = "mov  rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
+                                 "mov  rsi,QWORD$PTR\[16+rsp\]\n\t";
+               }
+               $self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
+           } elsif ($self->{op} =~ /^(pop|push)f/) {
+               $self->{op} .= $self->{sz};
+           } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+               $self->{op} = "\tDQ";
+           }
+           $self->{op};
+       }
+    }
+    sub mnemonic {
+       my ($self, $op) = @_;
+       $self->{op}=$op if (defined($op));
+       $self->{op};
+    }
+}
+{ package const;       # pick up constants, which start with $
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ /^\$([^,]+)/) {
+           bless $self, $class;
+           $self->{value} = $1;
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+
+       $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
+       if ($gas) {
+           # Solaris /usr/ccs/bin/as can't handle multiplications
+           # in $self->{value}
+           my $value = $self->{value};
+           no warnings;    # oct might complain about overflow, ignore here...
+           $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+           if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+               $self->{value} = $value;
+           }
+           sprintf "\$%s",$self->{value};
+       } else {
+           my $value = $self->{value};
+           $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+           sprintf "%s",$value;
+       }
+    }
+}
+{ package ea;          # pick up effective addresses: expr(%reg,%reg,scale)
+
+    my %szmap = (      b=>"BYTE$PTR",    w=>"WORD$PTR",
+                       l=>"DWORD$PTR",   d=>"DWORD$PTR",
+                       q=>"QWORD$PTR",   o=>"OWORD$PTR",
+                       x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+                       z=>"ZMMWORD$PTR" ) if (!$gas);
+
+    sub re {
+       my      ($class, $line, $opcode) = @_;
+       my      $self = {};
+       my      $ret;
+
+       # optional * ----vvv--- appears in indirect jmp/call
+       if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
+           bless $self, $class;
+           $self->{asterisk} = $1;
+           $self->{label} = $2;
+           ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+           $self->{scale} = 1 if (!defined($self->{scale}));
+           $self->{opmask} = $4;
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+           if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
+               die if ($opcode->mnemonic() ne "mov");
+               $opcode->mnemonic("lea");
+           }
+           $self->{base}  =~ s/^%//;
+           $self->{index} =~ s/^%// if (defined($self->{index}));
+           $self->{opcode} = $opcode;
+       }
+       $ret;
+    }
+    sub size {}
+    sub out {
+       my ($self, $sz) = @_;
+
+       $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+       $self->{label} =~ s/\.L/$decor/g;
+
+       # Silently convert all EAs to 64-bit. This is required for
+       # elder GNU assembler and results in more compact code,
+       # *but* most importantly AES module depends on this feature!
+       $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+       $self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+       # Solaris /usr/ccs/bin/as can't handle multiplications
+       # in $self->{label}...
+       use integer;
+       $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+       $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+
+       # Some assemblers insist on signed presentation of 32-bit
+       # offsets, but sign extension is a tricky business in perl...
+       if ((1<<31)<<1) {
+           $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+       } else {
+           $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg;
+       }
+
+       # if base register is %rbp or %r13, see if it's possible to
+       # flip base and index registers [for better performance]
+       if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+           $self->{base} =~ /(rbp|r13)/) {
+               $self->{base} = $self->{index}; $self->{index} = $1;
+       }
+
+       if ($gas) {
+           $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
+
+           if (defined($self->{index})) {
+               sprintf "%s%s(%s,%%%s,%d)%s",
+                                       $self->{asterisk},$self->{label},
+                                       $self->{base}?"%$self->{base}":"",
+                                       $self->{index},$self->{scale},
+                                       $self->{opmask};
+           } else {
+               sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label},
+                                       $self->{base},$self->{opmask};
+           }
+       } else {
+           $self->{label} =~ s/\./\$/g;
+           $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+           $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
+
+           my $mnemonic = $self->{opcode}->mnemonic();
+           ($self->{asterisk})                         && ($sz="q") ||
+           ($mnemonic =~ /^v?mov([qd])$/)              && ($sz=$1)  ||
+           ($mnemonic =~ /^v?pinsr([qdwb])$/)          && ($sz=$1)  ||
+           ($mnemonic =~ /^vpbroadcast([qdwb])$/)      && ($sz=$1)  ||
+           ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/)   && ($sz="x");
+
+           $self->{opmask}  =~ s/%(k[0-7])/$1/;
+
+           if (defined($self->{index})) {
+               sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
+                                       $self->{label}?"$self->{label}+":"",
+                                       $self->{index},$self->{scale},
+                                       $self->{base}?"+$self->{base}":"",
+                                       $self->{opmask};
+           } elsif ($self->{base} eq "rip") {
+               sprintf "%s[%s]",$szmap{$sz},$self->{label};
+           } else {
+               sprintf "%s[%s%s]%s",   $szmap{$sz},
+                                       $self->{label}?"$self->{label}+":"",
+                                       $self->{base},$self->{opmask};
+           }
+       }
+    }
+}
+{ package register;    # pick up registers, which start with %.
+    sub re {
+       my      ($class, $line, $opcode) = @_;
+       my      $self = {};
+       my      $ret;
+
+       # optional * ----vvv--- appears in indirect jmp/call
+       if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
+           bless $self,$class;
+           $self->{asterisk} = $1;
+           $self->{value} = $2;
+           $self->{opmask} = $3;
+           $opcode->size($self->size());
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+       }
+       $ret;
+    }
+    sub size {
+       my      $self = shift;
+       my      $ret;
+
+       if    ($self->{value} =~ /^r[\d]+b$/i)  { $ret="b"; }
+       elsif ($self->{value} =~ /^r[\d]+w$/i)  { $ret="w"; }
+       elsif ($self->{value} =~ /^r[\d]+d$/i)  { $ret="l"; }
+       elsif ($self->{value} =~ /^r[\w]+$/i)   { $ret="q"; }
+       elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+       elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; }
+       elsif ($self->{value} =~ /^[\w]{2}$/i)  { $ret="w"; }
+       elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       if ($gas)       { sprintf "%s%%%s%s",   $self->{asterisk},
+                                               $self->{value},
+                                               $self->{opmask}; }
+       else            { $self->{opmask} =~ s/%(k[0-7])/$1/;
+                         $self->{value}.$self->{opmask}; }
+    }
+}
+{ package label;       # pick up labels, which end with :
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ /(^[\.\w]+)\:/) {
+           bless $self,$class;
+           $self->{value} = $1;
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+           $self->{value} =~ s/^\.L/$decor/;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+
+       if ($gas) {
+           my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+           if ($win64  && $current_function->{name} eq $self->{value}
+                       && $current_function->{abi} eq "svr4") {
+               $func .= "\n";
+               $func .= "      movq    %rdi,8(%rsp)\n";
+               $func .= "      movq    %rsi,16(%rsp)\n";
+               $func .= "      movq    %rsp,%rax\n";
+               $func .= "${decor}SEH_begin_$current_function->{name}:\n";
+               my $narg = $current_function->{narg};
+               $narg=6 if (!defined($narg));
+               $func .= "      movq    %rcx,%rdi\n" if ($narg>0);
+               $func .= "      movq    %rdx,%rsi\n" if ($narg>1);
+               $func .= "      movq    %r8,%rdx\n"  if ($narg>2);
+               $func .= "      movq    %r9,%rcx\n"  if ($narg>3);
+               $func .= "      movq    40(%rsp),%r8\n" if ($narg>4);
+               $func .= "      movq    48(%rsp),%r9\n" if ($narg>5);
+           }
+           $func;
+       } elsif ($self->{value} ne "$current_function->{name}") {
+           # Make all labels in masm global.
+           $self->{value} .= ":" if ($masm);
+           $self->{value} . ":";
+       } elsif ($win64 && $current_function->{abi} eq "svr4") {
+           my $func =  "$current_function->{name}" .
+                       ($nasm ? ":" : "\tPROC $current_function->{scope}") .
+                       "\n";
+           $func .= "  mov     QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
+           $func .= "  mov     QWORD$PTR\[16+rsp\],rsi\n";
+           $func .= "  mov     rax,rsp\n";
+           $func .= "${decor}SEH_begin_$current_function->{name}:";
+           $func .= ":" if ($masm);
+           $func .= "\n";
+           my $narg = $current_function->{narg};
+           $narg=6 if (!defined($narg));
+           $func .= "  mov     rdi,rcx\n" if ($narg>0);
+           $func .= "  mov     rsi,rdx\n" if ($narg>1);
+           $func .= "  mov     rdx,r8\n"  if ($narg>2);
+           $func .= "  mov     rcx,r9\n"  if ($narg>3);
+           $func .= "  mov     r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
+           $func .= "  mov     r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
+           $func .= "\n";
+       } else {
+          "$current_function->{name}".
+                       ($nasm ? ":" : "\tPROC $current_function->{scope}");
+       }
+    }
+}
+{ package expr;                # pick up expressions
+    sub re {
+       my      ($class, $line, $opcode) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ /(^[^,]+)/) {
+           bless $self,$class;
+           $self->{value} = $1;
+           $ret = $self;
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+           $self->{value} =~ s/\@PLT// if (!$elf);
+           $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+           $self->{value} =~ s/\.L/$decor/g;
+           $self->{opcode} = $opcode;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) {
+           "NEAR ".$self->{value};
+       } else {
+           $self->{value};
+       }
+    }
+}
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (       # no-arg operators, mapped directly
+       deref   => 0x06,        dup     => 0x12,
+       drop    => 0x13,        over    => 0x14,
+       pick    => 0x15,        swap    => 0x16,
+       rot     => 0x17,        xderef  => 0x18,
+
+       abs     => 0x19,        and     => 0x1a,
+       div     => 0x1b,        minus   => 0x1c,
+       mod     => 0x1d,        mul     => 0x1e,
+       neg     => 0x1f,        not     => 0x20,
+       or      => 0x21,        plus    => 0x22,
+       shl     => 0x24,        shr     => 0x25,
+       shra    => 0x26,        xor     => 0x27,
+       );
+
+    my %DW_OP_complex = (      # used in specific subroutines
+       constu          => 0x10,        # uleb128
+       consts          => 0x11,        # sleb128
+       plus_uconst     => 0x23,        # uleb128
+       lit0            => 0x30,        # add 0-31 to opcode
+       reg0            => 0x50,        # add 0-31 to opcode
+       breg0           => 0x70,        # add 0-31 to opcole, sleb128
+       regx            => 0x90,        # uleb28
+       fbreg           => 0x91,        # sleb128
+       bregx           => 0x92,        # uleb128, sleb128
+       piece           => 0x93,        # uleb128
+       );
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+       "%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+       "%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+       "%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+       "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+       );
+
+    my ($cfa_reg, $cfa_rsp);
+    my @cfa_stack;
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significant digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+       use integer;    # get right shift extend sign
+
+       my $val = shift;
+       my $sign = ($val < 0) ? -1 : 0;
+       my @ret = ();
+
+       while(1) {
+           push @ret, $val&0x7f;
+
+           # see if remaining bits are same and equal to most
+           # significant bit of the current digit, if so, it's
+           # last digit...
+           last if (($val>>6) == $sign);
+
+           @ret[-1] |= 0x80;
+           $val >>= 7;
+       }
+
+       return @ret;
+    }
+    sub uleb128 {
+       my $val = shift;
+       my @ret = ();
+
+       while(1) {
+           push @ret, $val&0x7f;
+
+           # see if it's last significant digit...
+           last if (($val >>= 7) == 0);
+
+           @ret[-1] |= 0x80;
+       }
+
+       return @ret;
+    }
+    sub const {
+       my $val = shift;
+
+       if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+       }
+       return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+       my $val = shift;
+
+       return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+       my $reg = $DW_reg_idx{$1};
+       my $off = eval ("0 $2 $3");
+
+       return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+       # Yes, we use DW_OP_bregX+0 to push register value and not
+       # DW_OP_regX, because latter would require even DW_OP_piece,
+       # which would be a waste under the circumstances. If you have
+       # to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+       my $line = shift;
+       my @ret;
+
+       foreach my $token (split(/,\s*/,$line)) {
+           if ($token =~ /^%r/) {
+               push @ret,reg($token);
+           } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+               push @ret,reg("$2+$1");
+           } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+               my $i = 1*eval($2);
+               push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+           } elsif (my $i = 1*eval($token) or $token eq "0") {
+               if ($token =~ /^\+/) {
+                   push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+               } else {
+                   push @ret,const($i);
+               }
+           } else {
+               push @ret,$DW_OP_simple{$token};
+           }
+       }
+
+       # Finally we return DW_CFA_def_cfa_expression, 15, followed by
+       # length of the expression and of course the expression itself.
+       return (15,scalar(@ret),@ret);
+    }
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+           bless $self,$class;
+           $ret = $self;
+           undef $self->{value};
+           my $dir = $1;
+
+           SWITCH: for ($dir) {
+           # What is $cfa_rsp? Effectively it's difference between %rsp
+           # value and current CFA, Canonical Frame Address, which is
+           # why it starts with -8. Recall that CFA is top of caller's
+           # stack...
+           /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+           /endproc/   && do { ($cfa_reg, $cfa_rsp) = ("%rsp",  0);
+                               # .cfi_remember_state directives that are not
+                               # matched with .cfi_restore_state are
+                               # unnecessary.
+                               die "unpaired .cfi_remember_state" if (@cfa_stack);
+                               last;
+                             };
+           /def_cfa_register/
+                       && do { $cfa_reg = $$line; last; };
+           /def_cfa_offset/
+                       && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp");
+                               last;
+                             };
+           /adjust_cfa_offset/
+                       && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp");
+                               last;
+                             };
+           /def_cfa/   && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+                                   $cfa_reg = $1;
+                                   $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp");
+                               }
+                               last;
+                             };
+           /push/      && do { $dir = undef;
+                               $cfa_rsp -= 8;
+                               if ($cfa_reg eq "%rsp") {
+                                   $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+                               }
+                               $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+                               last;
+                             };
+           /pop/       && do { $dir = undef;
+                               $cfa_rsp += 8;
+                               if ($cfa_reg eq "%rsp") {
+                                   $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+                               }
+                               $self->{value} .= ".cfi_restore\t$$line";
+                               last;
+                             };
+           /cfa_expression/
+                       && do { $dir = undef;
+                               $self->{value} = ".cfi_escape\t" .
+                                       join(",", map(sprintf("0x%02x", $_),
+                                                     cfa_expression($$line)));
+                               last;
+                             };
+           /remember_state/
+                       && do { push @cfa_stack, [$cfa_reg, $cfa_rsp];
+                               last;
+                             };
+           /restore_state/
+                       && do { ($cfa_reg, $cfa_rsp) = @{pop @cfa_stack};
+                               last;
+                             };
+           }
+
+           $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+           $$line = "";
+       }
+
+       return $ret;
+    }
+    sub out {
+       my $self = shift;
+       return ($elf ? $self->{value} : undef);
+    }
+}
+{ package directive;   # pick up directives, which start with .
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+       my      $dir;
+
+       # chain-call to cfi_directive
+       $ret = cfi_directive->re($line) and return $ret;
+
+       if ($$line =~ /^\s*(\.\w+)/) {
+           bless $self,$class;
+           $dir = $1;
+           $ret = $self;
+           undef $self->{value};
+           $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+           SWITCH: for ($dir) {
+               /\.global|\.globl|\.extern/
+                           && do { $globals{$$line} = $prefix . $$line;
+                                   $$line = $globals{$$line} if ($prefix);
+                                   last;
+                                 };
+               /\.type/    && do { my ($sym,$type,$narg) = split(',',$$line);
+                                   if ($type eq "\@function") {
+                                       undef $current_function;
+                                       $current_function->{name} = $sym;
+                                       $current_function->{abi}  = "svr4";
+                                       $current_function->{narg} = $narg;
+                                       $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+                                   } elsif ($type eq "\@abi-omnipotent") {
+                                       undef $current_function;
+                                       $current_function->{name} = $sym;
+                                       $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+                                   }
+                                   $$line =~ s/\@abi\-omnipotent/\@function/;
+                                   $$line =~ s/\@function.*/\@function/;
+                                   last;
+                                 };
+               /\.asciz/   && do { if ($$line =~ /^"(.*)"$/) {
+                                       $dir  = ".byte";
+                                       $$line = join(",",unpack("C*",$1),0);
+                                   }
+                                   last;
+                                 };
+               /\.rva|\.long|\.quad/
+                           && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+                                   $$line =~ s/\.L/$decor/g;
+                                   last;
+                                 };
+           }
+
+           if ($gas) {
+               $self->{value} = $dir . "\t" . $$line;
+
+               if ($dir =~ /\.extern/) {
+                   $self->{value} = ""; # swallow extern
+               } elsif (!$elf && $dir =~ /\.type/) {
+                   $self->{value} = "";
+                   $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+                               (defined($globals{$1})?".scl 2;":".scl 3;") .
+                               "\t.type 32;\t.endef"
+                               if ($win64 && $$line =~ /([^,]+),\@function/);
+               } elsif (!$elf && $dir =~ /\.size/) {
+                   $self->{value} = "";
+                   if (defined($current_function)) {
+                       $self->{value} .= "${decor}SEH_end_$current_function->{name}:"
+                               if ($win64 && $current_function->{abi} eq "svr4");
+                       undef $current_function;
+                   }
+               } elsif (!$elf && $dir =~ /\.align/) {
+                   $self->{value} = ".p2align\t" . (log($$line)/log(2));
+               } elsif ($dir eq ".section") {
+                   $current_segment=$$line;
+                   if (!$elf && $current_segment eq ".init") {
+                       if      ($flavour eq "macosx")  { $self->{value} = ".mod_init_func"; }
+                       elsif   ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; }
+                   }
+               } elsif ($dir =~ /\.(text|data)/) {
+                   $current_segment=".$1";
+               } elsif ($dir =~ /\.hidden/) {
+                   if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$$line"; }
+                   elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+               } elsif ($dir =~ /\.comm/) {
+                   $self->{value} = "$dir\t$prefix$$line";
+                   $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+               }
+               $$line = "";
+               return $self;
+           }
+
+           # non-gas case or nasm/masm
+           SWITCH: for ($dir) {
+               /\.text/    && do { my $v=undef;
+                                   if ($nasm) {
+                                       $v="section     .text code align=64\n";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if ($current_segment);
+                                       $current_segment = ".text\$";
+                                       $v.="$current_segment\tSEGMENT ";
+                                       $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
+                                       $v.=" 'CODE'";
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.data/    && do { my $v=undef;
+                                   if ($nasm) {
+                                       $v="section     .data data align=8\n";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if ($current_segment);
+                                       $current_segment = "_DATA";
+                                       $v.="$current_segment\tSEGMENT";
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.section/ && do { my $v=undef;
+                                   $$line =~ s/([^,]*).*/$1/;
+                                   $$line = ".CRT\$XCU" if ($$line eq ".init");
+                                   if ($nasm) {
+                                       $v="section     $$line";
+                                       if ($$line=~/\.([px])data/) {
+                                           $v.=" rdata align=";
+                                           $v.=$1 eq "p"? 4 : 8;
+                                       } elsif ($$line=~/\.CRT\$/i) {
+                                           $v.=" rdata align=8";
+                                       }
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if ($current_segment);
+                                       $v.="$$line\tSEGMENT";
+                                       if ($$line=~/\.([px])data/) {
+                                           $v.=" READONLY";
+                                           $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
+                                       } elsif ($$line=~/\.CRT\$/i) {
+                                           $v.=" READONLY ";
+                                           $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
+                                       }
+                                   }
+                                   $current_segment = $$line;
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.extern/  && do { $self->{value}  = "EXTERN\t".$$line;
+                                   $self->{value} .= ":NEAR" if ($masm);
+                                   last;
+                                 };
+               /\.globl|.global/
+                           && do { $self->{value}  = $masm?"PUBLIC":"global";
+                                   $self->{value} .= "\t".$$line;
+                                   last;
+                                 };
+               /\.size/    && do { if (defined($current_function)) {
+                                       undef $self->{value};
+                                       if ($current_function->{abi} eq "svr4") {
+                                           $self->{value}="${decor}SEH_end_$current_function->{name}:";
+                                           $self->{value}.=":\n" if($masm);
+                                       }
+                                       $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
+                                       undef $current_function;
+                                   }
+                                   last;
+                                 };
+               /\.align/   && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
+                                   $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
+                                   last;
+                                 };
+               /\.(value|long|rva|quad)/
+                           && do { my $sz  = substr($1,0,1);
+                                   my @arr = split(/,\s*/,$$line);
+                                   my $last = pop(@arr);
+                                   my $conv = sub  {   my $var=shift;
+                                                       $var=~s/^(0b[0-1]+)/oct($1)/eig;
+                                                       $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+                                                       if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
+                                                       { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+                                                       $var;
+                                                   };
+
+                                   $sz =~ tr/bvlrq/BWDDQ/;
+                                   $self->{value} = "\tD$sz\t";
+                                   for (@arr) { $self->{value} .= &$conv($_).","; }
+                                   $self->{value} .= &$conv($last);
+                                   last;
+                                 };
+               /\.byte/    && do { my @str=split(/,\s*/,$$line);
+                                   map(s/(0b[0-1]+)/oct($1)/eig,@str);
+                                   map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+                                   while ($#str>15) {
+                                       $self->{value}.="DB\t"
+                                               .join(",",@str[0..15])."\n";
+                                       foreach (0..15) { shift @str; }
+                                   }
+                                   $self->{value}.="DB\t"
+                                               .join(",",@str) if (@str);
+                                   last;
+                                 };
+               /\.comm/    && do { my @str=split(/,\s*/,$$line);
+                                   my $v=undef;
+                                   if ($nasm) {
+                                       $v.="common     $prefix@str[0] @str[1]";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if ($current_segment);
+                                       $current_segment = "_DATA";
+                                       $v.="$current_segment\tSEGMENT\n";
+                                       $v.="COMM       @str[0]:DWORD:".@str[1]/4;
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+           }
+           $$line = "";
+       }
+
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       $self->{value};
+    }
+}
+
+# Upon initial x86_64 introduction SSE>2 extensions were not introduced
+# yet. In order not to be bothered by tracing exact assembler versions,
+# but at the same time to provide a bare security minimum of AES-NI, we
+# hard-code some instructions. Extensions past AES-NI on the other hand
+# are traced by examining assembler version in individual perlasm
+# modules...
+
+my %regrm = (  "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+               "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7      );
+
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @$opcode,($rex|0x40) if ($rex);
+}
+
+my $movq = sub {       # elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+       my ($src,$dst)=($1,$2);
+       if ($dst !~ /[0-9]+/)   { $dst = $regrm{"%e$dst"}; }
+       rex(\@opcode,$src,$dst,0x8);
+       push @opcode,0x0f,0x7e;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       @opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+       my ($src,$dst)=($2,$1);
+       if ($dst !~ /[0-9]+/)   { $dst = $regrm{"%e$dst"}; }
+       rex(\@opcode,$src,$dst,0x8);
+       push @opcode,0x0f,0x6e;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+       my $imm=$1;
+       my $src=$2;
+       my $dst=$3;
+       if ($dst =~ /%r([0-9]+)d/)      { $dst = $1; }
+       elsif ($dst =~ /%e/)            { $dst = $regrm{$dst}; }
+       rex(\@opcode,$src,$dst);
+       push @opcode,0x0f,0x3a,0x16;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       push @opcode,$imm;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       my $imm=$1;
+       my $src=$2;
+       my $dst=$3;
+       if ($src =~ /%r([0-9]+)/)       { $src = $1; }
+       elsif ($src =~ /%e/)            { $src = $regrm{$src}; }
+       rex(\@opcode,$dst,$src);
+       push @opcode,0x0f,0x3a,0x22;
+       push @opcode,0xc0|(($dst&7)<<3)|($src&7);       # ModR/M
+       push @opcode,$imm;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(\@opcode,$2,$1);
+       push @opcode,0x0f,0x38,0x00;
+       push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(\@opcode,$3,$2);
+       push @opcode,0x0f,0x3a,0x0f;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       push @opcode,$1;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(\@opcode,$3,$2);
+       push @opcode,0x0f,0x3a,0x44;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+       if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+       rex(\@opcode,0,$dst,8);
+       push @opcode,0x0f,0xc7,0xf0|($dst&7);
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $rdseed = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+       if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+       rex(\@opcode,0,$dst,8);
+       push @opcode,0x0f,0xc7,0xf8|($dst&7);
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
+sub rxb {
+ my $opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @$opcode,$rxb;
+}
+
+my $vprotd = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+       rxb(\@opcode,$3,$2,-1,0x08);
+       push @opcode,0x78,0xc2;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $vprotq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+       rxb(\@opcode,$3,$2,-1,0x08);
+       push @opcode,0x78,0xc3;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+
+my $endbranch = sub {
+    (0xf3,0x0f,0x1e,0xfa);
+};
+
+########################################################################
+
+if ($nasm) {
+    print <<___;
+default        rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+___
+} elsif ($masm) {
+    print <<___;
+OPTION DOTNAME
+___
+}
+while(defined(my $line=<>)) {
+
+    $line =~ s|\R$||;           # Better chomp
+
+    $line =~ s|[#!].*$||;      # get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;     # ... and C-style comments...
+    $line =~ s|^\s+||;         # ... and skip white spaces in beginning
+    $line =~ s|\s+$||;         # ... and at the end
+
+    if (my $label=label->re(\$line))   { print $label->out(); }
+
+    if (my $directive=directive->re(\$line)) {
+       printf "%s",$directive->out();
+    } elsif (my $opcode=opcode->re(\$line)) {
+       my $asm = eval("\$".$opcode->mnemonic());
+
+       if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
+           print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+           next;
+       }
+
+       my @args;
+       ARGUMENT: while (1) {
+           my $arg;
+
+           ($arg=register->re(\$line, $opcode))||
+           ($arg=const->re(\$line))            ||
+           ($arg=ea->re(\$line, $opcode))      ||
+           ($arg=expr->re(\$line, $opcode))    ||
+           last ARGUMENT;
+
+           push @args,$arg;
+
+           last ARGUMENT if ($line !~ /^,/);
+
+           $line =~ s/^,\s*//;
+       } # ARGUMENT:
+
+       if ($#args>=0) {
+           my $insn;
+           my $sz=$opcode->size();
+
+           if ($gas) {
+               $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+               @args = map($_->out($sz),@args);
+               printf "\t%s\t%s",$insn,join(",",@args);
+           } else {
+               $insn = $opcode->out();
+               foreach (@args) {
+                   my $arg = $_->out();
+                   # $insn.=$sz compensates for movq, pinsrw, ...
+                   if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+                   if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+                   if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
+                   if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+               }
+               @args = reverse(@args);
+               undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+               printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+           }
+       } else {
+           printf "\t%s",$opcode->out();
+       }
+    }
+
+    print $line,"\n";
+}
+
+print "\n$current_segment\tENDS\n"     if ($current_segment && $masm);
+print "END\n"                          if ($masm);
+
+close STDOUT;
+
+\f#################################################
+# Cross-reference x86_64 ABI "card"
+#
+#              Unix            Win64
+# %rax         *               *
+# %rbx         -               -
+# %rcx         #4              #1
+# %rdx         #3              #2
+# %rsi         #2              -
+# %rdi         #1              -
+# %rbp         -               -
+# %rsp         -               -
+# %r8          #5              #3
+# %r9          #6              #4
+# %r10         *               *
+# %r11         *               *
+# %r12         -               -
+# %r13         -               -
+# %r14         -               -
+# %r15         -               -
+#
+# (*)  volatile register
+# (-)  preserved by callee
+# (#)  Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accommodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existence,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+#      movq    %rdi,8(%rsp)
+#      movq    %rsi,16(%rsp)
+#      movq    %rcx,%rdi       ; if 1st argument is actually present
+#      movq    %rdx,%rsi       ; if 2nd argument is actually ...
+#      movq    %r8,%rdx        ; if 3rd argument is ...
+#      movq    %r9,%rcx        ; if 4th argument ...
+#      movq    40(%rsp),%r8    ; if 5th ...
+#      movq    48(%rsp),%r9    ; if 6th ...
+# endif
+#      ...
+# ifdef WIN64
+#      movq    8(%rsp),%rdi
+#      movq    16(%rsp),%rsi
+# endif
+#      ret
+#
+\f#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type        function,@function
+# function:
+#      movq    %rsp,%rax       # copy rsp to volatile register
+#      pushq   %r15            # save non-volatile registers
+#      pushq   %rbx
+#      pushq   %rbp
+#      movq    %rsp,%r11
+#      subq    %rdi,%r11       # prepare [variable] stack frame
+#      andq    $-64,%r11
+#      movq    %rax,0(%r11)    # check for exceptions
+#      movq    %r11,%rsp       # allocate [variable] stack frame
+#      movq    %rax,0(%rsp)    # save original rsp value
+# magic_point:
+#      ...
+#      movq    0(%rsp),%rcx    # pull original rsp value
+#      movq    -24(%rcx),%rbp  # restore non-volatile registers
+#      movq    -16(%rcx),%rbx
+#      movq    -8(%rcx),%r15
+#      movq    %rcx,%rsp       # restore original rsp
+# magic_epilogue:
+#      ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# {    ULONG64 *rsp = (ULONG64 *)context->Rax;
+#      ULONG64  rip = context->Rip;
+#
+#      if (rip >= magic_point)
+#      {   rsp = (ULONG64 *)context->Rsp;
+#          if (rip < magic_epilogue)
+#          {   rsp = (ULONG64 *)rsp[0];
+#              context->Rbp = rsp[-3];
+#              context->Rbx = rsp[-2];
+#              context->R15 = rsp[-1];
+#          }
+#      }
+#      context->Rsp = (ULONG64)rsp;
+#      context->Rdi = rsp[1];
+#      context->Rsi = rsp[2];
+#
+#      memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+#      RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+#              dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+#              &disp->HandlerData,&disp->EstablisherFrame,NULL);
+#      return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+#      CONTEXT.Rax                             120
+#      CONTEXT.Rcx                             128
+#      CONTEXT.Rdx                             136
+#      CONTEXT.Rbx                             144
+#      CONTEXT.Rsp                             152
+#      CONTEXT.Rbp                             160
+#      CONTEXT.Rsi                             168
+#      CONTEXT.Rdi                             176
+#      CONTEXT.R8                              184
+#      CONTEXT.R9                              192
+#      CONTEXT.R10                             200
+#      CONTEXT.R11                             208
+#      CONTEXT.R12                             216
+#      CONTEXT.R13                             224
+#      CONTEXT.R14                             232
+#      CONTEXT.R15                             240
+#      CONTEXT.Rip                             248
+#      CONTEXT.Xmm6                            512
+#      sizeof(CONTEXT)                         1232
+#      DISPATCHER_CONTEXT.ControlPc            0
+#      DISPATCHER_CONTEXT.ImageBase            8
+#      DISPATCHER_CONTEXT.FunctionEntry        16
+#      DISPATCHER_CONTEXT.EstablisherFrame     24
+#      DISPATCHER_CONTEXT.TargetIp             32
+#      DISPATCHER_CONTEXT.ContextRecord        40
+#      DISPATCHER_CONTEXT.LanguageHandler      48
+#      DISPATCHER_CONTEXT.HandlerData          56
+#      UNW_FLAG_NHANDLER                       0
+#      ExceptionContinueSearch                 1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+#      .byte   9,0,0,0
+#      .rva    handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+#      .rva    .LSEH_begin_function
+#      .rva    .LSEH_end_function
+#      .rva    function_unwind_info
+#
+# Reference to function_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# SE handlers are also involved in unwinding stack when executable is
+# profiled or debugged. Profiling implies additional limitations that
+# are too subtle to discuss here. For now it's sufficient to say that
+# in order to simplify handlers one should either a) offload original
+# %rsp to stack (like discussed above); or b) if you have a register to
+# spare for frame pointer, choose volatile one.
+#
+# (*)  Note that we're talking about run-time, not debug-time. Lack of
+#      unwind information makes debugging hard on both Windows and
+#      Unix. "Unlike" refers to the fact that on Unix signal handler
+#      will always be invoked, core dumped and appropriate exit code
+#      returned to parent (for user notification).
diff --git a/aesni/x86_64cpuid.pl b/aesni/x86_64cpuid.pl

new file mode 100644 (file)

index 0000000..cc50d43
--- /dev/null
+++ b/aesni/x86_64cpuid.pl
@@ -0,0 +1,201 @@
+#! /usr/bin/env perl
+# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :        # Win64 order
+                                ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+print<<___;
+.text
+
+.globl OPENSSL_ia32_cpuid
+.type  OPENSSL_ia32_cpuid,\@function,1
+.align 16
+OPENSSL_ia32_cpuid:
+.cfi_startproc
+       mov     %rbx,%r8                # save %rbx
+.cfi_register  %rbx,%r8
+
+       xor     %eax,%eax
+       mov     %rax,8(%rdi)            # clear extended feature flags
+       cpuid
+       mov     %eax,%r11d              # max value for standard query level
+
+       xor     %eax,%eax
+       cmp     \$0x756e6547,%ebx       # "Genu"
+       setne   %al
+       mov     %eax,%r9d
+       cmp     \$0x49656e69,%edx       # "ineI"
+       setne   %al
+       or      %eax,%r9d
+       cmp     \$0x6c65746e,%ecx       # "ntel"
+       setne   %al
+       or      %eax,%r9d               # 0 indicates Intel CPU
+       jz      .Lintel
+
+       cmp     \$0x68747541,%ebx       # "Auth"
+       setne   %al
+       mov     %eax,%r10d
+       cmp     \$0x69746E65,%edx       # "enti"
+       setne   %al
+       or      %eax,%r10d
+       cmp     \$0x444D4163,%ecx       # "cAMD"
+       setne   %al
+       or      %eax,%r10d              # 0 indicates AMD CPU
+       jnz     .Lintel
+
+       # AMD specific
+       mov     \$0x80000000,%eax
+       cpuid
+       cmp     \$0x80000001,%eax
+       jb      .Lintel
+       mov     %eax,%r10d
+       mov     \$0x80000001,%eax
+       cpuid
+       or      %ecx,%r9d
+       and     \$0x00000801,%r9d       # isolate AMD XOP bit, 1<<11
+
+       cmp     \$0x80000008,%r10d
+       jb      .Lintel
+
+       mov     \$0x80000008,%eax
+       cpuid
+       movzb   %cl,%r10                # number of cores - 1
+       inc     %r10                    # number of cores
+
+       mov     \$1,%eax
+       cpuid
+       bt      \$28,%edx               # test hyper-threading bit
+       jnc     .Lgeneric
+       shr     \$16,%ebx               # number of logical processors
+       cmp     %r10b,%bl
+       ja      .Lgeneric
+       and     \$0xefffffff,%edx       # ~(1<<28)
+       jmp     .Lgeneric
+
+.Lintel:
+       cmp     \$4,%r11d
+       mov     \$-1,%r10d
+       jb      .Lnocacheinfo
+
+       mov     \$4,%eax
+       mov     \$0,%ecx                # query L1D
+       cpuid
+       mov     %eax,%r10d
+       shr     \$14,%r10d
+       and     \$0xfff,%r10d           # number of cores -1 per L1D
+
+.Lnocacheinfo:
+       mov     \$1,%eax
+       cpuid
+       movd    %eax,%xmm0              # put aside processor id
+       and     \$0xbfefffff,%edx       # force reserved bits to 0
+       cmp     \$0,%r9d
+       jne     .Lnotintel
+       or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
+       and     \$15,%ah
+       cmp     \$15,%ah                # examine Family ID
+       jne     .LnotP4
+       or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+       cmp     \$6,%ah
+       jne     .Lnotintel
+       and     \$0x0fff0ff0,%eax
+       cmp     \$0x00050670,%eax       # Knights Landing
+       je      .Lknights
+       cmp     \$0x00080650,%eax       # Knights Mill (according to sde)
+       jne     .Lnotintel
+.Lknights:
+       and     \$0xfbffffff,%ecx       # clear XSAVE flag to mimic Silvermont
+
+.Lnotintel:
+       bt      \$28,%edx               # test hyper-threading bit
+       jnc     .Lgeneric
+       and     \$0xefffffff,%edx       # ~(1<<28)
+       cmp     \$0,%r10d
+       je      .Lgeneric
+
+       or      \$0x10000000,%edx       # 1<<28
+       shr     \$16,%ebx
+       cmp     \$1,%bl                 # see if cache is shared
+       ja      .Lgeneric
+       and     \$0xefffffff,%edx       # ~(1<<28)
+.Lgeneric:
+       and     \$0x00000800,%r9d       # isolate AMD XOP flag
+       and     \$0xfffff7ff,%ecx
+       or      %ecx,%r9d               # merge AMD XOP flag
+
+       mov     %edx,%r10d              # %r9d:%r10d is copy of %ecx:%edx
+
+       cmp     \$7,%r11d
+       jb      .Lno_extended_info
+       mov     \$7,%eax
+       xor     %ecx,%ecx
+       cpuid
+       bt      \$26,%r9d               # check XSAVE bit, cleared on Knights
+       jc      .Lnotknights
+       and     \$0xfff7ffff,%ebx       # clear ADCX/ADOX flag
+.Lnotknights:
+       movd    %xmm0,%eax              # restore processor id
+       and     \$0x0fff0ff0,%eax
+       cmp     \$0x00050650,%eax       # Skylake-X
+       jne     .Lnotskylakex
+       and     \$0xfffeffff,%ebx       # ~(1<<16)
+                                       # suppress AVX512F flag on Skylake-X
+.Lnotskylakex:
+       mov     %ebx,8(%rdi)            # save extended feature flags
+       mov     %ecx,12(%rdi)
+.Lno_extended_info:
+
+       bt      \$27,%r9d               # check OSXSAVE bit
+       jnc     .Lclear_avx
+       xor     %ecx,%ecx               # XCR0
+       .byte   0x0f,0x01,0xd0          # xgetbv
+       and     \$0xe6,%eax             # isolate XMM, YMM and ZMM state support
+       cmp     \$0xe6,%eax
+       je      .Ldone
+       andl    \$0x3fdeffff,8(%rdi)    # ~(1<<31|1<<30|1<<21|1<<16)
+                                       # clear AVX512F+BW+VL+FIMA, all of
+                                       # them are EVEX-encoded, which requires
+                                       # ZMM state support even if one uses
+                                       # only XMM and YMM :-(
+       and     \$6,%eax                # isolate XMM and YMM state support
+       cmp     \$6,%eax
+       je      .Ldone
+.Lclear_avx:
+       mov     \$0xefffe7ff,%eax       # ~(1<<28|1<<12|1<<11)
+       and     %eax,%r9d               # clear AVX, FMA and AMD XOP bits
+       mov     \$0x3fdeffdf,%eax       # ~(1<<31|1<<30|1<<21|1<<16|1<<5)
+       and     %eax,8(%rdi)            # clear AVX2 and AVX512* bits
+.Ldone:
+       shl     \$32,%r9
+       mov     %r10d,%eax
+       mov     %r8,%rbx                # restore %rbx
+.cfi_restore   %rbx
+       or      %r9,%rax
+       ret
+.cfi_endproc
+.size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+___
+
+
+close STDOUT;  # flush
diff --git a/aesni/x86asm.pl b/aesni/x86asm.pl

new file mode 100644 (file)

index 0000000..be462ce
--- /dev/null
+++ b/aesni/x86asm.pl
@@ -0,0 +1,303 @@
+#! /usr/bin/env perl
+# Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# require 'x86asm.pl';
+# &asm_init(<flavor>[,$i386only]);
+# &function_begin("foo");
+# ...
+# &function_end("foo");
+# &asm_finish
+
+$out=();
+$i386=0;
+
+# AUTOLOAD is this context has quite unpleasant side effect, namely
+# that typos in function calls effectively go to assembler output,
+# but on the pros side we don't have to implement one subroutine per
+# each opcode...
+sub ::AUTOLOAD
+{ my $opcode = $AUTOLOAD;
+
+    die "more than 4 arguments passed to $opcode" if ($#_>3);
+
+    $opcode =~ s/.*:://;
+    if    ($opcode =~ /^push/) { $stack+=4; }
+    elsif ($opcode =~ /^pop/)  { $stack-=4; }
+
+    &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD";
+}
+
+sub ::emit
+{ my $opcode=shift;
+
+    if ($#_==-1)    { push(@out,"\t$opcode\n");                                }
+    else            { push(@out,"\t$opcode\t".join(',',@_)."\n");      }
+}
+
+sub ::LB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'";
+  $1."l";
+}
+sub ::HB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'";
+  $1."h";
+}
+sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num);     }
+sub ::stack_pop        { my $num=$_[0]*4; $stack-=$num; &add("esp",$num);      }
+sub ::blindpop { &pop($_[0]); $stack+=4;                               }
+sub ::wparam   { &DWP($stack+4*$_[0],"esp");                           }
+sub ::swtmp    { &DWP(4*$_[0],"esp");                                  }
+
+sub ::bswap
+{   if ($i386) # emulate bswap for i386
+    {  &comment("bswap @_");
+       &xchg(&HB(@_),&LB(@_));
+       &ror (@_,16);
+       &xchg(&HB(@_),&LB(@_));
+    }
+    else
+    {  &generic("bswap",@_);   }
+}
+# These are made-up opcodes introduced over the years essentially
+# by ignorance, just alias them to real ones...
+sub ::movb     { &mov(@_);     }
+sub ::xorb     { &xor(@_);     }
+sub ::rotl     { &rol(@_);     }
+sub ::rotr     { &ror(@_);     }
+sub ::exch     { &xchg(@_);    }
+sub ::halt     { &hlt;         }
+sub ::movz     { &movzx(@_);   }
+sub ::pushf    { &pushfd;      }
+sub ::popf     { &popfd;       }
+
+# 3 argument instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+
+    if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+    # movq between mmx registers can sink Intel CPUs
+    {  &::pshufw($p1,$p2,0xe4);                }
+    else
+    {  &::generic("movq",@_);                  }
+}
+
+# SSE>2 instructions
+my %regrm = (  "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+               "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7  );
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
+    else
+    {  &::generic("pextrd",@_);                }
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
+    else
+    {  &::generic("pinsrd",@_);                }
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);        }
+    else
+    {  &::generic("pshufb",@_);                }
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
+    else
+    {  &::generic("palignr",@_);               }
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
+    else
+    {  &::generic("pclmulqdq",@_);             }
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {  &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});      }
+    else
+    {  &::generic("rdrand",@_);        }
+}
+
+sub ::rdseed
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {  &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst});      }
+    else
+    {  &::generic("rdrand",@_);        }
+}
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+    if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+    { my @opcode=(0x8f);
+       rxb(\@opcode,$1,$2,-1,0x08);
+       push @opcode,0x78,0xc2;
+       push @opcode,0xc0|($2&7)|(($1&7)<<3);           # ModR/M
+       my $c=$3;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       &::data_byte(@opcode);
+    }
+    else
+    {  &::generic("vprotd",@_);        }
+}
+
+sub ::endbranch
+{
+    &::data_byte(0xf3,0x0f,0x1e,0xfb);
+}
+
+# label management
+$lbdecor="L";          # local label decoration, set by package
+$label="000";
+
+sub ::islabel          # see is argument is a known label
+{ my $i;
+    foreach $i (values %label) { return $i if ($i eq $_[0]); }
+  $label{$_[0]};       # can be undef
+}
+
+sub ::label            # instantiate a function-scope label
+{   if (!defined($label{$_[0]}))
+    {  $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++;   }
+  $label{$_[0]};
+}
+
+sub ::LABEL            # instantiate a file-scope label
+{   $label{$_[0]}=$_[1] if (!defined($label{$_[0]}));
+  $label{$_[0]};
+}
+
+sub ::static_label     { &::LABEL($_[0],$lbdecor.$_[0]); }
+
+sub ::set_label_B      { push(@out,"@_:\n"); }
+sub ::set_label
+{ my $label=&::label($_[0]);
+    &::align($_[1]) if ($_[1]>1);
+    &::set_label_B($label);
+  $label;
+}
+
+sub ::wipe_labels      # wipes function-scope labels
+{   foreach $i (keys %label)
+    {  delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); }
+}
+
+# subroutine management
+sub ::function_begin
+{   &function_begin_B(@_);
+    $stack=4;
+    &push("ebp");
+    &push("ebx");
+    &push("esi");
+    &push("edi");
+}
+
+sub ::function_end
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    &function_end_B(@_);
+    $stack=0;
+    &wipe_labels();
+}
+
+sub ::function_end_A
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    $stack+=16;        # readjust esp as if we didn't pop anything
+}
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+       &data_byte(@str[0..15]);
+       foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+sub ::asm_finish
+{   &file_end();
+    print @out;
+}
+
+sub ::asm_init
+{ my ($type,$cpu)=@_;
+
+    $i386=$cpu;
+
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$mwerks=$android=0;
+    if    (($type eq "elf"))
+    {  $elf=1;                 require "x86gas.pl";    }
+    elsif (($type eq "elf-1"))
+    {  $elf=-1;                require "x86gas.pl";    }
+    elsif (($type eq "a\.out"))
+    {  $aout=1;                require "x86gas.pl";    }
+    elsif (($type eq "coff" or $type eq "gaswin"))
+    {  $coff=1;                require "x86gas.pl";    }
+    elsif (($type eq "win32n"))
+    {  $win32=1;               require "x86nasm.pl";   }
+    elsif (($type eq "win32"))
+    {  $win32=1;               require "x86masm.pl";   }
+    elsif (($type eq "macosx"))
+    {  $aout=1; $macosx=1;     require "x86gas.pl";    }
+    elsif (($type eq "android"))
+    {  $elf=1; $android=1;     require "x86gas.pl";    }
+    else
+    {  print STDERR <<"EOF";
+Pick one target type from
+       elf     - Linux, FreeBSD, Solaris x86, etc.
+       a.out   - DJGPP, elder OpenBSD, etc.
+       coff    - GAS/COFF such as Win32 targets
+       win32n  - Windows 95/Windows NT NASM format
+       macosx  - Mac OS X
+EOF
+       exit(1);
+    }
+
+    $pic=0;
+    for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); }
+
+    &file();
+}
+
+sub ::hidden {}
+
+1;
diff --git a/aesni/x86nasm.pl b/aesni/x86nasm.pl

new file mode 100644 (file)

index 0000000..7017b88
--- /dev/null
+++ b/aesni/x86nasm.pl
@@ -0,0 +1,186 @@
+#! /usr/bin/env perl
+# Copyright 1999-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+package x86nasm;
+
+*out=\@::out;
+
+$::lbdecor="L\$";              # local label decoration
+$nmdecor="_";                  # external name decoration
+$drdecor=$::mwerks?".":"";     # directive decoration
+
+$initseg="";
+
+sub ::generic
+{ my $opcode=shift;
+  my $tmp;
+
+    if (!$::mwerks)
+    {   if    ($opcode =~ m/^j/o && $#_==0) # optimize jumps
+       {   $_[0] = "NEAR $_[0]";       }
+       elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
+       {   $_[1] =~ s/^[^\[]*\[/\[/o;  }
+       elsif ($opcode eq "clflush" && $#_==0)
+       {   $_[0] =~ s/^[^\[]*\[/\[/o;  }
+    }
+    &::emit($opcode,@_);
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::call     { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr { &::emit("call",@_);   }
+sub ::jmp_ptr  { &::emit("jmp",@_);    }
+
+sub get_mem
+{ my($size,$addr,$reg1,$reg2,$idx)=@_;
+  my($post,$ret);
+
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
+    if ($size ne "")
+    {  $ret .= "$size";
+       $ret .= " PTR" if ($::mwerks);
+       $ret .= " ";
+    }
+    $ret .= "[";
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige;
+    # put address arithmetic expression in parenthesis
+    $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/);
+
+    if (($addr ne "") && ($addr ne 0))
+    {  if ($addr !~ /^-/)      { $ret .= "$addr+"; }
+       else                    { $post=$addr;      }
+    }
+
+    if ($reg2 ne "")
+    {  $idx!=0 or $idx=1;
+       $ret .= "$reg2*$idx";
+       $ret .= "+$reg1" if ($reg1 ne "");
+    }
+    else
+    {  $ret .= "$reg1";   }
+
+    $ret .= "$post]";
+    $ret =~ s/\+\]/]/; # in case $addr was the only argument
+
+  $ret;
+}
+sub ::BP       { &get_mem("BYTE",@_);  }
+sub ::DWP      { &get_mem("DWORD",@_); }
+sub ::WP       { &get_mem("WORD",@_);  }
+sub ::QWP      { &get_mem("",@_);      }
+sub ::BC       { (($::mwerks)?"":"BYTE ")."@_";  }
+sub ::DWC      { (($::mwerks)?"":"DWORD ")."@_"; }
+
+sub ::file
+{   if ($::mwerks)     { push(@out,".section\t.text,64\n"); }
+    else
+    { my $tmp=<<___;
+%ifidn __OUTPUT_FORMAT__,obj
+section        code    use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+\$\@feat.00 equ 1
+section        .text   code align=64
+%else
+section        .text   code
+%endif
+___
+       push(@out,$tmp);
+    }
+}
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func=$nmdecor.$func;
+
+    push(@out,"${drdecor}global        $func\n")       if ($global);
+    push(@out,"${drdecor}align 16\n");
+    push(@out,"$func:\n");
+    push(@out,"$begin:\n")                     if ($global);
+    $::stack=4;
+}
+
+sub ::function_end_B
+{   $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::file_end
+{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
+    {  my $comm=<<___;
+${drdecor}segment      .bss
+${drdecor}common       ${nmdecor}OPENSSL_ia32cap_P 16
+___
+       # comment out OPENSSL_ia32cap_P declarations
+       grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
+       push (@out,$comm)
+    }
+    push (@out,$initseg) if ($initseg);
+}
+
+sub ::comment {   foreach (@_) { push(@out,"\t; $_\n"); }   }
+
+sub ::external_label
+{   foreach(@_)
+    {  push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n");   }
+}
+
+sub ::public_label
+{   push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");  }
+
+sub ::data_byte
+{   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");       }
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");       }
+sub ::data_word
+{   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");       }
+
+sub ::align
+{   push(@out,"${drdecor}align\t$_[0]\n");     }
+
+sub ::picmeup
+{ my($dst,$sym)=@_;
+    &::lea($dst,&::DWP($sym));
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+    if ($::win32)
+    {  $initseg=<<___;
+segment        .CRT\$XCU data align=4
+extern $f
+dd     $f
+___
+    }
+}
+
+sub ::dataseg
+{   if ($mwerks)       { push(@out,".section\t.data,4\n");   }
+    else               { push(@out,"section\t.data align=4\n"); }
+}
+
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if     __NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
+1;
diff --git a/aesni/x86unix.pl b/aesni/x86unix.pl

new file mode 100644 (file)

index 0000000..e3f24f8
--- /dev/null
+++ b/aesni/x86unix.pl
@@ -0,0 +1,311 @@
+#!/usr/bin/env perl
+
+package x86unix;       # GAS actually...
+
+*out=\@::out;
+
+$label="L000";
+$const="";
+$constl=0;
+
+$align=($::aout)?"4":"16";
+$under=($::aout or $::coff)?"_":"";
+$dot=($::aout)?"":".";
+$com_start="#" if ($::aout or $::coff);
+
+sub opsize()
+{ my $reg=shift;
+    if    ($reg =~ m/^%e/o)            { "l"; }
+    elsif ($reg =~ m/^%[a-d][hl]$/o)   { "b"; }
+    elsif ($reg =~ m/^%[xm]/o)         { undef; }
+    else                               { "w"; }
+}
+
+# swap arguments;
+# expand opcode with size suffix;
+# prefix numeric constants with $;
+sub ::generic
+{ my($opcode,$dst,$src)=@_;
+  my($tmp,$suffix,@arg);
+
+    if (defined($src))
+    {  $src =~ s/^(e?[a-dsixphl]{2})$/%$1/o;
+       $src =~ s/^(x?mm[0-7])$/%$1/o;
+       $src =~ s/^(\-?[0-9]+)$/\$$1/o;
+       $src =~ s/^(\-?0x[0-9a-f]+)$/\$$1/o;
+       push(@arg,$src);
+    }
+    if (defined($dst))
+    {  $dst =~ s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o;
+       $dst =~ s/^(x?mm[0-7])$/%$1/o;
+       $dst =~ s/^(\-?[0-9]+)$/\$$1/o          if(!defined($src));
+       $dst =~ s/^(\-?0x[0-9a-f]+)$/\$$1/o     if(!defined($src));
+       push(@arg,$dst);
+    }
+
+    if    ($dst =~ m/^%/o)     { $suffix=&opsize($dst); }
+    elsif ($src =~ m/^%/o)     { $suffix=&opsize($src); }
+    else                       { $suffix="l";           }
+    undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
+
+    if ($#_==0)                                { &::emit($opcode);             }
+    elsif ($opcode =~ m/^j/o && $#_==1)        { &::emit($opcode,@arg);        }
+    elsif ($opcode eq "call" && $#_==1)        { &::emit($opcode,@arg);        }
+    else                               { &::emit($opcode.$suffix,@arg);}
+
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::movz     { &::movzb(@_);                 }
+sub ::pushf    { &::pushfl;                    }
+sub ::popf     { &::popfl;                     }
+sub ::cpuid    { &::emit(".byte\t0x0f,0xa2");  }
+sub ::rdtsc    { &::emit(".byte\t0x0f,0x31");  }
+
+sub ::call     { &::emit("call",(&islabel($_[0]) or "$under$_[0]")); }
+sub ::call_ptr { &::generic("call","*$_[0]");  }
+sub ::jmp_ptr  { &::generic("jmp","*$_[0]");   }
+
+*::bswap = sub { &::emit("bswap","%$_[0]");    } if (!$::i386);
+
+# chosen SSE instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+    if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+    # movq between mmx registers can sink Intel CPUs
+    {  &::pshufw($p1,$p2,0xe4);        }
+    else
+    {  &::generic("movq",@_);  }
+}
+sub ::pshufw
+{ my($dst,$src,$magic)=@_;
+    &::emit("pshufw","\$$magic","%$src","%$dst");
+}
+
+sub ::DWP
+{ my($addr,$reg1,$reg2,$idx)=@_;
+  my $ret="";
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/islabel($1) or "$under$1"/ige;
+
+    $reg1 = "%$reg1" if ($reg1);
+    $reg2 = "%$reg2" if ($reg2);
+
+    $ret .= $addr if (($addr ne "") && ($addr ne 0));
+
+    if ($reg2)
+    {  $idx!= 0 or $idx=1;
+       $ret .= "($reg1,$reg2,$idx)";
+    }
+    elsif ($reg1)
+    {  $ret .= "($reg1)";      }
+
+  $ret;
+}
+sub ::QWP      { &::DWP(@_);   }
+sub ::BP       { &::DWP(@_);   }
+sub ::BC       { @_;           }
+sub ::DWC      { @_;           }
+
+sub ::file
+{   push(@out,".file\t\"$_[0].s\"\n"); }
+
+sub ::function_begin_B
+{ my($func,$extra)=@_;
+  my $tmp;
+
+    &::external_label($func);
+    $func=$under.$func;
+
+    push(@out,".text\n.globl\t$func\n");
+    if ($::coff)
+    {  push(@out,".def\t$func;\t.scl\t2;\t.type\t32;\t.endef\n"); }
+    elsif ($::aout and !$::pic)
+    { }
+    else
+    {  push(@out,".type        $func,\@function\n"); }
+    push(@out,".align\t$align\n");
+    push(@out,"$func:\n");
+    $::stack=4;
+}
+
+sub ::function_end_B
+{ my($func)=@_;
+
+    $func=$under.$func;
+    push(@out,"${dot}L_${func}_end:\n");
+    if ($::elf)
+    {  push(@out,".size\t$func,${dot}L_${func}_end-$func\n"); }
+    $::stack=0;
+    %label=();
+}
+
+sub ::comment
+       {
+       if (!defined($com_start) or $::elf)
+               {       # Regarding $::elf above...
+                       # GNU and SVR4 as'es use different comment delimiters,
+               push(@out,"\n");        # so we just skip ELF comments...
+               return;
+               }
+       foreach (@_)
+               {
+               if (/^\s*$/)
+                       { push(@out,"\n"); }
+               else
+                       { push(@out,"\t$com_start $_ $com_end\n"); }
+               }
+       }
+
+sub islabel    # see is argument is a known label
+{ my $i;
+    foreach $i (%label) { return $label{$i} if ($label{$i} eq $_[0]); }
+  undef;
+}
+
+sub ::external_label { push(@labels,@_); }
+
+sub ::public_label
+{   $label{$_[0]}="${under}${_[0]}"    if (!defined($label{$_[0]}));
+    push(@out,".globl\t$label{$_[0]}\n");
+}
+
+sub ::label
+{   if (!defined($label{$_[0]}))
+    {  $label{$_[0]}="${dot}${label}${_[0]}"; $label++;   }
+  $label{$_[0]};
+}
+
+sub ::set_label
+{ my $label=&::label($_[0]);
+    &::align($_[1]) if ($_[1]>1);
+    push(@out,"$label:\n");
+}
+
+sub ::file_end
+{   # try to detect if SSE2 or MMX extensions were used on ELF platform...
+    if ($::elf && grep {/%[x]?mm[0-7]/i} @out){
+       my $tmp;
+
+       push (@out,"\n.section\t.bss\n");
+       push (@out,".comm\t${under}OPENSSL_ia32cap_P,4,4\n");
+
+       push (@out,".section\t.init\n");
+       # One can argue that it's wasteful to craft every
+       # SSE/MMX module with this snippet... Well, it's 72
+       # bytes long and for the moment we have two modules.
+       # Let's argue when we have 7 modules or so...
+       #
+       # $1<<10 sets a reserved bit to signal that variable
+       # was initialized already...
+       &::picmeup("edx","OPENSSL_ia32cap_P");
+       $tmp=<<___;
+       cmpl    \$0,(%edx)
+       jne     1f
+       movl    \$1<<10,(%edx)
+       pushf
+       popl    %eax
+       movl    %eax,%ecx
+       xorl    \$1<<21,%eax
+       pushl   %eax
+       popf
+       pushf
+       popl    %eax
+       xorl    %ecx,%eax
+       btl     \$21,%eax
+       jnc     1f
+       pushl   %edi
+       pushl   %ebx
+       movl    %edx,%edi
+       movl    \$1,%eax
+       .byte   0x0f,0xa2
+       orl     \$1<<10,%edx
+       movl    %edx,0(%edi)
+       popl    %ebx
+       popl    %edi
+       jmp     1f
+       .align  $align
+       1:
+___
+       push (@out,$tmp);
+    }
+
+    if ($const ne "")
+    {  push(@out,".section .rodata\n");
+       push(@out,$const);
+       $const="";
+    }
+}
+
+sub ::data_byte        {   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
+
+sub ::align
+{ my $val=$_[0],$p2,$i;
+    if ($::aout)
+    {  for ($p2=0;$val!=0;$val>>=1) { $p2++; }
+       $val=$p2-1;
+       $val.=",0x90";
+    }
+    push(@out,".align\t$val\n");
+}
+
+sub ::picmeup
+{ my($dst,$sym,$base,$reflabel)=@_;
+
+    if ($::pic && ($::elf || $::aout))
+    {  if (!defined($base))
+       {   &::call(&::label("PIC_me_up"));
+           &::set_label("PIC_me_up");
+           &::blindpop($dst);
+           &::add($dst,"\$${under}_GLOBAL_OFFSET_TABLE_+[.-".
+                           &::label("PIC_me_up") . "]");
+       }
+       else
+       {   &::lea($dst,&::DWP("${under}_GLOBAL_OFFSET_TABLE_+[.-$reflabel]",
+                           $base));
+       }
+       &::mov($dst,&::DWP($under.$sym."\@GOT",$dst));
+    }
+    else
+    {  &::lea($dst,&::DWP($sym));      }
+}
+
+sub ::initseg
+{ my($f)=@_;
+  my($tmp,$ctor);
+
+    if ($::elf)
+    {  $tmp=<<___;
+.section       .init
+       call    $under$f
+       jmp     .Linitalign
+.align $align
+.Linitalign:
+___
+    }
+    elsif ($::coff)
+    {   $tmp=<<___;    # applies to both Cygwin and Mingw
+.section       .ctors
+.long  $under$f
+___
+    }
+    elsif ($::aout)
+    {  $ctor="${under}_GLOBAL_\$I\$$f";
+       $tmp=".text\n";
+       $tmp.=".type    $ctor,\@function\n" if ($::pic);
+       $tmp.=<<___;    # OpenBSD way...
+.globl $ctor
+.align 2
+$ctor:
+       jmp     $under$f
+___
+    }
+    push(@out,$tmp) if ($tmp);
+}
+
+1;
diff --git a/configure.ac b/configure.ac

index dc78bbc5639c8150fccc2eceff9477a1966f7ae7..fc000e8cc1cf21119a36b8f391ada0decc64cd69 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -3,6 +3,7 @@ AC_CONFIG_HEADERS([config.h])
  
  PKG_PROG_PKG_CONFIG
  AC_LANG_C
+AM_PROG_AS
  AC_CANONICAL_HOST
  AM_MAINTAINER_MODE([enable])
  AM_INIT_AUTOMAKE([foreign tar-ustar])
@@ -226,6 +227,16 @@ if test "$ac_cv_func_nl_langinfo" = "yes"; then
  fi
  AM_CONDITIONAL(OPENCONNECT_ICONV, [test "$am_cv_func_iconv" = "yes"])
  
+aesni=no
+if test "$host_cpu" = "x86_64"; then
+   AC_PATH_PROG(PERL, perl)
+   if test "$PERL" != ""; then
+      AC_DEFINE(AESNI_ASM, 1, [Have AES-NI assembler])
+      aesni=yes
+   fi
+fi
+AM_CONDITIONAL(OPENCONNECT_AESNI, [test "$aesni" = "yes"])
+
  AC_ARG_ENABLE([nls],
         AS_HELP_STRING([--disable-nls], [Do not use Native Language Support]),
         [USE_NLS=$enableval], [USE_NLS=yes])
diff --git a/esp.c b/esp.c

index 48884b798c13424b023758e21fb9083560cca054..f77bcc97e5fc3aa43165aaf5171bfac5c16f25f1 100644 (file)
--- a/esp.c
+++ b/esp.c
@@ -389,9 +389,15 @@ int openconnect_setup_esp_keys(struct openconnect_info *vpninfo, int new_keys)
         vpninfo->esp_out.seq = vpninfo->esp_out.seq_backlog = 0;
         esp_in->seq = esp_in->seq_backlog = 0;
  
-       ret = init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in);
-       if (ret)
-               return ret;
+#ifdef AESNI_ASM
+       if (vpninfo->esp_hmac != HMAC_SHA1 ||
+           aesni_init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in))
+#endif
+       {
+               ret = init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in);
+               if (ret)
+                       return ret;
+       }
  
         if (vpninfo->dtls_state == DTLS_NOSECRET)
                 vpninfo->dtls_state = DTLS_SECRET;
diff --git a/openconnect-internal.h b/openconnect-internal.h

index 2e5e76d99ac62f57641a51133b0d9b6594080a53..cfee3a3de5b2a4aac3e2580cf34cbb63e179bfeb 100644 (file)
--- a/openconnect-internal.h
+++ b/openconnect-internal.h
@@ -938,6 +938,7 @@ int openconnect_setup_esp_keys(struct openconnect_info *vpninfo, int new_keys);
  
  /* {gnutls,openssl}-esp.c */
  int init_esp_ciphers(struct openconnect_info *vpninfo, struct esp *out, struct esp *in);
+int aesni_init_esp_ciphers(struct openconnect_info *vpninfo, struct esp *out, struct esp *in);
  
  /* {gnutls,openssl}.c */
  int ssl_nonblock_read(struct openconnect_info *vpninfo, void *buf, int maxlen);
author	David Woodhouse <dwmw2@infradead.org>
	Thu, 11 Apr 2019 21:44:48 +0000 (00:44 +0300)
committer	David Woodhouse <dwmw2@infradead.org>
	Mon, 15 Apr 2019 17:12:24 +0000 (18:12 +0100)
Makefile.am		patch \| blob \| history
aesni-esp.c	[new file with mode: 0644]	patch \| blob
aesni-esp.h	[new file with mode: 0644]	patch \| blob
aesni/aesni-sha1-x86_64.pl	[new file with mode: 0644]	patch \| blob
aesni/aesni-x86_64.pl	[new file with mode: 0755]	patch \| blob
aesni/sha1-x86_64.pl	[new file with mode: 0755]	patch \| blob
aesni/x86_64-xlate.pl	[new file with mode: 0755]	patch \| blob
aesni/x86_64cpuid.pl	[new file with mode: 0644]	patch \| blob
aesni/x86asm.pl	[new file with mode: 0644]	patch \| blob
aesni/x86nasm.pl	[new file with mode: 0644]	patch \| blob
aesni/x86unix.pl	[new file with mode: 0644]	patch \| blob
configure.ac		patch \| blob \| history
esp.c		patch \| blob \| history
openconnect-internal.h		patch \| blob \| history