lib_srcs_stoken = stoken.c
lib_srcs_esp = esp.c esp-seqno.c
lib_srcs_dtls = dtls.c
+lib_srcs_aesni = aesni-esp.c aesni-esp.h
POTFILES = $(openconnect_SOURCES) $(lib_srcs_cisco) $(lib_srcs_juniper) $(lib_srcs_globalprotect) \
- gnutls-esp.c gnutls-dtls.c openssl-esp.c openssl-dtls.c \
+ gnutls-esp.c gnutls-dtls.c openssl-esp.c openssl-dtls.c $(lib_srcs_aesni) \
$(lib_srcs_esp) $(lib_srcs_dtls) gnutls_tpm2_esys.c gnutls_tpm2_ibm.c \
$(lib_srcs_openssl) $(lib_srcs_gnutls) $(library_srcs) \
$(lib_srcs_win32) $(lib_srcs_posix) $(lib_srcs_gssapi) $(lib_srcs_iconv) \
if OPENCONNECT_DTLS
lib_srcs_cisco += $(lib_srcs_dtls)
endif
+if OPENCONNECT_AESNI
+lib_srcs_esp += x86_64cpuid.s aesni-sha1-x86_64.s aesni-x86_64.s sha1-x86_64.s $(lib_srcs_aesni)
+endif
if OPENCONNECT_ESP
lib_srcs_juniper += $(lib_srcs_esp)
endif
# main.c includes version.c
openconnect-main.$(OBJEXT): version.c
+# s/OPENSSL/OPENCONNECT/ here so instead of doing it in the .pl files, so
+# that they can remain pristine copies from upstream.
+%.s: aesni/%.pl
+ CC=$(CC) $(PERL) $< | sed s/OPENSSL_ia32/OPENCONNECT_ia32/g > $@
+
version.c: $(library_srcs) $(lib_openssl_srcs) $(lib_gnutls_srcs) \
$(openconnect_SOURCES) Makefile.am configure.ac \
openconnect.h openconnect-internal.h version.sh @GITVERSIONDEPS@
--- /dev/null
+/*
+ * OpenConnect (SSL + DTLS) VPN client
+ *
+ * Copyright © 2019 David Woodhouse
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "openconnect-internal.h"
+
+#include "aesni-esp.h"
+
+uint64_t OPENCONNECT_ia32cap_P[2];
+
+int aesni_init_esp_ciphers(struct openconnect_info *vpninfo,
+ struct esp *esp_out, struct esp *esp_in)
+{
+ if (!(OPENCONNECT_ia32cap_P[0] & (1<<10))) {
+ uint64_t cap = OPENCONNECT_ia32_cpuid(OPENCONNECT_ia32cap_P);
+
+ OPENCONNECT_ia32cap_P[0] = cap | (1<<10);
+
+ vpn_progress(vpninfo, PRG_DEBUG,
+ _("CPU capabilities: %08lx %08lx %08lx %08lx\n"),
+ OPENCONNECT_ia32cap_P[0] & 0xffffffff,
+ OPENCONNECT_ia32cap_P[0] >> 32,
+ OPENCONNECT_ia32cap_P[1] & 0xffffffff,
+ OPENCONNECT_ia32cap_P[1] >> 32);
+ }
+
+ return -EINVAL;
+}
--- /dev/null
+/*
+ * OpenConnect (SSL + DTLS) VPN client
+ *
+ * Copyright © 2019 David Woodhouse
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#ifndef __AESNI_ESP_H__
+#define __AESNI_ESP_H__
+
+/* ABI definitions for the CRYPTOGAMS routines */
+
+#define AES_MAXKEYBITS 256
+#define AES_MAXROUNDS 14
+#define AES_BLOCK 16
+
+struct aesni_key {
+ uint32_t rd_key[4 * (AES_MAXROUNDS + 1)];
+ int rounds;
+};
+
+/* Not literally AES-NI but we are only using this in the context of the
+ stitched AES-NI + SHA1 routines. */
+
+#define SHA1_BLOCK 64
+
+struct aesni_sha1 {
+ uint32_t h0, h1, h2, h3, h4;
+ uint64_t N; /* The CRYPTOGAMS routines don't touch this */
+};
+
+
+int aesni_set_encrypt_key (const unsigned char *userKey, int bits,
+ struct aesni_key *key);
+int aesni_set_decrypt_key (const unsigned char *userKey, int bits,
+ struct aesni_key *key);
+
+void aesni_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const struct aesni_key *key,
+ unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc(const void *inp, void *out, size_t blocks,
+ const struct aesni_key *key, unsigned char iv[16],
+ const struct aesni_sha1 *ctx, const void *in0);
+
+void sha1_block_data_order(struct aesni_sha1 *ctx, const void *p,
+ size_t n_blocks);
+
+uint64_t OPENCONNECT_ia32_cpuid(uint64_t *cap);
+
+#endif /* AESNI_ESP_H__ */
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+# AES-128-CBC +SHA1 stitch gain
+# Westmere 3.77[+5.3] 9.07 6.55 +38%
+# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
+# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
+# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
+# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
+# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
+# Ryzen(**) 2.71[+1.93] 4.64 2.74 +69%
+# Goldmont(**) 3.82[+1.70] 5.52 4.20 +31%
+#
+# AES-192-CBC
+# Westmere 4.51 9.81 6.80 +44%
+# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
+# Ivy Bridge 6.05 10.65 6.07 +75%
+# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
+# Bulldozer 6.89 12.84 6.96 +84%
+#
+# AES-256-CBC
+# Westmere 5.25 10.55 7.21 +46%
+# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
+# Ivy Bridge 7.05 11.65 7.12 +64%
+# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
+# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%)
+# Bulldozer 8.00 13.95 8.25 +69%
+# Ryzen(**) 3.71 5.64 3.72 +52%
+# Goldmont(**) 5.35 7.05 5.76 +22%
+#
+# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+# background information. Above numbers in parentheses are SSSE3
+# results collected on AVX-capable CPU, i.e. apply on OSes that
+# don't support AVX.
+# (**) SHAEXT results.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+# AES-128-CBC AES-192-CBC AES-256-CBC
+# Westmere 1.25 1.50 1.75
+# Sandy Bridge 0.74 0.91 1.09
+# Ivy Bridge 0.74 0.90 1.11
+# Haswell 0.63 0.76 0.88
+# Bulldozer 0.70 0.85 0.99
+
+# And indeed:
+#
+# AES-256-CBC +SHA1 stitch gain
+# Westmere 1.75 7.20 6.68 +7.8%
+# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
+# Ivy Bridge 1.11 5.70 5.45 +4.6%
+# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
+# Bulldozer 0.99 6.95 5.95 +17%(**)
+#
+# (*) Tiny improvement coefficient on Haswell is because we compare
+# AVX1 stitch to sum with AVX2 SHA1.
+# (**) Execution is fully dominated by integer code sequence and
+# SIMD still hardly shows [in single-process benchmark;-]
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+
+$stitched_decrypt=0;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA_CTX *ctx,
+# const void *in0);
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 32
+aesni_cbc_sha1_enc:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
+ bt \$61,%r11 # check SHA bit
+ jc aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+ jmp aesni_cbc_sha1_enc_ssse3
+ ret
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
+my $K_XX_XX="%r11";
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
+my @rndkey=("%xmm14","%xmm15"); # for enc
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+
+if (1) { # reassign for Atom Silvermont
+ # The goal is to minimize amount of instructions with more than
+ # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
+ # SSSE3 instructions to upper half of the register bank.
+ @X=map("%xmm$_",(8..11,4..7));
+ @Tx=map("%xmm$_",(12,13,3));
+ ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+ @rndkey=("%xmm0","%xmm1");
+}
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 32
+aesni_cbc_sha1_enc_ssse3:
+.cfi_startproc
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_ssse3 # debugging artefact
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ movdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240-112($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
+ add \$64,$inp
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @Tx[2],@X[-1&7]
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movups -112($key),$rndkey0 # $key[0]
+ movups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_ssse3
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)) if ($Xi==8);
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+
+ &pslld (@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (shift);
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@Tx[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+my @body_00_19 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&xor (@T[0],$d);',
+ '&mov (@T[1],$a);', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor ($b,$c);', # $c^$d for next round
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&and (@T[1],$b);', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c);', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_00_19;
+
+ $n = scalar(@r);
+ $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+
+ return @r;
+}
+
+my @body_20_39 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
+ '&mov (@T[1],$a);', # $b for next round
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
+
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_20_39;
+
+ $n = scalar(@r);
+ $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
+ $jj++;
+
+ return @r;
+}
+
+my @body_40_59 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40);', # restore $c
+
+ '&$_ror ($b,7);', # $b>>>2
+ '&mov (@T[1],$a);', # $b for next round
+ '&xor (@T[0],$c);',
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59);', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_40_59;
+
+ $n = scalar(@r);
+ $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
+ $jj++;
+
+ return @r;
+}
+$code.=<<___;
+.align 32
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_ssse3
+
+.Ldone_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups $iv,($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa %rsi,56
+ mov 0(%rsi),%r15
+.cfi_restore %r15
+ mov 8(%rsi),%r14
+.cfi_restore %r14
+ mov 16(%rsi),%r13
+.cfi_restore %r13
+ mov 24(%rsi),%r12
+.cfi_restore %r12
+ mov 32(%rsi),%rbp
+.cfi_restore %rbp
+ mov 40(%rsi),%rbx
+.cfi_restore %rbx
+ lea 48(%rsi),%rsp
+.cfi_def_cfa %rsp,8
+.Lepilogue_ssse3:
+ ret
+.cfi_endproc
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+# reassign for Atom Silvermont (see above)
+($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
+@X=map("%xmm$_",(8..13,6,7));
+@Tx=map("%xmm$_",(14,15,5));
+
+my @aes256_dec = (
+ '&movdqu($inout0,"0x00($in0)");',
+ '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
+ '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
+ '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
+
+ '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
+ '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&aesdec ($inout0,$rndkey0);',
+ '&aesdec ($inout1,$rndkey0);',
+ '&aesdec ($inout2,$rndkey0);',
+ '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
+ '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
+ '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
+ '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
+
+ '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
+ '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
+ '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
+ '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
+
+ '&movups ("0x30($out,$in0)",$inout3);'
+ ));
+
+sub body_00_19_dec () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39_dec() if ($rx==19);
+
+ my @r=@body_00_19;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_20_39_dec () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59_dec() if ($rx==39);
+
+ my @r=@body_20_39;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_40_59_dec () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+
+ my @r=@body_40_59;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+$code.=<<___;
+.globl aesni256_cbc_sha1_dec
+.type aesni256_cbc_sha1_dec,\@abi-omnipotent
+.align 32
+aesni256_cbc_sha1_dec:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni256_cbc_sha1_dec_avx
+___
+$code.=<<___;
+ jmp aesni256_cbc_sha1_dec_ssse3
+ ret
+.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
+
+.type aesni256_cbc_sha1_dec_ssse3,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_ssse3:
+.cfi_startproc
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ movdqu ($ivp),@X[3] # load IV
+ #mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
+ pshufb @Tx[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movdqu -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_ssse3
+
+.align 32
+.Loop_dec_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_ssse3
+
+.Ldone_dec_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups @X[3],($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_cfa_def %rsi,56
+ mov 0(%rsi),%r15
+.cfi_restore %r15
+ mov 8(%rsi),%r14
+.cfi_restore %r14
+ mov 16(%rsi),%r13
+.cfi_restore %r13
+ mov 24(%rsi),%r12
+.cfi_restore %r12
+ mov 32(%rsi),%rbp
+.cfi_restore %rbp
+ mov 40(%rsi),%rbx
+.cfi_restore %rbx
+ lea 48(%rsi),%rsp
+.cfi_cfa_def %rsp,8
+.Lepilogue_dec_ssse3:
+ ret
+.cfi_endproc
+.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
+___
+ }}}
+$j=$jj=$r=$rx=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+my $Kx=@Tx[2];
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_avx,\@function,6
+.align 32
+aesni_cbc_sha1_enc_avx:
+.cfi_startproc
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_avx # debugging artefact
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ vmovdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240-112($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey[1] # $key[0]
+ vmovups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_avx
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ vmovdqu `16*$n`($in0),$in # load input
+ vpxor $rndkey[1],$in,$in
+___
+ $code.=<<___ if ($n);
+ vmovups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ vpxor $in,$iv,$iv
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+0)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+1)-112`($key),$rndkey[0]
+ je .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+2)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+ vaesenclast $rndkey[0],$iv,$iv
+ vmovups -112($key),$rndkey[0]
+ vmovups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpsrld (@Tx[0],@Tx[1],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[1],@Tx[1],2);
+ &vpxor (@X[0],@X[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (shift);
+
+ &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 32
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_avx
+
+.Ldone_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups $iv,($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa %rsi,56
+ mov 0(%rsi),%r15
+.cfi_restore %r15
+ mov 8(%rsi),%r14
+.cfi_restore %r14
+ mov 16(%rsi),%r13
+.cfi_restore %r13
+ mov 24(%rsi),%r12
+.cfi_restore %r12
+ mov 32(%rsi),%rbp
+.cfi_restore %rbp
+ mov 40(%rsi),%rbx
+.cfi_restore %rbx
+ lea 48(%rsi),%rsp
+.cfi_def_cfa %rsp,8
+.Lepilogue_avx:
+ ret
+.cfi_endproc
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+@aes256_dec = (
+ '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
+ '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
+ '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
+ '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
+
+ '&vmovups($rndkey0,"16-112($key)");',
+ '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&vaesdec ($inout0,$inout0,$rndkey0);',
+ '&vaesdec ($inout1,$inout1,$rndkey0);',
+ '&vaesdec ($inout2,$inout2,$rndkey0);',
+ '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
+ '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
+ '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
+ '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
+
+ '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
+ '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
+ '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
+ '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
+
+ '&vmovups ("0x30($out,$in0)",$inout3);'
+ ));
+
+$code.=<<___;
+.type aesni256_cbc_sha1_dec_avx,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_avx:
+.cfi_startproc
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+.cfi_adjust_cfa_offset `104+($win64?10*16:0)`
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ vmovdqu ($ivp),@X[3] # load IV
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_avx
+
+.align 32
+.Loop_dec_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_avx
+
+.Ldone_dec_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups @X[3],($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+.cfi_def_cfa %rsi,56
+ mov 0(%rsi),%r15
+.cfi_restore %r15
+ mov 8(%rsi),%r14
+.cfi_restore %r14
+ mov 16(%rsi),%r13
+.cfi_restore %r13
+ mov 24(%rsi),%r12
+.cfi_restore %r12
+ mov 32(%rsi),%rbp
+.cfi_restore %rbp
+ mov 40(%rsi),%rbx
+.cfi_restore %rbx
+ lea 48(%rsi),%rsp
+.cfi_def_cfa %rsp,8
+.Lepilogue_dec_avx:
+ ret
+.cfi_endproc
+.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
+___
+ }}}
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+ if ($shaext) {{{
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$rounds="%r11d";
+
+($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+@rndkey=("%xmm0","%xmm1");
+$r=0;
+
+my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_shaext,\@function,6
+.align 32
+aesni_cbc_sha1_enc_shaext:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+___
+$code.=<<___ if ($win64);
+ lea `-8-10*16`(%rsp),%rsp
+ movaps %xmm6,-8-10*16(%rax)
+ movaps %xmm7,-8-9*16(%rax)
+ movaps %xmm8,-8-8*16(%rax)
+ movaps %xmm9,-8-7*16(%rax)
+ movaps %xmm10,-8-6*16(%rax)
+ movaps %xmm11,-8-5*16(%rax)
+ movaps %xmm12,-8-4*16(%rax)
+ movaps %xmm13,-8-3*16(%rax)
+ movaps %xmm14,-8-2*16(%rax)
+ movaps %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
+
+ mov 240($key),$rounds
+ sub $in0,$out
+ movups ($key),$rndkey0 # $key[0]
+ movups ($ivp),$iv # load IV
+ movups 16($key),$rndkey[0] # forward reference
+ lea 112($key),$key # size optimization
+
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ pshufd \$0b00011011,$E,$E # flip word order
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+___
+ &$aesenc();
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+ &$aesenc();
+$code.=<<___;
+ pshufb $BSWAP,@MSG[1]
+
+ paddd @MSG[0],$E
+ movdqu 0x20($inp),@MSG[2]
+ lea 0x40($inp),$inp
+ pxor $E_SAVE,@MSG[0] # black magic
+___
+ &$aesenc();
+$code.=<<___;
+ pxor $E_SAVE,@MSG[0] # black magic
+ movdqa $ABCD,$E_
+ pshufb $BSWAP,@MSG[2]
+ sha1rnds4 \$0,$E,$ABCD # 0-3
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqu -0x10($inp),@MSG[3]
+ movdqa $ABCD,$E
+ pshufb $BSWAP,@MSG[3]
+___
+ &$aesenc();
+$code.=<<___;
+ sha1rnds4 \$0,$E_,$ABCD # 4-7
+ sha1nexte @MSG[2],$E
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+___
+ &$aesenc();
+
+for($i=2;$i<20-4;$i++) {
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
+ sha1nexte @MSG[3],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+ sha1msg1 @MSG[3],@MSG[2]
+___
+ ($E,$E_)=($E_,$E);
+ push(@MSG,shift(@MSG));
+
+ &$aesenc();
+}
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[3],$E_
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[0],$E
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $E_SAVE,@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $MSG[0],$E
+___
+ while($r<40) { &$aesenc(); } # remaining aesenc's
+$code.=<<___;
+ dec $len
+
+ paddd $ABCD_SAVE,$ABCD
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movups $iv,($ivp) # write IV
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-10*16(%rax),%xmm6
+ movaps -8-9*16(%rax),%xmm7
+ movaps -8-8*16(%rax),%xmm8
+ movaps -8-7*16(%rax),%xmm9
+ movaps -8-6*16(%rax),%xmm10
+ movaps -8-5*16(%rax),%xmm11
+ movaps -8-4*16(%rax),%xmm12
+ movaps -8-3*16(%rax),%xmm13
+ movaps -8-2*16(%rax),%xmm14
+ movaps -8-1*16(%rax),%xmm15
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
+___
+ }}}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha1_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lseh_no_shaext
+
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
+ lea 96(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea `104+10*16`(%rax),%rax # adjust stack pointer
+
+ mov 0(%rax),%r15
+ mov 8(%rax),%r14
+ mov 16(%rax),%r13
+ mov 24(%rax),%r12
+ mov 32(%rax),%rbp
+ mov 40(%rax),%rbx
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ssse3_handler,.-ssse3_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_end_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ unshift @opcode,$rex|0x40 if($rex);
+}
+
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ rex(\@opcode,$3,$2);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x0f,0x38);
+
+ if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ unshift @opcode,0x66;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
+ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
+
+ print $_,"\n";
+}
+close STDOUT;
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
+# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
+# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
+# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straightforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor 3x 6x 8x
+# theoretical asymptotic limit 1.67 0.83 0.625
+# measured performance for 8KB block 1.05 0.86 0.84
+#
+# "as if" interleave factor 4.7x 5.8x 6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt 1.16 0.93 0.74
+# CTR 1.14 0.91 0.74
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instructions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizes 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB OCB
+# Westmere 3.77/1.25 1.25 1.25 1.26
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
+# Skylake 2.62/0.63 0.63 0.63 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Knights L 2.54/0.77 0.78 0.85 - 1.50
+# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
+# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
+#
+# (*) Atom Silvermont ECB result is suboptimal because of penalties
+# incurred by operations on %xmm8-15. As ECB is not considered
+# critical, nothing was done to mitigate the problem.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+$code=".text\n";
+$code.=".extern OPENSSL_ia32cap_P\n";
+
+$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8"; # cbc, ctr, ...
+
+$rnds_="%r10d"; # backup copy for $rounds
+$key_="%r11"; # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0"; $rndkey1="%xmm1";
+$inout0="%xmm2"; $inout1="%xmm3";
+$inout2="%xmm4"; $inout3="%xmm5";
+$inout4="%xmm6"; $inout5="%xmm7";
+$inout6="%xmm8"; $inout7="%xmm9";
+
+$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
+$in0="%xmm8"; $iv="%xmm9";
+\f
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+ $movkey ($key),$rndkey0
+ $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+ xorps $rndkey0,$ivec
+ lea 32($key),$key
+ xorps $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+ lea 32($key),$key
+ xorps $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+ aes${p} $rndkey1,$inout
+ dec $rounds
+ $movkey ($key),$rndkey1
+ lea 16($key),$key
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
+ aes${p}last $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_encrypt:
+.cfi_startproc
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($out) # output
+ pxor $inout0,$inout0
+ ret
+.cfi_endproc
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_decrypt:
+.cfi_startproc
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($out) # output
+ pxor $inout0,$inout0
+ ret
+.cfi_endproc
+.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+\f
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+.cfi_startproc
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop2:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop2
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ ret
+.cfi_endproc
+.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt3,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt3:
+.cfi_startproc
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop3:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop3
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ ret
+.cfi_endproc
+.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt4,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt4:
+.cfi_startproc
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ xorps $rndkey0,$inout3
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ .byte 0x0f,0x1f,0x00
+ add \$16,%rax
+
+.L${dir}_loop4:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop4
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ ret
+.cfi_endproc
+.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt6,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt6:
+.cfi_startproc
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout5
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+.L${dir}_loop6_enter:
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop6
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ ret
+.cfi_endproc
+.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt8,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt8:
+.cfi_startproc
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ pxor $rndkey0,$inout2
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout7
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop8_inner
+.align 16
+.L${dir}_loop8:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+.L${dir}_loop8_inner:
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+.L${dir}_loop8_enter:
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ aes${dir} $rndkey0,$inout6
+ aes${dir} $rndkey0,$inout7
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop8
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ aes${dir}last $rndkey0,$inout6
+ aes${dir}last $rndkey0,$inout7
+ ret
+.cfi_endproc
+.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+\f
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,\@function,5
+.align 16
+aesni_ecb_encrypt:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # offload $inout4..7
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ and \$-16,$len # if ($len<16)
+ jz .Lecb_ret # return
+
+ mov 240($key),$rounds # key->rounds
+ $movkey ($key),$rndkey0
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ test %r8d,%r8d # 5th argument
+ jz .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_enc_tail # short input
+
+ movdqu ($inp),$inout0 # load 8 input blocks
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0 # load 8 input blocks
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ sub \$0x80,$len
+ jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
+
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
+
+.Lecb_enc_tail: # $len is less than 8*16
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_enc_one
+ movups 0x10($inp),$inout1
+ je .Lecb_enc_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_enc_three
+ movups 0x30($inp),$inout3
+ je .Lecb_enc_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_enc_five
+ movups 0x50($inp),$inout5
+ je .Lecb_enc_six
+ movdqu 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_encrypt8
+ movups $inout0,($out) # store 7 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # store one output block
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ call _aesni_encrypt2
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups $inout0,($out) # store 4 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps $inout5,$inout5
+ call _aesni_encrypt6
+ movups $inout0,($out) # store 5 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups $inout0,($out) # store 6 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ jmp .Lecb_ret
+\f#--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_dec_tail # short input
+
+ movdqu ($inp),$inout0 # load 8 input blocks
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0 # load 8 input blocks
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ $movkey ($key_),$rndkey0
+ sub \$0x80,$len
+ jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
+
+ movups $inout0,($out) # store 8 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+ movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ movups $inout7,0x70($out)
+ pxor $inout7,$inout7
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
+
+.Lecb_dec_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_dec_one
+ movups 0x10($inp),$inout1
+ je .Lecb_dec_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_dec_three
+ movups 0x30($inp),$inout3
+ je .Lecb_dec_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_dec_five
+ movups 0x50($inp),$inout5
+ je .Lecb_dec_six
+ movups 0x60($inp),$inout6
+ $movkey ($key),$rndkey0
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups $inout0,($out) # store 7 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+ movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # store one output block
+ pxor $inout0,$inout0 # clear register bank
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ call _aesni_decrypt2
+ movups $inout0,($out) # store 2 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups $inout0,($out) # store 3 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups $inout0,($out) # store 4 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups $inout0,($out) # store 5 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups $inout0,($out) # store 6 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+
+.Lecb_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lecb_enc_ret:
+___
+$code.=<<___;
+ ret
+.cfi_endproc
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+\f
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9"; # 6th argument
+
+my $increment="%xmm9";
+my $iv="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_enc_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movdqu ($ivp),$iv
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ shl \$4,$rounds
+ mov \$16,$rnds_
+ lea 0($key),$key_
+ movdqu ($cmac),$inout1
+ movdqa $iv,$inout0
+ lea 32($key,$rounds),$key # end of key schedule
+ pshufb $bswap_mask,$iv
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ movups ($inp),$in0 # load inp
+
+ xorps $rndkey0,$inout0 # counter
+ $movkey 16($key_),$rndkey1
+ xorps $in0,$rndkey0
+ xorps $rndkey0,$inout1 # cmac^=inp
+ $movkey 32($key_),$rndkey0
+
+.Lccm64_enc2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_enc2_loop
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ paddq $increment,$iv
+ dec $len # $len-- ($len is in blocks)
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+
+ lea 16($inp),$inp
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ pshufb $bswap_mask,$inout0
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
+
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_dec_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movups ($ivp),$iv
+ movdqu ($cmac),$inout1
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ movaps $iv,$inout0
+ mov $rounds,$rnds_
+ mov $key,$key_
+ pshufb $bswap_mask,$iv
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ shl \$4,$rnds_
+ mov \$16,$rounds
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ lea 16($inp),$inp # $inp+=16
+ sub %r10,%rax # twisted $rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ mov %rax,%r10
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ lea 16($out),$out # $out+=16
+ pshufb $bswap_mask,$inout0
+
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
+
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ $movkey 16($key_),$rndkey1
+ xorps $rndkey0,$in0
+ xorps $rndkey0,$inout0
+ xorps $in0,$inout1 # cmac^=out
+ $movkey 32($key_),$rndkey0
+ jmp .Lccm64_dec2_loop
+.align 16
+.Lccm64_dec2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_dec2_loop
+ movups ($inp),$in0 # load input
+ paddq $increment,$iv
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+ lea 16($inp),$inp # $inp+=16
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+ #xorps $in0,$inout1 # cmac^=out
+ mov 240($key_),$rounds
+___
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}\f
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
+{
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("%ebp","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+.cfi_startproc
+ cmp \$1,$len
+ jne .Lctr32_bulk
+
+ # handle single block without allocating stack frame,
+ # useful when handling edges
+ movups ($ivp),$inout0
+ movups ($inp),$inout1
+ mov 240($key),%edx # key->rounds
+___
+ &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ xorps $inout1,$inout0
+ pxor $inout1,$inout1
+ movups $inout0,($out)
+ xorps $inout0,$inout0
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ lea (%rsp),$key_ # use $key_ as frame pointer
+.cfi_def_cfa_register $key_
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8($key_) # offload everything
+ movaps %xmm7,-0x98($key_)
+ movaps %xmm8,-0x88($key_)
+ movaps %xmm9,-0x78($key_)
+ movaps %xmm10,-0x68($key_)
+ movaps %xmm11,-0x58($key_)
+ movaps %xmm12,-0x48($key_)
+ movaps %xmm13,-0x38($key_)
+ movaps %xmm14,-0x28($key_)
+ movaps %xmm15,-0x18($key_)
+.Lctr32_body:
+___
+$code.=<<___;
+
+ # 8 16-byte words on top of stack are counter values
+ # xor-ed with zero-round key
+
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ mov %rdx,%r10 # about to borrow %rdx
+ movdqa $inout0,0x70(%rsp)
+
+ lea 1($ctr),%rax
+ lea 2($ctr),%rdx
+ bswap %eax
+ bswap %edx
+ xor $key0,%eax
+ xor $key0,%edx
+ pinsrd \$3,%eax,$inout1
+ lea 3($ctr),%rax
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%edx,$inout2
+ bswap %eax
+ mov %r10,%rdx # restore %rdx
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%eax
+ bswap %r10d
+ pinsrd \$3,%eax,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
+ mov 240($key),$rounds # key->rounds
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len # $len is in blocks
+ jb .Lctr32_tail # short input if ($len<8)
+
+ sub \$6,$len # $len is biased by -6
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x # [which denotes Atom Silvermont]
+
+ lea 0x80($key),$key # size optimization
+ sub \$2,$len # $len is biased by -8
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ add \$6,$ctr # next counter value
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
+
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6 # load 6 input blocks
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp # $inp+=6*16
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6 # inp^=E(ctr)
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out) # store 6 output blocks
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out # $out+=6*16
+
+ sub \$6,$len
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
+
+ add \$6,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr # next counter value
+ movdqa 0x60(%rsp),$inout6
+ aesenc $rndkey1,$inout0
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
+ aesenc $rndkey1,$inout1
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
+ aesenc $rndkey1,$inout2
+ xor $key0,%r9d
+ nop
+ aesenc $rndkey1,$inout3
+ mov %r9d,0x00+12(%rsp) # store next counter value
+ lea 1($ctr),%r9
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ xor $key0,%r9d
+ .byte 0x66,0x90
+ aesenc $rndkeyx,$inout2
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ movdqu 0x00($inp),$in0 # start loading input
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ cmp \$11,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ jb .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0 # input^=round[last]
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
+ lea 0x80($inp),$inp # $inp+=8*16
+
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
+ movdqu 0x70-0x80($inp),$in0
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ aesenclast $in3,$inout3
+ movdqa 0x10(%rsp),$in2
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ aesenclast $in5,$inout5
+ movdqa 0x30(%rsp),$in4
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
+ aesenclast $in0,$inout7
+
+ movups $inout0,($out) # store 8 output blocks
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+
+ sub \$8,$len
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
+
+ add \$8,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+ lea -0x80($key),$key
+
+.Lctr32_tail:
+ # note that at this point $inout0..5 are populated with
+ # counter values xor-ed with 0-round key
+ lea 16($key),$key
+ cmp \$4,$len
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+ # if ($len>4) compute 7 E(counter)
+ shl \$4,$rounds
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
+
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+ neg %rax
+ aesenc $rndkey1,$inout2
+ add \$16,%rax # prepare for .Lenc_loop8_enter
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1 # pre-load input
+ movups 0x20($inp),$in2
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out) # store output
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done # $len was 5, stop store
+
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done # $len was 6, stop store
+
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done # $len was 7, stop store
+
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ movups ($inp),$in0 # load input
+ movups 0x10($inp),$in1
+ aesenclast $rndkey1,$inout2
+ aesenclast $rndkey1,$inout3
+ movups 0x20($inp),$in2
+ movups 0x30($inp),$in3
+
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
+
+ movups ($inp),$in0 # load input
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ cmp \$2,$len
+ jb .Lctr32_done # $len was 1, stop store
+
+ movups 0x10($inp),$in1
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done # $len was 2, stop store
+
+ movups 0x20($inp),$in2
+ xorps $in2,$inout2
+ movups $inout2,0x20($out) # $len was 3, stop store
+
+.Lctr32_done:
+ xorps %xmm0,%xmm0 # clear register bank
+ xor $key0,$key0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,0x70(%rsp)
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8($key_),%xmm6
+ movaps %xmm0,-0xa8($key_) # clear stack
+ movaps -0x98($key_),%xmm7
+ movaps %xmm0,-0x98($key_)
+ movaps -0x88($key_),%xmm8
+ movaps %xmm0,-0x88($key_)
+ movaps -0x78($key_),%xmm9
+ movaps %xmm0,-0x78($key_)
+ movaps -0x68($key_),%xmm10
+ movaps %xmm0,-0x68($key_)
+ movaps -0x58($key_),%xmm11
+ movaps %xmm0,-0x58($key_)
+ movaps -0x48($key_),%xmm12
+ movaps %xmm0,-0x48($key_)
+ movaps -0x38($key_),%xmm13
+ movaps %xmm0,-0x38($key_)
+ movaps -0x28($key_),%xmm14
+ movaps %xmm0,-0x28($key_)
+ movaps -0x18($key_),%xmm15
+ movaps %xmm0,-0x18($key_)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+ movaps %xmm0,0x70(%rsp)
+___
+$code.=<<___;
+ mov -8($key_),%rbp
+.cfi_restore %rbp
+ lea ($key_),%rsp
+.cfi_def_cfa_register %rsp
+.Lctr32_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+\f
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp"; # override so that we can use %r11 as FP
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240(%r8),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_enc_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_enc_grandloop
+
+.align 32
+.Lxts_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # input^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calculate tweaks^round[last]
+ aesenc $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
+.Lxts_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_enc_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesenc $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesenc $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesenc $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesenclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_enc_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_enc_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[1]
+ cmp \$0x20,$len
+ jb .Lxts_enc_one # $len is 1*16
+ pxor $rndkey0,@tweak[2]
+ je .Lxts_enc_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[3]
+ cmp \$0x40,$len
+ jb .Lxts_enc_three # $len is 3*16
+ pxor $rndkey0,@tweak[4]
+ je .Lxts_enc_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+ pxor $inout5,$inout5
+
+ call _aesni_encrypt6
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_encrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_encrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_enc_ret
+ mov $len_,$len
+
+.Lxts_enc_steal:
+ movzb ($inp),%eax # borrow $rounds ...
+ movzb -16($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,-16($out)
+ mov %cl,0($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_enc_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups -16($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,-16($out)
+
+.Lxts_enc_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lxts_enc_epilogue:
+ ret
+.cfi_endproc
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240($key2),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ xor %eax,%eax # if ($len%16) len-=16;
+ test \$15,$len
+ setnz %al
+ shl \$4,%rax
+ sub %rax,$len
+
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_dec_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_dec_grandloop
+
+.align 32
+.Lxts_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # intput^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calculate tweaks^round[last]
+ aesdec $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
+.Lxts_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_dec_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesdec $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesdec $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesdec $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesdeclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_dec_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_dec_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[2]
+ cmp \$0x20,$len
+ jb .Lxts_dec_one # $len is 1*16
+ pxor $rndkey0,@tweak[3]
+ je .Lxts_dec_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[4]
+ cmp \$0x40,$len
+ jb .Lxts_dec_three # $len is 3*16
+ je .Lxts_dec_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_decrypt6
+
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ pxor $twtmp,$twtmp
+ movdqu $inout3,16*3($out)
+ pcmpgtd @tweak[5],$twtmp
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ pshufd \$0x13,$twtmp,@tweak[1] # $twres
+ and \$15,$len_
+ jz .Lxts_dec_ret
+
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,@tweak[1] # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # $inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ movdqa @tweak[2],@tweak[1]
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_decrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[3],@tweak[1]
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[4],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_decrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ mov $len_,$len
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($inp),$inout0
+ xorps @tweak[1],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[1],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp),%eax # borrow $rounds ...
+ movzb ($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,($out)
+ mov %cl,16($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_dec_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lxts_dec_epilogue:
+ ret
+.cfi_endproc
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+}
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_enc_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_dec_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
+} }}
+\f
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+.cfi_startproc
+ test $len,$len # check length
+ jz .Lcbc_ret
+
+ mov 240($key),$rnds_ # key->rounds
+ mov $key,$key_ # backup $key
+ test %r9d,%r9d # 6th argument
+ jz .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+ movups ($ivp),$inout0 # load iv as initial state
+ mov $rnds_,$rounds
+ cmp \$16,$len
+ jb .Lcbc_enc_tail
+ sub \$16,$len
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups ($inp),$inout1 # load input
+ lea 16($inp),$inp
+ #xorps $inout1,$inout0
+___
+ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+ mov $rnds_,$rounds # restore $rounds
+ mov $key_,$key # restore $key
+ movups $inout0,0($out) # store output
+ lea 16($out),$out
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ add \$16,$len
+ jnz .Lcbc_enc_tail
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($ivp)
+ pxor $inout0,$inout0
+ pxor $inout1,$inout1
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ mov $len,%rcx # zaps $key
+ xchg $inp,$out # $inp is %rsi and $out is %rdi now
+ .long 0x9066A4F3 # rep movsb
+ mov \$16,%ecx # zero tail
+ sub $len,%rcx
+ xor %eax,%eax
+ .long 0x9066AAF3 # rep stosb
+ lea -16(%rdi),%rdi # rewind $out by 1 block
+ mov $rnds_,$rounds # restore $rounds
+ mov %rdi,%rsi # $inp and $out are the same
+ mov $key_,$key # restore $key
+ xor $len,$len # len=16
+ jmp .Lcbc_enc_loop # one more spin
+\f#--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+ cmp \$16,$len
+ jne .Lcbc_decrypt_bulk
+
+ # handle single block without allocating stack frame,
+ # useful in ciphertext stealing mode
+ movdqu ($inp),$inout0 # load input
+ movdqu ($ivp),$inout1 # load iv
+ movdqa $inout0,$inout2 # future iv
+___
+ &aesni_generate1("dec",$key,$rnds_);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movdqu $inout2,($ivp) # store iv
+ xorps $inout1,$inout0 # ^=iv
+ pxor $inout1,$inout1
+ movups $inout0,($out) # store output
+ pxor $inout0,$inout0
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_decrypt_body:
+___
+
+my $inp_=$key_="%rbp"; # reassign $key_
+
+$code.=<<___;
+ mov $key,$key_ # [re-]backup $key [after reassignment]
+ movups ($ivp),$iv
+ mov $rnds_,$rounds
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ $movkey ($key),$rndkey0
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+ mov OPENSSL_ia32cap_P+4(%rip),%r9d
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_six_or_seven
+
+ and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
+ sub \$0x50,$len # $len is biased by -5*16
+ cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
+ je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
+ sub \$0x20,$len # $len is biased by -7*16
+ lea 0x70($key),$key # size optimization
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movups $inout7,($out)
+ lea 0x10($out),$out
+.Lcbc_dec_loop8_enter:
+ movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$inout0
+ movdqu 0x70($inp),$inout7
+ pxor $rndkey0,$inout1
+ $movkey 0x10-0x70($key),$rndkey1
+ pxor $rndkey0,$inout2
+ mov \$-1,$inp_
+ cmp \$0x70,$len # is there at least 0x60 bytes ahead?
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
+
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$inout7
+ $movkey 0x20-0x70($key),$rndkey0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ aesdec $rndkey1,$inout6
+ adc \$0,$inp_
+ and \$128,$inp_
+ aesdec $rndkey1,$inout7
+ add $inp,$inp_
+ $movkey 0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___ if ($i==7);
+ cmp \$11,$rounds
+___
+$code.=<<___;
+ aesdec $rndkeyx,$inout0
+ aesdec $rndkeyx,$inout1
+ aesdec $rndkeyx,$inout2
+ aesdec $rndkeyx,$inout3
+ aesdec $rndkeyx,$inout4
+ aesdec $rndkeyx,$inout5
+ aesdec $rndkeyx,$inout6
+ aesdec $rndkeyx,$inout7
+ $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
+ nop
+___
+$code.=<<___ if ($i==7);
+ jb .Lcbc_dec_done
+___
+$code.=<<___ if ($i==9);
+ je .Lcbc_dec_done
+___
+$code.=<<___ if ($i==11);
+ jmp .Lcbc_dec_done
+___
+}
+$code.=<<___;
+.align 16
+.Lcbc_dec_done:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$iv
+ pxor $rndkey0,$in0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$in1
+ pxor $rndkey0,$in2
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$in3
+ pxor $rndkey0,$in4
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ movdqu 0x50($inp),$rndkey1
+
+ aesdeclast $iv,$inout0
+ movdqu 0x60($inp),$iv # borrow $iv
+ pxor $rndkey0,$rndkey1
+ aesdeclast $in0,$inout1
+ pxor $rndkey0,$iv
+ movdqu 0x70($inp),$rndkey0 # next IV
+ aesdeclast $in1,$inout2
+ lea 0x80($inp),$inp
+ movdqu 0x00($inp_),$in0
+ aesdeclast $in2,$inout3
+ aesdeclast $in3,$inout4
+ movdqu 0x10($inp_),$in1
+ movdqu 0x20($inp_),$in2
+ aesdeclast $in4,$inout5
+ aesdeclast $rndkey1,$inout6
+ movdqu 0x30($inp_),$in3
+ movdqu 0x40($inp_),$in4
+ aesdeclast $iv,$inout7
+ movdqa $rndkey0,$iv # return $iv
+ movdqu 0x50($inp_),$rndkey1
+ $movkey -0x70($key),$rndkey0
+
+ movups $inout0,($out) # store output
+ movdqa $in0,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in1,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in2,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in3,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in4,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey1,$inout5
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
+
+ sub \$0x80,$len
+ ja .Lcbc_dec_loop8
+
+ movaps $inout7,$inout0
+ lea -0x70($key),$key
+ add \$0x70,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout7,($out)
+ lea 0x10($out),$out
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ movaps $in0,$inout0
+.Lcbc_dec_six_or_seven:
+ cmp \$0x60,$len
+ ja .Lcbc_dec_seven
+
+ movaps $inout5,$inout6
+ call _aesni_decrypt6
+ pxor $iv,$inout0 # ^= IV
+ movaps $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ lea 0x50($out),$out
+ movdqa $inout5,$inout0
+ pxor $inout5,$inout5
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups 0x50($inp),$inout7
+ pxor $iv,$inout0 # ^= IV
+ movups 0x60($inp),$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout7,$inout6
+ movdqu $inout5,0x50($out)
+ pxor $inout5,$inout5
+ lea 0x60($out),$out
+ movdqa $inout6,$inout0
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups $inout5,($out)
+ lea 0x10($out),$out
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+.Lcbc_dec_loop6_enter:
+ lea 0x60($inp),$inp
+ movdqa $inout5,$inout6
+
+ call _aesni_decrypt6
+
+ pxor $iv,$inout0 # ^= IV
+ movdqa $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ mov $key_,$key
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ mov $rnds_,$rounds
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ sub \$0x60,$len
+ ja .Lcbc_dec_loop6
+
+ movdqa $inout5,$inout0
+ add \$0x50,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout5,($out)
+ lea 0x10($out),$out
+
+.Lcbc_dec_tail:
+ movups ($inp),$inout0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_one # $len is 1*16 or less
+
+ movups 0x10($inp),$inout1
+ movaps $inout0,$in0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_two # $len is 2*16 or less
+
+ movups 0x20($inp),$inout2
+ movaps $inout1,$in1
+ sub \$0x10,$len
+ jbe .Lcbc_dec_three # $len is 3*16 or less
+
+ movups 0x30($inp),$inout3
+ movaps $inout2,$in2
+ sub \$0x10,$len
+ jbe .Lcbc_dec_four # $len is 4*16 or less
+
+ movups 0x40($inp),$inout4 # $len is 5*16 or less
+ movaps $inout3,$in3
+ movaps $inout4,$in4
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ pxor $iv,$inout0
+ movaps $in4,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ lea 0x40($out),$out
+ movdqa $inout4,$inout0
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ sub \$0x10,$len
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_one:
+ movaps $inout0,$in0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps $iv,$inout0
+ movaps $in0,$iv
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ movaps $inout1,$in1
+ call _aesni_decrypt2
+ pxor $iv,$inout0
+ movaps $in1,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ movdqa $inout1,$inout0
+ pxor $inout1,$inout1 # clear register bank
+ lea 0x10($out),$out
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ movaps $inout2,$in2
+ call _aesni_decrypt3
+ pxor $iv,$inout0
+ movaps $in2,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ movdqa $inout2,$inout0
+ pxor $inout2,$inout2
+ lea 0x20($out),$out
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ movaps $inout3,$in3
+ call _aesni_decrypt4
+ pxor $iv,$inout0
+ movaps $in3,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movdqa $inout3,$inout0
+ pxor $inout3,$inout3
+ lea 0x30($out),$out
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor $inout1,$inout1 # clear register bank
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+___
+$code.=<<___ if (!$win64);
+ pxor $inout4,$inout4 # %xmm6..9
+ pxor $inout5,$inout5
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+___
+$code.=<<___;
+.Lcbc_dec_tail_collected:
+ movups $iv,($ivp)
+ and \$15,$len
+ jnz .Lcbc_dec_tail_partial
+ movups $inout0,($out)
+ pxor $inout0,$inout0
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps $inout0,(%rsp)
+ pxor $inout0,$inout0
+ mov \$16,%rcx
+ mov $out,%rdi
+ sub $len,%rcx
+ lea (%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+ movdqa $inout0,(%rsp)
+
+.Lcbc_dec_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps %xmm0,0x10(%rsp) # clear stack
+ movaps 0x20(%rsp),%xmm7
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm8
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm9
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm10
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm11
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm12
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm13
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm14
+ movaps %xmm0,0x90(%rsp)
+ movaps 0xa0(%rsp),%xmm15
+ movaps %xmm0,0xa0(%rsp)
+___
+$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lcbc_ret:
+ ret
+.cfi_endproc
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} \f
+# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
+# int bits, AES_KEY *key)
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# *$key key schedule
+#
+{ my ($inp,$bits,$key) = @_4args;
+ $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_decrypt_key:
+.cfi_startproc
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
+ call __aesni_set_encrypt_key
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
+ test %eax,%eax
+ jnz .Ldec_key_ret
+ lea 16($key,$bits),$inp # points at the end of key schedule
+
+ $movkey ($key),%xmm0 # just swap
+ $movkey ($inp),%xmm1
+ $movkey %xmm0,($inp)
+ $movkey %xmm1,($key)
+ lea 16($key),$key
+ lea -16($inp),$inp
+
+.Ldec_key_inverse:
+ $movkey ($key),%xmm0 # swap and inverse
+ $movkey ($inp),%xmm1
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
+ lea 16($key),$key
+ lea -16($inp),$inp
+ $movkey %xmm0,16($inp)
+ $movkey %xmm1,-16($key)
+ cmp $key,$inp
+ ja .Ldec_key_inverse
+
+ $movkey ($key),%xmm0 # inverse middle
+ aesimc %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ $movkey %xmm0,($inp)
+ pxor %xmm0,%xmm0
+.Ldec_key_ret:
+ add \$8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.LSEH_end_set_decrypt_key:
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+\f
+# This is based on submission from Intel by
+# Huang Ying
+# Vinodh Gopal
+# Kahraman Akdemir
+#
+# Aggressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+# int bits, AES_KEY * const key);
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# $bits rounds-1 (used in aesni_set_decrypt_key)
+# *$key key schedule
+# $key pointer to key schedule (used in
+# aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
+ mov \$-1,%rax
+ test $inp,$inp
+ jz .Lenc_key_ret
+ test $key,$key
+ jz .Lenc_key_ret
+
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
+ cmp \$256,$bits
+ je .L14rounds
+ cmp \$192,$bits
+ je .L12rounds
+ cmp \$128,$bits
+ jne .Lbad_keybits
+
+.L10rounds:
+ mov \$9,$bits # 10 rounds for 128-bit key
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
+ je .L10rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
+ call .Lkey_expansion_128_cold
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
+ call .Lkey_expansion_128
+ $movkey %xmm0,(%rax)
+ mov $bits,80(%rax) # 240(%rdx)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ mov \$8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,($key)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+ lea 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ dec %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ mov $bits,96(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L12rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
+ call .Lkey_expansion_192a_cold
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
+ call .Lkey_expansion_192b
+ $movkey %xmm0,(%rax)
+ mov $bits,48(%rax) # 240(%rdx)
+ xor %rax, %rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$8,%r10d
+ movdqu %xmm0,($key)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+ pslld \$1, %xmm4
+ lea 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd \$0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ dec %r10d
+ jnz .Loop_key192
+
+ mov $bits,32(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16($inp),%xmm2 # remaining half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L14rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ $movkey %xmm2,16($key) # round 1
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
+ call .Lkey_expansion_256a_cold
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
+ call .Lkey_expansion_256a
+ $movkey %xmm0,(%rax)
+ mov $bits,16(%rax) # 240(%rdx)
+ xor %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$7,%r10d
+ movdqu %xmm0,0($key)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16($key)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld \$1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ dec %r10d
+ jz .Ldone_key256
+
+ pshufd \$0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+ aesenclast %xmm3,%xmm2
+
+ movdqa %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ lea 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ mov $bits,16(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ mov \$-2,%rax
+.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ add \$8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.LSEH_end_set_encrypt_key:
+\f
+.align 16
+.Lkey_expansion_128:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd \$0b11111111,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps \$0b01000100,%xmm0,%xmm5
+ $movkey %xmm5,(%rax)
+ shufps \$0b01001110,%xmm2,%xmm3
+ $movkey %xmm3,16(%rax)
+ lea 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ $movkey %xmm2,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+
+ shufps \$0b00010000,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm2
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+\f
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+ .long 6,6,6,0
+.Lincrement64:
+ .long 1,0,0,0
+.Lxts_magic:
+ .long 0x87,0,1,0
+.Lincrement1:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+ .long 1,1,1,1
+.Lkey_rcon1b:
+ .long 0x1b,0x1b,0x1b,0x1b
+
+.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type ecb_ccm64_se_handler,\@abi-omnipotent
+.align 16
+ecb_ccm64_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
+
+.type ctr_xts_se_handler,\@abi-omnipotent
+.align 16
+ctr_xts_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ lea -0xa8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
+ jmp .Lcommon_seh_tail
+.size ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size ocb_se_handler,.-ocb_se_handler
+___
+$code.=<<___;
+.type cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_decrypt_bulk(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ mov 120($context),%rax # pull context->Rax
+
+ lea .Lcbc_decrypt_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<cbc_decrypt_body
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lcbc_ret(%rip),%r10
+ cmp %r10,%rbx # context->Rip>="epilogue" label
+ jae .Lcommon_seh_tail
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov 208($context),%rax # pull context->R11
+
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size cbc_se_handler,.-cbc_se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+ .rva .LSEH_begin_aesni_ecb_encrypt
+ .rva .LSEH_end_aesni_ecb_encrypt
+ .rva .LSEH_info_ecb
+
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_info_ccm64_enc
+
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_info_ccm64_dec
+
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_info_ctr32
+
+ .rva .LSEH_begin_aesni_xts_encrypt
+ .rva .LSEH_end_aesni_xts_encrypt
+ .rva .LSEH_info_xts_enc
+
+ .rva .LSEH_begin_aesni_xts_decrypt
+ .rva .LSEH_end_aesni_xts_decrypt
+ .rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
+___
+$code.=<<___;
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_cbc
+
+ .rva ${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_set_decrypt_key
+ .rva .LSEH_info_key
+
+ .rva ${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_set_encrypt_key
+ .rva .LSEH_info_key
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
+.LSEH_info_ccm64_enc:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
+.LSEH_info_ccm64_dec:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
+.LSEH_info_ctr32:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
+.LSEH_info_xts_enc:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.LSEH_info_xts_dec:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
+___
+$code.=<<___;
+.LSEH_info_cbc:
+ .byte 9,0,0,0
+ .rva cbc_se_handler
+.LSEH_info_key:
+ .byte 0x01,0x04,0x01,0x00
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
+___
+}
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+sub movbe {
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
+
+print $code;
+
+close STDOUT;
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# sha1_block procedure for x86_64.
+#
+# It was brought to my attention that on EM64T compiler-generated code
+# was far behind 32-bit assembler implementation. This is unlike on
+# Opteron where compiler-generated code was only 15% behind 32-bit
+# assembler, which originally made it hard to motivate the effort.
+# There was suggestion to mechanically translate 32-bit code, but I
+# dismissed it, reasoning that x86_64 offers enough register bank
+# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+# implementation:-) However! While 64-bit code does perform better
+# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+# x86_64 does offer larger *addressable* bank, but out-of-order core
+# reaches for even more registers through dynamic aliasing, and EM64T
+# core must have managed to run-time optimize even 32-bit code just as
+# good as 64-bit one. Performance improvement is summarized in the
+# following table:
+#
+# gcc 3.4 32-bit asm cycles/byte
+# Opteron +45% +20% 6.8
+# Xeon P4 +65% +0% 9.9
+# Core2 +60% +10% 7.0
+
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+# May 2013.
+#
+# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
+# and loading pair of consecutive blocks to 256-bit %ymm registers)
+# did not provide impressive performance improvement till a crucial
+# hint regarding the number of Xupdate iterations to pre-compute in
+# advance was provided by Ilya Albrekht of Intel Corp.
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+# x86_64 SSSE3 AVX[2]
+# P4 9.05 -
+# Opteron 6.26 -
+# Core2 6.55 6.05/+8% -
+# Westmere 6.73 5.30/+27% -
+# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
+# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
+# Haswell 5.45 4.15/+31% 3.57/+53%
+# Skylake 5.18 4.06/+28% 3.54/+46%
+# Bulldozer 9.11 5.95/+53%
+# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
+# VIA Nano 9.32 7.15/+30%
+# Atom 10.3 9.17/+12%
+# Silvermont 13.1(*) 9.37/+40%
+# Knights L 13.2(*) 9.68/+36% 8.30/+59%
+# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
+#
+# (*) obviously suboptimal result, nothing was done about it,
+# because SSSE3 code is compiled unconditionally;
+# (**) SHAEXT result
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$ctx="%rdi"; # 1st arg
+$inp="%rsi"; # 2nd arg
+$num="%rdx"; # 3rd arg
+
+# reassign arguments in order to produce more compact code
+$ctx="%r8";
+$inp="%r9";
+$num="%r10";
+
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp","%r14d");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
+
+@V=($A,$B,$C,$D,$E);
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+ mov `4*$i`($inp),$xi[0]
+ bswap $xi[0]
+___
+$code.=<<___ if ($i<15);
+ mov `4*$j`($inp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*$i`(%rsp)
+ mov $a,$t2
+ bswap $xi[1]
+ xor $c,$t0
+ rol \$5,$t2
+ and $b,$t0
+ lea 0x5a827999($xi[0],$e),$e
+ add $t2,$e
+ xor $d,$t0
+ rol \$30,$b
+ add $t0,$e
+___
+$code.=<<___ if ($i>=15);
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
+ mov $a,$t2
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ xor $c,$t0
+ rol \$5,$t2
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ and $b,$t0
+ lea 0x5a827999($xi[0],$e),$e
+ rol \$30,$b
+ xor $d,$t0
+ add $t2,$e
+ rol \$1,$xi[1]
+ add $t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+$code.=<<___ if ($i<79);
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $b,$t0
+ `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
+ mov $a,$t2
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ xor $d,$t0
+ rol \$5,$t2
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ lea $K($xi[0],$e),$e
+ xor $c,$t0
+ add $t2,$e
+ rol \$30,$b
+ add $t0,$e
+ rol \$1,$xi[1]
+___
+$code.=<<___ if ($i==79);
+ mov $b,$t0
+ mov $a,$t2
+ xor $d,$t0
+ lea $K($xi[0],$e),$e
+ rol \$5,$t2
+ xor $c,$t0
+ add $t2,$e
+ rol \$30,$b
+ add $t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
+ mov $d,$t1
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ and $c,$t0
+ mov $a,$t2
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ lea 0x8f1bbcdc($xi[0],$e),$e
+ xor $c,$t1
+ rol \$5,$t2
+ add $t0,$e
+ rol \$1,$xi[1]
+ and $b,$t1
+ add $t2,$e
+ rol \$30,$b
+ add $t1,$e
+___
+push(@xi,shift(@xi));
+}
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl sha1_block_data_order
+.type sha1_block_data_order,\@function,3
+.align 16
+sha1_block_data_order:
+.cfi_startproc
+ mov OPENSSL_ia32cap_P+0(%rip),%r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r8d
+ mov OPENSSL_ia32cap_P+8(%rip),%r10d
+ test \$`1<<9`,%r8d # check SSSE3 bit
+ jz .Lialu
+___
+$code.=<<___ if ($shaext);
+ test \$`1<<29`,%r10d # check SHA bit
+ jnz _shaext_shortcut
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
+ cmp \$`1<<3|1<<5|1<<8`,%r10d
+ je _avx2_shortcut
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r8d # mask AVX bit
+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
+ or %r9d,%r8d
+ cmp \$`1<<28|1<<30`,%r8d
+ je _avx_shortcut
+___
+$code.=<<___;
+ jmp _ssse3_shortcut
+
+.align 16
+.Lialu:
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ mov %rdi,$ctx # reassigned argument
+ sub \$`8+16*4`,%rsp
+ mov %rsi,$inp # reassigned argument
+ and \$-64,%rsp
+ mov %rdx,$num # reassigned argument
+ mov %rax,`16*4`(%rsp)
+.cfi_cfa_expression %rsp+64,deref,+8
+.Lprologue:
+
+ mov 0($ctx),$A
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov 16($ctx),$E
+ jmp .Lloop
+
+.align 16
+.Lloop:
+___
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ add 0($ctx),$A
+ add 4($ctx),$B
+ add 8($ctx),$C
+ add 12($ctx),$D
+ add 16($ctx),$E
+ mov $A,0($ctx)
+ mov $B,4($ctx)
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+
+ sub \$1,$num
+ lea `16*4`($inp),$inp
+ jnz .Lloop
+
+ mov `16*4`(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ ret
+.cfi_endproc
+.size sha1_block_data_order,.-sha1_block_data_order
+___
+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type sha1_block_data_order_shaext,\@function,3
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+ lea `-8-4*16`(%rsp),%rsp
+ movaps %xmm6,-8-4*16(%rax)
+ movaps %xmm7,-8-3*16(%rax)
+ movaps %xmm8,-8-2*16(%rax)
+ movaps %xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
+
+ movdqu ($inp),@MSG[0]
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ movdqu 0x10($inp),@MSG[1]
+ pshufd \$0b00011011,$E,$E # flip word order
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[1]
+ pshufb $BSWAP,@MSG[2]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[3]
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ dec $num
+ lea 0x40($inp),%r8 # next input block
+ paddd @MSG[0],$E
+ cmovne %r8,$inp
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
+ sha1nexte @MSG[1],$E_
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+ sha1msg2 @MSG[3],@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
+ sha1nexte @MSG[2],$E
+ pxor @MSG[3],@MSG[1]
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[1],$E_
+ movdqu 0x10($inp),@MSG[1]
+ pshufb $BSWAP,@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[2],$E
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[1]
+
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[3],$E_
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[2]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $E_SAVE,$E
+ pshufb $BSWAP,@MSG[3]
+
+ paddd $ABCD_SAVE,$ABCD
+ movdqa $E,$E_SAVE # offload $E
+
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-4*16(%rax),%xmm6
+ movaps -8-3*16(%rax),%xmm7
+ movaps -8-2*16(%rax),%xmm8
+ movaps -8-1*16(%rax),%xmm9
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+.cfi_endproc
+ ret
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my $Kx="%xmm11";
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $rx=0;
+my $K_XX_XX="%r14";
+my $fp="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+{ my $sn;
+sub align32() {
+ ++$sn;
+$code.=<<___;
+ jmp .Lalign32_$sn # see "Decoded ICache" in manual
+.align 32
+.Lalign32_$sn:
+___
+}
+}
+
+$code.=<<___;
+.type sha1_block_data_order_ssse3,\@function,3
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+.cfi_startproc
+ mov %rsp,$fp # frame pointer
+.cfi_def_cfa_register $fp
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-40-6*16($fp)
+ movaps %xmm7,-40-5*16($fp)
+ movaps %xmm8,-40-4*16($fp)
+ movaps %xmm9,-40-3*16($fp)
+ movaps %xmm10,-40-2*16($fp)
+ movaps %xmm11,-40-1*16($fp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ and \$-64,%rsp
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ shl \$6,$num
+ add $inp,$num
+ lea K_XX_XX+64(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa -64($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @X[2],@X[-4&7] # byte swap
+ pshufb @X[2],@X[-3&7]
+ pshufb @X[2],@X[-2&7]
+ add \$64,$inp
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ jmp .Loop_ssse3
+___
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)) if ($Xi==8);
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
+ }
+ eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+
+ &pslld (@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$num);
+ &je (".Ldone_ssse3");
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&$_ror ($b,$j?7:2)', # $b>>>2
+ '&xor (@T[0],$d)',
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor ($b,$c)', # $c^$d for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&and (@T[1],$b)', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c)', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40)', # restore $c
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&mov (@T[1],$a)', # $b for next round
+ '&xor (@T[0],$c)',
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59)', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+$Xi=4; # reset variables
+@X=map("%xmm$_",(4..7,0..3));
+@Tx=map("%xmm$_",(8..10));
+$j=0;
+$rx=0;
+
+my $done_avx_label=".Ldone_avx";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type sha1_block_data_order_avx,\@function,3
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ and \$-64,%rsp
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ shl \$6,$num
+ add $inp,$num
+ lea K_XX_XX+64(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa -64($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ jmp .Loop_avx
+___
+
+sub Xupdate_avx_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$num);
+ &je ($done_avx_label);
+
+ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_avx
+
+.align 16
+$done_avx_label:
+___
+ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vzeroupper
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+
+if ($avx>1) {
+use integer;
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+@Tx=map("%ymm$_",(8..10));
+$Kx="%ymm11";
+$j=0;
+
+my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
+my ($a5,$t0)=("%r12d","%edi");
+
+my ($A,$F,$B,$C,$D,$E)=@ROTX;
+my $rx=0;
+my $frame="%r13";
+
+$code.=<<___;
+.type sha1_block_data_order_avx2,\@function,3
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -6*16(%rsp),%rsp
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
+.Lprologue_avx2:
+___
+$code.=<<___;
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ lea -640(%rsp),%rsp
+ shl \$6,$num
+ lea 64($inp),$frame
+ and \$-128,%rsp
+ add $inp,$num
+ lea K_XX_XX+64(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ cmp $num,$frame
+ cmovae $inp,$frame # next or same block
+ mov 4($ctx),$F
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov 16($ctx),$E
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+
+ vmovdqu ($inp),%xmm0
+ vmovdqu 16($inp),%xmm1
+ vmovdqu 32($inp),%xmm2
+ vmovdqu 48($inp),%xmm3
+ lea 64($inp),$inp
+ vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vmovdqu -64($K_XX_XX),$Kx # K_00_19
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqu @X[1],32(%rsp)
+ vpaddd $Kx,@X[-1&7],@X[3]
+ vmovdqu @X[2],64(%rsp)
+ vmovdqu @X[3],96(%rsp)
+___
+for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
+ use integer;
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+$code.=<<___;
+ lea 128(%rsp),$frame
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+___
+sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
+ # at start $f=(b&c)^(~b&d), $b>>>=2
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&andn ($t0,$a,$c)', # ~b&d for next round
+
+ '&add ($e,$f)', # e+=(b&c)^(~b&d)
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 for next round
+ '&and ($a,$b)', # b&c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
+ # on entry $f=b^c^d, $b>>>=2
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+
+ '&lea ($e,"($e,$f)")', # e+=b^c^d
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
+ '&xor ($a,$b) if ($j<79)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
+ # on entry $f=((b^c)&(c^d)), $b>>>=2
+ $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
+ '&mov ($t0,$b) if ($j<59)', # count on zero latency
+ '&xor ($t0,$c) if ($j<59)', # c^d for next round
+
+ '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 in next round
+ '&xor ($a,$b)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
+ '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx2_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ #&vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xloop_avx2()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128($inp),$frame
+ lea 128($inp),%rdi # borrow $t0
+ cmp $num,$frame
+ cmovae $inp,$frame # next or previous block
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ je .Ldone_avx2
+___
+
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+
+$code.=<<___;
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+ cmp $num,%rdi # borrowed $t0
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ lea 128+16(%rsp),$frame
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+ sub \$-128,$inp
+___
+ $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
+
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
+ &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
+ &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("0(%rsp)",@Tx[0]);
+ &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
+ &vpaddd (@Tx[1],@X[-3&7],$Kx);
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("32(%rsp)",@Tx[1]);
+ &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
+ &vpaddd (@X[2],@X[-2&7],$Kx);
+
+ &Xloop_avx2 (\&bodyx_40_59);
+ &align32 ();
+ &vmovdqu ("64(%rsp)",@X[2]);
+ &vpaddd (@X[3],@X[-1&7],$Kx);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &vmovdqu ("96(%rsp)",@X[3]);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &Xupdate_avx2_16_31(\&bodyx_40_59);
+
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xloop_avx2 (\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128(%rsp),$frame
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
+___
+$code.=<<___;
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+___
+}
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+___
+}}}
+$code.=<<___;
+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ mov `16*4`(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
+___
+
+$code.=<<___ if ($shaext);
+.type shaext_handler,\@abi-omnipotent
+.align 16
+shaext_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ lea .Lepilogue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea -8-4*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lcommon_seh_tail
+.size shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea -40-6*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$12,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ssse3_handler,.-ssse3_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_sha1_block_data_order
+ .rva .LSEH_end_sha1_block_data_order
+ .rva .LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_sha1_block_data_order_shaext
+ .rva .LSEH_end_sha1_block_data_order_shaext
+ .rva .LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
+ .rva .LSEH_begin_sha1_block_data_order_ssse3
+ .rva .LSEH_end_sha1_block_data_order_ssse3
+ .rva .LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_sha1_block_data_order_avx
+ .rva .LSEH_end_sha1_block_data_order_avx
+ .rva .LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_sha1_block_data_order_avx2
+ .rva .LSEH_end_sha1_block_data_order_avx2
+ .rva .LSEH_info_sha1_block_data_order_avx2
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_sha1_block_data_order:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_sha1_block_data_order_shaext:
+ .byte 9,0,0,0
+ .rva shaext_handler
+___
+$code.=<<___;
+.LSEH_info_sha1_block_data_order_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_block_data_order_avx2:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
+___
+}
+
+####################################################################
+
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ my $rex=0;
+ $rex|=0x04 if ($2>=8);
+ $rex|=0x01 if ($1>=8);
+ unshift @opcode,0x40|$rex if ($rex);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
+
+ print $_,"\n";
+}
+close STDOUT;
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+# ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+# stack frame allocation. If volatile storage is actually required
+# that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+# unified Win64 prologue and epilogue automatically. If you want
+# to take care of ABI differences yourself, tag functions as
+# ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+# arguments as ".type name,@function,N." Keep in mind that if N is
+# larger than 6, then you *have to* write "abi-omnipotent" code,
+# because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+# (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+# required to identify the spots, where to inject Win64 epilogue!
+# But on the pros, it's then prefixed with rep automatically:-)
+# 7. Stick to explicit ip-relative addressing. If you have to use
+# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
+# Both are recognized and translated to proper Win64 addressing
+# modes.
+#
+# 8. In order to provide for structured exception handling unified
+# Win64 prologue copies %rsp value to %rax. For further details
+# see SEH paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+# is declared as non 64-bit value, do clear its upper part.
+\f
+
+use strict;
+
+my $flavour = shift;
+my $output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+open STDOUT,">$output" || die "can't open $output: $!"
+ if (defined($output));
+
+my $gas=1; $gas=0 if ($output =~ /\.asm$/);
+my $elf=1; $elf=0 if (!$gas);
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1;
+ $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`;
+ $prefix =~ s|\R$||; # Better chomp
+ }
+elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
+elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
+elsif (!$gas)
+{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
+ { $nasm = $1 + $2*0.01; $PTR=""; }
+ elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
+ { $masm = $1 + $2*2**-16 + $4*2**-32; }
+ die "no assembler found on %PATH%" if (!($nasm || $masm));
+ $win64=1;
+ $elf=0;
+ $decor="\$L\$";
+}
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode; # pick up opcodes
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ /^([a-z][a-z0-9]*)/i) {
+ bless $self,$class;
+ $self->{op} = $1;
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+ undef $self->{sz};
+ if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain...
+ $self->{op} = $1;
+ $self->{sz} = $2;
+ } elsif ($self->{op} =~ /call|jmp/) {
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+ $self->{op} = $1;
+ $self->{sz} = $2;
+ }
+ }
+ $ret;
+ }
+ sub size {
+ my ($self, $sz) = @_;
+ $self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+ $self->{sz};
+ }
+ sub out {
+ my $self = shift;
+ if ($gas) {
+ if ($self->{op} eq "movz") { # movz is pain...
+ sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+ } elsif ($self->{op} =~ /^set/) {
+ "$self->{op}";
+ } elsif ($self->{op} eq "ret") {
+ my $epilogue = "";
+ if ($win64 && $current_function->{abi} eq "svr4") {
+ $epilogue = "movq 8(%rsp),%rdi\n\t" .
+ "movq 16(%rsp),%rsi\n\t";
+ }
+ $epilogue . ".byte 0xf3,0xc3";
+ } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
+ ".p2align\t3\n\t.quad";
+ } else {
+ "$self->{op}$self->{sz}";
+ }
+ } else {
+ $self->{op} =~ s/^movz/movzx/;
+ if ($self->{op} eq "ret") {
+ $self->{op} = "";
+ if ($win64 && $current_function->{abi} eq "svr4") {
+ $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
+ "mov rsi,QWORD$PTR\[16+rsp\]\n\t";
+ }
+ $self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
+ } elsif ($self->{op} =~ /^(pop|push)f/) {
+ $self->{op} .= $self->{sz};
+ } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+ $self->{op} = "\tDQ";
+ }
+ $self->{op};
+ }
+ }
+ sub mnemonic {
+ my ($self, $op) = @_;
+ $self->{op}=$op if (defined($op));
+ $self->{op};
+ }
+}
+{ package const; # pick up constants, which start with $
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ /^\$([^,]+)/) {
+ bless $self, $class;
+ $self->{value} = $1;
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+ }
+ $ret;
+ }
+ sub out {
+ my $self = shift;
+
+ $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
+ if ($gas) {
+ # Solaris /usr/ccs/bin/as can't handle multiplications
+ # in $self->{value}
+ my $value = $self->{value};
+ no warnings; # oct might complain about overflow, ignore here...
+ $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+ $self->{value} = $value;
+ }
+ sprintf "\$%s",$self->{value};
+ } else {
+ my $value = $self->{value};
+ $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+ sprintf "%s",$value;
+ }
+ }
+}
+{ package ea; # pick up effective addresses: expr(%reg,%reg,scale)
+
+ my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR",
+ l=>"DWORD$PTR", d=>"DWORD$PTR",
+ q=>"QWORD$PTR", o=>"OWORD$PTR",
+ x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+ z=>"ZMMWORD$PTR" ) if (!$gas);
+
+ sub re {
+ my ($class, $line, $opcode) = @_;
+ my $self = {};
+ my $ret;
+
+ # optional * ----vvv--- appears in indirect jmp/call
+ if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
+ bless $self, $class;
+ $self->{asterisk} = $1;
+ $self->{label} = $2;
+ ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+ $self->{scale} = 1 if (!defined($self->{scale}));
+ $self->{opmask} = $4;
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+ if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
+ die if ($opcode->mnemonic() ne "mov");
+ $opcode->mnemonic("lea");
+ }
+ $self->{base} =~ s/^%//;
+ $self->{index} =~ s/^%// if (defined($self->{index}));
+ $self->{opcode} = $opcode;
+ }
+ $ret;
+ }
+ sub size {}
+ sub out {
+ my ($self, $sz) = @_;
+
+ $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+ $self->{label} =~ s/\.L/$decor/g;
+
+ # Silently convert all EAs to 64-bit. This is required for
+ # elder GNU assembler and results in more compact code,
+ # *but* most importantly AES module depends on this feature!
+ $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+ $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+ # Solaris /usr/ccs/bin/as can't handle multiplications
+ # in $self->{label}...
+ use integer;
+ $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+
+ # Some assemblers insist on signed presentation of 32-bit
+ # offsets, but sign extension is a tricky business in perl...
+ if ((1<<31)<<1) {
+ $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+ } else {
+ $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg;
+ }
+
+ # if base register is %rbp or %r13, see if it's possible to
+ # flip base and index registers [for better performance]
+ if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+ $self->{base} =~ /(rbp|r13)/) {
+ $self->{base} = $self->{index}; $self->{index} = $1;
+ }
+
+ if ($gas) {
+ $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64");
+
+ if (defined($self->{index})) {
+ sprintf "%s%s(%s,%%%s,%d)%s",
+ $self->{asterisk},$self->{label},
+ $self->{base}?"%$self->{base}":"",
+ $self->{index},$self->{scale},
+ $self->{opmask};
+ } else {
+ sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label},
+ $self->{base},$self->{opmask};
+ }
+ } else {
+ $self->{label} =~ s/\./\$/g;
+ $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+ $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
+
+ my $mnemonic = $self->{opcode}->mnemonic();
+ ($self->{asterisk}) && ($sz="q") ||
+ ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) ||
+ ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) ||
+ ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) ||
+ ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/) && ($sz="x");
+
+ $self->{opmask} =~ s/%(k[0-7])/$1/;
+
+ if (defined($self->{index})) {
+ sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
+ $self->{label}?"$self->{label}+":"",
+ $self->{index},$self->{scale},
+ $self->{base}?"+$self->{base}":"",
+ $self->{opmask};
+ } elsif ($self->{base} eq "rip") {
+ sprintf "%s[%s]",$szmap{$sz},$self->{label};
+ } else {
+ sprintf "%s[%s%s]%s", $szmap{$sz},
+ $self->{label}?"$self->{label}+":"",
+ $self->{base},$self->{opmask};
+ }
+ }
+ }
+}
+{ package register; # pick up registers, which start with %.
+ sub re {
+ my ($class, $line, $opcode) = @_;
+ my $self = {};
+ my $ret;
+
+ # optional * ----vvv--- appears in indirect jmp/call
+ if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
+ bless $self,$class;
+ $self->{asterisk} = $1;
+ $self->{value} = $2;
+ $self->{opmask} = $3;
+ $opcode->size($self->size());
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+ }
+ $ret;
+ }
+ sub size {
+ my $self = shift;
+ my $ret;
+
+ if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; }
+ elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; }
+ elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; }
+ elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; }
+ elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+ elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; }
+ elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; }
+ elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+ $ret;
+ }
+ sub out {
+ my $self = shift;
+ if ($gas) { sprintf "%s%%%s%s", $self->{asterisk},
+ $self->{value},
+ $self->{opmask}; }
+ else { $self->{opmask} =~ s/%(k[0-7])/$1/;
+ $self->{value}.$self->{opmask}; }
+ }
+}
+{ package label; # pick up labels, which end with :
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ /(^[\.\w]+)\:/) {
+ bless $self,$class;
+ $self->{value} = $1;
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+ $self->{value} =~ s/^\.L/$decor/;
+ }
+ $ret;
+ }
+ sub out {
+ my $self = shift;
+
+ if ($gas) {
+ my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+ if ($win64 && $current_function->{name} eq $self->{value}
+ && $current_function->{abi} eq "svr4") {
+ $func .= "\n";
+ $func .= " movq %rdi,8(%rsp)\n";
+ $func .= " movq %rsi,16(%rsp)\n";
+ $func .= " movq %rsp,%rax\n";
+ $func .= "${decor}SEH_begin_$current_function->{name}:\n";
+ my $narg = $current_function->{narg};
+ $narg=6 if (!defined($narg));
+ $func .= " movq %rcx,%rdi\n" if ($narg>0);
+ $func .= " movq %rdx,%rsi\n" if ($narg>1);
+ $func .= " movq %r8,%rdx\n" if ($narg>2);
+ $func .= " movq %r9,%rcx\n" if ($narg>3);
+ $func .= " movq 40(%rsp),%r8\n" if ($narg>4);
+ $func .= " movq 48(%rsp),%r9\n" if ($narg>5);
+ }
+ $func;
+ } elsif ($self->{value} ne "$current_function->{name}") {
+ # Make all labels in masm global.
+ $self->{value} .= ":" if ($masm);
+ $self->{value} . ":";
+ } elsif ($win64 && $current_function->{abi} eq "svr4") {
+ my $func = "$current_function->{name}" .
+ ($nasm ? ":" : "\tPROC $current_function->{scope}") .
+ "\n";
+ $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
+ $func .= " mov QWORD$PTR\[16+rsp\],rsi\n";
+ $func .= " mov rax,rsp\n";
+ $func .= "${decor}SEH_begin_$current_function->{name}:";
+ $func .= ":" if ($masm);
+ $func .= "\n";
+ my $narg = $current_function->{narg};
+ $narg=6 if (!defined($narg));
+ $func .= " mov rdi,rcx\n" if ($narg>0);
+ $func .= " mov rsi,rdx\n" if ($narg>1);
+ $func .= " mov rdx,r8\n" if ($narg>2);
+ $func .= " mov rcx,r9\n" if ($narg>3);
+ $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
+ $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
+ $func .= "\n";
+ } else {
+ "$current_function->{name}".
+ ($nasm ? ":" : "\tPROC $current_function->{scope}");
+ }
+ }
+}
+{ package expr; # pick up expressions
+ sub re {
+ my ($class, $line, $opcode) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ /(^[^,]+)/) {
+ bless $self,$class;
+ $self->{value} = $1;
+ $ret = $self;
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+ $self->{value} =~ s/\@PLT// if (!$elf);
+ $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+ $self->{value} =~ s/\.L/$decor/g;
+ $self->{opcode} = $opcode;
+ }
+ $ret;
+ }
+ sub out {
+ my $self = shift;
+ if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) {
+ "NEAR ".$self->{value};
+ } else {
+ $self->{value};
+ }
+ }
+}
+{ package cfi_directive;
+ # CFI directives annotate instructions that are significant for
+ # stack unwinding procedure compliant with DWARF specification,
+ # see http://dwarfstd.org/. Besides naturally expected for this
+ # script platform-specific filtering function, this module adds
+ # three auxiliary synthetic directives not recognized by [GNU]
+ # assembler:
+ #
+ # - .cfi_push to annotate push instructions in prologue, which
+ # translates to .cfi_adjust_cfa_offset (if needed) and
+ # .cfi_offset;
+ # - .cfi_pop to annotate pop instructions in epilogue, which
+ # translates to .cfi_adjust_cfa_offset (if needed) and
+ # .cfi_restore;
+ # - [and most notably] .cfi_cfa_expression which encodes
+ # DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+ # byte vector;
+ #
+ # CFA expressions were introduced in DWARF specification version
+ # 3 and describe how to deduce CFA, Canonical Frame Address. This
+ # becomes handy if your stack frame is variable and you can't
+ # spare register for [previous] frame pointer. Suggested directive
+ # syntax is made-up mix of DWARF operator suffixes [subset of]
+ # and references to registers with optional bias. Following example
+ # describes offloaded *original* stack pointer at specific offset
+ # from *current* stack pointer:
+ #
+ # .cfi_cfa_expression %rsp+40,deref,+8
+ #
+ # Final +8 has everything to do with the fact that CFA is defined
+ # as reference to top of caller's stack, and on x86_64 call to
+ # subroutine pushes 8-byte return address. In other words original
+ # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+ # Below constants are taken from "DWARF Expressions" section of the
+ # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+ my %DW_OP_simple = ( # no-arg operators, mapped directly
+ deref => 0x06, dup => 0x12,
+ drop => 0x13, over => 0x14,
+ pick => 0x15, swap => 0x16,
+ rot => 0x17, xderef => 0x18,
+
+ abs => 0x19, and => 0x1a,
+ div => 0x1b, minus => 0x1c,
+ mod => 0x1d, mul => 0x1e,
+ neg => 0x1f, not => 0x20,
+ or => 0x21, plus => 0x22,
+ shl => 0x24, shr => 0x25,
+ shra => 0x26, xor => 0x27,
+ );
+
+ my %DW_OP_complex = ( # used in specific subroutines
+ constu => 0x10, # uleb128
+ consts => 0x11, # sleb128
+ plus_uconst => 0x23, # uleb128
+ lit0 => 0x30, # add 0-31 to opcode
+ reg0 => 0x50, # add 0-31 to opcode
+ breg0 => 0x70, # add 0-31 to opcole, sleb128
+ regx => 0x90, # uleb28
+ fbreg => 0x91, # sleb128
+ bregx => 0x92, # uleb128, sleb128
+ piece => 0x93, # uleb128
+ );
+
+ # Following constants are defined in x86_64 ABI supplement, for
+ # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+ # see section 3.7 "Stack Unwind Algorithm".
+ my %DW_reg_idx = (
+ "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3,
+ "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7,
+ "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11,
+ "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+ );
+
+ my ($cfa_reg, $cfa_rsp);
+ my @cfa_stack;
+
+ # [us]leb128 format is variable-length integer representation base
+ # 2^128, with most significant bit of each byte being 0 denoting
+ # *last* most significant digit. See "Variable Length Data" in the
+ # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+ sub sleb128 {
+ use integer; # get right shift extend sign
+
+ my $val = shift;
+ my $sign = ($val < 0) ? -1 : 0;
+ my @ret = ();
+
+ while(1) {
+ push @ret, $val&0x7f;
+
+ # see if remaining bits are same and equal to most
+ # significant bit of the current digit, if so, it's
+ # last digit...
+ last if (($val>>6) == $sign);
+
+ @ret[-1] |= 0x80;
+ $val >>= 7;
+ }
+
+ return @ret;
+ }
+ sub uleb128 {
+ my $val = shift;
+ my @ret = ();
+
+ while(1) {
+ push @ret, $val&0x7f;
+
+ # see if it's last significant digit...
+ last if (($val >>= 7) == 0);
+
+ @ret[-1] |= 0x80;
+ }
+
+ return @ret;
+ }
+ sub const {
+ my $val = shift;
+
+ if ($val >= 0 && $val < 32) {
+ return ($DW_OP_complex{lit0}+$val);
+ }
+ return ($DW_OP_complex{consts}, sleb128($val));
+ }
+ sub reg {
+ my $val = shift;
+
+ return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+ my $reg = $DW_reg_idx{$1};
+ my $off = eval ("0 $2 $3");
+
+ return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+ # Yes, we use DW_OP_bregX+0 to push register value and not
+ # DW_OP_regX, because latter would require even DW_OP_piece,
+ # which would be a waste under the circumstances. If you have
+ # to use DWP_OP_reg, use "regx:N"...
+ }
+ sub cfa_expression {
+ my $line = shift;
+ my @ret;
+
+ foreach my $token (split(/,\s*/,$line)) {
+ if ($token =~ /^%r/) {
+ push @ret,reg($token);
+ } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+ push @ret,reg("$2+$1");
+ } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+ my $i = 1*eval($2);
+ push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+ } elsif (my $i = 1*eval($token) or $token eq "0") {
+ if ($token =~ /^\+/) {
+ push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+ } else {
+ push @ret,const($i);
+ }
+ } else {
+ push @ret,$DW_OP_simple{$token};
+ }
+ }
+
+ # Finally we return DW_CFA_def_cfa_expression, 15, followed by
+ # length of the expression and of course the expression itself.
+ return (15,scalar(@ret),@ret);
+ }
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+ bless $self,$class;
+ $ret = $self;
+ undef $self->{value};
+ my $dir = $1;
+
+ SWITCH: for ($dir) {
+ # What is $cfa_rsp? Effectively it's difference between %rsp
+ # value and current CFA, Canonical Frame Address, which is
+ # why it starts with -8. Recall that CFA is top of caller's
+ # stack...
+ /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+ /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0);
+ # .cfi_remember_state directives that are not
+ # matched with .cfi_restore_state are
+ # unnecessary.
+ die "unpaired .cfi_remember_state" if (@cfa_stack);
+ last;
+ };
+ /def_cfa_register/
+ && do { $cfa_reg = $$line; last; };
+ /def_cfa_offset/
+ && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp");
+ last;
+ };
+ /adjust_cfa_offset/
+ && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp");
+ last;
+ };
+ /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+ $cfa_reg = $1;
+ $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp");
+ }
+ last;
+ };
+ /push/ && do { $dir = undef;
+ $cfa_rsp -= 8;
+ if ($cfa_reg eq "%rsp") {
+ $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+ }
+ $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+ last;
+ };
+ /pop/ && do { $dir = undef;
+ $cfa_rsp += 8;
+ if ($cfa_reg eq "%rsp") {
+ $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+ }
+ $self->{value} .= ".cfi_restore\t$$line";
+ last;
+ };
+ /cfa_expression/
+ && do { $dir = undef;
+ $self->{value} = ".cfi_escape\t" .
+ join(",", map(sprintf("0x%02x", $_),
+ cfa_expression($$line)));
+ last;
+ };
+ /remember_state/
+ && do { push @cfa_stack, [$cfa_reg, $cfa_rsp];
+ last;
+ };
+ /restore_state/
+ && do { ($cfa_reg, $cfa_rsp) = @{pop @cfa_stack};
+ last;
+ };
+ }
+
+ $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+ $$line = "";
+ }
+
+ return $ret;
+ }
+ sub out {
+ my $self = shift;
+ return ($elf ? $self->{value} : undef);
+ }
+}
+{ package directive; # pick up directives, which start with .
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+ my $dir;
+
+ # chain-call to cfi_directive
+ $ret = cfi_directive->re($line) and return $ret;
+
+ if ($$line =~ /^\s*(\.\w+)/) {
+ bless $self,$class;
+ $dir = $1;
+ $ret = $self;
+ undef $self->{value};
+ $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+ SWITCH: for ($dir) {
+ /\.global|\.globl|\.extern/
+ && do { $globals{$$line} = $prefix . $$line;
+ $$line = $globals{$$line} if ($prefix);
+ last;
+ };
+ /\.type/ && do { my ($sym,$type,$narg) = split(',',$$line);
+ if ($type eq "\@function") {
+ undef $current_function;
+ $current_function->{name} = $sym;
+ $current_function->{abi} = "svr4";
+ $current_function->{narg} = $narg;
+ $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+ } elsif ($type eq "\@abi-omnipotent") {
+ undef $current_function;
+ $current_function->{name} = $sym;
+ $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+ }
+ $$line =~ s/\@abi\-omnipotent/\@function/;
+ $$line =~ s/\@function.*/\@function/;
+ last;
+ };
+ /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) {
+ $dir = ".byte";
+ $$line = join(",",unpack("C*",$1),0);
+ }
+ last;
+ };
+ /\.rva|\.long|\.quad/
+ && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+ $$line =~ s/\.L/$decor/g;
+ last;
+ };
+ }
+
+ if ($gas) {
+ $self->{value} = $dir . "\t" . $$line;
+
+ if ($dir =~ /\.extern/) {
+ $self->{value} = ""; # swallow extern
+ } elsif (!$elf && $dir =~ /\.type/) {
+ $self->{value} = "";
+ $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+ (defined($globals{$1})?".scl 2;":".scl 3;") .
+ "\t.type 32;\t.endef"
+ if ($win64 && $$line =~ /([^,]+),\@function/);
+ } elsif (!$elf && $dir =~ /\.size/) {
+ $self->{value} = "";
+ if (defined($current_function)) {
+ $self->{value} .= "${decor}SEH_end_$current_function->{name}:"
+ if ($win64 && $current_function->{abi} eq "svr4");
+ undef $current_function;
+ }
+ } elsif (!$elf && $dir =~ /\.align/) {
+ $self->{value} = ".p2align\t" . (log($$line)/log(2));
+ } elsif ($dir eq ".section") {
+ $current_segment=$$line;
+ if (!$elf && $current_segment eq ".init") {
+ if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; }
+ elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; }
+ }
+ } elsif ($dir =~ /\.(text|data)/) {
+ $current_segment=".$1";
+ } elsif ($dir =~ /\.hidden/) {
+ if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; }
+ elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+ } elsif ($dir =~ /\.comm/) {
+ $self->{value} = "$dir\t$prefix$$line";
+ $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+ }
+ $$line = "";
+ return $self;
+ }
+
+ # non-gas case or nasm/masm
+ SWITCH: for ($dir) {
+ /\.text/ && do { my $v=undef;
+ if ($nasm) {
+ $v="section .text code align=64\n";
+ } else {
+ $v="$current_segment\tENDS\n" if ($current_segment);
+ $current_segment = ".text\$";
+ $v.="$current_segment\tSEGMENT ";
+ $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
+ $v.=" 'CODE'";
+ }
+ $self->{value} = $v;
+ last;
+ };
+ /\.data/ && do { my $v=undef;
+ if ($nasm) {
+ $v="section .data data align=8\n";
+ } else {
+ $v="$current_segment\tENDS\n" if ($current_segment);
+ $current_segment = "_DATA";
+ $v.="$current_segment\tSEGMENT";
+ }
+ $self->{value} = $v;
+ last;
+ };
+ /\.section/ && do { my $v=undef;
+ $$line =~ s/([^,]*).*/$1/;
+ $$line = ".CRT\$XCU" if ($$line eq ".init");
+ if ($nasm) {
+ $v="section $$line";
+ if ($$line=~/\.([px])data/) {
+ $v.=" rdata align=";
+ $v.=$1 eq "p"? 4 : 8;
+ } elsif ($$line=~/\.CRT\$/i) {
+ $v.=" rdata align=8";
+ }
+ } else {
+ $v="$current_segment\tENDS\n" if ($current_segment);
+ $v.="$$line\tSEGMENT";
+ if ($$line=~/\.([px])data/) {
+ $v.=" READONLY";
+ $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
+ } elsif ($$line=~/\.CRT\$/i) {
+ $v.=" READONLY ";
+ $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
+ }
+ }
+ $current_segment = $$line;
+ $self->{value} = $v;
+ last;
+ };
+ /\.extern/ && do { $self->{value} = "EXTERN\t".$$line;
+ $self->{value} .= ":NEAR" if ($masm);
+ last;
+ };
+ /\.globl|.global/
+ && do { $self->{value} = $masm?"PUBLIC":"global";
+ $self->{value} .= "\t".$$line;
+ last;
+ };
+ /\.size/ && do { if (defined($current_function)) {
+ undef $self->{value};
+ if ($current_function->{abi} eq "svr4") {
+ $self->{value}="${decor}SEH_end_$current_function->{name}:";
+ $self->{value}.=":\n" if($masm);
+ }
+ $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
+ undef $current_function;
+ }
+ last;
+ };
+ /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
+ $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
+ last;
+ };
+ /\.(value|long|rva|quad)/
+ && do { my $sz = substr($1,0,1);
+ my @arr = split(/,\s*/,$$line);
+ my $last = pop(@arr);
+ my $conv = sub { my $var=shift;
+ $var=~s/^(0b[0-1]+)/oct($1)/eig;
+ $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+ if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
+ { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+ $var;
+ };
+
+ $sz =~ tr/bvlrq/BWDDQ/;
+ $self->{value} = "\tD$sz\t";
+ for (@arr) { $self->{value} .= &$conv($_).","; }
+ $self->{value} .= &$conv($last);
+ last;
+ };
+ /\.byte/ && do { my @str=split(/,\s*/,$$line);
+ map(s/(0b[0-1]+)/oct($1)/eig,@str);
+ map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+ while ($#str>15) {
+ $self->{value}.="DB\t"
+ .join(",",@str[0..15])."\n";
+ foreach (0..15) { shift @str; }
+ }
+ $self->{value}.="DB\t"
+ .join(",",@str) if (@str);
+ last;
+ };
+ /\.comm/ && do { my @str=split(/,\s*/,$$line);
+ my $v=undef;
+ if ($nasm) {
+ $v.="common $prefix@str[0] @str[1]";
+ } else {
+ $v="$current_segment\tENDS\n" if ($current_segment);
+ $current_segment = "_DATA";
+ $v.="$current_segment\tSEGMENT\n";
+ $v.="COMM @str[0]:DWORD:".@str[1]/4;
+ }
+ $self->{value} = $v;
+ last;
+ };
+ }
+ $$line = "";
+ }
+
+ $ret;
+ }
+ sub out {
+ my $self = shift;
+ $self->{value};
+ }
+}
+
+# Upon initial x86_64 introduction SSE>2 extensions were not introduced
+# yet. In order not to be bothered by tracing exact assembler versions,
+# but at the same time to provide a bare security minimum of AES-NI, we
+# hard-code some instructions. Extensions past AES-NI on the other hand
+# are traced by examining assembler version in individual perlasm
+# modules...
+
+my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+ "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 );
+
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @$opcode,($rex|0x40) if ($rex);
+}
+
+my $movq = sub { # elderly gas can't handle inter-register movq
+ my $arg = shift;
+ my @opcode=(0x66);
+ if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+ my ($src,$dst)=($1,$2);
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,$src,$dst,0x8);
+ push @opcode,0x0f,0x7e;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ @opcode;
+ } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+ my ($src,$dst)=($2,$1);
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,$src,$dst,0x8);
+ push @opcode,0x0f,0x6e;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pextrd = sub {
+ if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+ my @opcode=(0x66);
+ my $imm=$1;
+ my $src=$2;
+ my $dst=$3;
+ if ($dst =~ /%r([0-9]+)d/) { $dst = $1; }
+ elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; }
+ rex(\@opcode,$src,$dst);
+ push @opcode,0x0f,0x3a,0x16;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ push @opcode,$imm;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pinsrd = sub {
+ if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ my $imm=$1;
+ my $src=$2;
+ my $dst=$3;
+ if ($src =~ /%r([0-9]+)/) { $src = $1; }
+ elsif ($src =~ /%e/) { $src = $regrm{$src}; }
+ rex(\@opcode,$dst,$src);
+ push @opcode,0x0f,0x3a,0x22;
+ push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M
+ push @opcode,$imm;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pshufb = sub {
+ if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$2,$1);
+ push @opcode,0x0f,0x38,0x00;
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $palignr = sub {
+ if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x3a,0x0f;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ push @opcode,$1;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pclmulqdq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x3a,0x44;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $rdrand = sub {
+ if (shift =~ /%[er](\w+)/) {
+ my @opcode=();
+ my $dst=$1;
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,0,$dst,8);
+ push @opcode,0x0f,0xc7,0xf0|($dst&7);
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $rdseed = sub {
+ if (shift =~ /%[er](\w+)/) {
+ my @opcode=();
+ my $dst=$1;
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,0,$dst,8);
+ push @opcode,0x0f,0xc7,0xf8|($dst&7);
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
+sub rxb {
+ my $opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @$opcode,$rxb;
+}
+
+my $vprotd = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $vprotq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc3;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+
+my $endbranch = sub {
+ (0xf3,0x0f,0x1e,0xfa);
+};
+
+########################################################################
+
+if ($nasm) {
+ print <<___;
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+___
+} elsif ($masm) {
+ print <<___;
+OPTION DOTNAME
+___
+}
+while(defined(my $line=<>)) {
+
+ $line =~ s|\R$||; # Better chomp
+
+ $line =~ s|[#!].*$||; # get rid of asm-style comments...
+ $line =~ s|/\*.*\*/||; # ... and C-style comments...
+ $line =~ s|^\s+||; # ... and skip white spaces in beginning
+ $line =~ s|\s+$||; # ... and at the end
+
+ if (my $label=label->re(\$line)) { print $label->out(); }
+
+ if (my $directive=directive->re(\$line)) {
+ printf "%s",$directive->out();
+ } elsif (my $opcode=opcode->re(\$line)) {
+ my $asm = eval("\$".$opcode->mnemonic());
+
+ if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
+ print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+ next;
+ }
+
+ my @args;
+ ARGUMENT: while (1) {
+ my $arg;
+
+ ($arg=register->re(\$line, $opcode))||
+ ($arg=const->re(\$line)) ||
+ ($arg=ea->re(\$line, $opcode)) ||
+ ($arg=expr->re(\$line, $opcode)) ||
+ last ARGUMENT;
+
+ push @args,$arg;
+
+ last ARGUMENT if ($line !~ /^,/);
+
+ $line =~ s/^,\s*//;
+ } # ARGUMENT:
+
+ if ($#args>=0) {
+ my $insn;
+ my $sz=$opcode->size();
+
+ if ($gas) {
+ $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+ @args = map($_->out($sz),@args);
+ printf "\t%s\t%s",$insn,join(",",@args);
+ } else {
+ $insn = $opcode->out();
+ foreach (@args) {
+ my $arg = $_->out();
+ # $insn.=$sz compensates for movq, pinsrw, ...
+ if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+ if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+ if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
+ if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
+ }
+ @args = reverse(@args);
+ undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+ printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+ }
+ } else {
+ printf "\t%s",$opcode->out();
+ }
+ }
+
+ print $line,"\n";
+}
+
+print "\n$current_segment\tENDS\n" if ($current_segment && $masm);
+print "END\n" if ($masm);
+
+close STDOUT;
+
+\f#################################################
+# Cross-reference x86_64 ABI "card"
+#
+# Unix Win64
+# %rax * *
+# %rbx - -
+# %rcx #4 #1
+# %rdx #3 #2
+# %rsi #2 -
+# %rdi #1 -
+# %rbp - -
+# %rsp - -
+# %r8 #5 #3
+# %r9 #6 #4
+# %r10 * *
+# %r11 * *
+# %r12 - -
+# %r13 - -
+# %r14 - -
+# %r15 - -
+#
+# (*) volatile register
+# (-) preserved by callee
+# (#) Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accommodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existence,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+# movq %rdi,8(%rsp)
+# movq %rsi,16(%rsp)
+# movq %rcx,%rdi ; if 1st argument is actually present
+# movq %rdx,%rsi ; if 2nd argument is actually ...
+# movq %r8,%rdx ; if 3rd argument is ...
+# movq %r9,%rcx ; if 4th argument ...
+# movq 40(%rsp),%r8 ; if 5th ...
+# movq 48(%rsp),%r9 ; if 6th ...
+# endif
+# ...
+# ifdef WIN64
+# movq 8(%rsp),%rdi
+# movq 16(%rsp),%rsi
+# endif
+# ret
+#
+\f#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type function,@function
+# function:
+# movq %rsp,%rax # copy rsp to volatile register
+# pushq %r15 # save non-volatile registers
+# pushq %rbx
+# pushq %rbp
+# movq %rsp,%r11
+# subq %rdi,%r11 # prepare [variable] stack frame
+# andq $-64,%r11
+# movq %rax,0(%r11) # check for exceptions
+# movq %r11,%rsp # allocate [variable] stack frame
+# movq %rax,0(%rsp) # save original rsp value
+# magic_point:
+# ...
+# movq 0(%rsp),%rcx # pull original rsp value
+# movq -24(%rcx),%rbp # restore non-volatile registers
+# movq -16(%rcx),%rbx
+# movq -8(%rcx),%r15
+# movq %rcx,%rsp # restore original rsp
+# magic_epilogue:
+# ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# { ULONG64 *rsp = (ULONG64 *)context->Rax;
+# ULONG64 rip = context->Rip;
+#
+# if (rip >= magic_point)
+# { rsp = (ULONG64 *)context->Rsp;
+# if (rip < magic_epilogue)
+# { rsp = (ULONG64 *)rsp[0];
+# context->Rbp = rsp[-3];
+# context->Rbx = rsp[-2];
+# context->R15 = rsp[-1];
+# }
+# }
+# context->Rsp = (ULONG64)rsp;
+# context->Rdi = rsp[1];
+# context->Rsi = rsp[2];
+#
+# memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+# &disp->HandlerData,&disp->EstablisherFrame,NULL);
+# return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+# CONTEXT.Rax 120
+# CONTEXT.Rcx 128
+# CONTEXT.Rdx 136
+# CONTEXT.Rbx 144
+# CONTEXT.Rsp 152
+# CONTEXT.Rbp 160
+# CONTEXT.Rsi 168
+# CONTEXT.Rdi 176
+# CONTEXT.R8 184
+# CONTEXT.R9 192
+# CONTEXT.R10 200
+# CONTEXT.R11 208
+# CONTEXT.R12 216
+# CONTEXT.R13 224
+# CONTEXT.R14 232
+# CONTEXT.R15 240
+# CONTEXT.Rip 248
+# CONTEXT.Xmm6 512
+# sizeof(CONTEXT) 1232
+# DISPATCHER_CONTEXT.ControlPc 0
+# DISPATCHER_CONTEXT.ImageBase 8
+# DISPATCHER_CONTEXT.FunctionEntry 16
+# DISPATCHER_CONTEXT.EstablisherFrame 24
+# DISPATCHER_CONTEXT.TargetIp 32
+# DISPATCHER_CONTEXT.ContextRecord 40
+# DISPATCHER_CONTEXT.LanguageHandler 48
+# DISPATCHER_CONTEXT.HandlerData 56
+# UNW_FLAG_NHANDLER 0
+# ExceptionContinueSearch 1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+# .byte 9,0,0,0
+# .rva handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+# .rva .LSEH_begin_function
+# .rva .LSEH_end_function
+# .rva function_unwind_info
+#
+# Reference to function_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# SE handlers are also involved in unwinding stack when executable is
+# profiled or debugged. Profiling implies additional limitations that
+# are too subtle to discuss here. For now it's sufficient to say that
+# in order to simplify handlers one should either a) offload original
+# %rsp to stack (like discussed above); or b) if you have a register to
+# spare for frame pointer, choose volatile one.
+#
+# (*) Note that we're talking about run-time, not debug-time. Lack of
+# unwind information makes debugging hard on both Windows and
+# Unix. "Unlike" refers to the fact that on Unix signal handler
+# will always be invoked, core dumped and appropriate exit code
+# returned to parent (for user notification).
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+print<<___;
+.text
+
+.globl OPENSSL_ia32_cpuid
+.type OPENSSL_ia32_cpuid,\@function,1
+.align 16
+OPENSSL_ia32_cpuid:
+.cfi_startproc
+ mov %rbx,%r8 # save %rbx
+.cfi_register %rbx,%r8
+
+ xor %eax,%eax
+ mov %rax,8(%rdi) # clear extended feature flags
+ cpuid
+ mov %eax,%r11d # max value for standard query level
+
+ xor %eax,%eax
+ cmp \$0x756e6547,%ebx # "Genu"
+ setne %al
+ mov %eax,%r9d
+ cmp \$0x49656e69,%edx # "ineI"
+ setne %al
+ or %eax,%r9d
+ cmp \$0x6c65746e,%ecx # "ntel"
+ setne %al
+ or %eax,%r9d # 0 indicates Intel CPU
+ jz .Lintel
+
+ cmp \$0x68747541,%ebx # "Auth"
+ setne %al
+ mov %eax,%r10d
+ cmp \$0x69746E65,%edx # "enti"
+ setne %al
+ or %eax,%r10d
+ cmp \$0x444D4163,%ecx # "cAMD"
+ setne %al
+ or %eax,%r10d # 0 indicates AMD CPU
+ jnz .Lintel
+
+ # AMD specific
+ mov \$0x80000000,%eax
+ cpuid
+ cmp \$0x80000001,%eax
+ jb .Lintel
+ mov %eax,%r10d
+ mov \$0x80000001,%eax
+ cpuid
+ or %ecx,%r9d
+ and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
+
+ cmp \$0x80000008,%r10d
+ jb .Lintel
+
+ mov \$0x80000008,%eax
+ cpuid
+ movzb %cl,%r10 # number of cores - 1
+ inc %r10 # number of cores
+
+ mov \$1,%eax
+ cpuid
+ bt \$28,%edx # test hyper-threading bit
+ jnc .Lgeneric
+ shr \$16,%ebx # number of logical processors
+ cmp %r10b,%bl
+ ja .Lgeneric
+ and \$0xefffffff,%edx # ~(1<<28)
+ jmp .Lgeneric
+
+.Lintel:
+ cmp \$4,%r11d
+ mov \$-1,%r10d
+ jb .Lnocacheinfo
+
+ mov \$4,%eax
+ mov \$0,%ecx # query L1D
+ cpuid
+ mov %eax,%r10d
+ shr \$14,%r10d
+ and \$0xfff,%r10d # number of cores -1 per L1D
+
+.Lnocacheinfo:
+ mov \$1,%eax
+ cpuid
+ movd %eax,%xmm0 # put aside processor id
+ and \$0xbfefffff,%edx # force reserved bits to 0
+ cmp \$0,%r9d
+ jne .Lnotintel
+ or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs
+ and \$15,%ah
+ cmp \$15,%ah # examine Family ID
+ jne .LnotP4
+ or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+ cmp \$6,%ah
+ jne .Lnotintel
+ and \$0x0fff0ff0,%eax
+ cmp \$0x00050670,%eax # Knights Landing
+ je .Lknights
+ cmp \$0x00080650,%eax # Knights Mill (according to sde)
+ jne .Lnotintel
+.Lknights:
+ and \$0xfbffffff,%ecx # clear XSAVE flag to mimic Silvermont
+
+.Lnotintel:
+ bt \$28,%edx # test hyper-threading bit
+ jnc .Lgeneric
+ and \$0xefffffff,%edx # ~(1<<28)
+ cmp \$0,%r10d
+ je .Lgeneric
+
+ or \$0x10000000,%edx # 1<<28
+ shr \$16,%ebx
+ cmp \$1,%bl # see if cache is shared
+ ja .Lgeneric
+ and \$0xefffffff,%edx # ~(1<<28)
+.Lgeneric:
+ and \$0x00000800,%r9d # isolate AMD XOP flag
+ and \$0xfffff7ff,%ecx
+ or %ecx,%r9d # merge AMD XOP flag
+
+ mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx
+
+ cmp \$7,%r11d
+ jb .Lno_extended_info
+ mov \$7,%eax
+ xor %ecx,%ecx
+ cpuid
+ bt \$26,%r9d # check XSAVE bit, cleared on Knights
+ jc .Lnotknights
+ and \$0xfff7ffff,%ebx # clear ADCX/ADOX flag
+.Lnotknights:
+ movd %xmm0,%eax # restore processor id
+ and \$0x0fff0ff0,%eax
+ cmp \$0x00050650,%eax # Skylake-X
+ jne .Lnotskylakex
+ and \$0xfffeffff,%ebx # ~(1<<16)
+ # suppress AVX512F flag on Skylake-X
+.Lnotskylakex:
+ mov %ebx,8(%rdi) # save extended feature flags
+ mov %ecx,12(%rdi)
+.Lno_extended_info:
+
+ bt \$27,%r9d # check OSXSAVE bit
+ jnc .Lclear_avx
+ xor %ecx,%ecx # XCR0
+ .byte 0x0f,0x01,0xd0 # xgetbv
+ and \$0xe6,%eax # isolate XMM, YMM and ZMM state support
+ cmp \$0xe6,%eax
+ je .Ldone
+ andl \$0x3fdeffff,8(%rdi) # ~(1<<31|1<<30|1<<21|1<<16)
+ # clear AVX512F+BW+VL+FIMA, all of
+ # them are EVEX-encoded, which requires
+ # ZMM state support even if one uses
+ # only XMM and YMM :-(
+ and \$6,%eax # isolate XMM and YMM state support
+ cmp \$6,%eax
+ je .Ldone
+.Lclear_avx:
+ mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
+ and %eax,%r9d # clear AVX, FMA and AMD XOP bits
+ mov \$0x3fdeffdf,%eax # ~(1<<31|1<<30|1<<21|1<<16|1<<5)
+ and %eax,8(%rdi) # clear AVX2 and AVX512* bits
+.Ldone:
+ shl \$32,%r9
+ mov %r10d,%eax
+ mov %r8,%rbx # restore %rbx
+.cfi_restore %rbx
+ or %r9,%rax
+ ret
+.cfi_endproc
+.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+___
+
+
+close STDOUT; # flush
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# require 'x86asm.pl';
+# &asm_init(<flavor>[,$i386only]);
+# &function_begin("foo");
+# ...
+# &function_end("foo");
+# &asm_finish
+
+$out=();
+$i386=0;
+
+# AUTOLOAD is this context has quite unpleasant side effect, namely
+# that typos in function calls effectively go to assembler output,
+# but on the pros side we don't have to implement one subroutine per
+# each opcode...
+sub ::AUTOLOAD
+{ my $opcode = $AUTOLOAD;
+
+ die "more than 4 arguments passed to $opcode" if ($#_>3);
+
+ $opcode =~ s/.*:://;
+ if ($opcode =~ /^push/) { $stack+=4; }
+ elsif ($opcode =~ /^pop/) { $stack-=4; }
+
+ &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD";
+}
+
+sub ::emit
+{ my $opcode=shift;
+
+ if ($#_==-1) { push(@out,"\t$opcode\n"); }
+ else { push(@out,"\t$opcode\t".join(',',@_)."\n"); }
+}
+
+sub ::LB
+{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'";
+ $1."l";
+}
+sub ::HB
+{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'";
+ $1."h";
+}
+sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num); }
+sub ::stack_pop { my $num=$_[0]*4; $stack-=$num; &add("esp",$num); }
+sub ::blindpop { &pop($_[0]); $stack+=4; }
+sub ::wparam { &DWP($stack+4*$_[0],"esp"); }
+sub ::swtmp { &DWP(4*$_[0],"esp"); }
+
+sub ::bswap
+{ if ($i386) # emulate bswap for i386
+ { &comment("bswap @_");
+ &xchg(&HB(@_),&LB(@_));
+ &ror (@_,16);
+ &xchg(&HB(@_),&LB(@_));
+ }
+ else
+ { &generic("bswap",@_); }
+}
+# These are made-up opcodes introduced over the years essentially
+# by ignorance, just alias them to real ones...
+sub ::movb { &mov(@_); }
+sub ::xorb { &xor(@_); }
+sub ::rotl { &rol(@_); }
+sub ::rotr { &ror(@_); }
+sub ::exch { &xchg(@_); }
+sub ::halt { &hlt; }
+sub ::movz { &movzx(@_); }
+sub ::pushf { &pushfd; }
+sub ::popf { &popfd; }
+
+# 3 argument instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+
+ if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+ # movq between mmx registers can sink Intel CPUs
+ { &::pshufw($p1,$p2,0xe4); }
+ else
+ { &::generic("movq",@_); }
+}
+
+# SSE>2 instructions
+my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+ "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 );
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
+ else
+ { &::generic("pextrd",@_); }
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
+ else
+ { &::generic("pinsrd",@_); }
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); }
+ else
+ { &::generic("pshufb",@_); }
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("palignr",@_); }
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("pclmulqdq",@_); }
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
+sub ::rdseed
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+ if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+ { my @opcode=(0x8f);
+ rxb(\@opcode,$1,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M
+ my $c=$3;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ &::data_byte(@opcode);
+ }
+ else
+ { &::generic("vprotd",@_); }
+}
+
+sub ::endbranch
+{
+ &::data_byte(0xf3,0x0f,0x1e,0xfb);
+}
+
+# label management
+$lbdecor="L"; # local label decoration, set by package
+$label="000";
+
+sub ::islabel # see is argument is a known label
+{ my $i;
+ foreach $i (values %label) { return $i if ($i eq $_[0]); }
+ $label{$_[0]}; # can be undef
+}
+
+sub ::label # instantiate a function-scope label
+{ if (!defined($label{$_[0]}))
+ { $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++; }
+ $label{$_[0]};
+}
+
+sub ::LABEL # instantiate a file-scope label
+{ $label{$_[0]}=$_[1] if (!defined($label{$_[0]}));
+ $label{$_[0]};
+}
+
+sub ::static_label { &::LABEL($_[0],$lbdecor.$_[0]); }
+
+sub ::set_label_B { push(@out,"@_:\n"); }
+sub ::set_label
+{ my $label=&::label($_[0]);
+ &::align($_[1]) if ($_[1]>1);
+ &::set_label_B($label);
+ $label;
+}
+
+sub ::wipe_labels # wipes function-scope labels
+{ foreach $i (keys %label)
+ { delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); }
+}
+
+# subroutine management
+sub ::function_begin
+{ &function_begin_B(@_);
+ $stack=4;
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+}
+
+sub ::function_end
+{ &pop("edi");
+ &pop("esi");
+ &pop("ebx");
+ &pop("ebp");
+ &ret();
+ &function_end_B(@_);
+ $stack=0;
+ &wipe_labels();
+}
+
+sub ::function_end_A
+{ &pop("edi");
+ &pop("esi");
+ &pop("ebx");
+ &pop("ebp");
+ &ret();
+ $stack+=16; # readjust esp as if we didn't pop anything
+}
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+ push @str,0;
+ while ($#str>15) {
+ &data_byte(@str[0..15]);
+ foreach (0..15) { shift @str; }
+ }
+ &data_byte(@str) if (@str);
+}
+
+sub ::asm_finish
+{ &file_end();
+ print @out;
+}
+
+sub ::asm_init
+{ my ($type,$cpu)=@_;
+
+ $i386=$cpu;
+
+ $elf=$cpp=$coff=$aout=$macosx=$win32=$mwerks=$android=0;
+ if (($type eq "elf"))
+ { $elf=1; require "x86gas.pl"; }
+ elsif (($type eq "elf-1"))
+ { $elf=-1; require "x86gas.pl"; }
+ elsif (($type eq "a\.out"))
+ { $aout=1; require "x86gas.pl"; }
+ elsif (($type eq "coff" or $type eq "gaswin"))
+ { $coff=1; require "x86gas.pl"; }
+ elsif (($type eq "win32n"))
+ { $win32=1; require "x86nasm.pl"; }
+ elsif (($type eq "win32"))
+ { $win32=1; require "x86masm.pl"; }
+ elsif (($type eq "macosx"))
+ { $aout=1; $macosx=1; require "x86gas.pl"; }
+ elsif (($type eq "android"))
+ { $elf=1; $android=1; require "x86gas.pl"; }
+ else
+ { print STDERR <<"EOF";
+Pick one target type from
+ elf - Linux, FreeBSD, Solaris x86, etc.
+ a.out - DJGPP, elder OpenBSD, etc.
+ coff - GAS/COFF such as Win32 targets
+ win32n - Windows 95/Windows NT NASM format
+ macosx - Mac OS X
+EOF
+ exit(1);
+ }
+
+ $pic=0;
+ for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); }
+
+ &file();
+}
+
+sub ::hidden {}
+
+1;
--- /dev/null
+#! /usr/bin/env perl
+# Copyright 1999-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+package x86nasm;
+
+*out=\@::out;
+
+$::lbdecor="L\$"; # local label decoration
+$nmdecor="_"; # external name decoration
+$drdecor=$::mwerks?".":""; # directive decoration
+
+$initseg="";
+
+sub ::generic
+{ my $opcode=shift;
+ my $tmp;
+
+ if (!$::mwerks)
+ { if ($opcode =~ m/^j/o && $#_==0) # optimize jumps
+ { $_[0] = "NEAR $_[0]"; }
+ elsif ($opcode eq "lea" && $#_==1) # wipe storage qualifier from lea
+ { $_[1] =~ s/^[^\[]*\[/\[/o; }
+ elsif ($opcode eq "clflush" && $#_==0)
+ { $_[0] =~ s/^[^\[]*\[/\[/o; }
+ }
+ &::emit($opcode,@_);
+ 1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr { &::emit("call",@_); }
+sub ::jmp_ptr { &::emit("jmp",@_); }
+
+sub get_mem
+{ my($size,$addr,$reg1,$reg2,$idx)=@_;
+ my($post,$ret);
+
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
+ if ($size ne "")
+ { $ret .= "$size";
+ $ret .= " PTR" if ($::mwerks);
+ $ret .= " ";
+ }
+ $ret .= "[";
+
+ $addr =~ s/^\s+//;
+ # prepend global references with optional underscore
+ $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige;
+ # put address arithmetic expression in parenthesis
+ $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/);
+
+ if (($addr ne "") && ($addr ne 0))
+ { if ($addr !~ /^-/) { $ret .= "$addr+"; }
+ else { $post=$addr; }
+ }
+
+ if ($reg2 ne "")
+ { $idx!=0 or $idx=1;
+ $ret .= "$reg2*$idx";
+ $ret .= "+$reg1" if ($reg1 ne "");
+ }
+ else
+ { $ret .= "$reg1"; }
+
+ $ret .= "$post]";
+ $ret =~ s/\+\]/]/; # in case $addr was the only argument
+
+ $ret;
+}
+sub ::BP { &get_mem("BYTE",@_); }
+sub ::DWP { &get_mem("DWORD",@_); }
+sub ::WP { &get_mem("WORD",@_); }
+sub ::QWP { &get_mem("",@_); }
+sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; }
+sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; }
+
+sub ::file
+{ if ($::mwerks) { push(@out,".section\t.text,64\n"); }
+ else
+ { my $tmp=<<___;
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+\$\@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+___
+ push(@out,$tmp);
+ }
+}
+
+sub ::function_begin_B
+{ my $func=shift;
+ my $global=($func !~ /^_/);
+ my $begin="${::lbdecor}_${func}_begin";
+
+ $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops
+
+ &::LABEL($func,$global?"$begin":"$nmdecor$func");
+ $func=$nmdecor.$func;
+
+ push(@out,"${drdecor}global $func\n") if ($global);
+ push(@out,"${drdecor}align 16\n");
+ push(@out,"$func:\n");
+ push(@out,"$begin:\n") if ($global);
+ $::stack=4;
+}
+
+sub ::function_end_B
+{ $::stack=0;
+ &::wipe_labels();
+}
+
+sub ::file_end
+{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
+ { my $comm=<<___;
+${drdecor}segment .bss
+${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16
+___
+ # comment out OPENSSL_ia32cap_P declarations
+ grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
+ push (@out,$comm)
+ }
+ push (@out,$initseg) if ($initseg);
+}
+
+sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }
+
+sub ::external_label
+{ foreach(@_)
+ { push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n"); }
+}
+
+sub ::public_label
+{ push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); }
+
+sub ::data_byte
+{ push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n"); }
+sub ::data_short
+{ push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n"); }
+sub ::data_word
+{ push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n"); }
+
+sub ::align
+{ push(@out,"${drdecor}align\t$_[0]\n"); }
+
+sub ::picmeup
+{ my($dst,$sym)=@_;
+ &::lea($dst,&::DWP($sym));
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+ if ($::win32)
+ { $initseg=<<___;
+segment .CRT\$XCU data align=4
+extern $f
+dd $f
+___
+ }
+}
+
+sub ::dataseg
+{ if ($mwerks) { push(@out,".section\t.data,4\n"); }
+ else { push(@out,"section\t.data align=4\n"); }
+}
+
+sub ::safeseh
+{ my $nm=shift;
+ push(@out,"%if __NASM_VERSION_ID__ >= 0x02030000\n");
+ push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n");
+ push(@out,"%endif\n");
+}
+
+1;
--- /dev/null
+#!/usr/bin/env perl
+
+package x86unix; # GAS actually...
+
+*out=\@::out;
+
+$label="L000";
+$const="";
+$constl=0;
+
+$align=($::aout)?"4":"16";
+$under=($::aout or $::coff)?"_":"";
+$dot=($::aout)?"":".";
+$com_start="#" if ($::aout or $::coff);
+
+sub opsize()
+{ my $reg=shift;
+ if ($reg =~ m/^%e/o) { "l"; }
+ elsif ($reg =~ m/^%[a-d][hl]$/o) { "b"; }
+ elsif ($reg =~ m/^%[xm]/o) { undef; }
+ else { "w"; }
+}
+
+# swap arguments;
+# expand opcode with size suffix;
+# prefix numeric constants with $;
+sub ::generic
+{ my($opcode,$dst,$src)=@_;
+ my($tmp,$suffix,@arg);
+
+ if (defined($src))
+ { $src =~ s/^(e?[a-dsixphl]{2})$/%$1/o;
+ $src =~ s/^(x?mm[0-7])$/%$1/o;
+ $src =~ s/^(\-?[0-9]+)$/\$$1/o;
+ $src =~ s/^(\-?0x[0-9a-f]+)$/\$$1/o;
+ push(@arg,$src);
+ }
+ if (defined($dst))
+ { $dst =~ s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o;
+ $dst =~ s/^(x?mm[0-7])$/%$1/o;
+ $dst =~ s/^(\-?[0-9]+)$/\$$1/o if(!defined($src));
+ $dst =~ s/^(\-?0x[0-9a-f]+)$/\$$1/o if(!defined($src));
+ push(@arg,$dst);
+ }
+
+ if ($dst =~ m/^%/o) { $suffix=&opsize($dst); }
+ elsif ($src =~ m/^%/o) { $suffix=&opsize($src); }
+ else { $suffix="l"; }
+ undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
+
+ if ($#_==0) { &::emit($opcode); }
+ elsif ($opcode =~ m/^j/o && $#_==1) { &::emit($opcode,@arg); }
+ elsif ($opcode eq "call" && $#_==1) { &::emit($opcode,@arg); }
+ else { &::emit($opcode.$suffix,@arg);}
+
+ 1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::movz { &::movzb(@_); }
+sub ::pushf { &::pushfl; }
+sub ::popf { &::popfl; }
+sub ::cpuid { &::emit(".byte\t0x0f,0xa2"); }
+sub ::rdtsc { &::emit(".byte\t0x0f,0x31"); }
+
+sub ::call { &::emit("call",(&islabel($_[0]) or "$under$_[0]")); }
+sub ::call_ptr { &::generic("call","*$_[0]"); }
+sub ::jmp_ptr { &::generic("jmp","*$_[0]"); }
+
+*::bswap = sub { &::emit("bswap","%$_[0]"); } if (!$::i386);
+
+# chosen SSE instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+ if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+ # movq between mmx registers can sink Intel CPUs
+ { &::pshufw($p1,$p2,0xe4); }
+ else
+ { &::generic("movq",@_); }
+}
+sub ::pshufw
+{ my($dst,$src,$magic)=@_;
+ &::emit("pshufw","\$$magic","%$src","%$dst");
+}
+
+sub ::DWP
+{ my($addr,$reg1,$reg2,$idx)=@_;
+ my $ret="";
+
+ $addr =~ s/^\s+//;
+ # prepend global references with optional underscore
+ $addr =~ s/^([^\+\-0-9][^\+\-]*)/islabel($1) or "$under$1"/ige;
+
+ $reg1 = "%$reg1" if ($reg1);
+ $reg2 = "%$reg2" if ($reg2);
+
+ $ret .= $addr if (($addr ne "") && ($addr ne 0));
+
+ if ($reg2)
+ { $idx!= 0 or $idx=1;
+ $ret .= "($reg1,$reg2,$idx)";
+ }
+ elsif ($reg1)
+ { $ret .= "($reg1)"; }
+
+ $ret;
+}
+sub ::QWP { &::DWP(@_); }
+sub ::BP { &::DWP(@_); }
+sub ::BC { @_; }
+sub ::DWC { @_; }
+
+sub ::file
+{ push(@out,".file\t\"$_[0].s\"\n"); }
+
+sub ::function_begin_B
+{ my($func,$extra)=@_;
+ my $tmp;
+
+ &::external_label($func);
+ $func=$under.$func;
+
+ push(@out,".text\n.globl\t$func\n");
+ if ($::coff)
+ { push(@out,".def\t$func;\t.scl\t2;\t.type\t32;\t.endef\n"); }
+ elsif ($::aout and !$::pic)
+ { }
+ else
+ { push(@out,".type $func,\@function\n"); }
+ push(@out,".align\t$align\n");
+ push(@out,"$func:\n");
+ $::stack=4;
+}
+
+sub ::function_end_B
+{ my($func)=@_;
+
+ $func=$under.$func;
+ push(@out,"${dot}L_${func}_end:\n");
+ if ($::elf)
+ { push(@out,".size\t$func,${dot}L_${func}_end-$func\n"); }
+ $::stack=0;
+ %label=();
+}
+
+sub ::comment
+ {
+ if (!defined($com_start) or $::elf)
+ { # Regarding $::elf above...
+ # GNU and SVR4 as'es use different comment delimiters,
+ push(@out,"\n"); # so we just skip ELF comments...
+ return;
+ }
+ foreach (@_)
+ {
+ if (/^\s*$/)
+ { push(@out,"\n"); }
+ else
+ { push(@out,"\t$com_start $_ $com_end\n"); }
+ }
+ }
+
+sub islabel # see is argument is a known label
+{ my $i;
+ foreach $i (%label) { return $label{$i} if ($label{$i} eq $_[0]); }
+ undef;
+}
+
+sub ::external_label { push(@labels,@_); }
+
+sub ::public_label
+{ $label{$_[0]}="${under}${_[0]}" if (!defined($label{$_[0]}));
+ push(@out,".globl\t$label{$_[0]}\n");
+}
+
+sub ::label
+{ if (!defined($label{$_[0]}))
+ { $label{$_[0]}="${dot}${label}${_[0]}"; $label++; }
+ $label{$_[0]};
+}
+
+sub ::set_label
+{ my $label=&::label($_[0]);
+ &::align($_[1]) if ($_[1]>1);
+ push(@out,"$label:\n");
+}
+
+sub ::file_end
+{ # try to detect if SSE2 or MMX extensions were used on ELF platform...
+ if ($::elf && grep {/%[x]?mm[0-7]/i} @out){
+ my $tmp;
+
+ push (@out,"\n.section\t.bss\n");
+ push (@out,".comm\t${under}OPENSSL_ia32cap_P,4,4\n");
+
+ push (@out,".section\t.init\n");
+ # One can argue that it's wasteful to craft every
+ # SSE/MMX module with this snippet... Well, it's 72
+ # bytes long and for the moment we have two modules.
+ # Let's argue when we have 7 modules or so...
+ #
+ # $1<<10 sets a reserved bit to signal that variable
+ # was initialized already...
+ &::picmeup("edx","OPENSSL_ia32cap_P");
+ $tmp=<<___;
+ cmpl \$0,(%edx)
+ jne 1f
+ movl \$1<<10,(%edx)
+ pushf
+ popl %eax
+ movl %eax,%ecx
+ xorl \$1<<21,%eax
+ pushl %eax
+ popf
+ pushf
+ popl %eax
+ xorl %ecx,%eax
+ btl \$21,%eax
+ jnc 1f
+ pushl %edi
+ pushl %ebx
+ movl %edx,%edi
+ movl \$1,%eax
+ .byte 0x0f,0xa2
+ orl \$1<<10,%edx
+ movl %edx,0(%edi)
+ popl %ebx
+ popl %edi
+ jmp 1f
+ .align $align
+ 1:
+___
+ push (@out,$tmp);
+ }
+
+ if ($const ne "")
+ { push(@out,".section .rodata\n");
+ push(@out,$const);
+ $const="";
+ }
+}
+
+sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); }
+sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); }
+
+sub ::align
+{ my $val=$_[0],$p2,$i;
+ if ($::aout)
+ { for ($p2=0;$val!=0;$val>>=1) { $p2++; }
+ $val=$p2-1;
+ $val.=",0x90";
+ }
+ push(@out,".align\t$val\n");
+}
+
+sub ::picmeup
+{ my($dst,$sym,$base,$reflabel)=@_;
+
+ if ($::pic && ($::elf || $::aout))
+ { if (!defined($base))
+ { &::call(&::label("PIC_me_up"));
+ &::set_label("PIC_me_up");
+ &::blindpop($dst);
+ &::add($dst,"\$${under}_GLOBAL_OFFSET_TABLE_+[.-".
+ &::label("PIC_me_up") . "]");
+ }
+ else
+ { &::lea($dst,&::DWP("${under}_GLOBAL_OFFSET_TABLE_+[.-$reflabel]",
+ $base));
+ }
+ &::mov($dst,&::DWP($under.$sym."\@GOT",$dst));
+ }
+ else
+ { &::lea($dst,&::DWP($sym)); }
+}
+
+sub ::initseg
+{ my($f)=@_;
+ my($tmp,$ctor);
+
+ if ($::elf)
+ { $tmp=<<___;
+.section .init
+ call $under$f
+ jmp .Linitalign
+.align $align
+.Linitalign:
+___
+ }
+ elsif ($::coff)
+ { $tmp=<<___; # applies to both Cygwin and Mingw
+.section .ctors
+.long $under$f
+___
+ }
+ elsif ($::aout)
+ { $ctor="${under}_GLOBAL_\$I\$$f";
+ $tmp=".text\n";
+ $tmp.=".type $ctor,\@function\n" if ($::pic);
+ $tmp.=<<___; # OpenBSD way...
+.globl $ctor
+.align 2
+$ctor:
+ jmp $under$f
+___
+ }
+ push(@out,$tmp) if ($tmp);
+}
+
+1;
PKG_PROG_PKG_CONFIG
AC_LANG_C
+AM_PROG_AS
AC_CANONICAL_HOST
AM_MAINTAINER_MODE([enable])
AM_INIT_AUTOMAKE([foreign tar-ustar])
fi
AM_CONDITIONAL(OPENCONNECT_ICONV, [test "$am_cv_func_iconv" = "yes"])
+aesni=no
+if test "$host_cpu" = "x86_64"; then
+ AC_PATH_PROG(PERL, perl)
+ if test "$PERL" != ""; then
+ AC_DEFINE(AESNI_ASM, 1, [Have AES-NI assembler])
+ aesni=yes
+ fi
+fi
+AM_CONDITIONAL(OPENCONNECT_AESNI, [test "$aesni" = "yes"])
+
AC_ARG_ENABLE([nls],
AS_HELP_STRING([--disable-nls], [Do not use Native Language Support]),
[USE_NLS=$enableval], [USE_NLS=yes])
vpninfo->esp_out.seq = vpninfo->esp_out.seq_backlog = 0;
esp_in->seq = esp_in->seq_backlog = 0;
- ret = init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in);
- if (ret)
- return ret;
+#ifdef AESNI_ASM
+ if (vpninfo->esp_hmac != HMAC_SHA1 ||
+ aesni_init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in))
+#endif
+ {
+ ret = init_esp_ciphers(vpninfo, &vpninfo->esp_out, esp_in);
+ if (ret)
+ return ret;
+ }
if (vpninfo->dtls_state == DTLS_NOSECRET)
vpninfo->dtls_state = DTLS_SECRET;
/* {gnutls,openssl}-esp.c */
int init_esp_ciphers(struct openconnect_info *vpninfo, struct esp *out, struct esp *in);
+int aesni_init_esp_ciphers(struct openconnect_info *vpninfo, struct esp *out, struct esp *in);
/* {gnutls,openssl}.c */
int ssl_nonblock_read(struct openconnect_info *vpninfo, void *buf, int maxlen);