summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFangming.Fang <fangming.fang@arm.com>2019-05-31 10:15:10 +0000
committerPauli <paul.dale@oracle.com>2019-12-19 12:36:07 +1000
commit31b59078c8245a4ee7f7fa4e6ea98bba7f9a29a5 (patch)
treee490a3b2bcf796c9c6e98fe86bcfb3d351a8a1b5
parent51a7c4b5f2a0b2d0f6bc0c87ec2ee44b9697dc78 (diff)
downloadopenssl-new-31b59078c8245a4ee7f7fa4e6ea98bba7f9a29a5.tar.gz
Optimize AES-GCM implementation on aarch64
Comparing to current implementation, this change can get more performance improved by tunning the loop-unrolling factor in interleave implementation as well as by enabling high level parallelism. Performance(A72) new type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes aes-128-gcm 113065.51k 375743.00k 848359.51k 1517865.98k 1964040.19k 1986663.77k aes-192-gcm 110679.32k 364470.63k 799322.88k 1428084.05k 1826917.03k 1848967.17k aes-256-gcm 104919.86k 352939.29k 759477.76k 1330683.56k 1663175.34k 1670430.72k old type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes aes-128-gcm 115595.32k 382348.65k 855891.29k 1236452.35k 1425670.14k 1429793.45k aes-192-gcm 112227.02k 369543.47k 810046.55k 1147948.37k 1286288.73k 1296941.06k aes-256-gcm 111543.90k 361902.36k 769543.59k 1070693.03k 1208576.68k 1207511.72k Change-Id: I28a2dca85c001a63a2a942e80c7c64f7a4fdfcf7 Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> Reviewed-by: Paul Dale <paul.dale@oracle.com> (Merged from https://github.com/openssl/openssl/pull/9818)
-rw-r--r--crypto/build.info1
-rw-r--r--crypto/evp/e_aes.c2
-rwxr-xr-xcrypto/modes/asm/aes-gcm-armv8_64.pl5722
-rw-r--r--crypto/modes/build.info4
-rw-r--r--include/crypto/ciphermode_platform.h33
-rw-r--r--providers/implementations/ciphers/cipher_aes_gcm_hw.c2
-rw-r--r--providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc83
-rw-r--r--providers/implementations/ciphers/ciphercommon_gcm_hw.c19
8 files changed, 5854 insertions, 12 deletions
diff --git a/crypto/build.info b/crypto/build.info
index 758a75ec6b..ab10a1cfe6 100644
--- a/crypto/build.info
+++ b/crypto/build.info
@@ -85,6 +85,7 @@ DEFINE[../libcrypto]=$UTIL_DEFINE $UPLINKDEF
DEFINE[../providers/libfips.a]=$UTIL_DEFINE
DEFINE[../providers/fips]=$UTIL_DEFINE
DEFINE[../providers/libimplementations.a]=$UTIL_DEFINE
+DEFINE[../providers/libcommon.a]=$UTIL_DEFINE
DEPEND[info.o]=buildinf.h
DEPEND[cversion.o]=buildinf.h
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index 296e8db1ca..b95adf0bbf 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -130,8 +130,6 @@ static void ctr64_inc(unsigned char *counter)
#if defined(AESNI_CAPABLE)
# if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
-# define AES_gcm_encrypt aesni_gcm_encrypt
-# define AES_gcm_decrypt aesni_gcm_decrypt
# define AES_GCM_ASM2(gctx) (gctx->gcm.block==(block128_f)aesni_encrypt && \
gctx->gcm.ghash==gcm_ghash_avx)
# undef AES_GCM_ASM2 /* minor size optimization */
diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl
new file mode 100755
index 0000000000..ee88906d85
--- /dev/null
+++ b/crypto/modes/asm/aes-gcm-armv8_64.pl
@@ -0,0 +1,5722 @@
+#! /usr/bin/env perl
+# Copyright 2019 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+#========================================================================
+# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
+# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
+# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
+# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
+#========================================================================
+#
+# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
+#
+# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
+#
+# ____________________________________________________
+# | |
+# | PRE |
+# |____________________________________________________|
+# | | | |
+# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
+# |________________|________________|__________________|
+# | | | |
+# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
+# |________________|________________|__________________|
+# | | | |
+# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
+# |________________|________________|__________________|
+# | | | |
+# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
+# |________________|____(mostly)____|__________________|
+# | |
+# | MODULO |
+# |____________________________________________________|
+#
+# PRE:
+# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
+# EXT low_acc, low_acc, low_acc, #8
+# EOR res_curr (4k+0), res_curr (4k+0), low_acc
+#
+# CTR block:
+# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
+# REV ctr32, rev_ctr32
+# ORR ctr64, constctr96_top32, ctr32, LSL #32
+# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
+# INS ctr_next.d[1], ctr64X
+# ADD rev_ctr32, #1
+#
+# AES block:
+# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
+# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
+# Given we are very constrained in our ASIMD registers this is quite important
+#
+# Encrypt:
+# LDR input_low, [ input_ptr ], #8
+# LDR input_high, [ input_ptr ], #8
+# EOR input_low, k14_low
+# EOR input_high, k14_high
+# INS res_curr.d[0], input_low
+# INS res_curr.d[1], input_high
+# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k13
+# EOR res_curr, res_curr, ctr_curr
+# ST1 { res_curr.16b }, [ output_ptr ], #16
+#
+# Decrypt:
+# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE ctr_curr, k13
+# LDR res_curr, [ input_ptr ], #16
+# EOR res_curr, res_curr, ctr_curr
+# MOV output_low, res_curr.d[0]
+# MOV output_high, res_curr.d[1]
+# EOR output_low, k14_low
+# EOR output_high, k14_high
+# STP output_low, output_high, [ output_ptr ], #16
+#
+# GHASH block X:
+# do 128b karatsuba polynomial multiplication on block
+# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
+#
+# multiplication:
+# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
+#
+# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
+# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
+#
+# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
+# multiplying with "twisted" powers of H
+#
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
+# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
+# path latency dominates the performance
+#
+# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
+# than indicated here
+# REV64 res_curr, res_curr
+# INS t_m.d[0], res_curr.d[1]
+# EOR t_m.8B, t_m.8B, res_curr.8B
+# PMULL2 t_h, res_curr, HX
+# PMULL t_l, res_curr, HX
+# PMULL t_m, t_m, HX_k
+# EOR acc_h, acc_h, t_h
+# EOR acc_l, acc_l, t_l
+# EOR acc_m, acc_m, t_m
+#
+# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
+# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
+# with a reversed constant
+# EOR acc_m, acc_m, acc_h
+# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
+# PMULL t_mod, acc_h, mod_constant
+# EXT acc_h, acc_h, acc_h, #8
+# EOR acc_m, acc_m, acc_h
+# EOR acc_m, acc_m, t_mod
+# PMULL acc_h, acc_m, mod_constant
+# EXT acc_m, acc_m, acc_m, #8
+# EOR acc_l, acc_l, acc_h
+# EOR acc_l, acc_l, acc_m
+
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$input_ptr="x0"; #argument block
+$bit_length="x1";
+$output_ptr="x2";
+$current_tag="x3";
+$counter="x16";
+$cc="x8";
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
+my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
+my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
+my ($output_l0,$output_h0)=map("x$_",(6..7));
+
+my $ctr32w="w9";
+my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
+my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
+
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
+my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
+
+my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
+my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
+my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
+
+my $t0="v8";
+my $t0d="d8";
+
+my ($t1,$t2,$t3)=map("v$_",(28..30));
+my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
+
+my $t4="v8";
+my $t4d="d8";
+my $t5="v28";
+my $t5d="d28";
+my $t6="v31";
+my $t6d="d31";
+
+my $t7="v4";
+my $t7d="d4";
+my $t8="v29";
+my $t8d="d29";
+my $t9="v30";
+my $t9d="d30";
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
+my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
+
+my $mod_constantd="d8";
+my $mod_constant="v8";
+my $mod_t="v31";
+
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
+my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
+my $rk2q1="v20.1q";
+my $rk3q1="v21.1q";
+my $rk4v="v22";
+my $rk4d="d22";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+___
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
+.fpu neon
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,0xef,a,b
+#else
+.code 32
+# define INST(a,b,c,d) $_byte a,b,c,0xf2
+#endif
+
+.text
+___
+
+#########################################################################################
+# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_enc_128_kernel
+.type aes_gcm_enc_128_kernel,%function
+.align 4
+aes_gcm_enc_128_kernel:
+ cbz x1, .L128_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+ ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
+
+ ld1 {$acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+
+ ldr $rk9q, [$cc, #144] @ load rk9
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+ rev $rctr32w, $rctr32w @ rev_ctr32
+
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+ ldr $rk0q, [$cc, #0] @ load rk0
+
+ rev $ctr32w, $rctr32w @ CTR block 1
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+ ldr $rk1q, [$cc, #16] @ load rk1
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $rk2q, [$cc, #32] @ load rk2
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ ldr $rk8q, [$cc, #128] @ load rk8
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+ ldr $rk3q, [$cc, #48] @ load rk3
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ ldr $rk6q, [$cc, #96] @ load rk6
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ldr $rk7q, [$cc, #112] @ load rk7
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+ ldr $rk5q, [$cc, #80] @ load rk5
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+
+ aese $ctr2b, $rk9 @ AES block 2 - round 9
+
+ aese $ctr0b, $rk9 @ AES block 0 - round 9
+
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+
+ aese $ctr1b, $rk9 @ AES block 1 - round 9
+
+ aese $ctr3b, $rk9 @ AES block 3 - round 9
+ b.ge .L128_enc_tail @ handle tail
+
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
+
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
+
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
+
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
+
+ eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
+ eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
+
+ eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
+ fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
+
+ eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
+ eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
+
+ fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
+ eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
+
+ eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
+
+ fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
+ eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
+ rev $ctr32w, $rctr32w @ CTR block 4
+
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+
+ fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
+ rev $ctr32w, $rctr32w @ CTR block 6
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
+
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
+
+ fmov $ctr2d, $ctr96_b64x @ CTR block 6
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 6
+ rev $ctr32w, $rctr32w @ CTR block 7
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
+
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
+ b.ge .L128_enc_prepretail @ do prepretail
+
+ .L128_enc_main_loop: @ main loop start
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
+
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ movi $mod_constant.8b, #0xc2
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
+ eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
+ eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
+
+ aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+
+ aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+
+ aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
+
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
+ rev $ctr32w, $rctr32w @ CTR block 4k+11
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
+ b.lt .L128_enc_main_loop
+
+ .L128_enc_prepretail: @ PREPRETAIL
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ pmull $t1.1q, $acc_h.1d, $mod_constant.1d
+ eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ ext $acc_hb, $acc_hb, $acc_hb, #8
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ eor $acc_mb, $acc_mb, $acc_lb
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ eor $acc_mb, $acc_mb, $t1.16b
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ eor $acc_mb, $acc_mb, $acc_hb
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+
+ pmull $t1.1q, $acc_m.1d, $mod_constant.1d
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ ext $acc_mb, $acc_mb, $acc_mb, #8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $acc_lb, $acc_lb, $t1.16b
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+
+ aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+
+ aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
+
+ aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
+ eor $acc_lb, $acc_lb, $acc_mb
+
+ aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
+ .L128_enc_tail: @ TAIL
+
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
+
+ cmp $main_end_input_ptr, #48
+
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+ eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
+ eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
+
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+
+ eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+
+ b.gt .L128_enc_blocks_more_than_3
+
+ sub $rctr32w, $rctr32w, #1
+ movi $acc_l.8b, #0
+ mov $ctr3b, $ctr2b
+
+ cmp $main_end_input_ptr, #32
+ mov $ctr2b, $ctr1b
+ movi $acc_h.8b, #0
+
+ movi $acc_m.8b, #0
+ b.gt .L128_enc_blocks_more_than_2
+
+ mov $ctr3b, $ctr1b
+ cmp $main_end_input_ptr, #16
+
+ sub $rctr32w, $rctr32w, #1
+ b.gt .L128_enc_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L128_enc_blocks_less_than_1
+ .L128_enc_blocks_more_than_3: @ blocks left > 3
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
+
+ rev64 $res0b, $res1b @ GHASH final-3 block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
+ eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
+
+ fmov $res1d, $input_l0 @ AES final-2 block - mov low
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+ fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+
+ eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ .L128_enc_blocks_more_than_2: @ blocks left > 2
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
+
+ fmov $res1d, $input_l0 @ AES final-1 block - mov low
+ eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+ fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
+
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+
+ eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ .L128_enc_blocks_more_than_1: @ blocks left > 1
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
+
+ rev64 $res0b, $res1b @ GHASH final-1 block
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
+ eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
+
+ fmov $res1d, $input_l0 @ AES final block - mov low
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+ fmov $res1.d[1], $input_h0 @ AES final block - mov high
+
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+
+ eor $res1b, $res1b, $ctr3b @ AES final block - result
+
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+ .L128_enc_blocks_less_than_1: @ blocks left <= 1
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+ mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
+
+ mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $input_l0, $rk10_l, $rk10_h, lt
+ csel $input_h0, $rk10_h, xzr, lt
+
+ fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
+
+ fmov $ctr0.d[1], $input_h0
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+
+ rev $ctr32w, $rctr32w
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
+
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+ st1 { $res1b}, [$output_ptr] @ store all 16B
+
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+.L128_enc_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
+___
+
+#########################################################################################
+# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_dec_128_kernel
+.type aes_gcm_dec_128_kernel,%function
+.align 4
+aes_gcm_dec_128_kernel:
+ cbz x1, .L128_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+ ldr $rk0q, [$cc, #0] @ load rk0
+
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+
+ ldr $rk1q, [$cc, #16] @ load rk1
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+ rev $rctr32w, $rctr32w @ rev_ctr32
+
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ rev $ctr32w, $rctr32w @ CTR block 1
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ ldr $rk2q, [$cc, #32] @ load rk2
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $rk3q, [$cc, #48] @ load rk3
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+ ldr $rk6q, [$cc, #96] @ load rk6
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $rk7q, [$cc, #112] @ load rk7
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+ ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ ld1 { $acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+ ldr $rk5q, [$cc, #80] @ load rk5
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+ ldr $rk9q, [$cc, #144] @ load rk9
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+ ldr $rk8q, [$cc, #128] @ load rk8
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ aese $ctr2b, $rk9 @ AES block 2 - round 9
+
+ aese $ctr3b, $rk9 @ AES block 3 - round 9
+
+ aese $ctr0b, $rk9 @ AES block 0 - round 9
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+
+ aese $ctr1b, $rk9 @ AES block 1 - round 9
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+ b.ge .L128_dec_tail @ handle tail
+
+ ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
+
+ ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
+
+ eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
+ ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
+
+ eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
+ rev64 $res0b, $res0b @ GHASH block 0
+ rev $ctr32w, $rctr32w @ CTR block 4
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+ ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
+
+ rev64 $res1b, $res1b @ GHASH block 1
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
+
+ mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
+
+ mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+
+ mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
+
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+ eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
+
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+ rev $ctr32w, $rctr32w @ CTR block 6
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+
+ eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
+ eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
+ eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
+
+ eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
+
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
+ b.ge .L128_dec_prepretail @ do prepretail
+
+ .L128_dec_main_loop: @ main loop start
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+ eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+ eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+ ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+ eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ rev64 $res1b, $res1b @ GHASH block 4k+5
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+
+ aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+ eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
+
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+ mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
+ eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
+
+ eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
+ mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+
+ aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+
+ rev64 $res0b, $res0b @ GHASH block 4k+4
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+
+ eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
+
+ eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+ b.lt L128_dec_main_loop
+
+ .L128_dec_prepretail: @ PREPRETAIL
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+ eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
+
+ aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+
+ aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ .L128_dec_tail: @ TAIL
+
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
+
+ eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
+
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+
+ cmp $main_end_input_ptr, #48
+
+ eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
+
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+ eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
+ b.gt .L128_dec_blocks_more_than_3
+
+ mov $ctr3b, $ctr2b
+ sub $rctr32w, $rctr32w, #1
+ movi $acc_l.8b, #0
+
+ movi $acc_h.8b, #0
+ mov $ctr2b, $ctr1b
+
+ movi $acc_m.8b, #0
+ cmp $main_end_input_ptr, #32
+ b.gt .L128_dec_blocks_more_than_2
+
+ cmp $main_end_input_ptr, #16
+
+ mov $ctr3b, $ctr1b
+ sub $rctr32w, $rctr32w, #1
+ b.gt .L128_dec_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L128_dec_blocks_less_than_1
+ .L128_dec_blocks_more_than_3: @ blocks left > 3
+ rev64 $res0b, $res1b @ GHASH final-3 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
+ eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
+
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+ mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+ mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+ eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
+ .L128_dec_blocks_more_than_2: @ blocks left > 2
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
+
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+ mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
+
+ mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
+ .L128_dec_blocks_more_than_1: @ blocks left > 1
+
+ rev64 $res0b, $res1b @ GHASH final-1 block
+
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ eor $ctr0b, $res1b, $ctr3b @ AES final block - result
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
+ mov $output_l0, $ctr0.d[0] @ AES final block - mov low
+
+ mov $output_h0, $ctr0.d[1] @ AES final block - mov high
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+ eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
+
+ eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+ .L128_dec_blocks_less_than_1: @ blocks left <= 1
+
+ mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $ctr96_b64x, $rk10_h, xzr, lt
+ csel $ctr32x, $rk10_l, $rk10_h, lt
+
+ fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
+
+ mov $ctr0.d[1], $ctr96_b64x
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
+
+ and $output_h0, $output_h0, $ctr96_b64x
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+ bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
+ and $output_l0, $output_l0, $ctr32x
+
+ rev $ctr32w, $rctr32w
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ orr $output_l0, $output_l0, $end_input_ptr
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ orr $output_h0, $output_h0, $main_end_input_ptr
+ stp $output_l0, $output_h0, [$output_ptr]
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+ .L128_dec_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
+___
+}
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
+my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
+my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
+my ($output_l0,$output_h0)=map("x$_",(6..7));
+
+my $ctr32w="w9";
+my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
+my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
+
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
+my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
+
+my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
+my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
+my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
+
+my $t0="v8";
+my $t0d="d8";
+my $t3="v4";
+my $t3d="d4";
+
+my ($t1,$t2)=map("v$_",(30..31));
+my ($t1d,$t2d)=map("d$_",(30..31));
+
+my $t4="v30";
+my $t4d="d30";
+my $t5="v8";
+my $t5d="d8";
+my $t6="v31";
+my $t6d="d31";
+
+my $t7="v5";
+my $t7d="d5";
+my $t8="v6";
+my $t8d="d6";
+my $t9="v30";
+my $t9d="d30";
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
+my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
+
+my $mod_constantd="d8";
+my $mod_constant="v8";
+my $mod_t="v31";
+
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
+my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
+my $rk2q1="v20.1q";
+my $rk3q1="v21.1q";
+my $rk4v="v22";
+my $rk4d="d22";
+
+#########################################################################################
+# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_enc_192_kernel
+.type aes_gcm_enc_192_kernel,%function
+.align 4
+aes_gcm_enc_192_kernel:
+ cbz x1, .L192_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+
+ ldr $rk5q, [$cc, #80] @ load rk5
+
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ ldr $rk8q, [$cc, #128] @ load rk8
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ ldr $rk6q, [$cc, #96] @ load rk6
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+
+ ldr $rk7q, [$cc, #112] @ load rk7
+ rev $rctr32w, $rctr32w @ rev_ctr32
+
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+
+ rev $ctr32w, $rctr32w @ CTR block 1
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+ ldr $rk0q, [$cc, #0] @ load rk0
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+
+ ldr $rk3q, [$cc, #48] @ load rk3
+
+ ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
+
+ ldr $rk1q, [$cc, #16] @ load rk1
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ ld1 { $acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+ ldr $rk11q, [$cc, #176] @ load rk11
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $rk2q, [$cc, #32] @ load rk2
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ ldr $rk10q, [$cc, #160] @ load rk10
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+ ldr $rk9q, [$cc, #144] @ load rk9
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+
+ aese $ctr2b, $rk11 @ AES block 2 - round 11
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+ aese $ctr1b, $rk11 @ AES block 1 - round 11
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+
+ aese $ctr0b, $rk11 @ AES block 0 - round 11
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+
+ aese $ctr3b, $rk11 @ AES block 3 - round 11
+ b.ge .L192_enc_tail @ handle tail
+
+ rev $ctr32w, $rctr32w @ CTR block 4
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
+
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
+
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+
+ eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
+
+ eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
+ eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
+ fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
+
+ eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
+
+ eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
+ eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
+
+ fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
+ eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
+
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
+
+ eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
+ fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+
+ fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
+
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
+
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
+
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+ rev $ctr32w, $rctr32w @ CTR block 6
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
+ fmov $ctr2d, $ctr96_b64x @ CTR block 6
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 6
+ rev $ctr32w, $rctr32w @ CTR block 7
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
+
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
+ b.ge .L192_enc_prepretail @ do prepretail
+
+ .L192_enc_main_loop: @ main loop start
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
+
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+ eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+ eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+ movi $mod_constant.8b, #0xc2
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+ fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
+
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+ fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+
+ aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+
+ aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+
+ aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+
+ aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
+
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
+ rev $ctr32w, $rctr32w @ CTR block 4k+11
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
+
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
+ b.lt .L192_enc_main_loop
+
+ .L192_enc_prepretail: @ PREPRETAIL
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ eor $acc_mb, $acc_mb, $acc_lb
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ pmull $t1.1q, $acc_h.1d, $mod_constant.1d
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ ext $acc_hb, $acc_hb, $acc_hb, #8
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ eor $acc_mb, $acc_mb, $t1.16b
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ eor $acc_mb, $acc_mb, $acc_hb
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+
+ pmull $t1.1q, $acc_m.1d, $mod_constant.1d
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+ eor $acc_lb, $acc_lb, $t1.16b
+
+ aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
+
+ aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
+
+ aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
+
+ aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
+ eor $acc_lb, $acc_lb, $acc_mb
+ .L192_enc_tail: @ TAIL
+
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
+
+ eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
+ eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
+
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+ cmp $main_end_input_ptr, #48
+
+ eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+ b.gt .L192_enc_blocks_more_than_3
+
+ sub $rctr32w, $rctr32w, #1
+ movi $acc_m.8b, #0
+
+ mov $ctr3b, $ctr2b
+ movi $acc_h.8b, #0
+ cmp $main_end_input_ptr, #32
+
+ mov $ctr2b, $ctr1b
+ movi $acc_l.8b, #0
+ b.gt .L192_enc_blocks_more_than_2
+
+ sub $rctr32w, $rctr32w, #1
+
+ mov $ctr3b, $ctr1b
+ cmp $main_end_input_ptr, #16
+ b.gt .L192_enc_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L192_enc_blocks_less_than_1
+ .L192_enc_blocks_more_than_3: @ blocks left > 3
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
+
+ rev64 $res0b, $res1b @ GHASH final-3 block
+
+ eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
+ fmov $res1d, $input_l0 @ AES final-2 block - mov low
+
+ fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
+
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
+ .L192_enc_blocks_more_than_2: @ blocks left > 2
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+ eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
+
+ fmov $res1d, $input_l0 @ AES final-1 block - mov low
+
+ fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ .L192_enc_blocks_more_than_1: @ blocks left > 1
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
+
+ rev64 $res0b, $res1b @ GHASH final-1 block
+
+ eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+ eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
+ fmov $res1d, $input_l0 @ AES final block - mov low
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+ fmov $res1.d[1], $input_h0 @ AES final block - mov high
+
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+
+ eor $res1b, $res1b, $ctr3b @ AES final block - result
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+ .L192_enc_blocks_less_than_1: @ blocks left <= 1
+
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
+ rev $ctr32w, $rctr32w
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+ mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
+
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+ mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $input_l0, $rk12_l, $rk12_h, lt
+ csel $input_h0, $rk12_h, xzr, lt
+
+ fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
+
+ fmov $ctr0.d[1], $input_h0
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ st1 { $res1b}, [$output_ptr] @ store all 16B
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+.L192_enc_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
+___
+
+#########################################################################################
+# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_dec_192_kernel
+.type aes_gcm_dec_192_kernel,%function
+.align 4
+aes_gcm_dec_192_kernel:
+ cbz x1, .L192_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+ ldr $rk0q, [$cc, #0] @ load rk0
+
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+ ldr $rk2q, [$cc, #32] @ load rk2
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+
+ rev $rctr32w, $rctr32w @ rev_ctr32
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+ ldr $rk1q, [$cc, #16] @ load rk1
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ rev $ctr32w, $rctr32w @ CTR block 1
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ ldr $rk3q, [$cc, #48] @ load rk3
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+
+ ldr $rk8q, [$cc, #128] @ load rk8
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $rk11q, [$cc, #176] @ load rk11
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+ ldr $rk10q, [$cc, #160] @ load rk10
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+ ldr $rk9q, [$cc, #144] @ load rk9
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+ ldr $rk7q, [$cc, #112] @ load rk7
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+ ld1 { $acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+ ldr $rk5q, [$cc, #80] @ load rk5
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+ ldr $rk6q, [$cc, #96] @ load rk6
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+
+ aese $ctr3b, $rk11 @ AES block 3 - round 11
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+
+ aese $ctr2b, $rk11 @ AES block 2 - round 11
+
+ aese $ctr1b, $rk11 @ AES block 1 - round 11
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+
+ aese $ctr0b, $rk11 @ AES block 0 - round 11
+ b.ge .L192_dec_tail @ handle tail
+
+ ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
+
+ ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
+
+ eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
+
+ eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
+ rev $ctr32w, $rctr32w @ CTR block 4
+ ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
+
+ ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
+
+ mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
+
+ mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
+
+ mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+
+ mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
+ rev64 $res0b, $res0b @ GHASH block 0
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+ rev64 $res1b, $res1b @ GHASH block 1
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+
+ eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+ eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+ eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
+
+ rev $ctr32w, $rctr32w @ CTR block 6
+ eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+ eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
+ b.ge .L192_dec_prepretail @ do prepretail
+
+ .L192_dec_main_loop: @ main loop start
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+ eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+ eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ movi $mod_constant.8b, #0xc2
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
+ eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+
+ aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
+ ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+
+ eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
+ eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
+ eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
+
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+ rev64 $res1b, $res1b @ GHASH block 4k+5
+
+ aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+ mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
+
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+
+ eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+ eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+ eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
+
+ eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+ rev64 $res0b, $res0b @ GHASH block 4k+4
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+
+ aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
+ b.lt .L192_dec_main_loop
+
+ .L192_dec_prepretail: @ PREPRETAIL
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+ eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+ eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+ movi $mod_constant.8b, #0xc2
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+
+ aese $ctr0b, $rk11
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ aese $ctr2b, $rk11
+
+ aese $ctr1b, $rk11
+
+ aese $ctr3b, $rk11
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ .L192_dec_tail: @ TAIL
+
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
+
+ eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
+
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+
+ cmp $main_end_input_ptr, #48
+
+ eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
+
+ eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
+ b.gt .L192_dec_blocks_more_than_3
+
+ movi $acc_l.8b, #0
+ movi $acc_h.8b, #0
+
+ mov $ctr3b, $ctr2b
+ mov $ctr2b, $ctr1b
+ sub $rctr32w, $rctr32w, #1
+
+ movi $acc_m.8b, #0
+ cmp $main_end_input_ptr, #32
+ b.gt .L192_dec_blocks_more_than_2
+
+ mov $ctr3b, $ctr1b
+ cmp $main_end_input_ptr, #16
+ sub $rctr32w, $rctr32w, #1
+
+ b.gt .L192_dec_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L192_dec_blocks_less_than_1
+ .L192_dec_blocks_more_than_3: @ blocks left > 3
+ rev64 $res0b, $res1b @ GHASH final-3 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+ mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+
+ mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
+ .L192_dec_blocks_more_than_2: @ blocks left > 2
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
+
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+ mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+ mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+ eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
+
+ eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ .L192_dec_blocks_more_than_1: @ blocks left > 1
+
+ rev64 $res0b, $res1b @ GHASH final-1 block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
+
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+
+ eor $ctr0b, $res1b, $ctr3b @ AES final block - result
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+ mov $output_h0, $ctr0.d[1] @ AES final block - mov high
+
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+ mov $output_l0, $ctr0.d[0] @ AES final block - mov low
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+ eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
+
+ eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+ .L192_dec_blocks_less_than_1: @ blocks left <= 1
+
+ mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
+ ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+ mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
+
+ lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $ctr32x, $rk12_l, $rk12_h, lt
+ csel $ctr96_b64x, $rk12_h, xzr, lt
+
+ fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
+ and $output_l0, $output_l0, $ctr32x
+ bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
+
+ orr $output_l0, $output_l0, $end_input_ptr
+ mov $ctr0.d[1], $ctr96_b64x
+
+ rev $ctr32w, $rctr32w
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
+
+ and $output_h0, $output_h0, $ctr96_b64x
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ orr $output_h0, $output_h0, $main_end_input_ptr
+ stp $output_l0, $output_h0, [$output_ptr]
+
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+.L192_dec_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
+___
+}
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
+my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
+my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
+my ($output_l0,$output_h0)=map("x$_",(6..7));
+
+my $ctr32w="w9";
+my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
+my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
+
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
+my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
+
+my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
+my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
+my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
+
+my $t0="v8";
+my $t0d="d8";
+my $t1="v4";
+my $t1d="d4";
+my $t2="v8";
+my $t2d="d8";
+my $t3="v4";
+my $t3d="d4";
+my $t4="v4";
+my $t4d="d4";
+my $t5="v5";
+my $t5d="d5";
+my $t6="v8";
+my $t6d="d8";
+my $t7="v5";
+my $t7d="d5";
+my $t8="v6";
+my $t8d="d6";
+my $t9="v4";
+my $t9d="d4";
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
+my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
+
+my $mod_constantd="d8";
+my $mod_constant="v8";
+my $mod_t="v7";
+
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
+my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
+my $rk2q1="v20.1q";
+my $rk3q1="v21.1q";
+my $rk4v="v22";
+my $rk4d="d22";
+
+#########################################################################################
+# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_enc_256_kernel
+.type aes_gcm_enc_256_kernel,%function
+.align 4
+aes_gcm_enc_256_kernel:
+ cbz x1, .L256_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+
+ ldr $rk0q, [$cc, #0] @ load rk0
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ ldr $rk7q, [$cc, #112] @ load rk7
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+
+ rev $rctr32w, $rctr32w @ rev_ctr32
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+
+ rev $ctr32w, $rctr32w @ CTR block 1
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+ ldr $rk1q, [$cc, #16] @ load rk1
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+ ldr $rk2q, [$cc, #32] @ load rk2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $rk3q, [$cc, #48] @ load rk3
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+ ldr $rk6q, [$cc, #96] @ load rk6
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $rk5q, [$cc, #80] @ load rk5
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+ ldr $rk13q, [$cc, #208] @ load rk13
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ ldr $rk12q, [$cc, #192] @ load rk12
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+ ldr $rk11q, [$cc, #176] @ load rk11
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+ ldr $rk8q, [$cc, #128] @ load rk8
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+ ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+ ld1 { $acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+ ldr $rk9q, [$cc, #144] @ load rk9
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+ ldr $rk10q, [$cc, #160] @ load rk10
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
+
+ aese $ctr2b, $rk13 @ AES block 2 - round 13
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
+
+ aese $ctr1b, $rk13 @ AES block 1 - round 13
+
+ aese $ctr0b, $rk13 @ AES block 0 - round 13
+
+ aese $ctr3b, $rk13 @ AES block 3 - round 13
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+ b.ge .L256_enc_tail @ handle tail
+
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
+
+ rev $ctr32w, $rctr32w @ CTR block 4
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
+
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
+
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
+ eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
+
+ fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
+ eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
+
+ eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
+ eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
+ fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
+
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
+ eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
+
+ eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
+
+ fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+ fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
+ eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
+
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
+
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+ rev $ctr32w, $rctr32w @ CTR block 6
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
+
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+ fmov $ctr2d, $ctr96_b64x @ CTR block 6
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 6
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
+ rev $ctr32w, $rctr32w @ CTR block 7
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
+
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
+ b.ge L256_enc_prepretail @ do prepretail
+
+ .L256_enc_main_loop: @ main loop start
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+ movi $mod_constant.8b, #0xc2
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+ fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+ eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+ eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
+
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+ eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
+ eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
+ eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+
+ aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
+ fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
+
+ aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
+ fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
+
+ fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+
+ fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+
+ eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+
+ aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+ st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+ fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+ st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+
+ aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
+ eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
+
+ st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
+ rev $ctr32w, $rctr32w @ CTR block 4k+11
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
+
+ eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
+ st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
+ b.lt L256_enc_main_loop
+
+ .L256_enc_prepretail: @ PREPRETAIL
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
+
+ pmull $t1.1q, $acc_h.1d, $mod_constant.1d
+ ext $acc_hb, $acc_hb, $acc_hb, #8
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ eor $acc_mb, $acc_mb, $acc_lb
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
+ eor $acc_mb, $acc_mb, $t1.16b
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
+ eor $acc_mb, $acc_mb, $acc_hb
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
+
+ pmull $t1.1q, $acc_m.1d, $mod_constant.1d
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
+ ext $acc_mb, $acc_mb, $acc_mb, #8
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
+
+ aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
+ eor $acc_lb, $acc_lb, $t1.16b
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
+
+ aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
+
+ aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
+
+ aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
+ eor $acc_lb, $acc_lb, $acc_mb
+ .L256_enc_tail: @ TAIL
+
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
+
+ eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
+ eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
+
+ cmp $main_end_input_ptr, #48
+ fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
+
+ fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
+
+ eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
+ b.gt .L256_enc_blocks_more_than_3
+
+ cmp $main_end_input_ptr, #32
+ mov $ctr3b, $ctr2b
+ movi $acc_l.8b, #0
+
+ movi $acc_h.8b, #0
+ sub $rctr32w, $rctr32w, #1
+
+ mov $ctr2b, $ctr1b
+ movi $acc_m.8b, #0
+ b.gt .L256_enc_blocks_more_than_2
+
+ mov $ctr3b, $ctr1b
+ sub $rctr32w, $rctr32w, #1
+ cmp $main_end_input_ptr, #16
+
+ b.gt .L256_enc_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L256_enc_blocks_less_than_1
+ .L256_enc_blocks_more_than_3: @ blocks left > 3
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
+
+ rev64 $res0b, $res1b @ GHASH final-3 block
+
+ eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
+
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+ fmov $res1d, $input_l0 @ AES final-2 block - mov low
+
+ fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
+ .L256_enc_blocks_more_than_2: @ blocks left > 2
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+
+ eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ fmov $res1d, $input_l0 @ AES final-1 block - mov low
+ eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
+
+ fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+
+ eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ .L256_enc_blocks_more_than_1: @ blocks left > 1
+
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
+
+ rev64 $res0b, $res1b @ GHASH final-1 block
+
+ ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+ eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+ fmov $res1d, $input_l0 @ AES final block - mov low
+
+ fmov $res1.d[1], $input_h0 @ AES final block - mov high
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+
+ eor $res1b, $res1b, $ctr3b @ AES final block - result
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+ .L256_enc_blocks_less_than_1: @ blocks left <= 1
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
+
+ mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $input_l0, $rk14_l, $rk14_h, lt
+ csel $input_h0, $rk14_h, xzr, lt
+
+ fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
+
+ fmov $ctr0.d[1], $input_h0
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+ rev $ctr32w, $rctr32w
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ st1 { $res1b}, [$output_ptr] @ store all 16B
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+.L256_enc_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
+___
+
+{
+my $t8="v4";
+my $t8d="d4";
+my $t9="v6";
+my $t9d="d6";
+#########################################################################################
+# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
+# size_t len,
+# unsigned char *out,
+# const void *key,
+# unsigned char ivec[16],
+# u64 *Xi);
+#
+$code.=<<___;
+.global aes_gcm_dec_256_kernel
+.type aes_gcm_dec_256_kernel,%function
+.align 4
+aes_gcm_dec_256_kernel:
+ cbz x1, .L256_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
+ mov $len, $main_end_input_ptr
+ ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+
+ ldr $rk8q, [$cc, #128] @ load rk8
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
+
+ ldr $rk7q, [$cc, #112] @ load rk7
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+ ldr $rk6q, [$cc, #96] @ load rk6
+
+ lsr $rctr32x, $ctr96_t32x, #32
+ ldr $rk5q, [$cc, #80] @ load rk5
+ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+
+ ldr $rk3q, [$cc, #48] @ load rk3
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+ rev $rctr32w, $rctr32w @ rev_ctr32
+
+ add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
+ fmov $ctr3d, $ctr96_b64x @ CTR block 3
+
+ rev $ctr32w, $rctr32w @ CTR block 1
+ add $rctr32w, $rctr32w, #1 @ CTR block 1
+ fmov $ctr1d, $ctr96_b64x @ CTR block 1
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
+ ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 1
+ rev $ctr32w, $rctr32w @ CTR block 2
+ add $rctr32w, $rctr32w, #1 @ CTR block 2
+
+ fmov $ctr2d, $ctr96_b64x @ CTR block 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 2
+ rev $ctr32w, $rctr32w @ CTR block 3
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
+ ldr $rk0q, [$cc, #0] @ load rk0
+
+ fmov $ctr3.d[1], $ctr32x @ CTR block 3
+ add $rctr32w, $rctr32w, #1 @ CTR block 3
+
+ ldr $rk4q, [$cc, #64] @ load rk4
+
+ ldr $rk13q, [$cc, #208] @ load rk13
+
+ ldr $rk1q, [$cc, #16] @ load rk1
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+ ext $h3b, $h3b, $h3b, #8
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+ ext $h4b, $h4b, $h4b, #8
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+ ext $h2b, $h2b, $h2b, #8
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
+ ldr $rk2q, [$cc, #32] @ load rk2
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
+ ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
+ ld1 { $acc_lb}, [$current_tag]
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
+ ldr $rk9q, [$cc, #144] @ load rk9
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
+ ldr $rk12q, [$cc, #192] @ load rk12
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+ ext $h1b, $h1b, $h1b, #8
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
+ ldr $rk10q, [$cc, #160] @ load rk10
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
+ ldr $rk11q, [$cc, #176] @ load rk11
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
+
+ trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
+
+ trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
+
+ trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
+ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
+ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
+
+ aese $ctr1b, $rk13 @ AES block 1 - round 13
+
+ aese $ctr2b, $rk13 @ AES block 2 - round 13
+ eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
+
+ aese $ctr3b, $rk13 @ AES block 3 - round 13
+
+ aese $ctr0b, $rk13 @ AES block 0 - round 13
+ b.ge .L256_dec_tail @ handle tail
+
+ ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
+
+ ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
+
+ rev $ctr32w, $rctr32w @ CTR block 4
+
+ eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
+
+ eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
+ rev64 $res1b, $res1b @ GHASH block 1
+ ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
+
+ mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
+
+ mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
+ rev64 $res0b, $res0b @ GHASH block 0
+ add $rctr32w, $rctr32w, #1 @ CTR block 4
+
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
+
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4
+ rev $ctr32w, $rctr32w @ CTR block 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 5
+
+ mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
+
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
+ mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
+ eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
+
+ eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
+ fmov $ctr1d, $ctr96_b64x @ CTR block 5
+
+ ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 5
+ rev $ctr32w, $rctr32w @ CTR block 6
+ add $rctr32w, $rctr32w, #1 @ CTR block 6
+
+ eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
+
+ eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
+
+ eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
+ b.ge .L256_dec_prepretail @ do prepretail
+
+ .L256_dec_main_loop: @ main loop start
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+ eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+ eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+ rev $ctr32w, $rctr32w @ CTR block 4k+8
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+ ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+
+ aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+ ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+ eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+ ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
+ ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
+ add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
+ fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
+ fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
+ rev $ctr32w, $rctr32w @ CTR block 4k+9
+
+ aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
+
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
+
+ eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
+ eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
+
+ mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
+ eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
+ mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
+
+ fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
+ rev $ctr32w, $rctr32w @ CTR block 4k+10
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
+
+ aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
+
+ rev64 $res1b, $res1b @ GHASH block 4k+5
+ eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
+
+ eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
+ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
+
+ rev64 $res0b, $res0b @ GHASH block 4k+4
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ b.lt .L256_dec_main_loop
+
+
+ .L256_dec_prepretail: @ PREPRETAIL
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
+ mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
+ eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
+
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
+ mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
+
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
+ fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
+
+ fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
+ rev $ctr32w, $rctr32w @ CTR block 4k+7
+ eor $res0b, $res0b, $acc_lb @ PRE 1
+
+ rev64 $res2b, $res2b @ GHASH block 4k+2
+ orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
+ mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
+
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
+ mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
+ mov $t0d, $res0.d[1] @ GHASH block 4k - mid
+ fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
+ fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
+
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
+ mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
+
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
+
+ pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
+
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
+ rev64 $res3b, $res3b @ GHASH block 4k+3
+
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
+
+ pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
+ eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
+
+ pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
+
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
+ mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
+
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
+ eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
+
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
+
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
+ mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
+
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
+ eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
+
+ pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
+
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
+
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
+ eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
+
+ pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
+
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
+ eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
+
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
+
+ pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
+ eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
+
+ pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
+
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
+ ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
+
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
+ eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
+
+ pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
+
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
+ mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
+
+ pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
+ eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
+
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
+
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
+ eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
+
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
+
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
+ movi $mod_constant.8b, #0xc2
+
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
+ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
+
+ pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
+ eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
+
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
+
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
+ eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
+
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
+
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
+
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
+
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
+
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
+
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
+
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
+ eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
+
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
+ eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
+
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
+ add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
+
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
+ eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
+
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+ eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
+
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
+ stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
+
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
+ stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
+
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
+
+ aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
+
+ aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
+
+ aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ .L256_dec_tail: @ TAIL
+
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
+ ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
+
+ eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
+
+ mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
+
+ mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
+
+ cmp $main_end_input_ptr, #48
+
+ eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
+
+ eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
+ b.gt .L256_dec_blocks_more_than_3
+
+ sub $rctr32w, $rctr32w, #1
+ mov $ctr3b, $ctr2b
+ movi $acc_m.8b, #0
+
+ movi $acc_l.8b, #0
+ cmp $main_end_input_ptr, #32
+
+ movi $acc_h.8b, #0
+ mov $ctr2b, $ctr1b
+ b.gt .L256_dec_blocks_more_than_2
+
+ sub $rctr32w, $rctr32w, #1
+
+ mov $ctr3b, $ctr1b
+ cmp $main_end_input_ptr, #16
+ b.gt .L256_dec_blocks_more_than_1
+
+ sub $rctr32w, $rctr32w, #1
+ b .L256_dec_blocks_less_than_1
+ .L256_dec_blocks_more_than_3: @ blocks left > 3
+ rev64 $res0b, $res1b @ GHASH final-3 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
+
+ mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
+
+ mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
+
+ mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
+
+ mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
+
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
+
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
+ eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
+
+ pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
+ eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
+ .L256_dec_blocks_more_than_2: @ blocks left > 2
+
+ rev64 $res0b, $res1b @ GHASH final-2 block
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
+
+ eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
+
+ mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
+
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
+
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
+ mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
+
+ mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
+ eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
+ eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
+ .L256_dec_blocks_more_than_1: @ blocks left > 1
+
+ stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
+ rev64 $res0b, $res1b @ GHASH final-1 block
+
+ ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+ movi $t0.8b, #0 @ suppress further partial tag feed in
+
+ mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
+
+ eor $ctr0b, $res1b, $ctr3b @ AES final block - result
+
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
+
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
+
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
+ mov $output_l0, $ctr0.d[0] @ AES final block - mov low
+
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
+
+ mov $output_h0, $ctr0.d[1] @ AES final block - mov high
+
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
+ eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
+
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
+ eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
+ .L256_dec_blocks_less_than_1: @ blocks left <= 1
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+ mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
+
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
+ mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
+
+ ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
+
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
+
+ lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
+ cmp $bit_length, #64
+
+ csel $ctr32x, $rk14_l, $rk14_h, lt
+ csel $ctr96_b64x, $rk14_h, xzr, lt
+
+ fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
+ and $output_l0, $output_l0, $ctr32x
+
+ mov $ctr0.d[1], $ctr96_b64x
+ bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
+
+ rev $ctr32w, $rctr32w
+
+ bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
+
+ orr $output_l0, $output_l0, $end_input_ptr
+
+ and $output_h0, $output_h0, $ctr96_b64x
+
+ orr $output_h0, $output_h0, $main_end_input_ptr
+
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
+
+ rev64 $res0b, $res1b @ GHASH final block
+
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
+
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
+
+ mov $t0d, $res0.d[1] @ GHASH final block - mid
+
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
+
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
+
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
+
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
+
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
+
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
+ movi $mod_constant.8b, #0xc2
+
+ eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
+
+ shl $mod_constantd, $mod_constantd, #56 @ mod_constant
+
+ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
+
+ pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
+
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
+
+ eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
+
+ eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
+
+ pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
+
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
+
+ eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
+
+ stp $output_l0, $output_h0, [$output_ptr]
+
+ str $ctr32w, [$counter, #12] @ store the updated counter
+
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
+ ext $acc_lb, $acc_lb, $acc_lb, #8
+ rev64 $acc_lb, $acc_lb
+ mov x0, $len
+ st1 { $acc_l.16b }, [$current_tag]
+
+ ldp x21, x22, [sp, #16]
+ ldp x23, x24, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d10, d11, [sp, #64]
+ ldp d12, d13, [sp, #80]
+ ldp d14, d15, [sp, #96]
+ ldp x19, x20, [sp], #112
+ ret
+
+.L256_dec_ret:
+ mov w0, #0x0
+ ret
+.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
+___
+}
+}
+
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#endif
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+ $3<8?$3:$3+8,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/@\s/\/\//o; # old->new style commentary
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older%%%%
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remaining new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/build.info b/crypto/modes/build.info
index b741be821b..fb54b46ea5 100644
--- a/crypto/modes/build.info
+++ b/crypto/modes/build.info
@@ -24,7 +24,7 @@ IF[{- !$disabled{asm} -}]
$MODESASM_armv4=ghash-armv4.S ghashv8-armx.S
$MODESDEF_armv4=GHASH_ASM
- $MODESASM_aarch64=ghashv8-armx.S
+ $MODESASM_aarch64=ghashv8-armx.S aes-gcm-armv8_64.S
$MODESDEF_aarch64=
$MODESASM_parisc11=ghash-parisc.s
@@ -76,6 +76,8 @@ GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
INCLUDE[ghash-armv4.o]=..
GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
INCLUDE[ghashv8-armx.o]=..
+GENERATE[aes-gcm-armv8_64.S]=asm/aes-gcm-armv8_64.pl
+INCLUDE[aes-gcm-armv8_64.o]=..
GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
INCLUDE[ghash-s390x.o]=..
GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
diff --git a/include/crypto/ciphermode_platform.h b/include/crypto/ciphermode_platform.h
index 03afd719af..e6a65bb99d 100644
--- a/include/crypto/ciphermode_platform.h
+++ b/include/crypto/ciphermode_platform.h
@@ -91,6 +91,32 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
# define HWAES_ecb_encrypt aes_v8_ecb_encrypt
# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+# define AES_PMULL_CAPABLE ((OPENSSL_armcap_P & ARMV8_PMULL) && (OPENSSL_armcap_P & ARMV8_AES))
+# define AES_GCM_ENC_BYTES 512
+# define AES_GCM_DEC_BYTES 512
+# if __ARM_MAX_ARCH__>=8
+# define AES_gcm_encrypt armv8_aes_gcm_encrypt
+# define AES_gcm_decrypt armv8_aes_gcm_decrypt
+# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_v8_ctr32_encrypt_blocks && \
+ (gctx)->gcm.ghash==gcm_ghash_v8)
+size_t aes_gcm_enc_128_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t aes_gcm_enc_192_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t aes_gcm_enc_256_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t aes_gcm_dec_128_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t aes_gcm_dec_192_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t aes_gcm_dec_256_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+ uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t armv8_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi);
+size_t armv8_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
+# endif
# endif
# endif
# endif /* OPENSSL_CPUID_OBJ */
@@ -111,6 +137,9 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
# define BSAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
# endif
+# define AES_GCM_ENC_BYTES 32
+# define AES_GCM_DEC_BYTES 16
+
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
@@ -181,6 +210,8 @@ size_t aesni_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len
const void *key, unsigned char ivec[16], u64 *Xi);
void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *in, size_t len);
+# define AES_gcm_encrypt aesni_gcm_encrypt
+# define AES_gcm_decrypt aesni_gcm_decrypt
# define AES_GCM_ASM(ctx) (ctx->ctr == aesni_ctr32_encrypt_blocks && \
ctx->gcm.ghash == gcm_ghash_avx)
# endif
@@ -416,7 +447,7 @@ void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const int enc);
void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
- size_t len, const AES_KEY *key,
+ size_t len, const void *key,
const unsigned char ivec[16]);
void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
index 0373917a18..08ee34ef1e 100644
--- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c
+++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
@@ -68,6 +68,8 @@ static const PROV_GCM_HW aes_gcm = {
# include "cipher_aes_gcm_hw_aesni.inc"
#elif defined(SPARC_AES_CAPABLE)
# include "cipher_aes_gcm_hw_t4.inc"
+#elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM)
+# include "cipher_aes_gcm_hw_armv8.inc"
#else
const PROV_GCM_HW *PROV_AES_HW_gcm(size_t keybits)
{
diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc
new file mode 100644
index 0000000000..4e8cc9c54e
--- /dev/null
+++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2019 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Crypto extention support for AES GCM.
+ * This file is included by cipher_aes_gcm_hw.c
+ */
+
+size_t armv8_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi)
+{
+ size_t align_bytes = 0;
+ align_bytes = len - len % 16;
+
+ AES_KEY *aes_key = (AES_KEY *)key;
+
+ switch(aes_key->rounds) {
+ case 10:
+ aes_gcm_enc_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ case 12:
+ aes_gcm_enc_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ case 14:
+ aes_gcm_enc_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ }
+ return align_bytes;
+}
+
+size_t armv8_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi)
+{
+ size_t align_bytes = 0;
+ align_bytes = len - len % 16;
+
+ AES_KEY *aes_key = (AES_KEY *)key;
+
+ switch(aes_key->rounds) {
+ case 10:
+ aes_gcm_dec_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ case 12:
+ aes_gcm_dec_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ case 14:
+ aes_gcm_dec_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+ break;
+ }
+ return align_bytes;
+}
+
+static int armv8_aes_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ size_t keylen)
+{
+ PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx;
+ AES_KEY *ks = &actx->ks.ks;
+
+ GCM_HW_SET_KEY_CTR_FN(ks, aes_v8_set_encrypt_key, aes_v8_encrypt,
+ aes_v8_ctr32_encrypt_blocks);
+ return 1;
+}
+
+
+static const PROV_GCM_HW armv8_aes_gcm = {
+ armv8_aes_gcm_initkey,
+ gcm_setiv,
+ gcm_aad_update,
+ gcm_cipher_update,
+ gcm_cipher_final,
+ gcm_one_shot
+};
+
+const PROV_GCM_HW *PROV_AES_HW_gcm(size_t keybits)
+{
+ return AES_PMULL_CAPABLE ? &armv8_aes_gcm : &aes_gcm;
+}
diff --git a/providers/implementations/ciphers/ciphercommon_gcm_hw.c b/providers/implementations/ciphers/ciphercommon_gcm_hw.c
index 60c7ac5d8f..1114c36b3f 100644
--- a/providers/implementations/ciphers/ciphercommon_gcm_hw.c
+++ b/providers/implementations/ciphers/ciphercommon_gcm_hw.c
@@ -30,14 +30,16 @@ int gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in,
#if defined(AES_GCM_ASM)
size_t bulk = 0;
- if (len >= 32 && AES_GCM_ASM(ctx)) {
+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(ctx)) {
size_t res = (16 - ctx->gcm.mres) % 16;
if (CRYPTO_gcm128_encrypt(&ctx->gcm, in, out, res))
return 0;
- bulk = aesni_gcm_encrypt(in + res, out + res, len - res,
- ctx->gcm.key,
- ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+
+ bulk = AES_gcm_encrypt(in + res, out + res, len - res,
+ ctx->gcm.key,
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+
ctx->gcm.len.u[1] += bulk;
bulk += res;
}
@@ -57,15 +59,16 @@ int gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in,
#if defined(AES_GCM_ASM)
size_t bulk = 0;
- if (len >= 16 && AES_GCM_ASM(ctx)) {
+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(ctx)) {
size_t res = (16 - ctx->gcm.mres) % 16;
if (CRYPTO_gcm128_decrypt(&ctx->gcm, in, out, res))
return -1;
- bulk = aesni_gcm_decrypt(in + res, out + res, len - res,
- ctx->gcm.key,
- ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+ bulk = AES_gcm_decrypt(in + res, out + res, len - res,
+ ctx->gcm.key,
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+
ctx->gcm.len.u[1] += bulk;
bulk += res;
}