From edce87be74a578eb3b2bc85483130ddd62a4f38f Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Mon, 26 Sep 2016 11:55:37 +0200 Subject: Imported Andy Polyakov's implementation of AES in aarch64 --- cfg.mk | 15 +- devel/perlasm/aes-aarch64.pl | 1 + devel/perlasm/aes-aarch64.pl.license | 1 + lib/accelerated/aarch64/Makefile.am | 5 +- lib/accelerated/aarch64/aarch64-common.c | 35 ++ lib/accelerated/aarch64/aes-aarch64.h | 30 ++ lib/accelerated/aarch64/aes-cbc-aarch64.c | 130 +++++ lib/accelerated/aarch64/aes-gcm-aarch64.c | 177 +++++++ lib/accelerated/aarch64/elf/aes-aarch64.s | 791 ++++++++++++++++++++++++++++++ 9 files changed, 1176 insertions(+), 9 deletions(-) create mode 120000 devel/perlasm/aes-aarch64.pl create mode 120000 devel/perlasm/aes-aarch64.pl.license create mode 100644 lib/accelerated/aarch64/aes-aarch64.h create mode 100644 lib/accelerated/aarch64/aes-cbc-aarch64.c create mode 100644 lib/accelerated/aarch64/aes-gcm-aarch64.c create mode 100644 lib/accelerated/aarch64/elf/aes-aarch64.s diff --git a/cfg.mk b/cfg.mk index dfb881fbee..0cf082a955 100644 --- a/cfg.mk +++ b/cfg.mk @@ -135,6 +135,7 @@ web: -cp -v doc/reference/html/*.html doc/reference/html/*.png doc/reference/html/*.devhelp* doc/reference/html/*.css $(htmldir)/reference/ ASM_SOURCES_XXX := \ + lib/accelerated/aarch64/XXX/aes-aarch64.s \ lib/accelerated/aarch64/XXX/sha1-armv8.s \ lib/accelerated/aarch64/XXX/sha256-armv8.s \ lib/accelerated/aarch64/XXX/sha512-armv8.s \ @@ -227,15 +228,15 @@ lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl .submodule.stamp sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ lib/accelerated/aarch64/elf/%.s: devel/perlasm/%.pl .submodule.stamp + rm -f $@tmp CC=aarch64-linux-gnu-gcc perl $< linux64 $@.tmp - cat $@.tmp | /usr/bin/perl -ne '/^#(line)?\s*[0-9]+/ or print' > $@.i - cat $<.license > $@.tmp.S - cat $@.i >> $@.tmp.S - rm -f $@.i $@.tmp + cat $@.tmp | /usr/bin/perl -ne '/^#(line)?\s*[0-9]+/ or print' > $@.tmp.S echo "" >> $@.tmp.S sed -i 's/OPENSSL_armcap_P/_gnutls_arm_cpuid_s/g' $@.tmp.S sed -i 's/arm_arch.h/aarch64-common.h/g' $@.tmp.S - echo ".section .note.GNU-stack,\"\",%progbits" >> $@.tmp.S - aarch64-linux-gnu-gcc -Ilib/accelerated/aarch64 -Wa,--noexecstack -E $@.tmp.S -o $@ - rm -f $@.tmp.S + aarch64-linux-gnu-gcc -D__ARM_MAX_ARCH__=8 -Ilib/accelerated/aarch64 -Wa,--noexecstack -E $@.tmp.S -o $@.tmp.s + cat $<.license $@.tmp.s > $@ + echo ".section .note.GNU-stack,\"\",%progbits" >> $@ + rm -f $@.tmp.S $@.tmp.s $@.tmp + diff --git a/devel/perlasm/aes-aarch64.pl b/devel/perlasm/aes-aarch64.pl new file mode 120000 index 0000000000..6582600fc0 --- /dev/null +++ b/devel/perlasm/aes-aarch64.pl @@ -0,0 +1 @@ +../openssl/crypto/aes/asm/aesv8-armx.pl \ No newline at end of file diff --git a/devel/perlasm/aes-aarch64.pl.license b/devel/perlasm/aes-aarch64.pl.license new file mode 120000 index 0000000000..cd301a44ab --- /dev/null +++ b/devel/perlasm/aes-aarch64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/lib/accelerated/aarch64/Makefile.am b/lib/accelerated/aarch64/Makefile.am index 3fa8eed99f..f34507853b 100644 --- a/lib/accelerated/aarch64/Makefile.am +++ b/lib/accelerated/aarch64/Makefile.am @@ -38,9 +38,10 @@ EXTRA_DIST = README noinst_LTLIBRARIES = libaarch64.la libaarch64_la_SOURCES = aarch64-common.c aarch64-common.h sha-aarch64.h sha-aarch64.c \ - hmac-sha-aarch64.c + hmac-sha-aarch64.c aes-cbc-aarch64.c aes-gcm-aarch64.c aes-aarch64.h if ASM_AARCH64 -libaarch64_la_SOURCES += elf/sha1-armv8.s elf/sha512-armv8.s elf/sha256-armv8.s +libaarch64_la_SOURCES += elf/sha1-armv8.s elf/sha512-armv8.s elf/sha256-armv8.s \ + elf/aes-aarch64.s endif #ASM_AARCH64 diff --git a/lib/accelerated/aarch64/aarch64-common.c b/lib/accelerated/aarch64/aarch64-common.c index 84218e4643..9866132848 100644 --- a/lib/accelerated/aarch64/aarch64-common.c +++ b/lib/accelerated/aarch64/aarch64-common.c @@ -33,6 +33,7 @@ #ifdef HAVE_LIBNETTLE # include /* for key generation in 192 and 256 bits */ # include "sha-aarch64.h" +# include "aes-aarch64.h" #endif #include "aarch64-common.h" @@ -189,6 +190,40 @@ void _register_aarch64_crypto(unsigned capabilities) } } + if (_gnutls_arm_cpuid_s & ARMV8_AES) { + _gnutls_debug_log("Aarch64 AES was detected\n"); + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_128_GCM, 90, + &_gnutls_aes_gcm_aarch64, 0); + if (ret < 0) { + gnutls_assert(); + } + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_256_GCM, 90, + &_gnutls_aes_gcm_aarch64, 0); + if (ret < 0) { + gnutls_assert(); + } + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_128_CBC, 90, &_gnutls_aes_cbc_aarch64, 0); + if (ret < 0) { + gnutls_assert(); + } + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_256_CBC, 90, &_gnutls_aes_cbc_aarch64, 0); + if (ret < 0) { + gnutls_assert(); + } + } + return; } diff --git a/lib/accelerated/aarch64/aes-aarch64.h b/lib/accelerated/aarch64/aes-aarch64.h new file mode 100644 index 0000000000..35d5926cde --- /dev/null +++ b/lib/accelerated/aarch64/aes-aarch64.h @@ -0,0 +1,30 @@ +#ifndef AES_ARM_H +#define AES_ARM_H + +#include "gnutls_int.h" + +#define ALIGN16(x) \ + ((void *)(((ptrdiff_t)(x)+(ptrdiff_t)0x0f)&~((ptrdiff_t)0x0f))) + +#define AES_KEY_ALIGN_SIZE 4 +#define AES_MAXNR 14 +typedef struct { + /* We add few more integers to allow alignment + * on a 16-byte boundary. + */ + uint32_t rd_key[4 * (AES_MAXNR + 1) + AES_KEY_ALIGN_SIZE]; + uint32_t rounds; +} AES_KEY; + +int aes_v8_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); +int aes_v8_set_decrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); +void aes_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, unsigned char *ivec, int enc); +void aes_v8_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); +void aes_v8_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); + +extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_aarch64; + +extern const gnutls_crypto_cipher_st _gnutls_aes_cbc_aarch64; + +#endif diff --git a/lib/accelerated/aarch64/aes-cbc-aarch64.c b/lib/accelerated/aarch64/aes-cbc-aarch64.c new file mode 100644 index 0000000000..649145999f --- /dev/null +++ b/lib/accelerated/aarch64/aes-cbc-aarch64.c @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2011-2016 Free Software Foundation, Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * Author: Nikos Mavrogiannopoulos + * + * This file is part of GnuTLS. + * + * The GnuTLS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see + * + */ + +/* + * The following code is an implementation of the AES-128-CBC cipher + * using aarch64 instruction set. + */ + +#include "errors.h" +#include "gnutls_int.h" +#include +#include "errors.h" +#include +#include + +struct aes_ctx { + AES_KEY expanded_key; + uint8_t iv[16]; + int enc; +}; + +static int +aes_cipher_init(gnutls_cipher_algorithm_t algorithm, void **_ctx, int enc) +{ + /* we use key size to distinguish */ + if (algorithm != GNUTLS_CIPHER_AES_128_CBC + && algorithm != GNUTLS_CIPHER_AES_192_CBC + && algorithm != GNUTLS_CIPHER_AES_256_CBC) + return GNUTLS_E_INVALID_REQUEST; + + *_ctx = gnutls_calloc(1, sizeof(struct aes_ctx)); + if (*_ctx == NULL) { + gnutls_assert(); + return GNUTLS_E_MEMORY_ERROR; + } + + ((struct aes_ctx *) (*_ctx))->enc = enc; + + return 0; +} + +static int +aes_aarch64_cipher_setkey(void *_ctx, const void *userkey, size_t keysize) +{ + struct aes_ctx *ctx = _ctx; + int ret; + + if (ctx->enc) + ret = + aes_v8_set_encrypt_key(userkey, keysize * 8, + ALIGN16(&ctx->expanded_key)); + else + ret = + aes_v8_set_decrypt_key(userkey, keysize * 8, + ALIGN16(&ctx->expanded_key)); + + if (ret != 0) + return gnutls_assert_val(GNUTLS_E_ENCRYPTION_FAILED); + + return 0; +} + +static int +aes_aarch64_encrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t dst_size) +{ + struct aes_ctx *ctx = _ctx; + + aes_v8_cbc_encrypt(src, dst, src_size, ALIGN16(&ctx->expanded_key), + ctx->iv, 1); + return 0; +} + +static int +aes_aarch64_decrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t dst_size) +{ + struct aes_ctx *ctx = _ctx; + + aes_v8_cbc_encrypt(src, dst, src_size, ALIGN16(&ctx->expanded_key), + ctx->iv, 0); + + return 0; +} + +static int aes_setiv(void *_ctx, const void *iv, size_t iv_size) +{ + struct aes_ctx *ctx = _ctx; + + memcpy(ctx->iv, iv, 16); + return 0; +} + +static void aes_deinit(void *_ctx) +{ + struct aes_ctx *ctx = _ctx; + + zeroize_temp_key(ctx, sizeof(*ctx)); + gnutls_free(ctx); +} + +const gnutls_crypto_cipher_st _gnutls_aes_cbc_aarch64 = { + .init = aes_cipher_init, + .setkey = aes_aarch64_cipher_setkey, + .setiv = aes_setiv, + .encrypt = aes_aarch64_encrypt, + .decrypt = aes_aarch64_decrypt, + .deinit = aes_deinit, +}; + diff --git a/lib/accelerated/aarch64/aes-gcm-aarch64.c b/lib/accelerated/aarch64/aes-gcm-aarch64.c new file mode 100644 index 0000000000..37412363f3 --- /dev/null +++ b/lib/accelerated/aarch64/aes-gcm-aarch64.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2011-2016 Free Software Foundation, Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * Author: Nikos Mavrogiannopoulos + * + * This file is part of GnuTLS. + * + * The GnuTLS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see + * + */ + +/* + * The following code is an implementation of the AES-128-GCM cipher + * using the vpaes aarch64 code. + */ + +#include "errors.h" +#include "gnutls_int.h" + +#ifdef HAVE_LIBNETTLE + +#include +#include "errors.h" +#include +#include +#include +#include + +/* GCM mode + * It is used when the CPU doesn't include the PCLMUL instructions. + */ +struct gcm_aarch64_aes_ctx GCM_CTX(AES_KEY); + +static void aarch64_aes_encrypt(const void *_ctx, + size_t length, uint8_t * dst, + const uint8_t * src) +{ + AES_KEY *ctx = (void*)_ctx; + + aes_v8_encrypt(src, dst, ctx); +} + +static void aarch64_aes_128_set_encrypt_key(void *_ctx, + const uint8_t * key) +{ + AES_KEY *ctx = _ctx; + + aes_v8_set_encrypt_key(key, 16*8, ctx); +} + +static void aarch64_aes_256_set_encrypt_key(void *_ctx, + const uint8_t * key) +{ + AES_KEY *ctx = _ctx; + + aes_v8_set_encrypt_key(key, 32*8, ctx); +} + +static int +aes_gcm_cipher_init(gnutls_cipher_algorithm_t algorithm, void **_ctx, + int enc) +{ + /* we use key size to distinguish */ + if (algorithm != GNUTLS_CIPHER_AES_128_GCM && + algorithm != GNUTLS_CIPHER_AES_256_GCM) + return GNUTLS_E_INVALID_REQUEST; + + *_ctx = gnutls_calloc(1, sizeof(struct gcm_aarch64_aes_ctx)); + if (*_ctx == NULL) { + gnutls_assert(); + return GNUTLS_E_MEMORY_ERROR; + } + + return 0; +} + +static int +aes_gcm_cipher_setkey(void *_ctx, const void *key, size_t keysize) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + if (keysize == 16) { + GCM_SET_KEY(ctx, aarch64_aes_128_set_encrypt_key, aarch64_aes_encrypt, + key); + } else if (keysize == 32) { + GCM_SET_KEY(ctx, aarch64_aes_256_set_encrypt_key, aarch64_aes_encrypt, + key); + } else abort(); + + return 0; +} + +static int aes_gcm_setiv(void *_ctx, const void *iv, size_t iv_size) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + if (iv_size != GCM_BLOCK_SIZE - 4) + return gnutls_assert_val(GNUTLS_E_INVALID_REQUEST); + + GCM_SET_IV(ctx, iv_size, iv); + + return 0; +} + +static int +aes_gcm_encrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t length) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + GCM_ENCRYPT(ctx, aarch64_aes_encrypt, src_size, dst, src); + + return 0; +} + +static int +aes_gcm_decrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t dst_size) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + GCM_DECRYPT(ctx, aarch64_aes_encrypt, src_size, dst, src); + return 0; +} + +static int aes_gcm_auth(void *_ctx, const void *src, size_t src_size) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + GCM_UPDATE(ctx, src_size, src); + + return 0; +} + +static void aes_gcm_tag(void *_ctx, void *tag, size_t tagsize) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + GCM_DIGEST(ctx, aarch64_aes_encrypt, tagsize, tag); +} + +static void aes_gcm_deinit(void *_ctx) +{ + struct gcm_aarch64_aes_ctx *ctx = _ctx; + + zeroize_temp_key(ctx, sizeof(*ctx)); + gnutls_free(ctx); +} + +#include "../x86/aes-gcm-aead.h" + +const gnutls_crypto_cipher_st _gnutls_aes_gcm_aarch64 = { + .init = aes_gcm_cipher_init, + .setkey = aes_gcm_cipher_setkey, + .setiv = aes_gcm_setiv, + .aead_encrypt = aes_gcm_aead_encrypt, + .aead_decrypt = aes_gcm_aead_decrypt, + .encrypt = aes_gcm_encrypt, + .decrypt = aes_gcm_decrypt, + .deinit = aes_gcm_deinit, + .tag = aes_gcm_tag, + .auth = aes_gcm_auth, +}; + +#endif diff --git a/lib/accelerated/aarch64/elf/aes-aarch64.s b/lib/accelerated/aarch64/elf/aes-aarch64.s new file mode 100644 index 0000000000..0f16715357 --- /dev/null +++ b/lib/accelerated/aarch64/elf/aes-aarch64.s @@ -0,0 +1,791 @@ +# Copyright (c) 2011-2016, Andy Polyakov +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the Andy Polyakov nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# *** This file is auto-generated *** +# +# 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" +# 1 "" +# 1 "" +# 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" +# 1 "lib/accelerated/aarch64/aarch64-common.h" 1 +# 2 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" 2 + + +.text +.arch armv8-a+crypto +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-1 + cmp x0,#0 + b.eq .Lenc_key_abort + cmp x2,#0 + b.eq .Lenc_key_abort + mov x3,#-2 + cmp w1,#128 + b.lt .Lenc_key_abort + cmp w1,#256 + b.gt .Lenc_key_abort + tst w1,#0x3f + b.ne .Lenc_key_abort + + adr x3,.Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + mov x3,#0 + +.Lenc_key_abort: + mov x0,x3 + ldr x29,[sp],#16 + ret +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + cmp x0,#0 + b.ne .Ldec_key_abort + + sub x2,x2,#240 + mov x4,#-16 + add x0,x2,x12,lsl#4 + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 +.Ldec_key_abort: + ldp x29,x30,[sp],#16 + ret +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] + sub w5,w5,#6 + add x7,x3,x5,lsl#4 + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq .Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b .Lenter_cbc_enc + +.align 4 +.Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq .Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +.Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b .Lcbc_done +.align 5 +.Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo .Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +.Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 + + + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs .Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq .Lcbc_done + nop + +.Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq .Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b .Lcbc_done + +.Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +.Lcbc_done: + st1 {v6.16b},[x4] +.Lcbc_abort: + ldr x29,[sp],#16 + ret +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + rev w8, w8 + + orr v1.16b,v0.16b,v0.16b + add w10, w8, #1 + orr v18.16b,v0.16b,v0.16b + add w8, w8, #2 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v1.s[3],w10 + b.ls .Lctr32_tail + rev w12, w8 + sub x2,x2,#3 + mov v18.s[3],w12 + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + orr v0.16b,v6.16b,v6.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + add w9,w8,#1 + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + eor v19.16b,v19.16b,v7.16b + rev w9,w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + mov v0.s[3], w9 + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + mov v1.s[3], w10 + rev w12,w8 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + mov v18.s[3], w12 + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 + st1 {v19.16b},[x1],#16 + b.hs .Loop3x_ctr32 + + adds x2,x2,#3 + b.eq .Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +.Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq .Lctr32_done + st1 {v3.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +.section .note.GNU-stack,"",%progbits -- cgit v1.2.1