summaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r--arch/arm64/crypto/Kconfig24
-rw-r--r--arch/arm64/crypto/Makefile13
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c1
-rw-r--r--arch/arm64/crypto/aes-cipher-core.S110
-rw-r--r--arch/arm64/crypto/aes-cipher-glue.c69
-rw-r--r--arch/arm64/crypto/aes-glue.c281
-rw-r--r--arch/arm64/crypto/aes-modes.S119
-rw-r--r--arch/arm64/crypto/aes-neon.S235
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S972
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c439
-rw-r--r--arch/arm64/crypto/chacha20-neon-core.S450
-rw-r--r--arch/arm64/crypto/chacha20-neon-glue.c126
-rw-r--r--arch/arm64/crypto/crc32-arm64.c290
-rw-r--r--arch/arm64/crypto/crc32-ce-glue.c49
14 files changed, 2671 insertions, 507 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 450a85df041a..d92293747d63 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -37,10 +37,14 @@ config CRYPTO_CRCT10DIF_ARM64_CE
select CRYPTO_HASH
config CRYPTO_CRC32_ARM64_CE
- tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
- depends on KERNEL_MODE_NEON && CRC32
+ tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
+ depends on CRC32
select CRYPTO_HASH
+config CRYPTO_AES_ARM64
+ tristate "AES core cipher using scalar instructions"
+ select CRYPTO_AES
+
config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
@@ -67,9 +71,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES
select CRYPTO_SIMD
-config CRYPTO_CRC32_ARM64
- tristate "CRC32 and CRC32C using optional ARMv8 instructions"
- depends on ARM64
- select CRYPTO_HASH
+config CRYPTO_CHACHA20_NEON
+ tristate "NEON accelerated ChaCha20 symmetric cipher"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_CHACHA20
+
+config CRYPTO_AES_ARM64_BS
+ tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_AES_ARM64_NEON_BLK
+ select CRYPTO_SIMD
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index aa8888d7b744..b5edc5918c28 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,15 +41,20 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4
CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
-obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o
-
-CFLAGS_crc32-arm64.o := -mcpu=generic+crc
-
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
$(call if_changed_rule,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index cc5515dac74a..6a7dbc7c83a6 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -258,7 +258,6 @@ static struct aead_alg ccm_aes_alg = {
.cra_priority = 300,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_alignmask = 7,
.cra_module = THIS_MODULE,
},
.ivsize = AES_BLOCK_SIZE,
diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
new file mode 100644
index 000000000000..f2f9cc519309
--- /dev/null
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -0,0 +1,110 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ rk .req x0
+ out .req x1
+ in .req x2
+ rounds .req x3
+ tt .req x4
+ lt .req x2
+
+ .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift
+ ubfx \reg0, \in0, #\shift, #8
+ .if \enc
+ ubfx \reg1, \in1e, #\shift, #8
+ .else
+ ubfx \reg1, \in1d, #\shift, #8
+ .endif
+ ldr \reg0, [tt, \reg0, uxtw #2]
+ ldr \reg1, [tt, \reg1, uxtw #2]
+ .endm
+
+ .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+ ldp \out0, \out1, [rk], #8
+
+ __pair \enc, w13, w14, \in0, \in1, \in3, 0
+ __pair \enc, w15, w16, \in1, \in2, \in0, 8
+ __pair \enc, w17, w18, \in2, \in3, \in1, 16
+ __pair \enc, \t0, \t1, \in3, \in0, \in2, 24
+
+ eor \out0, \out0, w13
+ eor \out1, \out1, w14
+ eor \out0, \out0, w15, ror #24
+ eor \out1, \out1, w16, ror #24
+ eor \out0, \out0, w17, ror #16
+ eor \out1, \out1, w18, ror #16
+ eor \out0, \out0, \t0, ror #8
+ eor \out1, \out1, \t1, ror #8
+ .endm
+
+ .macro fround, out0, out1, out2, out3, in0, in1, in2, in3
+ __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
+ __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+ .endm
+
+ .macro iround, out0, out1, out2, out3, in0, in1, in2, in3
+ __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
+ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+ .endm
+
+ .macro do_crypt, round, ttab, ltab
+ ldp w5, w6, [in]
+ ldp w7, w8, [in, #8]
+ ldp w9, w10, [rk], #16
+ ldp w11, w12, [rk, #-8]
+
+CPU_BE( rev w5, w5 )
+CPU_BE( rev w6, w6 )
+CPU_BE( rev w7, w7 )
+CPU_BE( rev w8, w8 )
+
+ eor w5, w5, w9
+ eor w6, w6, w10
+ eor w7, w7, w11
+ eor w8, w8, w12
+
+ adr_l tt, \ttab
+ adr_l lt, \ltab
+
+ tbnz rounds, #1, 1f
+
+0: \round w9, w10, w11, w12, w5, w6, w7, w8
+ \round w5, w6, w7, w8, w9, w10, w11, w12
+
+1: subs rounds, rounds, #4
+ \round w9, w10, w11, w12, w5, w6, w7, w8
+ csel tt, tt, lt, hi
+ \round w5, w6, w7, w8, w9, w10, w11, w12
+ b.hi 0b
+
+CPU_BE( rev w5, w5 )
+CPU_BE( rev w6, w6 )
+CPU_BE( rev w7, w7 )
+CPU_BE( rev w8, w8 )
+
+ stp w5, w6, [out]
+ stp w7, w8, [out, #8]
+ ret
+ .endm
+
+ .align 5
+ENTRY(__aes_arm64_encrypt)
+ do_crypt fround, crypto_ft_tab, crypto_fl_tab
+ENDPROC(__aes_arm64_encrypt)
+
+ .align 5
+ENTRY(__aes_arm64_decrypt)
+ do_crypt iround, crypto_it_tab, crypto_il_tab
+ENDPROC(__aes_arm64_decrypt)
diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c
new file mode 100644
index 000000000000..7288e7cbebff
--- /dev/null
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@@ -0,0 +1,69 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_encrypt);
+
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ int rounds = 6 + ctx->key_length / 4;
+
+ __aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ int rounds = 6 + ctx->key_length / 4;
+
+ __aes_arm64_decrypt(ctx->key_dec, out, in, rounds);
+}
+
+static struct crypto_alg aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-arm64",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+
+ .cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cra_cipher.cia_setkey = crypto_aes_set_key,
+ .cra_cipher.cia_encrypt = aes_encrypt,
+ .cra_cipher.cia_decrypt = aes_decrypt
+};
+
+static int __init aes_init(void)
+{
+ return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+ crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for arm64");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 4e3f8adb1793..bcf596b0197e 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -1,7 +1,7 @@
/*
* linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,7 @@
#include <asm/neon.h>
#include <asm/hwcap.h>
#include <crypto/aes.h>
+#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/module.h>
@@ -31,6 +32,7 @@
#define aes_ctr_encrypt ce_aes_ctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
+#define aes_mac_update ce_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
@@ -44,11 +46,15 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_ctr_encrypt neon_aes_ctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
+#define aes_mac_update neon_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("cmac(aes)");
+MODULE_ALIAS_CRYPTO("xcbc(aes)");
+MODULE_ALIAS_CRYPTO("cbcmac(aes)");
#endif
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -75,11 +81,25 @@ asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
int rounds, int blocks, u8 const rk2[], u8 iv[],
int first);
+asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+ int blocks, u8 dg[], int enc_before,
+ int enc_after);
+
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
};
+struct mac_tfm_ctx {
+ struct crypto_aes_ctx key;
+ u8 __aligned(8) consts[];
+};
+
+struct mac_desc_ctx {
+ unsigned int len;
+ u8 dg[AES_BLOCK_SIZE];
+};
+
static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
@@ -215,14 +235,15 @@ static int ctr_encrypt(struct skcipher_request *req)
u8 *tsrc = walk.src.virt.addr;
/*
- * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
- * to tell aes_ctr_encrypt() to only read half a block.
+ * Tell aes_ctr_encrypt() to process a tail block.
*/
- blocks = (nbytes <= 8) ? -1 : 1;
+ blocks = -1;
- aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+ aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
blocks, walk.iv, first);
- memcpy(tdst, tail, nbytes);
+ if (tdst != tsrc)
+ memcpy(tdst, tsrc, nbytes);
+ crypto_xor(tdst, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_neon_end();
@@ -282,7 +303,6 @@ static struct skcipher_alg aes_algs[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_alignmask = 7,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
@@ -298,7 +318,6 @@ static struct skcipher_alg aes_algs[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_alignmask = 7,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
@@ -315,7 +334,22 @@ static struct skcipher_alg aes_algs[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_alignmask = 7,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = skcipher_aes_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base = {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
+ .cra_priority = PRIO - 1,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
@@ -333,7 +367,6 @@ static struct skcipher_alg aes_algs[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
- .cra_alignmask = 7,
.cra_module = THIS_MODULE,
},
.min_keysize = 2 * AES_MIN_KEY_SIZE,
@@ -344,15 +377,228 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = xts_decrypt,
} };
+static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+ int err;
+
+ err = aes_expandkey(&ctx->key, in_key, key_len);
+ if (err)
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+
+ return err;
+}
+
+static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
+{
+ u64 a = be64_to_cpu(x->a);
+ u64 b = be64_to_cpu(x->b);
+
+ y->a = cpu_to_be64((a << 1) | (b >> 63));
+ y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+}
+
+static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+ be128 *consts = (be128 *)ctx->consts;
+ u8 *rk = (u8 *)ctx->key.key_enc;
+ int rounds = 6 + key_len / 4;
+ int err;
+
+ err = cbcmac_setkey(tfm, in_key, key_len);
+ if (err)
+ return err;
+
+ /* encrypt the zero vector */
+ kernel_neon_begin();
+ aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
+ kernel_neon_end();
+
+ cmac_gf128_mul_by_x(consts, consts);
+ cmac_gf128_mul_by_x(consts + 1, consts);
+
+ return 0;
+}
+
+static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ static u8 const ks[3][AES_BLOCK_SIZE] = {
+ { [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
+ { [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
+ { [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
+ };
+
+ struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+ u8 *rk = (u8 *)ctx->key.key_enc;
+ int rounds = 6 + key_len / 4;
+ u8 key[AES_BLOCK_SIZE];
+ int err;
+
+ err = cbcmac_setkey(tfm, in_key, key_len);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
+ aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
+ kernel_neon_end();
+
+ return cbcmac_setkey(tfm, key, sizeof(key));
+}
+
+static int mac_init(struct shash_desc *desc)
+{
+ struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ memset(ctx->dg, 0, AES_BLOCK_SIZE);
+ ctx->len = 0;
+
+ return 0;
+}
+
+static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
+{
+ struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+ int rounds = 6 + tctx->key.key_length / 4;
+
+ while (len > 0) {
+ unsigned int l;
+
+ if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
+ (ctx->len + len) > AES_BLOCK_SIZE) {
+
+ int blocks = len / AES_BLOCK_SIZE;
+
+ len %= AES_BLOCK_SIZE;
+
+ kernel_neon_begin();
+ aes_mac_update(p, tctx->key.key_enc, rounds, blocks,
+ ctx->dg, (ctx->len != 0), (len != 0));
+ kernel_neon_end();
+
+ p += blocks * AES_BLOCK_SIZE;
+
+ if (!len) {
+ ctx->len = AES_BLOCK_SIZE;
+ break;
+ }
+ ctx->len = 0;
+ }
+
+ l = min(len, AES_BLOCK_SIZE - ctx->len);
+
+ if (l <= AES_BLOCK_SIZE) {
+ crypto_xor(ctx->dg + ctx->len, p, l);
+ ctx->len += l;
+ len -= l;
+ p += l;
+ }
+ }
+
+ return 0;
+}
+
+static int cbcmac_final(struct shash_desc *desc, u8 *out)
+{
+ struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+ int rounds = 6 + tctx->key.key_length / 4;
+
+ kernel_neon_begin();
+ aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0);
+ kernel_neon_end();
+
+ memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+static int cmac_final(struct shash_desc *desc, u8 *out)
+{
+ struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+ int rounds = 6 + tctx->key.key_length / 4;
+ u8 *consts = tctx->consts;
+
+ if (ctx->len != AES_BLOCK_SIZE) {
+ ctx->dg[ctx->len] ^= 0x80;
+ consts += AES_BLOCK_SIZE;
+ }
+
+ kernel_neon_begin();
+ aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1);
+ kernel_neon_end();
+
+ memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg mac_algs[] = { {
+ .base.cra_name = "cmac(aes)",
+ .base.cra_driver_name = "cmac-aes-" MODE,
+ .base.cra_priority = PRIO,
+ .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
+ 2 * AES_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = AES_BLOCK_SIZE,
+ .init = mac_init,
+ .update = mac_update,
+ .final = cmac_final,
+ .setkey = cmac_setkey,
+ .descsize = sizeof(struct mac_desc_ctx),
+}, {
+ .base.cra_name = "xcbc(aes)",
+ .base.cra_driver_name = "xcbc-aes-" MODE,
+ .base.cra_priority = PRIO,
+ .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
+ 2 * AES_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = AES_BLOCK_SIZE,
+ .init = mac_init,
+ .update = mac_update,
+ .final = cmac_final,
+ .setkey = xcbc_setkey,
+ .descsize = sizeof(struct mac_desc_ctx),
+}, {
+ .base.cra_name = "cbcmac(aes)",
+ .base.cra_driver_name = "cbcmac-aes-" MODE,
+ .base.cra_priority = PRIO,
+ .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct mac_tfm_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = AES_BLOCK_SIZE,
+ .init = mac_init,
+ .update = mac_update,
+ .final = cbcmac_final,
+ .setkey = cbcmac_setkey,
+ .descsize = sizeof(struct mac_desc_ctx),
+} };
+
static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
static void aes_exit(void)
{
int i;
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
- simd_skcipher_free(aes_simd_algs[i]);
+ for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+ if (aes_simd_algs[i])
+ simd_skcipher_free(aes_simd_algs[i]);
+ crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
@@ -369,7 +615,14 @@ static int __init aes_init(void)
if (err)
return err;
+ err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs));
+ if (err)
+ goto unregister_ciphers;
+
for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+ if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+ continue;
+
algname = aes_algs[i].base.cra_name + 2;
drvname = aes_algs[i].base.cra_driver_name + 2;
basename = aes_algs[i].base.cra_driver_name;
@@ -385,6 +638,8 @@ static int __init aes_init(void)
unregister_simds:
aes_exit();
+unregister_ciphers:
+ crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
return err;
}
@@ -392,5 +647,7 @@ unregister_simds:
module_cpu_feature_match(AES, aes_init);
#else
module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
#endif
module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index c53dbeae79f2..2674d43d1384 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -1,7 +1,7 @@
/*
* linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -193,15 +193,16 @@ AES_ENTRY(aes_cbc_encrypt)
cbz w6, .Lcbcencloop
ld1 {v0.16b}, [x5] /* get iv */
- enc_prepare w3, x2, x5
+ enc_prepare w3, x2, x6
.Lcbcencloop:
ld1 {v1.16b}, [x1], #16 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
- encrypt_block v0, w3, x2, x5, w6
+ encrypt_block v0, w3, x2, x6, w7
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcencloop
+ st1 {v0.16b}, [x5] /* return iv */
ret
AES_ENDPROC(aes_cbc_encrypt)
@@ -211,7 +212,7 @@ AES_ENTRY(aes_cbc_decrypt)
cbz w6, .LcbcdecloopNx
ld1 {v7.16b}, [x5] /* get iv */
- dec_prepare w3, x2, x5
+ dec_prepare w3, x2, x6
.LcbcdecloopNx:
#if INTERLEAVE >= 2
@@ -248,7 +249,7 @@ AES_ENTRY(aes_cbc_decrypt)
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
- decrypt_block v0, w3, x2, x5, w6
+ decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
@@ -256,6 +257,7 @@ AES_ENTRY(aes_cbc_decrypt)
bne .Lcbcdecloop
.Lcbcdecout:
FRAME_POP
+ st1 {v7.16b}, [x5] /* return iv */
ret
AES_ENDPROC(aes_cbc_decrypt)
@@ -267,24 +269,15 @@ AES_ENDPROC(aes_cbc_decrypt)
AES_ENTRY(aes_ctr_encrypt)
FRAME_PUSH
- cbnz w6, .Lctrfirst /* 1st time around? */
- umov x5, v4.d[1] /* keep swabbed ctr in reg */
- rev x5, x5
-#if INTERLEAVE >= 2
- cmn w5, w4 /* 32 bit overflow? */
- bcs .Lctrinc
- add x5, x5, #1 /* increment BE ctr */
- b .LctrincNx
-#else
- b .Lctrinc
-#endif
-.Lctrfirst:
+ cbz w6, .Lctrnotfirst /* 1st time around? */
enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5]
- umov x5, v4.d[1] /* keep swabbed ctr in reg */
- rev x5, x5
+
+.Lctrnotfirst:
+ umov x8, v4.d[1] /* keep swabbed ctr in reg */
+ rev x8, x8
#if INTERLEAVE >= 2
- cmn w5, w4 /* 32 bit overflow? */
+ cmn w8, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx:
subs w4, w4, #INTERLEAVE
@@ -292,11 +285,11 @@ AES_ENTRY(aes_ctr_encrypt)
#if INTERLEAVE == 2
mov v0.8b, v4.8b
mov v1.8b, v4.8b
- rev x7, x5
- add x5, x5, #1
+ rev x7, x8
+ add x8, x8, #1
ins v0.d[1], x7
- rev x7, x5
- add x5, x5, #1
+ rev x7, x8
+ add x8, x8, #1
ins v1.d[1], x7
ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
do_encrypt_block2x
@@ -305,7 +298,7 @@ AES_ENTRY(aes_ctr_encrypt)
st1 {v0.16b-v1.16b}, [x0], #32
#else
ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
- dup v7.4s, w5
+ dup v7.4s, w8
mov v0.16b, v4.16b
add v7.4s, v7.4s, v8.4s
mov v1.16b, v4.16b
@@ -323,18 +316,12 @@ AES_ENTRY(aes_ctr_encrypt)
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
st1 {v0.16b-v3.16b}, [x0], #64
- add x5, x5, #INTERLEAVE
+ add x8, x8, #INTERLEAVE
#endif
- cbz w4, .LctroutNx
-.LctrincNx:
- rev x7, x5
+ rev x7, x8
ins v4.d[1], x7
+ cbz w4, .Lctrout
b .LctrloopNx
-.LctroutNx:
- sub x5, x5, #1
- rev x7, x5
- ins v4.d[1], x7
- b .Lctrout
.Lctr1x:
adds w4, w4, #INTERLEAVE
beq .Lctrout
@@ -342,30 +329,37 @@ AES_ENTRY(aes_ctr_encrypt)
.Lctrloop:
mov v0.16b, v4.16b
encrypt_block v0, w3, x2, x6, w7
+
+ adds x8, x8, #1 /* increment BE ctr */
+ rev x7, x8
+ ins v4.d[1], x7
+ bcs .Lctrcarry /* overflow? */
+
+.Lctrcarrydone:
subs w4, w4, #1
- bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
+ bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x0], #16
- beq .Lctrout
-.Lctrinc:
- adds x5, x5, #1 /* increment BE ctr */
- rev x7, x5
- ins v4.d[1], x7
- bcc .Lctrloop /* no overflow? */
+ bne .Lctrloop
+
+.Lctrout:
+ st1 {v4.16b}, [x5] /* return next CTR value */
+ FRAME_POP
+ ret
+
+.Lctrtailblock:
+ st1 {v0.16b}, [x0]
+ FRAME_POP
+ ret
+
+.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */
add x7, x7, #1
rev x7, x7
ins v4.d[0], x7
- b .Lctrloop
-.Lctrhalfblock:
- ld1 {v3.8b}, [x1]
- eor v3.8b, v0.8b, v3.8b
- st1 {v3.8b}, [x0]
-.Lctrout:
- FRAME_POP
- ret
+ b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)
.ltorg
@@ -531,3 +525,30 @@ AES_ENTRY(aes_xts_decrypt)
FRAME_POP
ret
AES_ENDPROC(aes_xts_decrypt)
+
+ /*
+ * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+ * int blocks, u8 dg[], int enc_before, int enc_after)
+ */
+AES_ENTRY(aes_mac_update)
+ ld1 {v0.16b}, [x4] /* get dg */
+ enc_prepare w2, x1, x7
+ cbnz w5, .Lmacenc
+
+.Lmacloop:
+ cbz w3, .Lmacout
+ ld1 {v1.16b}, [x0], #16 /* get next pt block */
+ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
+
+ subs w3, w3, #1
+ csinv x5, x6, xzr, eq
+ cbz w5, .Lmacout
+
+.Lmacenc:
+ encrypt_block v0, w2, x1, x7, w8
+ b .Lmacloop
+
+.Lmacout:
+ st1 {v0.16b}, [x4] /* return dg */
+ ret
+AES_ENDPROC(aes_mac_update)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 85f07ead7c5c..f1e3aa2732f9 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,7 +1,7 @@
/*
* linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
sshr \temp, \in, #7
- add \out, \in, \in
+ shl \out, \in, #1
and \temp, \temp, \const
eor \out, \out, \temp
.endm
+ /* multiply by polynomial 'x^2' in GF(2^8) */
+ .macro mul_by_x2, out, in, temp, const
+ ushr \temp, \in, #6
+ shl \out, \in, #2
+ pmul \temp, \temp, \const
+ eor \out, \out, \temp
+ .endm
+
/* preload the entire Sbox */
.macro prepare, sbox, shiftrows, temp
adr \temp, \sbox
- movi v12.16b, #0x40
+ movi v12.16b, #0x1b
ldr q13, \shiftrows
- movi v14.16b, #0x1b
+ ldr q14, .Lror32by8
ld1 {v16.16b-v19.16b}, [\temp], #64
ld1 {v20.16b-v23.16b}, [\temp], #64
ld1 {v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@
/* apply SubBytes transformation using the the preloaded Sbox */
.macro sub_bytes, in
- sub v9.16b, \in\().16b, v12.16b
+ sub v9.16b, \in\().16b, v15.16b
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
- sub v10.16b, v9.16b, v12.16b
+ sub v10.16b, v9.16b, v15.16b
tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
- sub v11.16b, v10.16b, v12.16b
+ sub v11.16b, v10.16b, v15.16b
tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
.endm
/* apply MixColumns transformation */
- .macro mix_columns, in
- mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
- rev32 v8.8h, \in\().8h
- eor \in\().16b, v10.16b, \in\().16b
- shl v9.4s, v8.4s, #24
- shl v11.4s, \in\().4s, #24
- sri v9.4s, v8.4s, #8
- sri v11.4s, \in\().4s, #8
- eor v9.16b, v9.16b, v8.16b
- eor v10.16b, v10.16b, v9.16b
- eor \in\().16b, v10.16b, v11.16b
- .endm
-
+ .macro mix_columns, in, enc
+ .if \enc == 0
/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
- .macro inv_mix_columns, in
- mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
- mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
- eor \in\().16b, \in\().16b, v11.16b
- rev32 v11.8h, v11.8h
- eor \in\().16b, \in\().16b, v11.16b
- mix_columns \in
+ mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
+ eor \in\().16b, \in\().16b, v8.16b
+ rev32 v8.8h, v8.8h
+ eor \in\().16b, \in\().16b, v8.16b
+ .endif
+
+ mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
+ rev32 v8.8h, \in\().8h
+ eor v8.16b, v8.16b, v9.16b
+ eor \in\().16b, \in\().16b, v8.16b
+ tbl \in\().16b, {\in\().16b}, v14.16b
+ eor \in\().16b, \in\().16b, v8.16b
.endm
.macro do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
add \rkp, \rk, #16
mov \i, \rounds
1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
+ movi v15.16b, #0x40
tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
sub_bytes \in
- ld1 {v15.4s}, [\rkp], #16
subs \i, \i, #1
+ ld1 {v15.4s}, [\rkp], #16
beq 2222f
- .if \enc == 1
- mix_columns \in
- .else
- inv_mix_columns \in
- .endif
+ mix_columns \in, \enc
b 1111b
2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
.endm
@@ -116,139 +115,114 @@
*/
.macro sub_bytes_2x, in0, in1
- sub v8.16b, \in0\().16b, v12.16b
- sub v9.16b, \in1\().16b, v12.16b
+ sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+ sub v9.16b, \in1\().16b, v15.16b
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
- sub v10.16b, v8.16b, v12.16b
- sub v11.16b, v9.16b, v12.16b
+ sub v10.16b, v8.16b, v15.16b
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
+ sub v11.16b, v9.16b, v15.16b
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
- sub v8.16b, v10.16b, v12.16b
- sub v9.16b, v11.16b, v12.16b
+ sub v8.16b, v10.16b, v15.16b
tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
+ sub v9.16b, v11.16b, v15.16b
tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
.endm
.macro sub_bytes_4x, in0, in1, in2, in3
- sub v8.16b, \in0\().16b, v12.16b
+ sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
- sub v9.16b, \in1\().16b, v12.16b
+ sub v9.16b, \in1\().16b, v15.16b
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
- sub v10.16b, \in2\().16b, v12.16b
+ sub v10.16b, \in2\().16b, v15.16b
tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
- sub v11.16b, \in3\().16b, v12.16b
+ sub v11.16b, \in3\().16b, v15.16b
tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
- sub v8.16b, v8.16b, v12.16b
+ sub v8.16b, v8.16b, v15.16b
tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
- sub v9.16b, v9.16b, v12.16b
+ sub v9.16b, v9.16b, v15.16b
tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
- sub v10.16b, v10.16b, v12.16b
+ sub v10.16b, v10.16b, v15.16b
tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
- sub v11.16b, v11.16b, v12.16b
+ sub v11.16b, v11.16b, v15.16b
tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
- sub v8.16b, v8.16b, v12.16b
+ sub v8.16b, v8.16b, v15.16b
tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
- sub v9.16b, v9.16b, v12.16b
+ sub v9.16b, v9.16b, v15.16b
tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
- sub v10.16b, v10.16b, v12.16b
+ sub v10.16b, v10.16b, v15.16b
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
- sub v11.16b, v11.16b, v12.16b
+ sub v11.16b, v11.16b, v15.16b
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
.endm
.macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
- sshr \tmp0\().16b, \in0\().16b, #7
- add \out0\().16b, \in0\().16b, \in0\().16b
- sshr \tmp1\().16b, \in1\().16b, #7
+ sshr \tmp0\().16b, \in0\().16b, #7
+ shl \out0\().16b, \in0\().16b, #1
+ sshr \tmp1\().16b, \in1\().16b, #7
and \tmp0\().16b, \tmp0\().16b, \const\().16b
- add \out1\().16b, \in1\().16b, \in1\().16b
+ shl \out1\().16b, \in1\().16b, #1
and \tmp1\().16b, \tmp1\().16b, \const\().16b
eor \out0\().16b, \out0\().16b, \tmp0\().16b
eor \out1\().16b, \out1\().16b, \tmp1\().16b
.endm
- .macro mix_columns_2x, in0, in1
- mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
- rev32 v10.8h, \in0\().8h
- rev32 v11.8h, \in1\().8h
- eor \in0\().16b, v8.16b, \in0\().16b
- eor \in1\().16b, v9.16b, \in1\().16b
- shl v12.4s, v10.4s, #24
- shl v13.4s, v11.4s, #24
- eor v8.16b, v8.16b, v10.16b
- sri v12.4s, v10.4s, #8
- shl v10.4s, \in0\().4s, #24
- eor v9.16b, v9.16b, v11.16b
- sri v13.4s, v11.4s, #8
- shl v11.4s, \in1\().4s, #24
- sri v10.4s, \in0\().4s, #8
- eor \in0\().16b, v8.16b, v12.16b
- sri v11.4s, \in1\().4s, #8
- eor \in1\().16b, v9.16b, v13.16b
- eor \in0\().16b, v10.16b, \in0\().16b
- eor \in1\().16b, v11.16b, \in1\().16b
+ .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+ ushr \tmp0\().16b, \in0\().16b, #6
+ shl \out0\().16b, \in0\().16b, #2
+ ushr \tmp1\().16b, \in1\().16b, #6
+ pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
+ shl \out1\().16b, \in1\().16b, #2
+ pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
+ eor \out0\().16b, \out0\().16b, \tmp0\().16b
+ eor \out1\().16b, \out1\().16b, \tmp1\().16b
.endm
- .macro inv_mix_cols_2x, in0, in1
- mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
- mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
+ .macro mix_columns_2x, in0, in1, enc
+ .if \enc == 0
+ /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+ mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
eor \in0\().16b, \in0\().16b, v8.16b
- eor \in1\().16b, \in1\().16b, v9.16b
rev32 v8.8h, v8.8h
- rev32 v9.8h, v9.8h
- eor \in0\().16b, \in0\().16b, v8.16b
- eor \in1\().16b, \in1\().16b, v9.16b
- mix_columns_2x \in0, \in1
- .endm
-
- .macro inv_mix_cols_4x, in0, in1, in2, in3
- mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
- mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
- mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
- mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
- eor \in0\().16b, \in0\().16b, v8.16b
eor \in1\().16b, \in1\().16b, v9.16b
- eor \in2\().16b, \in2\().16b, v10.16b
- eor \in3\().16b, \in3\().16b, v11.16b
- rev32 v8.8h, v8.8h
rev32 v9.8h, v9.8h
- rev32 v10.8h, v10.8h
- rev32 v11.8h, v11.8h
eor \in0\().16b, \in0\().16b, v8.16b
eor \in1\().16b, \in1\().16b, v9.16b
- eor \in2\().16b, \in2\().16b, v10.16b
- eor \in3\().16b, \in3\().16b, v11.16b
- mix_columns_2x \in0, \in1
- mix_columns_2x \in2, \in3
+ .endif
+
+ mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
+ rev32 v10.8h, \in0\().8h
+ rev32 v11.8h, \in1\().8h
+ eor v10.16b, v10.16b, v8.16b
+ eor v11.16b, v11.16b, v9.16b
+ eor \in0\().16b, \in0\().16b, v10.16b
+ eor \in1\().16b, \in1\().16b, v11.16b
+ tbl \in0\().16b, {\in0\().16b}, v14.16b
+ tbl \in1\().16b, {\in1\().16b}, v14.16b
+ eor \in0\().16b, \in0\().16b, v10.16b
+ eor \in1\().16b, \in1\().16b, v11.16b
.endm
- .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+ .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
ld1 {v15.4s}, [\rk]
add \rkp, \rk, #16
mov \i, \rounds
1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
- sub_bytes_2x \in0, \in1
+ movi v15.16b, #0x40
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
- ld1 {v15.4s}, [\rkp], #16
+ sub_bytes_2x \in0, \in1
subs \i, \i, #1
+ ld1 {v15.4s}, [\rkp], #16
beq 2222f
- .if \enc == 1
- mix_columns_2x \in0, \in1
- ldr q13, .LForward_ShiftRows
- .else
- inv_mix_cols_2x \in0, \in1
- ldr q13, .LReverse_ShiftRows
- .endif
- movi v12.16b, #0x40
+ mix_columns_2x \in0, \in1, \enc
b 1111b
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
@@ -262,23 +236,17 @@
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
- sub_bytes_4x \in0, \in1, \in2, \in3
+ movi v15.16b, #0x40
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
- ld1 {v15.4s}, [\rkp], #16
+ sub_bytes_4x \in0, \in1, \in2, \in3
subs \i, \i, #1
+ ld1 {v15.4s}, [\rkp], #16
beq 2222f
- .if \enc == 1
- mix_columns_2x \in0, \in1
- mix_columns_2x \in2, \in3
- ldr q13, .LForward_ShiftRows
- .else
- inv_mix_cols_4x \in0, \in1, \in2, \in3
- ldr q13, .LReverse_ShiftRows
- .endif
- movi v12.16b, #0x40
+ mix_columns_2x \in0, \in1, \enc
+ mix_columns_2x \in2, \in3, \enc
b 1111b
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
@@ -305,19 +273,7 @@
#include "aes-modes.S"
.text
- .align 4
-.LForward_ShiftRows:
-CPU_LE( .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 )
-CPU_LE( .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb )
-CPU_BE( .byte 0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8 )
-CPU_BE( .byte 0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0 )
-
-.LReverse_ShiftRows:
-CPU_LE( .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb )
-CPU_LE( .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 )
-CPU_BE( .byte 0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8 )
-CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 )
-
+ .align 6
.LForward_Sbox:
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 )
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+ .octa 0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+ .octa 0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+ .octa 0x0c0f0e0d080b0a090407060500030201
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
new file mode 100644
index 000000000000..ca0472500433
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -0,0 +1,972 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ rounds .req x11
+ bskey .req x12
+
+ .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+ eor \b2, \b2, \b1
+ eor \b5, \b5, \b6
+ eor \b3, \b3, \b0
+ eor \b6, \b6, \b2
+ eor \b5, \b5, \b0
+ eor \b6, \b6, \b3
+ eor \b3, \b3, \b7
+ eor \b7, \b7, \b5
+ eor \b3, \b3, \b4
+ eor \b4, \b4, \b5
+ eor \b2, \b2, \b7
+ eor \b3, \b3, \b1
+ eor \b1, \b1, \b5
+ .endm
+
+ .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+ eor \b0, \b0, \b6
+ eor \b1, \b1, \b4
+ eor \b4, \b4, \b6
+ eor \b2, \b2, \b0
+ eor \b6, \b6, \b1
+ eor \b1, \b1, \b5
+ eor \b5, \b5, \b3
+ eor \b3, \b3, \b7
+ eor \b7, \b7, \b5
+ eor \b2, \b2, \b5
+ eor \b4, \b4, \b7
+ .endm
+
+ .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+ eor \b1, \b1, \b7
+ eor \b4, \b4, \b7
+ eor \b7, \b7, \b5
+ eor \b1, \b1, \b3
+ eor \b2, \b2, \b5
+ eor \b3, \b3, \b7
+ eor \b6, \b6, \b1
+ eor \b2, \b2, \b0
+ eor \b5, \b5, \b3
+ eor \b4, \b4, \b6
+ eor \b0, \b0, \b6
+ eor \b1, \b1, \b4
+ .endm
+
+ .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+ eor \b1, \b1, \b5
+ eor \b2, \b2, \b7
+ eor \b3, \b3, \b1
+ eor \b4, \b4, \b5
+ eor \b7, \b7, \b5
+ eor \b3, \b3, \b4
+ eor \b5, \b5, \b0
+ eor \b3, \b3, \b7
+ eor \b6, \b6, \b2
+ eor \b2, \b2, \b1
+ eor \b6, \b6, \b3
+ eor \b3, \b3, \b0
+ eor \b5, \b5, \b6
+ .endm
+
+ .macro mul_gf4, x0, x1, y0, y1, t0, t1
+ eor \t0, \y0, \y1
+ and \t0, \t0, \x0
+ eor \x0, \x0, \x1
+ and \t1, \x1, \y0
+ and \x0, \x0, \y1
+ eor \x1, \t1, \t0
+ eor \x0, \x0, \t1
+ .endm
+
+ .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+ eor \t0, \y0, \y1
+ eor \t1, \y2, \y3
+ and \t0, \t0, \x0
+ and \t1, \t1, \x2
+ eor \x0, \x0, \x1
+ eor \x2, \x2, \x3
+ and \x1, \x1, \y0
+ and \x3, \x3, \y2
+ and \x0, \x0, \y1
+ and \x2, \x2, \y3
+ eor \x1, \x1, \x0
+ eor \x2, \x2, \x3
+ eor \x0, \x0, \t0
+ eor \x3, \x3, \t1
+ .endm
+
+ .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, t0, t1, t2, t3
+ eor \t0, \x0, \x2
+ eor \t1, \x1, \x3
+ mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
+ eor \y0, \y0, \y2
+ eor \y1, \y1, \y3
+ mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+ eor \x0, \x0, \t0
+ eor \x2, \x2, \t0
+ eor \x1, \x1, \t1
+ eor \x3, \x3, \t1
+ eor \t0, \x4, \x6
+ eor \t1, \x5, \x7
+ mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+ eor \y0, \y0, \y2
+ eor \y1, \y1, \y3
+ mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
+ eor \x4, \x4, \t0
+ eor \x6, \x6, \t0
+ eor \x5, \x5, \t1
+ eor \x7, \x7, \t1
+ .endm
+
+ .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ eor \t3, \x4, \x6
+ eor \t0, \x5, \x7
+ eor \t1, \x1, \x3
+ eor \s1, \x7, \x6
+ eor \s0, \x0, \x2
+ eor \s3, \t3, \t0
+ orr \t2, \t0, \t1
+ and \s2, \t3, \s0
+ orr \t3, \t3, \s0
+ eor \s0, \s0, \t1
+ and \t0, \t0, \t1
+ eor \t1, \x3, \x2
+ and \s3, \s3, \s0
+ and \s1, \s1, \t1
+ eor \t1, \x4, \x5
+ eor \s0, \x1, \x0
+ eor \t3, \t3, \s1
+ eor \t2, \t2, \s1
+ and \s1, \t1, \s0
+ orr \t1, \t1, \s0
+ eor \t3, \t3, \s3
+ eor \t0, \t0, \s1
+ eor \t2, \t2, \s2
+ eor \t1, \t1, \s3
+ eor \t0, \t0, \s2
+ and \s0, \x7, \x3
+ eor \t1, \t1, \s2
+ and \s1, \x6, \x2
+ and \s2, \x5, \x1
+ orr \s3, \x4, \x0
+ eor \t3, \t3, \s0
+ eor \t1, \t1, \s2
+ eor \s0, \t0, \s3
+ eor \t2, \t2, \s1
+ and \s2, \t3, \t1
+ eor \s1, \t2, \s2
+ eor \s3, \s0, \s2
+ bsl \s1, \t1, \s0
+ not \t0, \s0
+ bsl \s0, \s1, \s3
+ bsl \t0, \s1, \s3
+ bsl \s3, \t3, \t2
+ eor \t3, \t3, \t2
+ and \s2, \s0, \s3
+ eor \t1, \t1, \t0
+ eor \s2, \s2, \t3
+ mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+ \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ .endm
+
+ .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+ \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+ inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+ \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+ \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+ \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+ out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+ \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+ .endm
+
+ .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+ \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+ inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+ \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+ \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+ \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+ inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+ \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+ .endm
+
+ .macro enc_next_rk
+ ldp q16, q17, [bskey], #128
+ ldp q18, q19, [bskey, #-96]
+ ldp q20, q21, [bskey, #-64]
+ ldp q22, q23, [bskey, #-32]
+ .endm
+
+ .macro dec_next_rk
+ ldp q16, q17, [bskey, #-128]!
+ ldp q18, q19, [bskey, #32]
+ ldp q20, q21, [bskey, #64]
+ ldp q22, q23, [bskey, #96]
+ .endm
+
+ .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+ eor \x0\().16b, \x0\().16b, v16.16b
+ eor \x1\().16b, \x1\().16b, v17.16b
+ eor \x2\().16b, \x2\().16b, v18.16b
+ eor \x3\().16b, \x3\().16b, v19.16b
+ eor \x4\().16b, \x4\().16b, v20.16b
+ eor \x5\().16b, \x5\().16b, v21.16b
+ eor \x6\().16b, \x6\().16b, v22.16b
+ eor \x7\().16b, \x7\().16b, v23.16b
+ .endm
+
+ .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+ tbl \x0\().16b, {\x0\().16b}, \mask\().16b
+ tbl \x1\().16b, {\x1\().16b}, \mask\().16b
+ tbl \x2\().16b, {\x2\().16b}, \mask\().16b
+ tbl \x3\().16b, {\x3\().16b}, \mask\().16b
+ tbl \x4\().16b, {\x4\().16b}, \mask\().16b
+ tbl \x5\().16b, {\x5\().16b}, \mask\().16b
+ tbl \x6\().16b, {\x6\().16b}, \mask\().16b
+ tbl \x7\().16b, {\x7\().16b}, \mask\().16b
+ .endm
+
+ .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, t4, t5, t6, t7, inv
+ ext \t0\().16b, \x0\().16b, \x0\().16b, #12
+ ext \t1\().16b, \x1\().16b, \x1\().16b, #12
+ eor \x0\().16b, \x0\().16b, \t0\().16b
+ ext \t2\().16b, \x2\().16b, \x2\().16b, #12
+ eor \x1\().16b, \x1\().16b, \t1\().16b
+ ext \t3\().16b, \x3\().16b, \x3\().16b, #12
+ eor \x2\().16b, \x2\().16b, \t2\().16b
+ ext \t4\().16b, \x4\().16b, \x4\().16b, #12
+ eor \x3\().16b, \x3\().16b, \t3\().16b
+ ext \t5\().16b, \x5\().16b, \x5\().16b, #12
+ eor \x4\().16b, \x4\().16b, \t4\().16b
+ ext \t6\().16b, \x6\().16b, \x6\().16b, #12
+ eor \x5\().16b, \x5\().16b, \t5\().16b
+ ext \t7\().16b, \x7\().16b, \x7\().16b, #12
+ eor \x6\().16b, \x6\().16b, \t6\().16b
+ eor \t1\().16b, \t1\().16b, \x0\().16b
+ eor \x7\().16b, \x7\().16b, \t7\().16b
+ ext \x0\().16b, \x0\().16b, \x0\().16b, #8
+ eor \t2\().16b, \t2\().16b, \x1\().16b
+ eor \t0\().16b, \t0\().16b, \x7\().16b
+ eor \t1\().16b, \t1\().16b, \x7\().16b
+ ext \x1\().16b, \x1\().16b, \x1\().16b, #8
+ eor \t5\().16b, \t5\().16b, \x4\().16b
+ eor \x0\().16b, \x0\().16b, \t0\().16b
+ eor \t6\().16b, \t6\().16b, \x5\().16b
+ eor \x1\().16b, \x1\().16b, \t1\().16b
+ ext \t0\().16b, \x4\().16b, \x4\().16b, #8
+ eor \t4\().16b, \t4\().16b, \x3\().16b
+ ext \t1\().16b, \x5\().16b, \x5\().16b, #8
+ eor \t7\().16b, \t7\().16b, \x6\().16b
+ ext \x4\().16b, \x3\().16b, \x3\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x2\().16b
+ ext \x5\().16b, \x7\().16b, \x7\().16b, #8
+ eor \t4\().16b, \t4\().16b, \x7\().16b
+ ext \x3\().16b, \x6\().16b, \x6\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x7\().16b
+ ext \x6\().16b, \x2\().16b, \x2\().16b, #8
+ eor \x7\().16b, \t1\().16b, \t5\().16b
+ .ifb \inv
+ eor \x2\().16b, \t0\().16b, \t4\().16b
+ eor \x4\().16b, \x4\().16b, \t3\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x3\().16b, \x3\().16b, \t6\().16b
+ eor \x6\().16b, \x6\().16b, \t2\().16b
+ .else
+ eor \t3\().16b, \t3\().16b, \x4\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x2\().16b, \x3\().16b, \t6\().16b
+ eor \x3\().16b, \t0\().16b, \t4\().16b
+ eor \x4\().16b, \x6\().16b, \t2\().16b
+ mov \x6\().16b, \t3\().16b
+ .endif
+ .endm
+
+ .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, t4, t5, t6, t7
+ ext \t0\().16b, \x0\().16b, \x0\().16b, #8
+ ext \t6\().16b, \x6\().16b, \x6\().16b, #8
+ ext \t7\().16b, \x7\().16b, \x7\().16b, #8
+ eor \t0\().16b, \t0\().16b, \x0\().16b
+ ext \t1\().16b, \x1\().16b, \x1\().16b, #8
+ eor \t6\().16b, \t6\().16b, \x6\().16b
+ ext \t2\().16b, \x2\().16b, \x2\().16b, #8
+ eor \t7\().16b, \t7\().16b, \x7\().16b
+ ext \t3\().16b, \x3\().16b, \x3\().16b, #8
+ eor \t1\().16b, \t1\().16b, \x1\().16b
+ ext \t4\().16b, \x4\().16b, \x4\().16b, #8
+ eor \t2\().16b, \t2\().16b, \x2\().16b
+ ext \t5\().16b, \x5\().16b, \x5\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x3\().16b
+ eor \t4\().16b, \t4\().16b, \x4\().16b
+ eor \t5\().16b, \t5\().16b, \x5\().16b
+ eor \x0\().16b, \x0\().16b, \t6\().16b
+ eor \x1\().16b, \x1\().16b, \t6\().16b
+ eor \x2\().16b, \x2\().16b, \t0\().16b
+ eor \x4\().16b, \x4\().16b, \t2\().16b
+ eor \x3\().16b, \x3\().16b, \t1\().16b
+ eor \x1\().16b, \x1\().16b, \t7\().16b
+ eor \x2\().16b, \x2\().16b, \t7\().16b
+ eor \x4\().16b, \x4\().16b, \t6\().16b
+ eor \x5\().16b, \x5\().16b, \t3\().16b
+ eor \x3\().16b, \x3\().16b, \t6\().16b
+ eor \x6\().16b, \x6\().16b, \t4\().16b
+ eor \x4\().16b, \x4\().16b, \t7\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x7\().16b, \x7\().16b, \t5\().16b
+ mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+ \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+ .endm
+
+ .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+ ushr \t0\().2d, \b0\().2d, #\n
+ ushr \t1\().2d, \b1\().2d, #\n
+ eor \t0\().16b, \t0\().16b, \a0\().16b
+ eor \t1\().16b, \t1\().16b, \a1\().16b
+ and \t0\().16b, \t0\().16b, \mask\().16b
+ and \t1\().16b, \t1\().16b, \mask\().16b
+ eor \a0\().16b, \a0\().16b, \t0\().16b
+ shl \t0\().2d, \t0\().2d, #\n
+ eor \a1\().16b, \a1\().16b, \t1\().16b
+ shl \t1\().2d, \t1\().2d, #\n
+ eor \b0\().16b, \b0\().16b, \t0\().16b
+ eor \b1\().16b, \b1\().16b, \t1\().16b
+ .endm
+
+ .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+ movi \t0\().16b, #0x55
+ movi \t1\().16b, #0x33
+ swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+ swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+ movi \t0\().16b, #0x0f
+ swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+ swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+ swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+ swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+ .endm
+
+
+ .align 6
+M0: .octa 0x0004080c0105090d02060a0e03070b0f
+
+M0SR: .octa 0x0004080c05090d010a0e02060f03070b
+SR: .octa 0x0f0e0d0c0a09080b0504070600030201
+SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
+
+M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
+ISR: .octa 0x0f0e0d0c080b0a090504070602010003
+ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
+
+ /*
+ * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+ */
+ENTRY(aesbs_convert_key)
+ ld1 {v7.4s}, [x1], #16 // load round 0 key
+ ld1 {v17.4s}, [x1], #16 // load round 1 key
+
+ movi v8.16b, #0x01 // bit masks
+ movi v9.16b, #0x02
+ movi v10.16b, #0x04
+ movi v11.16b, #0x08
+ movi v12.16b, #0x10
+ movi v13.16b, #0x20
+ movi v14.16b, #0x40
+ movi v15.16b, #0x80
+ ldr q16, M0
+
+ sub x2, x2, #1
+ str q7, [x0], #16 // save round 0 key
+
+.Lkey_loop:
+ tbl v7.16b ,{v17.16b}, v16.16b
+ ld1 {v17.4s}, [x1], #16 // load next round key
+
+ cmtst v0.16b, v7.16b, v8.16b
+ cmtst v1.16b, v7.16b, v9.16b
+ cmtst v2.16b, v7.16b, v10.16b
+ cmtst v3.16b, v7.16b, v11.16b
+ cmtst v4.16b, v7.16b, v12.16b
+ cmtst v5.16b, v7.16b, v13.16b
+ cmtst v6.16b, v7.16b, v14.16b
+ cmtst v7.16b, v7.16b, v15.16b
+ not v0.16b, v0.16b
+ not v1.16b, v1.16b
+ not v5.16b, v5.16b
+ not v6.16b, v6.16b
+
+ subs x2, x2, #1
+ stp q0, q1, [x0], #128
+ stp q2, q3, [x0, #-96]
+ stp q4, q5, [x0, #-64]
+ stp q6, q7, [x0, #-32]
+ b.ne .Lkey_loop
+
+ movi v7.16b, #0x63 // compose .L63
+ eor v17.16b, v17.16b, v7.16b
+ str q17, [x0]
+ ret
+ENDPROC(aesbs_convert_key)
+
+ .align 4
+aesbs_encrypt8:
+ ldr q9, [bskey], #16 // round 0 key
+ ldr q8, M0SR
+ ldr q24, SR
+
+ eor v10.16b, v0.16b, v9.16b // xor with round0 key
+ eor v11.16b, v1.16b, v9.16b
+ tbl v0.16b, {v10.16b}, v8.16b
+ eor v12.16b, v2.16b, v9.16b
+ tbl v1.16b, {v11.16b}, v8.16b
+ eor v13.16b, v3.16b, v9.16b
+ tbl v2.16b, {v12.16b}, v8.16b
+ eor v14.16b, v4.16b, v9.16b
+ tbl v3.16b, {v13.16b}, v8.16b
+ eor v15.16b, v5.16b, v9.16b
+ tbl v4.16b, {v14.16b}, v8.16b
+ eor v10.16b, v6.16b, v9.16b
+ tbl v5.16b, {v15.16b}, v8.16b
+ eor v11.16b, v7.16b, v9.16b
+ tbl v6.16b, {v10.16b}, v8.16b
+ tbl v7.16b, {v11.16b}, v8.16b
+
+ bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+ sub rounds, rounds, #1
+ b .Lenc_sbox
+
+.Lenc_loop:
+ shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+ sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+ subs rounds, rounds, #1
+ b.cc .Lenc_done
+
+ enc_next_rk
+
+ mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+
+ add_round_key v0, v1, v2, v3, v4, v5, v6, v7
+
+ b.ne .Lenc_loop
+ ldr q24, SRM0
+ b .Lenc_loop
+
+.Lenc_done:
+ ldr q12, [bskey] // last round key
+
+ bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v3.16b, v3.16b, v12.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v2.16b, v2.16b, v12.16b
+ eor v5.16b, v5.16b, v12.16b
+ ret
+ENDPROC(aesbs_encrypt8)
+
+ .align 4
+aesbs_decrypt8:
+ lsl x9, rounds, #7
+ add bskey, bskey, x9
+
+ ldr q9, [bskey, #-112]! // round 0 key
+ ldr q8, M0ISR
+ ldr q24, ISR
+
+ eor v10.16b, v0.16b, v9.16b // xor with round0 key
+ eor v11.16b, v1.16b, v9.16b
+ tbl v0.16b, {v10.16b}, v8.16b
+ eor v12.16b, v2.16b, v9.16b
+ tbl v1.16b, {v11.16b}, v8.16b
+ eor v13.16b, v3.16b, v9.16b
+ tbl v2.16b, {v12.16b}, v8.16b
+ eor v14.16b, v4.16b, v9.16b
+ tbl v3.16b, {v13.16b}, v8.16b
+ eor v15.16b, v5.16b, v9.16b
+ tbl v4.16b, {v14.16b}, v8.16b
+ eor v10.16b, v6.16b, v9.16b
+ tbl v5.16b, {v15.16b}, v8.16b
+ eor v11.16b, v7.16b, v9.16b
+ tbl v6.16b, {v10.16b}, v8.16b
+ tbl v7.16b, {v11.16b}, v8.16b
+
+ bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+ sub rounds, rounds, #1
+ b .Ldec_sbox
+
+.Ldec_loop:
+ shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+ inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+ subs rounds, rounds, #1
+ b.cc .Ldec_done
+
+ dec_next_rk
+
+ add_round_key v0, v1, v6, v4, v2, v7, v3, v5
+
+ inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+
+ b.ne .Ldec_loop
+ ldr q24, ISRM0
+ b .Ldec_loop
+.Ldec_done:
+ ldr q12, [bskey, #-16] // last round key
+
+ bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v2.16b, v2.16b, v12.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v3.16b, v3.16b, v12.16b
+ eor v5.16b, v5.16b, v12.16b
+ ret
+ENDPROC(aesbs_decrypt8)
+
+ /*
+ * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks)
+ * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks)
+ */
+ .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+99: mov x5, #1
+ lsl x5, x5, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x5, x5, xzr, mi
+
+ ld1 {v0.16b}, [x1], #16
+ tbnz x5, #1, 0f
+ ld1 {v1.16b}, [x1], #16
+ tbnz x5, #2, 0f
+ ld1 {v2.16b}, [x1], #16
+ tbnz x5, #3, 0f
+ ld1 {v3.16b}, [x1], #16
+ tbnz x5, #4, 0f
+ ld1 {v4.16b}, [x1], #16
+ tbnz x5, #5, 0f
+ ld1 {v5.16b}, [x1], #16
+ tbnz x5, #6, 0f
+ ld1 {v6.16b}, [x1], #16
+ tbnz x5, #7, 0f
+ ld1 {v7.16b}, [x1], #16
+
+0: mov bskey, x2
+ mov rounds, x3
+ bl \do8
+
+ st1 {\o0\().16b}, [x0], #16
+ tbnz x5, #1, 1f
+ st1 {\o1\().16b}, [x0], #16
+ tbnz x5, #2, 1f
+ st1 {\o2\().16b}, [x0], #16
+ tbnz x5, #3, 1f
+ st1 {\o3\().16b}, [x0], #16
+ tbnz x5, #4, 1f
+ st1 {\o4\().16b}, [x0], #16
+ tbnz x5, #5, 1f
+ st1 {\o5\().16b}, [x0], #16
+ tbnz x5, #6, 1f
+ st1 {\o6\().16b}, [x0], #16
+ tbnz x5, #7, 1f
+ st1 {\o7\().16b}, [x0], #16
+
+ cbnz x4, 99b
+
+1: ldp x29, x30, [sp], #16
+ ret
+ .endm
+
+ .align 4
+ENTRY(aesbs_ecb_encrypt)
+ __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+ .align 4
+ENTRY(aesbs_ecb_decrypt)
+ __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+ /*
+ * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[])
+ */
+ .align 4
+ENTRY(aesbs_cbc_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+99: mov x6, #1
+ lsl x6, x6, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x6, x6, xzr, mi
+
+ ld1 {v0.16b}, [x1], #16
+ mov v25.16b, v0.16b
+ tbnz x6, #1, 0f
+ ld1 {v1.16b}, [x1], #16
+ mov v26.16b, v1.16b
+ tbnz x6, #2, 0f
+ ld1 {v2.16b}, [x1], #16
+ mov v27.16b, v2.16b
+ tbnz x6, #3, 0f
+ ld1 {v3.16b}, [x1], #16
+ mov v28.16b, v3.16b
+ tbnz x6, #4, 0f
+ ld1 {v4.16b}, [x1], #16
+ mov v29.16b, v4.16b
+ tbnz x6, #5, 0f
+ ld1 {v5.16b}, [x1], #16
+ mov v30.16b, v5.16b
+ tbnz x6, #6, 0f
+ ld1 {v6.16b}, [x1], #16
+ mov v31.16b, v6.16b
+ tbnz x6, #7, 0f
+ ld1 {v7.16b}, [x1]
+
+0: mov bskey, x2
+ mov rounds, x3
+ bl aesbs_decrypt8
+
+ ld1 {v24.16b}, [x5] // load IV
+
+ eor v1.16b, v1.16b, v25.16b
+ eor v6.16b, v6.16b, v26.16b
+ eor v4.16b, v4.16b, v27.16b
+ eor v2.16b, v2.16b, v28.16b
+ eor v7.16b, v7.16b, v29.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v3.16b, v3.16b, v30.16b
+ eor v5.16b, v5.16b, v31.16b
+
+ st1 {v0.16b}, [x0], #16
+ mov v24.16b, v25.16b
+ tbnz x6, #1, 1f
+ st1 {v1.16b}, [x0], #16
+ mov v24.16b, v26.16b
+ tbnz x6, #2, 1f
+ st1 {v6.16b}, [x0], #16
+ mov v24.16b, v27.16b
+ tbnz x6, #3, 1f
+ st1 {v4.16b}, [x0], #16
+ mov v24.16b, v28.16b
+ tbnz x6, #4, 1f
+ st1 {v2.16b}, [x0], #16
+ mov v24.16b, v29.16b
+ tbnz x6, #5, 1f
+ st1 {v7.16b}, [x0], #16
+ mov v24.16b, v30.16b
+ tbnz x6, #6, 1f
+ st1 {v3.16b}, [x0], #16
+ mov v24.16b, v31.16b
+ tbnz x6, #7, 1f
+ ld1 {v24.16b}, [x1], #16
+ st1 {v5.16b}, [x0], #16
+1: st1 {v24.16b}, [x5] // store IV
+
+ cbnz x4, 99b
+
+ ldp x29, x30, [sp], #16
+ ret
+ENDPROC(aesbs_cbc_decrypt)
+
+ .macro next_tweak, out, in, const, tmp
+ sshr \tmp\().2d, \in\().2d, #63
+ and \tmp\().16b, \tmp\().16b, \const\().16b
+ add \out\().2d, \in\().2d, \in\().2d
+ ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+ eor \out\().16b, \out\().16b, \tmp\().16b
+ .endm
+
+ .align 4
+.Lxts_mul_x:
+CPU_LE( .quad 1, 0x87 )
+CPU_BE( .quad 0x87, 1 )
+
+ /*
+ * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[])
+ * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[])
+ */
+__xts_crypt8:
+ mov x6, #1
+ lsl x6, x6, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x6, x6, xzr, mi
+
+ ld1 {v0.16b}, [x1], #16
+ next_tweak v26, v25, v30, v31
+ eor v0.16b, v0.16b, v25.16b
+ tbnz x6, #1, 0f
+
+ ld1 {v1.16b}, [x1], #16
+ next_tweak v27, v26, v30, v31
+ eor v1.16b, v1.16b, v26.16b
+ tbnz x6, #2, 0f
+
+ ld1 {v2.16b}, [x1], #16
+ next_tweak v28, v27, v30, v31
+ eor v2.16b, v2.16b, v27.16b
+ tbnz x6, #3, 0f
+
+ ld1 {v3.16b}, [x1], #16
+ next_tweak v29, v28, v30, v31
+ eor v3.16b, v3.16b, v28.16b
+ tbnz x6, #4, 0f
+
+ ld1 {v4.16b}, [x1], #16
+ str q29, [sp, #16]
+ eor v4.16b, v4.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #5, 0f
+
+ ld1 {v5.16b}, [x1], #16
+ str q29, [sp, #32]
+ eor v5.16b, v5.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #6, 0f
+
+ ld1 {v6.16b}, [x1], #16
+ str q29, [sp, #48]
+ eor v6.16b, v6.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #7, 0f
+
+ ld1 {v7.16b}, [x1], #16
+ str q29, [sp, #64]
+ eor v7.16b, v7.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+
+0: mov bskey, x2
+ mov rounds, x3
+ br x7
+ENDPROC(__xts_crypt8)
+
+ .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+
+ ldr q30, .Lxts_mul_x
+ ld1 {v25.16b}, [x5]
+
+99: adr x7, \do8
+ bl __xts_crypt8
+
+ ldp q16, q17, [sp, #16]
+ ldp q18, q19, [sp, #48]
+
+ eor \o0\().16b, \o0\().16b, v25.16b
+ eor \o1\().16b, \o1\().16b, v26.16b
+ eor \o2\().16b, \o2\().16b, v27.16b
+ eor \o3\().16b, \o3\().16b, v28.16b
+
+ st1 {\o0\().16b}, [x0], #16
+ mov v25.16b, v26.16b
+ tbnz x6, #1, 1f
+ st1 {\o1\().16b}, [x0], #16
+ mov v25.16b, v27.16b
+ tbnz x6, #2, 1f
+ st1 {\o2\().16b}, [x0], #16
+ mov v25.16b, v28.16b
+ tbnz x6, #3, 1f
+ st1 {\o3\().16b}, [x0], #16
+ mov v25.16b, v29.16b
+ tbnz x6, #4, 1f
+
+ eor \o4\().16b, \o4\().16b, v16.16b
+ eor \o5\().16b, \o5\().16b, v17.16b
+ eor \o6\().16b, \o6\().16b, v18.16b
+ eor \o7\().16b, \o7\().16b, v19.16b
+
+ st1 {\o4\().16b}, [x0], #16
+ tbnz x6, #5, 1f
+ st1 {\o5\().16b}, [x0], #16
+ tbnz x6, #6, 1f
+ st1 {\o6\().16b}, [x0], #16
+ tbnz x6, #7, 1f
+ st1 {\o7\().16b}, [x0], #16
+
+ cbnz x4, 99b
+
+1: st1 {v25.16b}, [x5]
+ ldp x29, x30, [sp], #80
+ ret
+ .endm
+
+ENTRY(aesbs_xts_encrypt)
+ __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+ __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+ .macro next_ctr, v
+ mov \v\().d[1], x8
+ adds x8, x8, #1
+ mov \v\().d[0], x7
+ adc x7, x7, xzr
+ rev64 \v\().16b, \v\().16b
+ .endm
+
+ /*
+ * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ * int rounds, int blocks, u8 iv[], u8 final[])
+ */
+ENTRY(aesbs_ctr_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ cmp x6, #0
+ cset x10, ne
+ add x4, x4, x10 // do one extra block if final
+
+ ldp x7, x8, [x5]
+ ld1 {v0.16b}, [x5]
+CPU_LE( rev x7, x7 )
+CPU_LE( rev x8, x8 )
+ adds x8, x8, #1
+ adc x7, x7, xzr
+
+99: mov x9, #1
+ lsl x9, x9, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x9, x9, xzr, le
+
+ tbnz x9, #1, 0f
+ next_ctr v1
+ tbnz x9, #2, 0f
+ next_ctr v2
+ tbnz x9, #3, 0f
+ next_ctr v3
+ tbnz x9, #4, 0f
+ next_ctr v4
+ tbnz x9, #5, 0f
+ next_ctr v5
+ tbnz x9, #6, 0f
+ next_ctr v6
+ tbnz x9, #7, 0f
+ next_ctr v7
+
+0: mov bskey, x2
+ mov rounds, x3
+ bl aesbs_encrypt8
+
+ lsr x9, x9, x10 // disregard the extra block
+ tbnz x9, #0, 0f
+
+ ld1 {v8.16b}, [x1], #16
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x0], #16
+ tbnz x9, #1, 1f
+
+ ld1 {v9.16b}, [x1], #16
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v1.16b}, [x0], #16
+ tbnz x9, #2, 2f
+
+ ld1 {v10.16b}, [x1], #16
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.16b}, [x0], #16
+ tbnz x9, #3, 3f
+
+ ld1 {v11.16b}, [x1], #16
+ eor v6.16b, v6.16b, v11.16b
+ st1 {v6.16b}, [x0], #16
+ tbnz x9, #4, 4f
+
+ ld1 {v12.16b}, [x1], #16
+ eor v3.16b, v3.16b, v12.16b
+ st1 {v3.16b}, [x0], #16
+ tbnz x9, #5, 5f
+
+ ld1 {v13.16b}, [x1], #16
+ eor v7.16b, v7.16b, v13.16b
+ st1 {v7.16b}, [x0], #16
+ tbnz x9, #6, 6f
+
+ ld1 {v14.16b}, [x1], #16
+ eor v2.16b, v2.16b, v14.16b
+ st1 {v2.16b}, [x0], #16
+ tbnz x9, #7, 7f
+
+ ld1 {v15.16b}, [x1], #16
+ eor v5.16b, v5.16b, v15.16b
+ st1 {v5.16b}, [x0], #16
+
+8: next_ctr v0
+ cbnz x4, 99b
+
+0: st1 {v0.16b}, [x5]
+ ldp x29, x30, [sp], #16
+ ret
+
+ /*
+ * If we are handling the tail of the input (x6 != NULL), return the
+ * final keystream block back to the caller.
+ */
+1: cbz x6, 8b
+ st1 {v1.16b}, [x6]
+ b 8b
+2: cbz x6, 8b
+ st1 {v4.16b}, [x6]
+ b 8b
+3: cbz x6, 8b
+ st1 {v6.16b}, [x6]
+ b 8b
+4: cbz x6, 8b
+ st1 {v3.16b}, [x6]
+ b 8b
+5: cbz x6, 8b
+ st1 {v7.16b}, [x6]
+ b 8b
+6: cbz x6, 8b
+ st1 {v2.16b}, [x6]
+ b 8b
+7: cbz x6, 8b
+ st1 {v5.16b}, [x6]
+ b 8b
+ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
new file mode 100644
index 000000000000..db2501d93550
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -0,0 +1,439 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[], u8 final[]);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]);
+
+/* borrowed from aes-neon-blk.ko */
+asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks, int first);
+asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks, u8 iv[],
+ int first);
+
+struct aesbs_ctx {
+ u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32];
+ int rounds;
+} __aligned(AES_BLOCK_SIZE);
+
+struct aesbs_cbc_ctx {
+ struct aesbs_ctx key;
+ u32 enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+ struct aesbs_ctx key;
+ u32 twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx rk;
+ int err;
+
+ err = crypto_aes_expand_key(&rk, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->rounds = 6 + key_len / 4;
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+ void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks))
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ if (walk.nbytes < walk.total)
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+
+ fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+ ctx->rounds, blocks);
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx rk;
+ int err;
+
+ err = crypto_aes_expand_key(&rk, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->key.rounds = 6 + key_len / 4;
+
+ memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err, first = 1;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ /* fall back to the non-bitsliced NEON implementation */
+ neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->enc, ctx->key.rounds, blocks, walk.iv,
+ first);
+ err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+ first = 0;
+ }
+ kernel_neon_end();
+ return err;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ if (walk.nbytes < walk.total)
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+
+ aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key.rk, ctx->key.rounds, blocks,
+ walk.iv);
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ u8 buf[AES_BLOCK_SIZE];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes > 0) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+
+ if (walk.nbytes < walk.total) {
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+ final = NULL;
+ }
+
+ aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+ if (final) {
+ u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+ if (dst != src)
+ memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+ crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, 0);
+ break;
+ }
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx rk;
+ int err;
+
+ err = xts_verify_key(tfm, in_key, key_len);
+ if (err)
+ return err;
+
+ key_len /= 2;
+ err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+ if (err)
+ return err;
+
+ memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+ return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+ void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]))
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+
+ neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
+ ctx->key.rounds, 1, 1);
+
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ if (walk.nbytes < walk.total)
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+
+ fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+ ctx->key.rounds, blocks, walk.iv);
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+ .base.cra_name = "__ecb(aes)",
+ .base.cra_driver_name = "__ecb-aes-neonbs",
+ .base.cra_priority = 250,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+}, {
+ .base.cra_name = "__cbc(aes)",
+ .base.cra_driver_name = "__cbc-aes-neonbs",
+ .base.cra_priority = 250,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_cbc_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+}, {
+ .base.cra_name = "__ctr(aes)",
+ .base.cra_driver_name = "__ctr-aes-neonbs",
+ .base.cra_priority = 250,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs",
+ .base.cra_priority = 250 - 1,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base.cra_name = "__xts(aes)",
+ .base.cra_driver_name = "__xts-aes-neonbs",
+ .base.cra_priority = 250,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_xts_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+ if (aes_simd_algs[i])
+ simd_skcipher_free(aes_simd_algs[i]);
+
+ crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+ struct simd_skcipher_alg *simd;
+ const char *basename;
+ const char *algname;
+ const char *drvname;
+ int err;
+ int i;
+
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+ if (err)
+ return err;
+
+ for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+ if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+ continue;
+
+ algname = aes_algs[i].base.cra_name + 2;
+ drvname = aes_algs[i].base.cra_driver_name + 2;
+ basename = aes_algs[i].base.cra_driver_name;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ goto unregister_simds;
+
+ aes_simd_algs[i] = simd;
+ }
+ return 0;
+
+unregister_simds:
+ aes_exit();
+ return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..13c85e272c2a
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -0,0 +1,450 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .align 6
+
+ENTRY(chacha20_block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 1 data block output, o
+ // x2: 1 data block input, i
+
+ //
+ // This function encrypts one ChaCha20 block by loading the state matrix
+ // in four NEON registers. It performs matrix operation on four words in
+ // parallel, but requires shuffling to rearrange the words after each
+ // round.
+ //
+
+ // x0..3 = s0..3
+ adr x3, ROT8
+ ld1 {v0.4s-v3.4s}, [x0]
+ ld1 {v8.4s-v11.4s}, [x0]
+ ld1 {v12.4s}, [x3]
+
+ mov x3, #10
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ ext v1.16b, v1.16b, v1.16b, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ ext v3.16b, v3.16b, v3.16b, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ ext v1.16b, v1.16b, v1.16b, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ ext v3.16b, v3.16b, v3.16b, #4
+
+ subs x3, x3, #1
+ b.ne .Ldoubleround
+
+ ld1 {v4.16b-v7.16b}, [x2]
+
+ // o0 = i0 ^ (x0 + s0)
+ add v0.4s, v0.4s, v8.4s
+ eor v0.16b, v0.16b, v4.16b
+
+ // o1 = i1 ^ (x1 + s1)
+ add v1.4s, v1.4s, v9.4s
+ eor v1.16b, v1.16b, v5.16b
+
+ // o2 = i2 ^ (x2 + s2)
+ add v2.4s, v2.4s, v10.4s
+ eor v2.16b, v2.16b, v6.16b
+
+ // o3 = i3 ^ (x3 + s3)
+ add v3.4s, v3.4s, v11.4s
+ eor v3.16b, v3.16b, v7.16b
+
+ st1 {v0.16b-v3.16b}, [x1]
+
+ ret
+ENDPROC(chacha20_block_xor_neon)
+
+ .align 6
+ENTRY(chacha20_4block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 4 data blocks output, o
+ // x2: 4 data blocks input, i
+
+ //
+ // This function encrypts four consecutive ChaCha20 blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. For final XORing step we transpose the
+ // matrix by interleaving 32- and then 64-bit words, which allows us to
+ // do XOR in NEON registers.
+ //
+ adr x3, CTRINC // ... and ROT8
+ ld1 {v30.4s-v31.4s}, [x3]
+
+ // x0..15[0-3] = s0..3[0..3]
+ mov x4, x0
+ ld4r { v0.4s- v3.4s}, [x4], #16
+ ld4r { v4.4s- v7.4s}, [x4], #16
+ ld4r { v8.4s-v11.4s}, [x4], #16
+ ld4r {v12.4s-v15.4s}, [x4]
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ mov x3, #10
+
+.Ldoubleround4:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ add v0.4s, v0.4s, v4.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+
+ eor v12.16b, v12.16b, v0.16b
+ eor v13.16b, v13.16b, v1.16b
+ eor v14.16b, v14.16b, v2.16b
+ eor v15.16b, v15.16b, v3.16b
+
+ rev32 v12.8h, v12.8h
+ rev32 v13.8h, v13.8h
+ rev32 v14.8h, v14.8h
+ rev32 v15.8h, v15.8h
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ add v8.4s, v8.4s, v12.4s
+ add v9.4s, v9.4s, v13.4s
+ add v10.4s, v10.4s, v14.4s
+ add v11.4s, v11.4s, v15.4s
+
+ eor v16.16b, v4.16b, v8.16b
+ eor v17.16b, v5.16b, v9.16b
+ eor v18.16b, v6.16b, v10.16b
+ eor v19.16b, v7.16b, v11.16b
+
+ shl v4.4s, v16.4s, #12
+ shl v5.4s, v17.4s, #12
+ shl v6.4s, v18.4s, #12
+ shl v7.4s, v19.4s, #12
+
+ sri v4.4s, v16.4s, #20
+ sri v5.4s, v17.4s, #20
+ sri v6.4s, v18.4s, #20
+ sri v7.4s, v19.4s, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ add v0.4s, v0.4s, v4.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+
+ eor v12.16b, v12.16b, v0.16b
+ eor v13.16b, v13.16b, v1.16b
+ eor v14.16b, v14.16b, v2.16b
+ eor v15.16b, v15.16b, v3.16b
+
+ tbl v12.16b, {v12.16b}, v31.16b
+ tbl v13.16b, {v13.16b}, v31.16b
+ tbl v14.16b, {v14.16b}, v31.16b
+ tbl v15.16b, {v15.16b}, v31.16b
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ add v8.4s, v8.4s, v12.4s
+ add v9.4s, v9.4s, v13.4s
+ add v10.4s, v10.4s, v14.4s
+ add v11.4s, v11.4s, v15.4s
+
+ eor v16.16b, v4.16b, v8.16b
+ eor v17.16b, v5.16b, v9.16b
+ eor v18.16b, v6.16b, v10.16b
+ eor v19.16b, v7.16b, v11.16b
+
+ shl v4.4s, v16.4s, #7
+ shl v5.4s, v17.4s, #7
+ shl v6.4s, v18.4s, #7
+ shl v7.4s, v19.4s, #7
+
+ sri v4.4s, v16.4s, #25
+ sri v5.4s, v17.4s, #25
+ sri v6.4s, v18.4s, #25
+ sri v7.4s, v19.4s, #25
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v4.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v12.16b, v12.16b, v1.16b
+ eor v13.16b, v13.16b, v2.16b
+ eor v14.16b, v14.16b, v3.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v12.8h, v12.8h
+ rev32 v13.8h, v13.8h
+ rev32 v14.8h, v14.8h
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v12.4s
+ add v8.4s, v8.4s, v13.4s
+ add v9.4s, v9.4s, v14.4s
+
+ eor v16.16b, v5.16b, v10.16b
+ eor v17.16b, v6.16b, v11.16b
+ eor v18.16b, v7.16b, v8.16b
+ eor v19.16b, v4.16b, v9.16b
+
+ shl v5.4s, v16.4s, #12
+ shl v6.4s, v17.4s, #12
+ shl v7.4s, v18.4s, #12
+ shl v4.4s, v19.4s, #12
+
+ sri v5.4s, v16.4s, #20
+ sri v6.4s, v17.4s, #20
+ sri v7.4s, v18.4s, #20
+ sri v4.4s, v19.4s, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v4.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v12.16b, v12.16b, v1.16b
+ eor v13.16b, v13.16b, v2.16b
+ eor v14.16b, v14.16b, v3.16b
+
+ tbl v15.16b, {v15.16b}, v31.16b
+ tbl v12.16b, {v12.16b}, v31.16b
+ tbl v13.16b, {v13.16b}, v31.16b
+ tbl v14.16b, {v14.16b}, v31.16b
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v12.4s
+ add v8.4s, v8.4s, v13.4s
+ add v9.4s, v9.4s, v14.4s
+
+ eor v16.16b, v5.16b, v10.16b
+ eor v17.16b, v6.16b, v11.16b
+ eor v18.16b, v7.16b, v8.16b
+ eor v19.16b, v4.16b, v9.16b
+
+ shl v5.4s, v16.4s, #7
+ shl v6.4s, v17.4s, #7
+ shl v7.4s, v18.4s, #7
+ shl v4.4s, v19.4s, #7
+
+ sri v5.4s, v16.4s, #25
+ sri v6.4s, v17.4s, #25
+ sri v7.4s, v18.4s, #25
+ sri v4.4s, v19.4s, #25
+
+ subs x3, x3, #1
+ b.ne .Ldoubleround4
+
+ ld4r {v16.4s-v19.4s}, [x0], #16
+ ld4r {v20.4s-v23.4s}, [x0], #16
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ // x0[0-3] += s0[0]
+ // x1[0-3] += s0[1]
+ // x2[0-3] += s0[2]
+ // x3[0-3] += s0[3]
+ add v0.4s, v0.4s, v16.4s
+ add v1.4s, v1.4s, v17.4s
+ add v2.4s, v2.4s, v18.4s
+ add v3.4s, v3.4s, v19.4s
+
+ ld4r {v24.4s-v27.4s}, [x0], #16
+ ld4r {v28.4s-v31.4s}, [x0]
+
+ // x4[0-3] += s1[0]
+ // x5[0-3] += s1[1]
+ // x6[0-3] += s1[2]
+ // x7[0-3] += s1[3]
+ add v4.4s, v4.4s, v20.4s
+ add v5.4s, v5.4s, v21.4s
+ add v6.4s, v6.4s, v22.4s
+ add v7.4s, v7.4s, v23.4s
+
+ // x8[0-3] += s2[0]
+ // x9[0-3] += s2[1]
+ // x10[0-3] += s2[2]
+ // x11[0-3] += s2[3]
+ add v8.4s, v8.4s, v24.4s
+ add v9.4s, v9.4s, v25.4s
+ add v10.4s, v10.4s, v26.4s
+ add v11.4s, v11.4s, v27.4s
+
+ // x12[0-3] += s3[0]
+ // x13[0-3] += s3[1]
+ // x14[0-3] += s3[2]
+ // x15[0-3] += s3[3]
+ add v12.4s, v12.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v14.4s, v14.4s, v30.4s
+ add v15.4s, v15.4s, v31.4s
+
+ // interleave 32-bit words in state n, n+1
+ zip1 v16.4s, v0.4s, v1.4s
+ zip2 v17.4s, v0.4s, v1.4s
+ zip1 v18.4s, v2.4s, v3.4s
+ zip2 v19.4s, v2.4s, v3.4s
+ zip1 v20.4s, v4.4s, v5.4s
+ zip2 v21.4s, v4.4s, v5.4s
+ zip1 v22.4s, v6.4s, v7.4s
+ zip2 v23.4s, v6.4s, v7.4s
+ zip1 v24.4s, v8.4s, v9.4s
+ zip2 v25.4s, v8.4s, v9.4s
+ zip1 v26.4s, v10.4s, v11.4s
+ zip2 v27.4s, v10.4s, v11.4s
+ zip1 v28.4s, v12.4s, v13.4s
+ zip2 v29.4s, v12.4s, v13.4s
+ zip1 v30.4s, v14.4s, v15.4s
+ zip2 v31.4s, v14.4s, v15.4s
+
+ // interleave 64-bit words in state n, n+2
+ zip1 v0.2d, v16.2d, v18.2d
+ zip2 v4.2d, v16.2d, v18.2d
+ zip1 v8.2d, v17.2d, v19.2d
+ zip2 v12.2d, v17.2d, v19.2d
+ ld1 {v16.16b-v19.16b}, [x2], #64
+
+ zip1 v1.2d, v20.2d, v22.2d
+ zip2 v5.2d, v20.2d, v22.2d
+ zip1 v9.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+ ld1 {v20.16b-v23.16b}, [x2], #64
+
+ zip1 v2.2d, v24.2d, v26.2d
+ zip2 v6.2d, v24.2d, v26.2d
+ zip1 v10.2d, v25.2d, v27.2d
+ zip2 v14.2d, v25.2d, v27.2d
+ ld1 {v24.16b-v27.16b}, [x2], #64
+
+ zip1 v3.2d, v28.2d, v30.2d
+ zip2 v7.2d, v28.2d, v30.2d
+ zip1 v11.2d, v29.2d, v31.2d
+ zip2 v15.2d, v29.2d, v31.2d
+ ld1 {v28.16b-v31.16b}, [x2]
+
+ // xor with corresponding input, write to output
+ eor v16.16b, v16.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ eor v18.16b, v18.16b, v2.16b
+ eor v19.16b, v19.16b, v3.16b
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v5.16b
+ st1 {v16.16b-v19.16b}, [x1], #64
+ eor v22.16b, v22.16b, v6.16b
+ eor v23.16b, v23.16b, v7.16b
+ eor v24.16b, v24.16b, v8.16b
+ eor v25.16b, v25.16b, v9.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+ eor v26.16b, v26.16b, v10.16b
+ eor v27.16b, v27.16b, v11.16b
+ eor v28.16b, v28.16b, v12.16b
+ st1 {v24.16b-v27.16b}, [x1], #64
+ eor v29.16b, v29.16b, v13.16b
+ eor v30.16b, v30.16b, v14.16b
+ eor v31.16b, v31.16b, v15.16b
+ st1 {v28.16b-v31.16b}, [x1]
+
+ ret
+ENDPROC(chacha20_4block_xor_neon)
+
+CTRINC: .word 0, 1, 2, 3
+ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..a7cd575ea223
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -0,0 +1,126 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes)
+{
+ u8 buf[CHACHA20_BLOCK_SIZE];
+
+ while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ chacha20_4block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE * 4;
+ src += CHACHA20_BLOCK_SIZE * 4;
+ dst += CHACHA20_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE;
+ src += CHACHA20_BLOCK_SIZE;
+ dst += CHACHA20_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha20_block_xor_neon(state, buf, buf);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
+ return crypto_chacha20_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ crypto_chacha20_init(state, ctx, walk.iv);
+
+ kernel_neon_begin();
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static struct skcipher_alg alg = {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha20_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA20_KEY_SIZE,
+ .max_keysize = CHACHA20_KEY_SIZE,
+ .ivsize = CHACHA20_IV_SIZE,
+ .chunksize = CHACHA20_BLOCK_SIZE,
+ .walksize = 4 * CHACHA20_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha20_neon,
+ .decrypt = chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+ crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm64/crypto/crc32-arm64.c b/arch/arm64/crypto/crc32-arm64.c
deleted file mode 100644
index 6a37c3c6b11d..000000000000
--- a/arch/arm64/crypto/crc32-arm64.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions
- *
- * Module based on crypto/crc32c_generic.c
- *
- * CRC32 loop taken from Ed Nevill's Hadoop CRC patch
- * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E
- *
- * Using inline assembly instead of intrinsics in order to be backwards
- * compatible with older compilers.
- *
- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/cpufeature.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>");
-MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions");
-MODULE_LICENSE("GPL v2");
-
-#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-
-static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
- s64 length = len;
-
- while ((length -= sizeof(u64)) >= 0) {
- CRC32X(crc, get_unaligned_le64(p));
- p += sizeof(u64);
- }
-
- /* The following is more efficient than the straight loop */
- if (length & sizeof(u32)) {
- CRC32W(crc, get_unaligned_le32(p));
- p += sizeof(u32);
- }
- if (length & sizeof(u16)) {
- CRC32H(crc, get_unaligned_le16(p));
- p += sizeof(u16);
- }
- if (length & sizeof(u8))
- CRC32B(crc, *p);
-
- return crc;
-}
-
-static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
- s64 length = len;
-
- while ((length -= sizeof(u64)) >= 0) {
- CRC32CX(crc, get_unaligned_le64(p));
- p += sizeof(u64);
- }
-
- /* The following is more efficient than the straight loop */
- if (length & sizeof(u32)) {
- CRC32CW(crc, get_unaligned_le32(p));
- p += sizeof(u32);
- }
- if (length & sizeof(u16)) {
- CRC32CH(crc, get_unaligned_le16(p));
- p += sizeof(u16);
- }
- if (length & sizeof(u8))
- CRC32CB(crc, *p);
-
- return crc;
-}
-
-#define CHKSUM_BLOCK_SIZE 1
-#define CHKSUM_DIGEST_SIZE 4
-
-struct chksum_ctx {
- u32 key;
-};
-
-struct chksum_desc_ctx {
- u32 crc;
-};
-
-static int chksum_init(struct shash_desc *desc)
-{
- struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- ctx->crc = mctx->key;
-
- return 0;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
-
- if (keylen != sizeof(mctx->key)) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- mctx->key = get_unaligned_le32(key);
- return 0;
-}
-
-static int chksum_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length);
- return 0;
-}
-
-static int chksumc_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length);
- return 0;
-}
-
-static int chksum_final(struct shash_desc *desc, u8 *out)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- put_unaligned_le32(ctx->crc, out);
- return 0;
-}
-
-static int chksumc_final(struct shash_desc *desc, u8 *out)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- put_unaligned_le32(~ctx->crc, out);
- return 0;
-}
-
-static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
- put_unaligned_le32(crc32_arm64_le_hw(crc, data, len), out);
- return 0;
-}
-
-static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
- put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out);
- return 0;
-}
-
-static int chksum_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- return __chksum_finup(ctx->crc, data, len, out);
-}
-
-static int chksumc_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- return __chksumc_finup(ctx->crc, data, len, out);
-}
-
-static int chksum_digest(struct shash_desc *desc, const u8 *data,
- unsigned int length, u8 *out)
-{
- struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
- return __chksum_finup(mctx->key, data, length, out);
-}
-
-static int chksumc_digest(struct shash_desc *desc, const u8 *data,
- unsigned int length, u8 *out)
-{
- struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
- return __chksumc_finup(mctx->key, data, length, out);
-}
-
-static int crc32_cra_init(struct crypto_tfm *tfm)
-{
- struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
- mctx->key = 0;
- return 0;
-}
-
-static int crc32c_cra_init(struct crypto_tfm *tfm)
-{
- struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
- mctx->key = ~0;
- return 0;
-}
-
-static struct shash_alg crc32_alg = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = chksum_setkey,
- .init = chksum_init,
- .update = chksum_update,
- .final = chksum_final,
- .finup = chksum_finup,
- .digest = chksum_digest,
- .descsize = sizeof(struct chksum_desc_ctx),
- .base = {
- .cra_name = "crc32",
- .cra_driver_name = "crc32-arm64-hw",
- .cra_priority = 300,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_alignmask = 0,
- .cra_ctxsize = sizeof(struct chksum_ctx),
- .cra_module = THIS_MODULE,
- .cra_init = crc32_cra_init,
- }
-};
-
-static struct shash_alg crc32c_alg = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = chksum_setkey,
- .init = chksum_init,
- .update = chksumc_update,
- .final = chksumc_final,
- .finup = chksumc_finup,
- .digest = chksumc_digest,
- .descsize = sizeof(struct chksum_desc_ctx),
- .base = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-arm64-hw",
- .cra_priority = 300,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_alignmask = 0,
- .cra_ctxsize = sizeof(struct chksum_ctx),
- .cra_module = THIS_MODULE,
- .cra_init = crc32c_cra_init,
- }
-};
-
-static int __init crc32_mod_init(void)
-{
- int err;
-
- err = crypto_register_shash(&crc32_alg);
-
- if (err)
- return err;
-
- err = crypto_register_shash(&crc32c_alg);
-
- if (err) {
- crypto_unregister_shash(&crc32_alg);
- return err;
- }
-
- return 0;
-}
-
-static void __exit crc32_mod_exit(void)
-{
- crypto_unregister_shash(&crc32_alg);
- crypto_unregister_shash(&crc32c_alg);
-}
-
-module_cpu_feature_match(CRC32, crc32_mod_init);
-module_exit(crc32_mod_exit);
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
index 8594127d5e01..eccb1ae90064 100644
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -72,6 +72,24 @@ static int crc32_pmull_init(struct shash_desc *desc)
return 0;
}
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = crc32_armv8_le(*crc, data, length);
+ return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = crc32c_armv8_le(*crc, data, length);
+ return 0;
+}
+
static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
@@ -156,7 +174,7 @@ static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
static struct shash_alg crc32_pmull_algs[] = { {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
- .update = crc32_pmull_update,
+ .update = crc32_update,
.final = crc32_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
@@ -171,7 +189,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
}, {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
- .update = crc32c_pmull_update,
+ .update = crc32c_update,
.final = crc32c_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
@@ -187,14 +205,20 @@ static struct shash_alg crc32_pmull_algs[] = { {
static int __init crc32_pmull_mod_init(void)
{
- if (elf_hwcap & HWCAP_CRC32) {
- fallback_crc32 = crc32_armv8_le;
- fallback_crc32c = crc32c_armv8_le;
- } else {
- fallback_crc32 = crc32_le;
- fallback_crc32c = __crc32c_le;
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+ crc32_pmull_algs[0].update = crc32_pmull_update;
+ crc32_pmull_algs[1].update = crc32c_pmull_update;
+
+ if (elf_hwcap & HWCAP_CRC32) {
+ fallback_crc32 = crc32_armv8_le;
+ fallback_crc32c = crc32c_armv8_le;
+ } else {
+ fallback_crc32 = crc32_le;
+ fallback_crc32c = __crc32c_le;
+ }
+ } else if (!(elf_hwcap & HWCAP_CRC32)) {
+ return -ENODEV;
}
-
return crypto_register_shashes(crc32_pmull_algs,
ARRAY_SIZE(crc32_pmull_algs));
}
@@ -205,7 +229,12 @@ static void __exit crc32_pmull_mod_exit(void)
ARRAY_SIZE(crc32_pmull_algs));
}
-module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+static const struct cpu_feature crc32_cpu_feature[] = {
+ { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
+
+module_init(crc32_pmull_mod_init);
module_exit(crc32_pmull_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");