summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-09-10 16:41:15 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2018-09-21 13:24:50 +0800
commit2e5d2f33d1dbd551f14634483c7ea81e5119d689 (patch)
treeea7b3f03c3af0d1e818d588ac2a8e99c391b0826
parentdd597fb33ff0d2a158d7dd098be6dc849b7c2bcc (diff)
downloadlinux-0-day-2e5d2f33d1dbd551f14634483c7ea81e5119d689.tar.gz
linux-0-day-2e5d2f33d1dbd551f14634483c7ea81e5119d689.tar.xz
crypto: arm64/aes-blk - improve XTS mask handling
The Crypto Extension instantiation of the aes-modes.S collection of skciphers uses only 15 NEON registers for the round key array, whereas the pure NEON flavor uses 16 NEON registers for the AES S-box. This means we have a spare register available that we can use to hold the XTS mask vector, removing the need to reload it at every iteration of the inner loop. Since the pure NEON version does not permit this optimization, tweak the macros so we can factor out this functionality. Also, replace the literal load with a short sequence to compose the mask vector. On Cortex-A53, this results in a ~4% speedup. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/arm64/crypto/aes-ce.S5
-rw-r--r--arch/arm64/crypto/aes-modes.S40
-rw-r--r--arch/arm64/crypto/aes-neon.S6
3 files changed, 32 insertions, 19 deletions
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 623e74ed1c67f..143070510809a 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -17,6 +17,11 @@
.arch armv8-a+crypto
+ xtsmask .req v16
+
+ .macro xts_reload_mask, tmp
+ .endm
+
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 9697eda3b4d1b..039738ae23f66 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -340,17 +340,19 @@ AES_ENDPROC(aes_ctr_encrypt)
* int blocks, u8 const rk2[], u8 iv[], int first)
*/
- .macro next_tweak, out, in, const, tmp
+ .macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63
- and \tmp\().16b, \tmp\().16b, \const\().16b
+ and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
-.Lxts_mul_x:
-CPU_LE( .quad 1, 0x87 )
-CPU_BE( .quad 0x87, 1 )
+ .macro xts_load_mask, tmp
+ movi xtsmask.2s, #0x1
+ movi \tmp\().2s, #0x87
+ uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
+ .endm
AES_ENTRY(aes_xts_encrypt)
stp x29, x30, [sp, #-16]!
@@ -362,24 +364,24 @@ AES_ENTRY(aes_xts_encrypt)
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
enc_switch_key w3, x2, x8
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
b .LxtsencNx
.Lxtsencnotfirst:
enc_prepare w3, x2, x8
.LxtsencloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ xts_reload_mask v8
+ next_tweak v4, v4, v8
.LxtsencNx:
subs w4, w4, #4
bmi .Lxtsenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
@@ -401,7 +403,7 @@ AES_ENTRY(aes_xts_encrypt)
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsencout
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
b .Lxtsencloop
.Lxtsencout:
st1 {v4.16b}, [x6]
@@ -420,24 +422,24 @@ AES_ENTRY(aes_xts_decrypt)
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
dec_prepare w3, x2, x8
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
b .LxtsdecNx
.Lxtsdecnotfirst:
dec_prepare w3, x2, x8
.LxtsdecloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ xts_reload_mask v8
+ next_tweak v4, v4, v8
.LxtsdecNx:
subs w4, w4, #4
bmi .Lxtsdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
@@ -459,7 +461,7 @@ AES_ENTRY(aes_xts_decrypt)
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsdecout
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
b .Lxtsdecloop
.Lxtsdecout:
st1 {v4.16b}, [x6]
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 1c7b45b7268e4..29100f692e8a0 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -14,6 +14,12 @@
#define AES_ENTRY(func) ENTRY(neon_ ## func)
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
+ xtsmask .req v7
+
+ .macro xts_reload_mask, tmp
+ xts_load_mask \tmp
+ .endm
+
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
sshr \temp, \in, #7