diff options
Diffstat (limited to 'arch/arm/crypto/ghash-ce-core.S')
| -rw-r--r-- | arch/arm/crypto/ghash-ce-core.S | 171 |
1 files changed, 14 insertions, 157 deletions
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index 858c0d66798b..a449525d61f8 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. + * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions. * * Copyright (C) 2015 - 2017 Linaro Ltd. * Copyright (C) 2023 Google LLC. <ardb@google.com> @@ -29,39 +29,10 @@ XM_H .req d7 XH_L .req d8 - t0l .req d10 - t0h .req d11 - t1l .req d12 - t1h .req d13 - t2l .req d14 - t2h .req d15 - t3l .req d16 - t3h .req d17 - t4l .req d18 - t4h .req d19 - - t0q .req q5 - t1q .req q6 - t2q .req q7 - t3q .req q8 - t4q .req q9 XH2 .req q9 - s1l .req d20 - s1h .req d21 - s2l .req d22 - s2h .req d23 - s3l .req d24 - s3h .req d25 - s4l .req d26 - s4h .req d27 - MASK .req d28 - SHASH2_p8 .req d28 - k16 .req d29 - k32 .req d30 - k48 .req d31 SHASH2_p64 .req d31 HH .req q10 @@ -93,72 +64,6 @@ .text - .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 - vmull.p64 \rd, \rn, \rm - .endm - - /* - * This implementation of 64x64 -> 128 bit polynomial multiplication - * using vmull.p8 instructions (8x8 -> 16) is taken from the paper - * "Fast Software Polynomial Multiplication on ARM Processors Using - * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and - * Ricardo Dahab (https://hal.inria.fr/hal-01506572) - * - * It has been slightly tweaked for in-order performance, and to allow - * 'rq' to overlap with 'ad' or 'bd'. - */ - .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l - vext.8 t0l, \ad, \ad, #1 @ A1 - .ifc \b1, t4l - vext.8 t4l, \bd, \bd, #1 @ B1 - .endif - vmull.p8 t0q, t0l, \bd @ F = A1*B - vext.8 t1l, \ad, \ad, #2 @ A2 - vmull.p8 t4q, \ad, \b1 @ E = A*B1 - .ifc \b2, t3l - vext.8 t3l, \bd, \bd, #2 @ B2 - .endif - vmull.p8 t1q, t1l, \bd @ H = A2*B - vext.8 t2l, \ad, \ad, #3 @ A3 - vmull.p8 t3q, \ad, \b2 @ G = A*B2 - veor t0q, t0q, t4q @ L = E + F - .ifc \b3, t4l - vext.8 t4l, \bd, \bd, #3 @ B3 - .endif - vmull.p8 t2q, t2l, \bd @ J = A3*B - veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 - veor t1q, t1q, t3q @ M = G + H - .ifc \b4, t3l - vext.8 t3l, \bd, \bd, #4 @ B4 - .endif - vmull.p8 t4q, \ad, \b3 @ I = A*B3 - veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 - vmull.p8 t3q, \ad, \b4 @ K = A*B4 - vand t0h, t0h, k48 - vand t1h, t1h, k32 - veor t2q, t2q, t4q @ N = I + J - veor t0l, t0l, t0h - veor t1l, t1l, t1h - veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 - vand t2h, t2h, k16 - veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 - vmov.i64 t3h, #0 - vext.8 t0q, t0q, t0q, #15 - veor t2l, t2l, t2h - vext.8 t1q, t1q, t1q, #14 - vmull.p8 \rq, \ad, \bd @ D = A*B - vext.8 t2q, t2q, t2q, #13 - vext.8 t3q, t3q, t3q, #12 - veor t0q, t0q, t1q - veor t2q, t2q, t3q - veor \rq, \rq, t0q - veor \rq, \rq, t2q - .endm - - // - // PMULL (64x64->128) based reduction for CPUs that can do - // it in a single instruction. - // .macro __pmull_reduce_p64 vmull.p64 T1, XL_L, MASK @@ -170,30 +75,7 @@ vmull.p64 XL, T1_H, MASK .endm - // - // Alternative reduction for CPUs that lack support for the - // 64x64->128 PMULL instruction - // - .macro __pmull_reduce_p8 - veor XL_H, XL_H, XM_L - veor XH_L, XH_L, XM_H - - vshl.i64 T1, XL, #57 - vshl.i64 T2, XL, #62 - veor T1, T1, T2 - vshl.i64 T2, XL, #63 - veor T1, T1, T2 - veor XL_H, XL_H, T1_L - veor XH_L, XH_L, T1_H - - vshr.u64 T1, XL, #1 - veor XH, XH, XL - veor XL, XL, T1 - vshr.u64 T1, T1, #6 - vshr.u64 XL, XL, #1 - .endm - - .macro ghash_update, pn, enc, aggregate=1, head=1 + .macro ghash_update, enc, aggregate=1, head=1 vld1.64 {XL}, [r1] .if \head @@ -206,8 +88,7 @@ b 3f .endif -0: .ifc \pn, p64 - .if \aggregate +0: .if \aggregate tst r0, #3 // skip until #blocks is a bne 2f // round multiple of 4 @@ -288,7 +169,6 @@ b 1b .endif - .endif 2: vld1.8 {T1}, [r2]! @@ -308,15 +188,15 @@ veor T1_L, T1_L, XL_H veor XL, XL, IN1 - __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 + vmull.p64 XH, XL_H, SHASH_H @ a1 * b1 veor T1, T1, XL - __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 - __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) + vmull.p64 XL, XL_L, SHASH_L @ a0 * b0 + vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0) 4: veor T1, XL, XH veor XM, XM, T1 - __pmull_reduce_\pn + __pmull_reduce_p64 veor T1, T1, XH veor XL, XL, T1 @@ -325,8 +205,8 @@ .endm /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) + * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + * u64 const h[4][2], const char *head) */ ENTRY(pmull_ghash_update_p64) vld1.64 {SHASH}, [r3]! @@ -341,35 +221,12 @@ ENTRY(pmull_ghash_update_p64) vmov.i8 MASK, #0xe1 vshl.u64 MASK, MASK, #57 - ghash_update p64 + ghash_update vst1.64 {XL}, [r1] bx lr ENDPROC(pmull_ghash_update_p64) -ENTRY(pmull_ghash_update_p8) - vld1.64 {SHASH}, [r3] - veor SHASH2_p8, SHASH_L, SHASH_H - - vext.8 s1l, SHASH_L, SHASH_L, #1 - vext.8 s2l, SHASH_L, SHASH_L, #2 - vext.8 s3l, SHASH_L, SHASH_L, #3 - vext.8 s4l, SHASH_L, SHASH_L, #4 - vext.8 s1h, SHASH_H, SHASH_H, #1 - vext.8 s2h, SHASH_H, SHASH_H, #2 - vext.8 s3h, SHASH_H, SHASH_H, #3 - vext.8 s4h, SHASH_H, SHASH_H, #4 - - vmov.i64 k16, #0xffff - vmov.i64 k32, #0xffffffff - vmov.i64 k48, #0xffffffffffff - - ghash_update p8 - vst1.64 {XL}, [r1] - - bx lr -ENDPROC(pmull_ghash_update_p8) - e0 .req q9 e1 .req q10 e2 .req q11 @@ -536,7 +393,7 @@ ENTRY(pmull_gcm_encrypt) vld1.64 {SHASH}, [r3] - ghash_update p64, enc, head=0 + ghash_update enc, head=0 vst1.64 {XL}, [r1] pop {r4-r8, pc} @@ -554,7 +411,7 @@ ENTRY(pmull_gcm_decrypt) vld1.64 {SHASH}, [r3] - ghash_update p64, dec, head=0 + ghash_update dec, head=0 vst1.64 {XL}, [r1] pop {r4-r8, pc} @@ -603,7 +460,7 @@ ENTRY(pmull_gcm_enc_final) vshl.u64 MASK, MASK, #57 mov r0, #1 bne 3f // process head block first - ghash_update p64, aggregate=0, head=0 + ghash_update aggregate=0, head=0 vrev64.8 XL, XL vext.8 XL, XL, XL, #8 @@ -660,7 +517,7 @@ ENTRY(pmull_gcm_dec_final) vshl.u64 MASK, MASK, #57 mov r0, #1 bne 3f // process head block first - ghash_update p64, aggregate=0, head=0 + ghash_update aggregate=0, head=0 vrev64.8 XL, XL vext.8 XL, XL, XL, #8 |
