diff options
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/Kconfig | 265 | ||||
-rw-r--r-- | arch/arm64/crypto/Makefile | 11 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 102 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-modes.S | 345 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neon.S | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neonbs-core.S | 264 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neonbs-glue.c | 97 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 5 | ||||
-rw-r--r-- | arch/arm64/crypto/poly1305-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/polyval-ce-core.S | 361 | ||||
-rw-r--r-- | arch/arm64/crypto/polyval-ce-glue.c | 191 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-armv8.pl | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-core.S | 3 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-glue.c | 28 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-ce-cipher-core.S | 36 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-ce-cipher-glue.c | 82 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-ce-core.S | 688 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-ce-glue.c | 386 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-neon-core.S | 487 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-neon-glue.c | 442 |
22 files changed, 3290 insertions, 513 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index addfa413650b..8bd80508a710 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -1,123 +1,282 @@ # SPDX-License-Identifier: GPL-2.0 -menuconfig ARM64_CRYPTO - bool "ARM64 Accelerated Cryptographic Algorithms" - depends on ARM64 +menu "Accelerated Cryptographic Algorithms for CPU (arm64)" + +config CRYPTO_GHASH_ARM64_CE + tristate "Hash functions: GHASH (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_GF128MUL + select CRYPTO_LIB_AES + select CRYPTO_AEAD help - Say Y here to choose from a selection of cryptographic algorithms - implemented using ARM64 specific CPU features or instructions. + GCM GHASH function (NIST SP800-38D) -if ARM64_CRYPTO + Architecture: arm64 using: + - ARMv8 Crypto Extensions -config CRYPTO_SHA256_ARM64 - tristate "SHA-224/SHA-256 digest algorithm for arm64" - select CRYPTO_HASH +config CRYPTO_NHPOLY1305_NEON + tristate "Hash functions: NHPoly1305 (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + help + NHPoly1305 hash function (Adiantum) -config CRYPTO_SHA512_ARM64 - tristate "SHA-384/SHA-512 digest algorithm for arm64" + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_POLY1305_NEON + tristate "Hash functions: Poly1305 (NEON)" + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions config CRYPTO_SHA1_ARM64_CE - tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" + tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_SHA1 + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_SHA256_ARM64 + tristate "Hash functions: SHA-224 and SHA-256" + select CRYPTO_HASH + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: arm64 config CRYPTO_SHA2_ARM64_CE - tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" + tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_SHA256_ARM64 + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_SHA512_ARM64 + tristate "Hash functions: SHA-384 and SHA-512" + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: arm64 config CRYPTO_SHA512_ARM64_CE - tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)" + tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_SHA512_ARM64 + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions config CRYPTO_SHA3_ARM64 - tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)" + tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_SHA3 + help + SHA-3 secure hash algorithms (FIPS 202) + + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions config CRYPTO_SM3_ARM64_CE - tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)" + tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_SM3 + help + SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) -config CRYPTO_SM4_ARM64_CE - tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_ALGAPI - select CRYPTO_LIB_SM4 + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions -config CRYPTO_GHASH_ARM64_CE - tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" +config CRYPTO_POLYVAL_ARM64_CE + tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_GF128MUL - select CRYPTO_LIB_AES + select CRYPTO_POLYVAL + help + POLYVAL hash function for HCTR2 -config CRYPTO_CRCT10DIF_ARM64_CE - tristate "CRCT10DIF digest algorithm using PMULL instructions" - depends on KERNEL_MODE_NEON && CRC_T10DIF - select CRYPTO_HASH + Architecture: arm64 using: + - ARMv8 Crypto Extensions config CRYPTO_AES_ARM64 - tristate "AES core cipher using scalar instructions" + tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS" select CRYPTO_AES + help + Block ciphers: AES cipher algorithms (FIPS-197) + Length-preserving ciphers: AES with ECB, CBC, CTR, CTS, + XCTR, and XTS modes + AEAD cipher: AES with CBC, ESSIV, and SHA-256 + for fscrypt and dm-crypt + + Architecture: arm64 config CRYPTO_AES_ARM64_CE - tristate "AES core cipher using ARMv8 Crypto Extensions" + tristate "Ciphers: AES (ARMv8 Crypto Extensions)" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_LIB_AES + help + Block ciphers: AES cipher algorithms (FIPS-197) -config CRYPTO_AES_ARM64_CE_CCM - tristate "AES in CCM mode using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON - select CRYPTO_ALGAPI - select CRYPTO_AES_ARM64_CE - select CRYPTO_AEAD - select CRYPTO_LIB_AES + Architecture: arm64 using: + - ARMv8 Crypto Extensions config CRYPTO_AES_ARM64_CE_BLK - tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64_CE + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions config CRYPTO_AES_ARM64_NEON_BLK - tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (NEON)" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_LIB_AES + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions config CRYPTO_CHACHA20_NEON - tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions" + tristate "Ciphers: ChaCha (NEON)" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms -config CRYPTO_POLY1305_NEON - tristate "Poly1305 hash function using scalar or NEON instructions" + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_AES_ARM64_BS + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)" depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_ARCH_HAVE_LIB_POLY1305 + select CRYPTO_SKCIPHER + select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_LIB_AES + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XCTR mode for HCTR2 + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) -config CRYPTO_NHPOLY1305_NEON - tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)" + Architecture: arm64 using: + - bit-sliced algorithm + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_CE + tristate "Ciphers: SM4 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON - select CRYPTO_NHPOLY1305 + select CRYPTO_ALGAPI + select CRYPTO_SM4 + help + Block ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) -config CRYPTO_AES_ARM64_BS - tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_CE_BLK + tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER - select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_SM4 + help + Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CFB (Cipher Feedback) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_NEON_BLK + tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_SM4 + help + Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CFB (Cipher Feedback) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_AES_ARM64_CE_CCM + tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES_ARM64_CE + select CRYPTO_AEAD select CRYPTO_LIB_AES + help + AEAD cipher: AES cipher algorithms (FIPS-197) with + CCM (Counter with Cipher Block Chaining-Message Authentication Code) + authenticated encryption mode (NIST SP800-38C) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_CRCT10DIF_ARM64_CE + tristate "CRCT10DIF (PMULL)" + depends on KERNEL_MODE_NEON && CRC_T10DIF + select CRYPTO_HASH + help + CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) + + Architecture: arm64 using + - PMULL (Polynomial Multiply Long) instructions + +endmenu -endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 09a805cc32d7..24bb0c4610de 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -20,12 +20,21 @@ sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o -obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o +sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o + +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o +obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o +sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o + obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o +polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o + obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 30b7cc6a7079..162787c7aa86 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -24,7 +24,6 @@ #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" #define PRIO 300 -#define STRIDE 5 #define aes_expandkey ce_aes_expandkey #define aes_ecb_encrypt ce_aes_ecb_encrypt #define aes_ecb_decrypt ce_aes_ecb_decrypt @@ -35,14 +34,14 @@ #define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt #define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt #define aes_ctr_encrypt ce_aes_ctr_encrypt +#define aes_xctr_encrypt ce_aes_xctr_encrypt #define aes_xts_encrypt ce_aes_xts_encrypt #define aes_xts_decrypt ce_aes_xts_decrypt #define aes_mac_update ce_aes_mac_update -MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions"); #else #define MODE "neon" #define PRIO 200 -#define STRIDE 4 #define aes_ecb_encrypt neon_aes_ecb_encrypt #define aes_ecb_decrypt neon_aes_ecb_decrypt #define aes_cbc_encrypt neon_aes_cbc_encrypt @@ -52,16 +51,18 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt #define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt #define aes_ctr_encrypt neon_aes_ctr_encrypt +#define aes_xctr_encrypt neon_aes_xctr_encrypt #define aes_xts_encrypt neon_aes_xts_encrypt #define aes_xts_decrypt neon_aes_xts_decrypt #define aes_mac_update neon_aes_mac_update -MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON"); #endif #if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_ALIAS_CRYPTO("xctr(aes)"); #endif MODULE_ALIAS_CRYPTO("cts(cbc(aes))"); MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)"); @@ -89,7 +90,10 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int bytes, u8 const iv[]); asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 ctr[], u8 finalbuf[]); + int rounds, int bytes, u8 ctr[]); + +asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[], int byte_ctr); asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int bytes, u32 const rk2[], u8 iv[], @@ -444,6 +448,52 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req) return err ?: cbc_decrypt_walk(req, &walk); } +static int __maybe_unused xctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + unsigned int byte_ctr = 0; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv, byte_ctr); + kernel_neon_end(); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + byte_ctr += nbytes; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + static int __maybe_unused ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -458,26 +508,29 @@ static int __maybe_unused ctr_encrypt(struct skcipher_request *req) unsigned int nbytes = walk.nbytes; u8 *dst = walk.dst.virt.addr; u8 buf[AES_BLOCK_SIZE]; - unsigned int tail; + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ if (unlikely(nbytes < AES_BLOCK_SIZE)) - src = memcpy(buf, src, nbytes); + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); else if (nbytes < walk.total) nbytes &= ~(AES_BLOCK_SIZE - 1); kernel_neon_begin(); aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, - walk.iv, buf); + walk.iv); kernel_neon_end(); - tail = nbytes % (STRIDE * AES_BLOCK_SIZE); - if (tail > 0 && tail < AES_BLOCK_SIZE) - /* - * The final partial block could not be returned using - * an overlapping store, so it was passed via buf[] - * instead. - */ - memcpy(dst + nbytes - tail, buf, tail); + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } @@ -678,6 +731,22 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = ctr_encrypt, }, { .base = { + .cra_name = "xctr(aes)", + .cra_driver_name = "xctr-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = xctr_encrypt, + .decrypt = xctr_encrypt, +}, { + .base = { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-" MODE, .cra_priority = PRIO, @@ -983,6 +1052,7 @@ module_cpu_feature_match(AES, aes_init); module_init(aes_init); EXPORT_SYMBOL(neon_aes_ecb_encrypt); EXPORT_SYMBOL(neon_aes_cbc_encrypt); +EXPORT_SYMBOL(neon_aes_ctr_encrypt); EXPORT_SYMBOL(neon_aes_xts_encrypt); EXPORT_SYMBOL(neon_aes_xts_decrypt); #endif diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index ff01f0167ba2..5abc834271f4 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -318,127 +318,211 @@ AES_FUNC_END(aes_cbc_cts_decrypt) .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .previous - /* - * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int bytes, u8 ctr[], u8 finalbuf[]) + * This macro generates the code for CTR and XCTR mode. */ +.macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 -AES_FUNC_START(aes_ctr_encrypt) stp x29, x30, [sp, #-16]! mov x29, sp - enc_prepare w3, x2, x12 - ld1 {vctr.16b}, [x5] - - umov x12, vctr.d[1] /* keep swabbed ctr in reg */ - rev x12, x12 + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] -.LctrloopNx: - add w7, w4, #15 - sub w4, w4, #MAX_STRIDE << 4 - lsr w7, w7, #4 + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ + .if \xctr + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 + .else + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART + .endif + +.LctrloopNx\xctr: + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 mov w8, #MAX_STRIDE - cmp w7, w8 - csel w7, w7, w8, lt - adds x12, x12, x7 + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt + /* + * Set up the counter values in v0-v{MAX_STRIDE-1}. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in + * v{MAX_STRIDE-1}. For example: if encrypting two blocks with + * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. + */ + .if \xctr + add CTR, CTR, BLOCKS + .else + adds IV_PART, IV_PART, BLOCKS + .endif mov v0.16b, vctr.16b mov v1.16b, vctr.16b mov v2.16b, vctr.16b mov v3.16b, vctr.16b ST5( mov v4.16b, vctr.16b ) - bcs 0f - - .subsection 1 - /* apply carry to outgoing counter */ -0: umov x8, vctr.d[0] - rev x8, x8 - add x8, x8, #1 - rev x8, x8 - ins vctr.d[0], x8 - - /* apply carry to N counter blocks for N := x12 */ - cbz x12, 2f - adr x16, 1f - sub x16, x16, x12, lsl #3 - br x16 - bti c - mov v0.d[0], vctr.d[0] - bti c - mov v1.d[0], vctr.d[0] - bti c - mov v2.d[0], vctr.d[0] - bti c - mov v3.d[0], vctr.d[0] -ST5( bti c ) -ST5( mov v4.d[0], vctr.d[0] ) -1: b 2f - .previous + .if \xctr + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART +ST5( eor x10, x10, IV_PART ) + mov v0.d[0], x6 + mov v1.d[0], x7 + mov v2.d[0], x8 + mov v3.d[0], x9 +ST5( mov v4.d[0], x10 ) + .else + bcs 0f + .subsection 1 + /* + * This subsection handles carries. + * + * Conditional branching here is allowed with respect to time + * invariance since the branches are dependent on the IV instead + * of the plaintext or key. This code is rarely executed in + * practice anyway. + */ + + /* Apply carry to outgoing counter. */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* + * Apply carry to counter blocks if needed. + * + * Since the carry flag was set, we know 0 <= IV_PART < + * MAX_STRIDE. Using the value of IV_PART we can determine how + * many counter blocks need to be updated. + */ + cbz IV_PART, 2f + adr x16, 1f + sub x16, x16, IV_PART, lsl #3 + br x16 + bti c + mov v0.d[0], vctr.d[0] + bti c + mov v1.d[0], vctr.d[0] + bti c + mov v2.d[0], vctr.d[0] + bti c + mov v3.d[0], vctr.d[0] +ST5( bti c ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, IV_PART + ins vctr.d[1], x7 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + .endif -2: rev x7, x12 - ins vctr.d[1], x7 - sub x7, x12, #MAX_STRIDE - 1 - sub x8, x12, #MAX_STRIDE - 2 - sub x9, x12, #MAX_STRIDE - 3 - rev x7, x7 - rev x8, x8 - mov v1.d[1], x7 - rev x9, x9 -ST5( sub x10, x12, #MAX_STRIDE - 4 ) - mov v2.d[1], x8 -ST5( rev x10, x10 ) - mov v3.d[1], x9 -ST5( mov v4.d[1], x10 ) - tbnz w4, #31, .Lctrtail - ld1 {v5.16b-v7.16b}, [x1], #48 + /* + * If there are at least MAX_STRIDE blocks left, XOR the data with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b -ST4( ld1 {v5.16b}, [x1], #16 ) +ST4( ld1 {v5.16b}, [IN], #16 ) eor v1.16b, v6.16b, v1.16b -ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b ST5( eor v4.16b, v6.16b, v4.16b ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - cbz w4, .Lctrout - b .LctrloopNx - -.Lctrout: - st1 {vctr.16b}, [x5] /* return next CTR value */ + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr + b .LctrloopNx\xctr + +.Lctrout\xctr: + .if !\xctr + st1 {vctr.16b}, [IV] /* return next CTR value */ + .endif ldp x29, x30, [sp], #16 ret -.Lctrtail: - /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */ +.Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v{MAX_STRIDE-1}. + * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and + * v4 should have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach greatly simplifies the + * logic for storing the ciphertext. + */ mov x16, #16 - ands x13, x4, #0xf - csel x13, x13, x16, ne + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne -ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) ST5( csel x14, x16, xzr, gt ) - cmp w4, #48 - (MAX_STRIDE << 4) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) csel x15, x16, xzr, gt - cmp w4, #32 - (MAX_STRIDE << 4) + cmp BYTES_W, #32 - (MAX_STRIDE << 4) csel x16, x16, xzr, gt - cmp w4, #16 - (MAX_STRIDE << 4) - ble .Lctrtail1x + cmp BYTES_W, #16 - (MAX_STRIDE << 4) - adr_l x12, .Lcts_permute_table - add x12, x12, x13 + adr_l x9, .Lcts_permute_table + add x9, x9, x13 + ble .Lctrtail1x\xctr -ST5( ld1 {v5.16b}, [x1], x14 ) - ld1 {v6.16b}, [x1], x15 - ld1 {v7.16b}, [x1], x16 +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) - ld1 {v8.16b}, [x1], x13 - ld1 {v9.16b}, [x1] - ld1 {v10.16b}, [x12] + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] ST4( eor v6.16b, v6.16b, v0.16b ) ST4( eor v7.16b, v7.16b, v1.16b ) @@ -453,24 +537,91 @@ ST5( eor v7.16b, v7.16b, v2.16b ) ST5( eor v8.16b, v8.16b, v3.16b ) ST5( eor v9.16b, v9.16b, v4.16b ) -ST5( st1 {v5.16b}, [x0], x14 ) - st1 {v6.16b}, [x0], x15 - st1 {v7.16b}, [x0], x16 - add x13, x13, x0 +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT st1 {v9.16b}, [x13] // overlapping stores - st1 {v8.16b}, [x0] - b .Lctrout + st1 {v8.16b}, [OUT] + b .Lctrout\xctr -.Lctrtail1x: - csel x0, x0, x6, eq // use finalbuf if less than a full block - ld1 {v5.16b}, [x1] +.Lctrtail1x\xctr: + /* + * Handle <= 16 bytes of plaintext + * + * This code always reads and writes 16 bytes. To avoid out of bounds + * accesses, XCTR and CTR modes must use a temporary buffer when + * encrypting/decrypting less than 16 bytes. + * + * This code is unusual in that it loads the input and stores the output + * relative to the end of the buffers rather than relative to the start. + * This causes unusual behaviour when encrypting/decrypting less than 16 + * bytes; the end of the data is expected to be at the end of the + * temporary buffer rather than the start of the data being at the start + * of the temporary buffer. + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] ST5( mov v3.16b, v4.16b ) - encrypt_block v3, w3, x2, x8, w7 + encrypt_block v3, ROUNDS_W, KEY, x8, w7 + ld1 {v10.16b-v11.16b}, [x9] + tbl v3.16b, {v3.16b}, v10.16b + sshr v11.16b, v11.16b, #7 eor v5.16b, v5.16b, v3.16b - st1 {v5.16b}, [x0] - b .Lctrout + bif v5.16b, v6.16b, v11.16b + st1 {v5.16b}, [OUT] + b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W +.endm + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 ctr[]) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_ctr_encrypt) + ctr_encrypt 0 AES_FUNC_END(aes_ctr_encrypt) + /* + * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 const iv[], int byte_ctr) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_xctr_encrypt) + ctr_encrypt 1 +AES_FUNC_END(aes_xctr_encrypt) + /* * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index e47d3ec2cfb4..9de7fbc797af 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -66,7 +66,7 @@ prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp .endm - /* apply SubBytes transformation using the the preloaded Sbox */ + /* apply SubBytes transformation using the preloaded Sbox */ .macro sub_bytes, in sub v9.16b, \in\().16b, v15.16b tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index a3405b8c344b..d427f4556b6e 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -735,119 +735,67 @@ SYM_FUNC_END(aesbs_cbc_decrypt) * int blocks, u8 iv[]) */ SYM_FUNC_START_LOCAL(__xts_crypt8) - mov x6, #1 - lsl x6, x6, x23 - subs w23, w23, #8 - csel x23, x23, xzr, pl - csel x6, x6, xzr, mi + movi v18.2s, #0x1 + movi v19.2s, #0x87 + uzp1 v18.4s, v18.4s, v19.4s + + ld1 {v0.16b-v3.16b}, [x1], #64 + ld1 {v4.16b-v7.16b}, [x1], #64 + + next_tweak v26, v25, v18, v19 + next_tweak v27, v26, v18, v19 + next_tweak v28, v27, v18, v19 + next_tweak v29, v28, v18, v19 + next_tweak v30, v29, v18, v19 + next_tweak v31, v30, v18, v19 + next_tweak v16, v31, v18, v19 + next_tweak v17, v16, v18, v19 - ld1 {v0.16b}, [x20], #16 - next_tweak v26, v25, v30, v31 eor v0.16b, v0.16b, v25.16b - tbnz x6, #1, 0f - - ld1 {v1.16b}, [x20], #16 - next_tweak v27, v26, v30, v31 eor v1.16b, v1.16b, v26.16b - tbnz x6, #2, 0f - - ld1 {v2.16b}, [x20], #16 - next_tweak v28, v27, v30, v31 eor v2.16b, v2.16b, v27.16b - tbnz x6, #3, 0f - - ld1 {v3.16b}, [x20], #16 - next_tweak v29, v28, v30, v31 eor v3.16b, v3.16b, v28.16b - tbnz x6, #4, 0f - - ld1 {v4.16b}, [x20], #16 - str q29, [sp, #.Lframe_local_offset] eor v4.16b, v4.16b, v29.16b - next_tweak v29, v29, v30, v31 - tbnz x6, #5, 0f - - ld1 {v5.16b}, [x20], #16 - str q29, [sp, #.Lframe_local_offset + 16] - eor v5.16b, v5.16b, v29.16b - next_tweak v29, v29, v30, v31 - tbnz x6, #6, 0f - - ld1 {v6.16b}, [x20], #16 - str q29, [sp, #.Lframe_local_offset + 32] - eor v6.16b, v6.16b, v29.16b - next_tweak v29, v29, v30, v31 - tbnz x6, #7, 0f + eor v5.16b, v5.16b, v30.16b + eor v6.16b, v6.16b, v31.16b + eor v7.16b, v7.16b, v16.16b - ld1 {v7.16b}, [x20], #16 - str q29, [sp, #.Lframe_local_offset + 48] - eor v7.16b, v7.16b, v29.16b - next_tweak v29, v29, v30, v31 + stp q16, q17, [sp, #16] -0: mov bskey, x21 - mov rounds, x22 + mov bskey, x2 + mov rounds, x3 br x16 SYM_FUNC_END(__xts_crypt8) .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 - frame_push 6, 64 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 + stp x29, x30, [sp, #-48]! + mov x29, sp - movi v30.2s, #0x1 - movi v25.2s, #0x87 - uzp1 v30.4s, v30.4s, v25.4s - ld1 {v25.16b}, [x24] + ld1 {v25.16b}, [x5] -99: adr x16, \do8 +0: adr x16, \do8 bl __xts_crypt8 - ldp q16, q17, [sp, #.Lframe_local_offset] - ldp q18, q19, [sp, #.Lframe_local_offset + 32] + eor v16.16b, \o0\().16b, v25.16b + eor v17.16b, \o1\().16b, v26.16b + eor v18.16b, \o2\().16b, v27.16b + eor v19.16b, \o3\().16b, v28.16b - eor \o0\().16b, \o0\().16b, v25.16b - eor \o1\().16b, \o1\().16b, v26.16b - eor \o2\().16b, \o2\().16b, v27.16b - eor \o3\().16b, \o3\().16b, v28.16b + ldp q24, q25, [sp, #16] - st1 {\o0\().16b}, [x19], #16 - mov v25.16b, v26.16b - tbnz x6, #1, 1f - st1 {\o1\().16b}, [x19], #16 - mov v25.16b, v27.16b - tbnz x6, #2, 1f - st1 {\o2\().16b}, [x19], #16 - mov v25.16b, v28.16b - tbnz x6, #3, 1f - st1 {\o3\().16b}, [x19], #16 - mov v25.16b, v29.16b - tbnz x6, #4, 1f + eor v20.16b, \o4\().16b, v29.16b + eor v21.16b, \o5\().16b, v30.16b + eor v22.16b, \o6\().16b, v31.16b + eor v23.16b, \o7\().16b, v24.16b - eor \o4\().16b, \o4\().16b, v16.16b - eor \o5\().16b, \o5\().16b, v17.16b - eor \o6\().16b, \o6\().16b, v18.16b - eor \o7\().16b, \o7\().16b, v19.16b + st1 {v16.16b-v19.16b}, [x0], #64 + st1 {v20.16b-v23.16b}, [x0], #64 - st1 {\o4\().16b}, [x19], #16 - tbnz x6, #5, 1f - st1 {\o5\().16b}, [x19], #16 - tbnz x6, #6, 1f - st1 {\o6\().16b}, [x19], #16 - tbnz x6, #7, 1f - st1 {\o7\().16b}, [x19], #16 + subs x4, x4, #8 + b.gt 0b - cbz x23, 1f - st1 {v25.16b}, [x24] - - b 99b - -1: st1 {v25.16b}, [x24] - frame_pop + st1 {v25.16b}, [x5] + ldp x29, x30, [sp], #48 ret .endm @@ -869,133 +817,51 @@ SYM_FUNC_END(aesbs_xts_decrypt) /* * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], - * int rounds, int blocks, u8 iv[], u8 final[]) + * int rounds, int blocks, u8 iv[]) */ SYM_FUNC_START(aesbs_ctr_encrypt) - frame_push 8 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 + stp x29, x30, [sp, #-16]! + mov x29, sp - cmp x25, #0 - cset x26, ne - add x23, x23, x26 // do one extra block if final - - ldp x7, x8, [x24] - ld1 {v0.16b}, [x24] + ldp x7, x8, [x5] + ld1 {v0.16b}, [x5] CPU_LE( rev x7, x7 ) CPU_LE( rev x8, x8 ) adds x8, x8, #1 adc x7, x7, xzr -99: mov x9, #1 - lsl x9, x9, x23 - subs w23, w23, #8 - csel x23, x23, xzr, pl - csel x9, x9, xzr, le - - tbnz x9, #1, 0f - next_ctr v1 - tbnz x9, #2, 0f +0: next_ctr v1 next_ctr v2 - tbnz x9, #3, 0f next_ctr v3 - tbnz x9, #4, 0f next_ctr v4 - tbnz x9, #5, 0f next_ctr v5 - tbnz x9, #6, 0f next_ctr v6 - tbnz x9, #7, 0f next_ctr v7 -0: mov bskey, x21 - mov rounds, x22 + mov bskey, x2 + mov rounds, x3 bl aesbs_encrypt8 - lsr x9, x9, x26 // disregard the extra block - tbnz x9, #0, 0f - - ld1 {v8.16b}, [x20], #16 - eor v0.16b, v0.16b, v8.16b - st1 {v0.16b}, [x19], #16 - tbnz x9, #1, 1f + ld1 { v8.16b-v11.16b}, [x1], #64 + ld1 {v12.16b-v15.16b}, [x1], #64 - ld1 {v9.16b}, [x20], #16 - eor v1.16b, v1.16b, v9.16b - st1 {v1.16b}, [x19], #16 - tbnz x9, #2, 2f + eor v8.16b, v0.16b, v8.16b + eor v9.16b, v1.16b, v9.16b + eor v10.16b, v4.16b, v10.16b + eor v11.16b, v6.16b, v11.16b + eor v12.16b, v3.16b, v12.16b + eor v13.16b, v7.16b, v13.16b + eor v14.16b, v2.16b, v14.16b + eor v15.16b, v5.16b, v15.16b - ld1 {v10.16b}, [x20], #16 - eor v4.16b, v4.16b, v10.16b - st1 {v4.16b}, [x19], #16 - tbnz x9, #3, 3f + st1 { v8.16b-v11.16b}, [x0], #64 + st1 {v12.16b-v15.16b}, [x0], #64 - ld1 {v11.16b}, [x20], #16 - eor v6.16b, v6.16b, v11.16b - st1 {v6.16b}, [x19], #16 - tbnz x9, #4, 4f - - ld1 {v12.16b}, [x20], #16 - eor v3.16b, v3.16b, v12.16b - st1 {v3.16b}, [x19], #16 - tbnz x9, #5, 5f - - ld1 {v13.16b}, [x20], #16 - eor v7.16b, v7.16b, v13.16b - st1 {v7.16b}, [x19], #16 - tbnz x9, #6, 6f + next_ctr v0 + subs x4, x4, #8 + b.gt 0b - ld1 {v14.16b}, [x20], #16 - eor v2.16b, v2.16b, v14.16b - st1 {v2.16b}, [x19], #16 - tbnz x9, #7, 7f - - ld1 {v15.16b}, [x20], #16 - eor v5.16b, v5.16b, v15.16b - st1 {v5.16b}, [x19], #16 - -8: next_ctr v0 - st1 {v0.16b}, [x24] - cbz x23, .Lctr_done - - b 99b - -.Lctr_done: - frame_pop + st1 {v0.16b}, [x5] + ldp x29, x30, [sp], #16 ret - - /* - * If we are handling the tail of the input (x6 != NULL), return the - * final keystream block back to the caller. - */ -0: cbz x25, 8b - st1 {v0.16b}, [x25] - b 8b -1: cbz x25, 8b - st1 {v1.16b}, [x25] - b 8b -2: cbz x25, 8b - st1 {v4.16b}, [x25] - b 8b -3: cbz x25, 8b - st1 {v6.16b}, [x25] - b 8b -4: cbz x25, 8b - st1 {v3.16b}, [x25] - b 8b -5: cbz x25, 8b - st1 {v7.16b}, [x25] - b 8b -6: cbz x25, 8b - st1 {v2.16b}, [x25] - b 8b -7: cbz x25, 8b - st1 {v5.16b}, [x25] - b 8b SYM_FUNC_END(aesbs_ctr_encrypt) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 8df6ad8cb09d..bac4cabef607 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -34,7 +34,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], - int rounds, int blocks, u8 iv[], u8 final[]); + int rounds, int blocks, u8 iv[]); asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); @@ -46,6 +46,8 @@ asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int blocks); asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int blocks, u8 iv[]); +asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[]); asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int bytes, u32 const rk2[], u8 iv[], int first); @@ -58,7 +60,7 @@ struct aesbs_ctx { int rounds; } __aligned(AES_BLOCK_SIZE); -struct aesbs_cbc_ctx { +struct aesbs_cbc_ctr_ctx { struct aesbs_ctx key; u32 enc[AES_MAX_KEYLENGTH_U32]; }; @@ -128,10 +130,10 @@ static int ecb_decrypt(struct skcipher_request *req) return __ecb_crypt(req, aesbs_ecb_decrypt); } -static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, +static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_aes_ctx rk; int err; @@ -154,7 +156,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; int err; @@ -177,7 +179,7 @@ static int cbc_encrypt(struct skcipher_request *req) static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; int err; @@ -205,40 +207,32 @@ static int cbc_decrypt(struct skcipher_request *req) static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; - u8 buf[AES_BLOCK_SIZE]; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes > 0) { - unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; - u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL; - - if (walk.nbytes < walk.total) { - blocks = round_down(blocks, - walk.stride / AES_BLOCK_SIZE); - final = NULL; - } + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; + int nbytes = walk.nbytes % (8 * AES_BLOCK_SIZE); + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; kernel_neon_begin(); - aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - ctx->rk, ctx->rounds, blocks, walk.iv, final); - kernel_neon_end(); - - if (final) { - u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; - u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - - crypto_xor_cpy(dst, src, final, - walk.total % AES_BLOCK_SIZE); - - err = skcipher_walk_done(&walk, 0); - break; + if (blocks >= 8) { + aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds, + blocks, walk.iv); + dst += blocks * AES_BLOCK_SIZE; + src += blocks * AES_BLOCK_SIZE; } - err = skcipher_walk_done(&walk, - walk.nbytes - blocks * AES_BLOCK_SIZE); + if (nbytes && walk.nbytes == walk.total) { + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, + nbytes, walk.iv); + nbytes = 0; + } + kernel_neon_end(); + err = skcipher_walk_done(&walk, nbytes); } return err; } @@ -308,23 +302,18 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, return err; while (walk.nbytes >= AES_BLOCK_SIZE) { - unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; - - if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE) - blocks = round_down(blocks, - walk.stride / AES_BLOCK_SIZE); - + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; out = walk.dst.virt.addr; in = walk.src.virt.addr; nbytes = walk.nbytes; kernel_neon_begin(); - if (likely(blocks > 6)) { /* plain NEON is faster otherwise */ - if (first) + if (blocks >= 8) { + if (first == 1) neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey, ctx->key.rounds, 1); - first = 0; + first = 2; fn(out, in, ctx->key.rk, ctx->key.rounds, blocks, walk.iv); @@ -333,10 +322,17 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, in += blocks * AES_BLOCK_SIZE; nbytes -= blocks * AES_BLOCK_SIZE; } - - if (walk.nbytes == walk.total && nbytes > 0) - goto xts_tail; - + if (walk.nbytes == walk.total && nbytes > 0) { + if (encrypt) + neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, + ctx->key.rounds, nbytes, + ctx->twkey, walk.iv, first); + else + neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, + ctx->key.rounds, nbytes, + ctx->twkey, walk.iv, first); + nbytes = first = 0; + } kernel_neon_end(); err = skcipher_walk_done(&walk, nbytes); } @@ -361,13 +357,12 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, nbytes = walk.nbytes; kernel_neon_begin(); -xts_tail: if (encrypt) neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds, - nbytes, ctx->twkey, walk.iv, first ?: 2); + nbytes, ctx->twkey, walk.iv, first); else neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds, - nbytes, ctx->twkey, walk.iv, first ?: 2); + nbytes, ctx->twkey, walk.iv, first); kernel_neon_end(); return skcipher_walk_done(&walk, 0); @@ -402,14 +397,14 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_driver_name = "cbc-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), .base.cra_module = THIS_MODULE, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .walksize = 8 * AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_cbc_setkey, + .setkey = aesbs_cbc_ctr_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { @@ -417,7 +412,7 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), .base.cra_module = THIS_MODULE, .min_keysize = AES_MIN_KEY_SIZE, @@ -425,7 +420,7 @@ static struct skcipher_alg aes_algs[] = { { .chunksize = AES_BLOCK_SIZE, .walksize = 8 * AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_setkey, + .setkey = aesbs_cbc_ctr_setkey, .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, }, { diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index 7868330dd54e..ebe5558929b7 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> SHASH .req v0 @@ -350,11 +351,11 @@ CPU_LE( rev64 T1.16b, T1.16b ) * void pmull_ghash_update(int blocks, u64 dg[], const char *src, * struct ghash_key const *k, const char *head) */ -SYM_FUNC_START(pmull_ghash_update_p64) +SYM_TYPED_FUNC_START(pmull_ghash_update_p64) __pmull_ghash p64 SYM_FUNC_END(pmull_ghash_update_p64) -SYM_FUNC_START(pmull_ghash_update_p8) +SYM_TYPED_FUNC_START(pmull_ghash_update_p8) __pmull_ghash p8 SYM_FUNC_END(pmull_ghash_update_p8) diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 9c3d86e397bf..1fae18ba11ed 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, { if (unlikely(!dctx->sset)) { if (!dctx->rset) { - poly1305_init_arch(dctx, src); + poly1305_init_arm64(&dctx->h, src); src += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; dctx->rset = 1; diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S new file mode 100644 index 000000000000..b5326540d2e3 --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-core.S @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Implementation of POLYVAL using ARMv8 Crypto Extensions. + * + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions + * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, + * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split + * finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include <linux/linkage.h> +#define STRIDE_BLOCKS 8 + +KEY_POWERS .req x0 +MSG .req x1 +BLOCKS_LEFT .req x2 +ACCUMULATOR .req x3 +KEY_START .req x10 +EXTRA_BYTES .req x11 +TMP .req x13 + +M0 .req v0 +M1 .req v1 +M2 .req v2 +M3 .req v3 +M4 .req v4 +M5 .req v5 +M6 .req v6 +M7 .req v7 +KEY8 .req v8 +KEY7 .req v9 +KEY6 .req v10 +KEY5 .req v11 +KEY4 .req v12 +KEY3 .req v13 +KEY2 .req v14 +KEY1 .req v15 +PL .req v16 +PH .req v17 +TMP_V .req v18 +LO .req v20 +MI .req v21 +HI .req v22 +SUM .req v23 +GSTAR .req v24 + + .text + + .arch armv8-a+crypto + .align 4 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +/* + * Computes the product of two 128-bit polynomials in X and Y and XORs the + * components of the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += (X_0 + X_1) * (Y_0 + Y_1) + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * Karatsuba multiplication is used instead of Schoolbook multiplication because + * it was found to be slightly faster on ARM64 CPUs. + * + */ +.macro karatsuba1 X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 v28.1q, X.2d, Y.2d + pmull v29.1q, X.1d, Y.1d + pmull v27.1q, v25.1d, v26.1d + eor HI.16b, HI.16b, v28.16b + eor LO.16b, LO.16b, v29.16b + eor MI.16b, MI.16b, v27.16b + .unreq X + .unreq Y +.endm + +/* + * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into + * them. + */ +.macro karatsuba1_store X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 HI.1q, X.2d, Y.2d + pmull LO.1q, X.1d, Y.1d + pmull MI.1q, v25.1d, v26.1d + .unreq X + .unreq Y +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = + * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + */ +.macro karatsuba2 + // v4 = [HI_1 + MI_1 : HI_0 + MI_0] + eor v4.16b, HI.16b, MI.16b + // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] + eor v4.16b, v4.16b, LO.16b + // v5 = [HI_0 : LO_1] + ext v5.16b, LO.16b, HI.16b, #8 + // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] + eor v4.16b, v4.16b, v5.16b + // HI = [HI_0 : HI_1] + ext HI.16b, HI.16b, HI.16b, #8 + // LO = [LO_0 : LO_1] + ext LO.16b, LO.16b, LO.16b, #8 + // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] + ext PH.16b, v4.16b, HI.16b, #8 + // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + ext PL.16b, LO.16b, v4.16b, #8 +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right + * shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + DEST .req \dest + // TMP_V = T_1 : T_0 = P_0 * g*(x) + pmull TMP_V.1q, PL.1d, GSTAR.1d + // TMP_V = T_0 : T_1 + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + // TMP_V = P_1 + T_0 : P_0 + T_1 + eor TMP_V.16b, PL.16b, TMP_V.16b + // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + eor PH.16b, PH.16b, TMP_V.16b + // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + eor DEST.16b, PH.16b, TMP_V.16b + .unreq DEST +.endm + +/* + * Compute Polyval on 8 blocks. + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + * + * Sets PL, PH. + */ +.macro full_stride reduce + eor LO.16b, LO.16b, LO.16b + eor MI.16b, MI.16b, MI.16b + eor HI.16b, HI.16b, HI.16b + + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 + + karatsuba1 M7 KEY1 + .if \reduce + pmull TMP_V.1q, PL.1d, GSTAR.1d + .endif + + karatsuba1 M6 KEY2 + .if \reduce + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + .endif + + karatsuba1 M5 KEY3 + .if \reduce + eor TMP_V.16b, PL.16b, TMP_V.16b + .endif + + karatsuba1 M4 KEY4 + .if \reduce + eor PH.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M3 KEY5 + .if \reduce + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + .endif + + karatsuba1 M2 KEY6 + .if \reduce + eor SUM.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M1 KEY7 + eor M0.16b, M0.16b, SUM.16b + + karatsuba1 M0 KEY8 + karatsuba2 +.endm + +/* + * Handle any extra blocks after full_stride loop. + */ +.macro partial_stride + add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) + sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 + ld1 {KEY1.16b}, [KEY_POWERS], #16 + + ld1 {TMP_V.16b}, [MSG], #16 + eor SUM.16b, SUM.16b, TMP_V.16b + karatsuba1_store KEY1 SUM + sub BLOCKS_LEFT, BLOCKS_LEFT, #1 + + tst BLOCKS_LEFT, #4 + beq .Lpartial4BlocksDone + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 + karatsuba1 M2 KEY6 + karatsuba1 M3 KEY5 +.Lpartial4BlocksDone: + tst BLOCKS_LEFT, #2 + beq .Lpartial2BlocksDone + ld1 {M0.16b, M1.16b}, [MSG], #32 + ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 +.Lpartial2BlocksDone: + tst BLOCKS_LEFT, #1 + beq .LpartialDone + ld1 {M0.16b}, [MSG], #16 + ld1 {KEY8.16b}, [KEY_POWERS], #16 + karatsuba1 M0 KEY8 +.LpartialDone: + karatsuba2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void pmull_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(pmull_polyval_mul) + adr TMP, .Lgstar + ld1 {GSTAR.2d}, [TMP] + ld1 {v0.16b}, [x0] + ld1 {v1.16b}, [x1] + karatsuba1_store v0 v1 + karatsuba2 + montgomery_reduction SUM + st1 {SUM.16b}, [x0] + ret +SYM_FUNC_END(pmull_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * x0 - pointer to precomputed key powers h^8 ... h^1 + * x1 - pointer to message blocks + * x2 - number of blocks to hash + * x3 - pointer to accumulator + * + * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, + * size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(pmull_polyval_update) + adr TMP, .Lgstar + mov KEY_START, KEY_POWERS + ld1 {GSTAR.2d}, [TMP] + ld1 {SUM.16b}, [ACCUMULATOR] + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExit + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 + full_stride 0 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + bge .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + beq .LskipPartial + partial_stride +.LskipPartial: + st1 {SUM.16b}, [ACCUMULATOR] + ret +SYM_FUNC_END(pmull_polyval_update) diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c new file mode 100644 index 000000000000..0a3b5718df85 --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-glue.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using ARMv8 Crypto Extensions + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication accelerated by + * ARMv8 Crypto Extensions instructions to implement the finite field operations. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/polyval.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2); + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_update(keys, in, nblocks, accumulator); + kernel_neon_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_mul(op1, op2); + kernel_neon_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_arm64_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_arm64_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_arm64_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* allow rescheduling every 4K bytes */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_arm64_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_arm64_init, + .update = polyval_arm64_update, + .final = polyval_arm64_final, + .setkey = polyval_arm64_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-ce", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + }, +}; + +static int __init polyval_ce_mod_init(void) +{ + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_ce_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_cpu_feature_match(PMULL, polyval_ce_mod_init) +module_exit(polyval_ce_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-ce"); diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index 8c65cecf560a..250e1377c481 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions * diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl index 2d8655d5b1af..35ec9ae99fe1 100644 --- a/arch/arm64/crypto/sha512-armv8.pl +++ b/arch/arm64/crypto/sha512-armv8.pl @@ -43,7 +43,7 @@ # on Cortex-A53 (or by 4 cycles per round). # (***) Super-impressive coefficients over gcc-generated code are # indication of some compiler "pathology", most notably code -# generated with -mgeneral-regs-only is significanty faster +# generated with -mgeneral-regs-only is significantly faster # and the gap is only 40-90%. # # October 2016. diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index e62a094a9d52..94cb7580deb7 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions * diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S index ef97d3187cb7..ca70cfacd0d0 100644 --- a/arch/arm64/crypto/sm3-ce-core.S +++ b/arch/arm64/crypto/sm3-ce-core.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 @@ -73,7 +74,7 @@ * int blocks) */ .text -SYM_FUNC_START(sm3_ce_transform) +SYM_TYPED_FUNC_START(sm3_ce_transform) /* load state */ ld1 {v8.4s-v9.4s}, [x0] rev64 v8.4s, v8.4s diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index d71faca322f2..ee98954ae8ca 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -26,8 +26,10 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, static int sm3_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) - return crypto_sm3_update(desc, data, len); + if (!crypto_simd_usable()) { + sm3_update(shash_desc_ctx(desc), data, len); + return 0; + } kernel_neon_begin(); sm3_base_do_update(desc, data, len, sm3_ce_transform); @@ -38,8 +40,10 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data, static int sm3_ce_final(struct shash_desc *desc, u8 *out) { - if (!crypto_simd_usable()) - return crypto_sm3_finup(desc, NULL, 0, out); + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } kernel_neon_begin(); sm3_base_do_finalize(desc, sm3_ce_transform); @@ -51,14 +55,22 @@ static int sm3_ce_final(struct shash_desc *desc, u8 *out) static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) - return crypto_sm3_finup(desc, data, len, out); + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, out); + return 0; + } kernel_neon_begin(); - sm3_base_do_update(desc, data, len, sm3_ce_transform); + if (len) + sm3_base_do_update(desc, data, len, sm3_ce_transform); + sm3_base_do_finalize(desc, sm3_ce_transform); kernel_neon_end(); - return sm3_ce_final(desc, out); + return sm3_base_finish(desc, out); } static struct shash_alg sm3_alg = { diff --git a/arch/arm64/crypto/sm4-ce-cipher-core.S b/arch/arm64/crypto/sm4-ce-cipher-core.S new file mode 100644 index 000000000000..4ac6cfbc5797 --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-cipher-core.S @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8 + .set .Lv\b\().4s, \b + .endr + + .macro sm4e, rd, rn + .inst 0xcec08400 | .L\rd | (.L\rn << 5) + .endm + + /* + * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in); + */ + .text +SYM_FUNC_START(sm4_ce_do_crypt) + ld1 {v8.4s}, [x2] + ld1 {v0.4s-v3.4s}, [x0], #64 +CPU_LE( rev32 v8.16b, v8.16b ) + ld1 {v4.4s-v7.4s}, [x0] + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 +CPU_LE( rev32 v8.16b, v8.16b ) + st1 {v8.4s}, [x1] + ret +SYM_FUNC_END(sm4_ce_do_crypt) diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c new file mode 100644 index 000000000000..76a34ef4abbb --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/sm4.h> +#include <crypto/internal/simd.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/types.h> + +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-ce"); +MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in); + +static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + sm4_crypt_block(ctx->rkey_enc, out, in); + } else { + kernel_neon_begin(); + sm4_ce_do_crypt(ctx->rkey_enc, out, in); + kernel_neon_end(); + } +} + +static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + sm4_crypt_block(ctx->rkey_dec, out, in); + } else { + kernel_neon_begin(); + sm4_ce_do_crypt(ctx->rkey_dec, out, in); + kernel_neon_end(); + } +} + +static struct crypto_alg sm4_ce_alg = { + .cra_name = "sm4", + .cra_driver_name = "sm4-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + .cra_u.cipher = { + .cia_min_keysize = SM4_KEY_SIZE, + .cia_max_keysize = SM4_KEY_SIZE, + .cia_setkey = sm4_ce_setkey, + .cia_encrypt = sm4_ce_encrypt, + .cia_decrypt = sm4_ce_decrypt + } +}; + +static int __init sm4_ce_mod_init(void) +{ + return crypto_register_alg(&sm4_ce_alg); +} + +static void __exit sm4_ce_mod_fini(void) +{ + crypto_unregister_alg(&sm4_ce_alg); +} + +module_cpu_feature_match(SM4, sm4_ce_mod_init); +module_exit(sm4_ce_mod_fini); diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index 4ac6cfbc5797..934e0f093279 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -1,36 +1,660 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ #include <linux/linkage.h> #include <asm/assembler.h> - .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8 - .set .Lv\b\().4s, \b - .endr - - .macro sm4e, rd, rn - .inst 0xcec08400 | .L\rd | (.L\rn << 5) - .endm - - /* - * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in); - */ - .text -SYM_FUNC_START(sm4_ce_do_crypt) - ld1 {v8.4s}, [x2] - ld1 {v0.4s-v3.4s}, [x0], #64 -CPU_LE( rev32 v8.16b, v8.16b ) - ld1 {v4.4s-v7.4s}, [x0] - sm4e v8.4s, v0.4s - sm4e v8.4s, v1.4s - sm4e v8.4s, v2.4s - sm4e v8.4s, v3.4s - sm4e v8.4s, v4.4s - sm4e v8.4s, v5.4s - sm4e v8.4s, v6.4s - sm4e v8.4s, v7.4s - rev64 v8.4s, v8.4s - ext v8.16b, v8.16b, v8.16b, #8 -CPU_LE( rev32 v8.16b, v8.16b ) - st1 {v8.4s}, [x1] - ret -SYM_FUNC_END(sm4_ce_do_crypt) +.arch armv8-a+crypto + +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b\().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +.macro sm4ekey, vd, vn, vm + .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd +.endm + +/* Register macros */ + +#define RTMP0 v16 +#define RTMP1 v17 +#define RTMP2 v18 +#define RTMP3 v19 + +#define RIV v20 + +/* Helper macros. */ + +#define PREPARE \ + ld1 {v24.16b-v27.16b}, [x0], #64; \ + ld1 {v28.16b-v31.16b}, [x0]; + +#define SM4_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; + +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b4.4s, v24.4s; \ + sm4e b5.4s, v24.4s; \ + sm4e b6.4s, v24.4s; \ + sm4e b7.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b4.4s, v25.4s; \ + sm4e b5.4s, v25.4s; \ + sm4e b6.4s, v25.4s; \ + sm4e b7.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b4.4s, v26.4s; \ + sm4e b5.4s, v26.4s; \ + sm4e b6.4s, v26.4s; \ + sm4e b7.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b4.4s, v27.4s; \ + sm4e b5.4s, v27.4s; \ + sm4e b6.4s, v27.4s; \ + sm4e b7.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b4.4s, v28.4s; \ + sm4e b5.4s, v28.4s; \ + sm4e b6.4s, v28.4s; \ + sm4e b7.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b4.4s, v29.4s; \ + sm4e b5.4s, v29.4s; \ + sm4e b6.4s, v29.4s; \ + sm4e b7.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b4.4s, v30.4s; \ + sm4e b5.4s, v30.4s; \ + sm4e b6.4s, v30.4s; \ + sm4e b7.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + sm4e b4.4s, v31.4s; \ + sm4e b5.4s, v31.4s; \ + sm4e b6.4s, v31.4s; \ + sm4e b7.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + rev64 b4.4s, b4.4s; \ + rev64 b5.4s, b5.4s; \ + rev64 b6.4s, b6.4s; \ + rev64 b7.4s, b7.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + ext b4.16b, b4.16b, b4.16b, #8; \ + ext b5.16b, b5.16b, b5.16b, #8; \ + ext b6.16b, b6.16b, b6.16b, #8; \ + ext b7.16b, b7.16b, b7.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; + + +.align 3 +SYM_FUNC_START(sm4_ce_expand_key) + /* input: + * x0: 128-bit key + * x1: rkey_enc + * x2: rkey_dec + * x3: fk array + * x4: ck array + */ + ld1 {v0.16b}, [x0]; + rev32 v0.16b, v0.16b; + ld1 {v1.16b}, [x3]; + /* load ck */ + ld1 {v24.16b-v27.16b}, [x4], #64; + ld1 {v28.16b-v31.16b}, [x4]; + + /* input ^ fk */ + eor v0.16b, v0.16b, v1.16b; + + sm4ekey v0.4s, v0.4s, v24.4s; + sm4ekey v1.4s, v0.4s, v25.4s; + sm4ekey v2.4s, v1.4s, v26.4s; + sm4ekey v3.4s, v2.4s, v27.4s; + sm4ekey v4.4s, v3.4s, v28.4s; + sm4ekey v5.4s, v4.4s, v29.4s; + sm4ekey v6.4s, v5.4s, v30.4s; + sm4ekey v7.4s, v6.4s, v31.4s; + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1]; + rev64 v7.4s, v7.4s; + rev64 v6.4s, v6.4s; + rev64 v5.4s, v5.4s; + rev64 v4.4s, v4.4s; + rev64 v3.4s, v3.4s; + rev64 v2.4s, v2.4s; + rev64 v1.4s, v1.4s; + rev64 v0.4s, v0.4s; + ext v7.16b, v7.16b, v7.16b, #8; + ext v6.16b, v6.16b, v6.16b, #8; + ext v5.16b, v5.16b, v5.16b, #8; + ext v4.16b, v4.16b, v4.16b, #8; + ext v3.16b, v3.16b, v3.16b, #8; + ext v2.16b, v2.16b, v2.16b, #8; + ext v1.16b, v1.16b, v1.16b, #8; + ext v0.16b, v0.16b, v0.16b, #8; + st1 {v7.16b}, [x2], #16; + st1 {v6.16b}, [x2], #16; + st1 {v5.16b}, [x2], #16; + st1 {v4.16b}, [x2], #16; + st1 {v3.16b}, [x2], #16; + st1 {v2.16b}, [x2], #16; + st1 {v1.16b}, [x2], #16; + st1 {v0.16b}, [x2]; + + ret; +SYM_FUNC_END(sm4_ce_expand_key) + +.align 3 +SYM_FUNC_START(sm4_ce_crypt_block) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + */ + PREPARE; + + ld1 {v0.16b}, [x2]; + SM4_CRYPT_BLK(v0); + st1 {v0.16b}, [x1]; + + ret; +SYM_FUNC_END(sm4_ce_crypt_block) + +.align 3 +SYM_FUNC_START(sm4_ce_crypt) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: nblocks + */ + PREPARE; + +.Lcrypt_loop_blk: + sub w3, w3, #8; + tbnz w3, #31, .Lcrypt_tail8; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2], #64; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w3, .Lcrypt_end; + b .Lcrypt_loop_blk; + +.Lcrypt_tail8: + add w3, w3, #8; + cmp w3, #4; + blt .Lcrypt_tail4; + + sub w3, w3, #4; + + ld1 {v0.16b-v3.16b}, [x2], #64; + SM4_CRYPT_BLK4(v0, v1, v2, v3); + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w3, .Lcrypt_end; + +.Lcrypt_tail4: + sub w3, w3, #1; + + ld1 {v0.16b}, [x2], #16; + SM4_CRYPT_BLK(v0); + st1 {v0.16b}, [x1], #16; + + cbnz w3, .Lcrypt_tail4; + +.Lcrypt_end: + ret; +SYM_FUNC_END(sm4_ce_crypt) + +.align 3 +SYM_FUNC_START(sm4_ce_cbc_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_enc_loop: + sub w4, w4, #1; + + ld1 {RTMP0.16b}, [x2], #16; + eor RIV.16b, RIV.16b, RTMP0.16b; + + SM4_CRYPT_BLK(RIV); + + st1 {RIV.16b}, [x1], #16; + + cbnz w4, .Lcbc_enc_loop; + + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cbc_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_cbc_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lcbc_tail8; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #64; + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + eor v4.16b, v4.16b, RTMP3.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v5.16b, v5.16b, RTMP0.16b; + eor v6.16b, v6.16b, RTMP1.16b; + eor v7.16b, v7.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w4, .Lcbc_end; + b .Lcbc_loop_blk; + +.Lcbc_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lcbc_tail4; + + sub w4, w4, #4; + + ld1 {v0.16b-v3.16b}, [x2]; + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w4, .Lcbc_end; + +.Lcbc_tail4: + sub w4, w4, #1; + + ld1 {v0.16b}, [x2]; + + SM4_CRYPT_BLK(v0); + + eor v0.16b, v0.16b, RIV.16b; + ld1 {RIV.16b}, [x2], #16; + st1 {v0.16b}, [x1], #16; + + cbnz w4, .Lcbc_tail4; + +.Lcbc_end: + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cbc_dec) + +.align 3 +SYM_FUNC_START(sm4_ce_cfb_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcfb_enc_loop: + sub w4, w4, #1; + + SM4_CRYPT_BLK(RIV); + + ld1 {RTMP0.16b}, [x2], #16; + eor RIV.16b, RIV.16b, RTMP0.16b; + st1 {RIV.16b}, [x1], #16; + + cbnz w4, .Lcfb_enc_loop; + + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cfb_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_cfb_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {v0.16b}, [x3]; + +.Lcfb_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lcfb_tail8; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #48; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + cbz w4, .Lcfb_end; + b .Lcfb_loop_blk; + +.Lcfb_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lcfb_tail4; + + sub w4, w4, #4; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2]; + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + cbz w4, .Lcfb_end; + +.Lcfb_tail4: + sub w4, w4, #1; + + SM4_CRYPT_BLK(v0); + + ld1 {RTMP0.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0.16b; + st1 {v0.16b}, [x1], #16; + + mov v0.16b, RTMP0.16b; + + cbnz w4, .Lcfb_tail4; + +.Lcfb_end: + /* store new IV */ + st1 {v0.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cfb_dec) + +.align 3 +SYM_FUNC_START(sm4_ce_ctr_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +.Lctr_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lctr_tail8; + +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + adc x7, x7, xzr; \ + rev64 vctr.16b, vctr.16b; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + inc_le128(v4); /* +4 */ + inc_le128(v5); /* +5 */ + inc_le128(v6); /* +6 */ + inc_le128(v7); /* +7 */ + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w4, .Lctr_end; + b .Lctr_loop_blk; + +.Lctr_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lctr_tail4; + + sub w4, w4, #4; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w4, .Lctr_end; + +.Lctr_tail4: + sub w4, w4, #1; + + /* construct CTRs */ + inc_le128(v0); + + SM4_CRYPT_BLK(v0); + + ld1 {RTMP0.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0.16b; + st1 {v0.16b}, [x1], #16; + + cbnz w4, .Lctr_tail4; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_ctr_enc) diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 9c93cfc4841b..496d55c0d01a 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -1,82 +1,372 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/cpufeature.h> #include <asm/neon.h> #include <asm/simd.h> -#include <crypto/sm4.h> #include <crypto/internal/simd.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/types.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> -MODULE_ALIAS_CRYPTO("sm4"); -MODULE_ALIAS_CRYPTO("sm4-ce"); -MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); +#define BYTES2BLKS(nbytes) ((nbytes) >> 4) + +asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec, + const u32 *fk, const u32 *ck); +asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src); +asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); + +static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + return 0; +} + +static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_crypt(rkey, dst, src, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_enc); +} + +static int sm4_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_dec); +} + +static int sm4_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); -asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in); + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} -static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int key_len) +static int sm4_cbc_decrypt(struct skcipher_request *req) { - struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } - return sm4_expandkey(ctx, key, key_len); + return err; } -static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static int sm4_cfb_encrypt(struct skcipher_request *req) { - const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; - if (!crypto_simd_usable()) { - sm4_crypt_block(ctx->rkey_enc, out, in); - } else { kernel_neon_begin(); - sm4_ce_do_crypt(ctx->rkey_enc, out, in); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cfb_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); } + + return err; } -static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static int sm4_cfb_decrypt(struct skcipher_request *req) { - const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; - if (!crypto_simd_usable()) { - sm4_crypt_block(ctx->rkey_dec, out, in); - } else { kernel_neon_begin(); - sm4_ce_do_crypt(ctx->rkey_dec, out, in); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cfb_dec(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); } + + return err; } -static struct crypto_alg sm4_ce_alg = { - .cra_name = "sm4", - .cra_driver_name = "sm4-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = SM4_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct sm4_ctx), - .cra_module = THIS_MODULE, - .cra_u.cipher = { - .cia_min_keysize = SM4_KEY_SIZE, - .cia_max_keysize = SM4_KEY_SIZE, - .cia_setkey = sm4_ce_setkey, - .cia_encrypt = sm4_ce_encrypt, - .cia_decrypt = sm4_ce_decrypt +static int sm4_ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_algs[] = { + { + .base = { + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ecb_encrypt, + .decrypt = sm4_ecb_decrypt, + }, { + .base = { + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = sm4_cbc_decrypt, + }, { + .base = { + .cra_name = "cfb(sm4)", + .cra_driver_name = "cfb-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = sm4_cfb_decrypt, + }, { + .base = { + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ctr_crypt, + .decrypt = sm4_ctr_crypt, } }; -static int __init sm4_ce_mod_init(void) +static int __init sm4_init(void) { - return crypto_register_alg(&sm4_ce_alg); + return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); } -static void __exit sm4_ce_mod_fini(void) +static void __exit sm4_exit(void) { - crypto_unregister_alg(&sm4_ce_alg); + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); } -module_cpu_feature_match(SM4, sm4_ce_mod_init); -module_exit(sm4_ce_mod_fini); +module_cpu_feature_match(SM4, sm4_init); +module_exit(sm4_exit); + +MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("sm4-ce"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("ecb(sm4)"); +MODULE_ALIAS_CRYPTO("cbc(sm4)"); +MODULE_ALIAS_CRYPTO("cfb(sm4)"); +MODULE_ALIAS_CRYPTO("ctr(sm4)"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S new file mode 100644 index 000000000000..3d5256b354d2 --- /dev/null +++ b/arch/arm64/crypto/sm4-neon-core.S @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm for ARMv8 NEON + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Register macros */ + +#define RTMP0 v8 +#define RTMP1 v9 +#define RTMP2 v10 +#define RTMP3 v11 + +#define RX0 v12 +#define RX1 v13 +#define RKEY v14 +#define RIV v15 + +/* Helper macros. */ + +#define PREPARE \ + adr_l x5, crypto_sm4_sbox; \ + ld1 {v16.16b-v19.16b}, [x5], #64; \ + ld1 {v20.16b-v23.16b}, [x5], #64; \ + ld1 {v24.16b-v27.16b}, [x5], #64; \ + ld1 {v28.16b-v31.16b}, [x5]; + +#define transpose_4x4(s0, s1, s2, s3) \ + zip1 RTMP0.4s, s0.4s, s1.4s; \ + zip1 RTMP1.4s, s2.4s, s3.4s; \ + zip2 RTMP2.4s, s0.4s, s1.4s; \ + zip2 RTMP3.4s, s2.4s, s3.4s; \ + zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ + zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ + zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ + zip2 s3.2d, RTMP2.2d, RTMP3.2d; + +#define rotate_clockwise_90(s0, s1, s2, s3) \ + zip1 RTMP0.4s, s1.4s, s0.4s; \ + zip2 RTMP1.4s, s1.4s, s0.4s; \ + zip1 RTMP2.4s, s3.4s, s2.4s; \ + zip2 RTMP3.4s, s3.4s, s2.4s; \ + zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ + zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ + zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ + zip2 s3.2d, RTMP3.2d, RTMP1.2d; + +#define ROUND4(round, s0, s1, s2, s3) \ + dup RX0.4s, RKEY.s[round]; \ + /* rk ^ s1 ^ s2 ^ s3 */ \ + eor RTMP1.16b, s2.16b, s3.16b; \ + eor RX0.16b, RX0.16b, s1.16b; \ + eor RX0.16b, RX0.16b, RTMP1.16b; \ + \ + /* sbox, non-linear part */ \ + movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ + tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ + \ + /* linear part */ \ + shl RTMP1.4s, RTMP0.4s, #8; \ + shl RTMP2.4s, RTMP0.4s, #16; \ + shl RTMP3.4s, RTMP0.4s, #24; \ + sri RTMP1.4s, RTMP0.4s, #(32-8); \ + sri RTMP2.4s, RTMP0.4s, #(32-16); \ + sri RTMP3.4s, RTMP0.4s, #(32-24); \ + /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ + eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ + /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ + eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ + shl RTMP2.4s, RTMP1.4s, 2; \ + sri RTMP2.4s, RTMP1.4s, #(32-2); \ + eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ + /* s0 ^= RTMP3 */ \ + eor s0.16b, s0.16b, RTMP3.16b; + +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + \ + transpose_4x4(b0, b1, b2, b3); \ + \ + mov x6, 8; \ +4: \ + ld1 {RKEY.4s}, [x0], #16; \ + subs x6, x6, #1; \ + \ + ROUND4(0, b0, b1, b2, b3); \ + ROUND4(1, b1, b2, b3, b0); \ + ROUND4(2, b2, b3, b0, b1); \ + ROUND4(3, b3, b0, b1, b2); \ + \ + bne 4b; \ + \ + rotate_clockwise_90(b0, b1, b2, b3); \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + \ + /* repoint to rkey */ \ + sub x0, x0, #128; + +#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ + /* rk ^ s1 ^ s2 ^ s3 */ \ + dup RX0.4s, RKEY.s[round]; \ + eor RTMP0.16b, s2.16b, s3.16b; \ + mov RX1.16b, RX0.16b; \ + eor RTMP1.16b, t2.16b, t3.16b; \ + eor RX0.16b, RX0.16b, s1.16b; \ + eor RX1.16b, RX1.16b, t1.16b; \ + eor RX0.16b, RX0.16b, RTMP0.16b; \ + eor RX1.16b, RX1.16b, RTMP1.16b; \ + \ + /* sbox, non-linear part */ \ + movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ + tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ + tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ + tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ + tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ + tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ + \ + /* linear part */ \ + shl RX0.4s, RTMP0.4s, #8; \ + shl RX1.4s, RTMP1.4s, #8; \ + shl RTMP2.4s, RTMP0.4s, #16; \ + shl RTMP3.4s, RTMP1.4s, #16; \ + sri RX0.4s, RTMP0.4s, #(32 - 8); \ + sri RX1.4s, RTMP1.4s, #(32 - 8); \ + sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ + sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ + /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ + eor RX0.16b, RX0.16b, RTMP0.16b; \ + eor RX1.16b, RX1.16b, RTMP1.16b; \ + eor RX0.16b, RX0.16b, RTMP2.16b; \ + eor RX1.16b, RX1.16b, RTMP3.16b; \ + /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ + shl RTMP2.4s, RTMP0.4s, #24; \ + shl RTMP3.4s, RTMP1.4s, #24; \ + sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ + sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ + eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ + shl RTMP2.4s, RX0.4s, #2; \ + shl RTMP3.4s, RX1.4s, #2; \ + sri RTMP2.4s, RX0.4s, #(32 - 2); \ + sri RTMP3.4s, RX1.4s, #(32 - 2); \ + eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ + /* s0/t0 ^= RTMP0/1 */ \ + eor s0.16b, s0.16b, RTMP0.16b; \ + eor t0.16b, t0.16b, RTMP1.16b; + +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + \ + transpose_4x4(b0, b1, b2, b3); \ + transpose_4x4(b4, b5, b6, b7); \ + \ + mov x6, 8; \ +8: \ + ld1 {RKEY.4s}, [x0], #16; \ + subs x6, x6, #1; \ + \ + ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ + ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ + ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ + ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ + \ + bne 8b; \ + \ + rotate_clockwise_90(b0, b1, b2, b3); \ + rotate_clockwise_90(b4, b5, b6, b7); \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + \ + /* repoint to rkey */ \ + sub x0, x0, #128; + + +.align 3 +SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: num blocks (1..4) + */ + PREPARE; + + ld1 {v0.16b}, [x2], #16; + mov v1.16b, v0.16b; + mov v2.16b, v0.16b; + mov v3.16b, v0.16b; + cmp w3, #2; + blt .Lblk4_load_input_done; + ld1 {v1.16b}, [x2], #16; + beq .Lblk4_load_input_done; + ld1 {v2.16b}, [x2], #16; + cmp w3, #3; + beq .Lblk4_load_input_done; + ld1 {v3.16b}, [x2]; + +.Lblk4_load_input_done: + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + st1 {v0.16b}, [x1], #16; + cmp w3, #2; + blt .Lblk4_store_output_done; + st1 {v1.16b}, [x1], #16; + beq .Lblk4_store_output_done; + st1 {v2.16b}, [x1], #16; + cmp w3, #3; + beq .Lblk4_store_output_done; + st1 {v3.16b}, [x1]; + +.Lblk4_store_output_done: + ret; +SYM_FUNC_END(__sm4_neon_crypt_blk1_4) + +.align 3 +SYM_FUNC_START(sm4_neon_crypt_blk1_8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: num blocks (1..8) + */ + cmp w3, #5; + blt __sm4_neon_crypt_blk1_4; + + PREPARE; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b}, [x2], #16; + mov v5.16b, v4.16b; + mov v6.16b, v4.16b; + mov v7.16b, v4.16b; + beq .Lblk8_load_input_done; + ld1 {v5.16b}, [x2], #16; + cmp w3, #7; + blt .Lblk8_load_input_done; + ld1 {v6.16b}, [x2], #16; + beq .Lblk8_load_input_done; + ld1 {v7.16b}, [x2]; + +.Lblk8_load_input_done: + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + cmp w3, #6; + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b}, [x1], #16; + blt .Lblk8_store_output_done; + st1 {v5.16b}, [x1], #16; + beq .Lblk8_store_output_done; + st1 {v6.16b}, [x1], #16; + cmp w3, #7; + beq .Lblk8_store_output_done; + st1 {v7.16b}, [x1]; + +.Lblk8_store_output_done: + ret; +SYM_FUNC_END(sm4_neon_crypt_blk1_8) + +.align 3 +SYM_FUNC_START(sm4_neon_crypt_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: nblocks (multiples of 8) + */ + PREPARE; + +.Lcrypt_loop_blk: + subs w3, w3, #8; + bmi .Lcrypt_end; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2], #64; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lcrypt_loop_blk; + +.Lcrypt_end: + ret; +SYM_FUNC_END(sm4_neon_crypt_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_cbc_dec_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_loop_blk: + subs w4, w4, #8; + bmi .Lcbc_end; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #64; + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + eor v4.16b, v4.16b, RTMP3.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v5.16b, v5.16b, RTMP0.16b; + eor v6.16b, v6.16b, RTMP1.16b; + eor v7.16b, v7.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lcbc_loop_blk; + +.Lcbc_end: + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_cbc_dec_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_cfb_dec_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ld1 {v0.16b}, [x3]; + +.Lcfb_loop_blk: + subs w4, w4, #8; + bmi .Lcfb_end; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #48; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + b .Lcfb_loop_blk; + +.Lcfb_end: + /* store new IV */ + st1 {v0.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_cfb_dec_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_ctr_enc_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +.Lctr_loop_blk: + subs w4, w4, #8; + bmi .Lctr_end; + +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + adc x7, x7, xzr; \ + rev64 vctr.16b, vctr.16b; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + inc_le128(v4); /* +4 */ + inc_le128(v5); /* +5 */ + inc_le128(v6); /* +6 */ + inc_le128(v7); /* +7 */ + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lctr_loop_blk; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_ctr_enc_blk8) diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c new file mode 100644 index 000000000000..03a6a6866a31 --- /dev/null +++ b/arch/arm64/crypto/sm4-neon-glue.c @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, using ARMv8 NEON + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> + +#define BYTES2BLKS(nbytes) ((nbytes) >> 4) +#define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1)) + +asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); + +static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_crypt_blk8(rkey, dst, src, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_neon_crypt_blk1_8(rkey, dst, src, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_enc); +} + +static int sm4_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_dec); +} + +static int sm4_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); + sm4_crypt_block(ctx->rkey_enc, dst, dst); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + u8 iv[SM4_BLOCK_SIZE]; + int i; + + sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream, + src, nblks); + + src += ((int)nblks - 2) * SM4_BLOCK_SIZE; + dst += (nblks - 1) * SM4_BLOCK_SIZE; + memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); + + for (i = nblks - 1; i > 0; i--) { + crypto_xor_cpy(dst, src, + &keystream[i * SM4_BLOCK_SIZE], + SM4_BLOCK_SIZE); + src -= SM4_BLOCK_SIZE; + dst -= SM4_BLOCK_SIZE; + } + crypto_xor_cpy(dst, walk.iv, + keystream, SM4_BLOCK_SIZE); + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + sm4_crypt_block(ctx->rkey_enc, keystream, iv); + crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + if (nblks > 1) + memcpy(&keystream[SM4_BLOCK_SIZE], src, + (nblks - 1) * SM4_BLOCK_SIZE); + memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE, + SM4_BLOCK_SIZE); + + sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, + keystream, nblks); + + crypto_xor_cpy(dst, src, keystream, + nblks * SM4_BLOCK_SIZE); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + int i; + + for (i = 0; i < nblks; i++) { + memcpy(&keystream[i * SM4_BLOCK_SIZE], + walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + } + sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, + keystream, nblks); + + crypto_xor_cpy(dst, src, keystream, + nblks * SM4_BLOCK_SIZE); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_algs[] = { + { + .base = { + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-neon", + .cra_priority = 200, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ecb_encrypt, + .decrypt = sm4_ecb_decrypt, + }, { + .base = { + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-neon", + .cra_priority = 200, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = sm4_cbc_decrypt, + }, { + .base = { + .cra_name = "cfb(sm4)", + .cra_driver_name = "cfb-sm4-neon", + .cra_priority = 200, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = sm4_cfb_decrypt, + }, { + .base = { + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-neon", + .cra_priority = 200, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ctr_crypt, + .decrypt = sm4_ctr_crypt, + } +}; + +static int __init sm4_init(void) +{ + return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +static void __exit sm4_exit(void) +{ + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 NEON"); +MODULE_ALIAS_CRYPTO("sm4-neon"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("ecb(sm4)"); +MODULE_ALIAS_CRYPTO("cbc(sm4)"); +MODULE_ALIAS_CRYPTO("cfb(sm4)"); +MODULE_ALIAS_CRYPTO("ctr(sm4)"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_LICENSE("GPL v2"); |