diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2011-12-20 12:58:06 +0200 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2012-01-13 16:38:40 +1100 |
commit | 847cb7ef565d31484f426677e0bea081bfd2acd9 (patch) | |
tree | 7325f4ce5961e0d51ea4707119aeba80622991c3 /arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | |
parent | 4c58464b8034cef4317593bf4ccbfc19d5bb3a77 (diff) | |
download | lwn-847cb7ef565d31484f426677e0bea081bfd2acd9.tar.gz lwn-847cb7ef565d31484f426677e0bea081bfd2acd9.zip |
crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions
Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.
This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/serpent-sse2-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 29 |
1 files changed, 13 insertions, 16 deletions
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 7f24a1540821..3ee1ff04d3e9 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -585,23 +585,20 @@ get_key(i, 1, RK1); \ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ -#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ - movdqa x2, t3; \ - movdqa x0, t1; \ - unpcklps x3, t3; \ +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ movdqa x0, t2; \ - unpcklps x1, t1; \ - unpckhps x1, t2; \ - movdqa t3, x1; \ - unpckhps x3, x2; \ - movdqa t1, x0; \ - movhlps t1, x1; \ - movdqa t2, t1; \ - movlhps t3, x0; \ - movlhps x2, t1; \ - movhlps t2, x2; \ - movdqa x2, x3; \ - movdqa t1, x2; + punpckldq x1, x0; \ + punpckhdq x1, t2; \ + movdqa x2, t1; \ + punpckhdq x3, x2; \ + punpckldq x3, t1; \ + movdqa x0, x1; \ + punpcklqdq t1, x0; \ + punpckhqdq t1, x1; \ + movdqa t2, x3; \ + punpcklqdq x2, t2; \ + punpckhqdq x2, x3; \ + movdqa t2, x2; #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ movdqu (0*4*4)(in), x0; \ |