summaryrefslogtreecommitdiff
path: root/x86_64/serpent-encrypt.asm
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2011-06-20 13:01:41 +0200
committerNiels Möller <nisse@lysator.liu.se>2011-06-20 13:01:41 +0200
commitfb49f89ffc9d3f5ae66e7c4bb9853faef2fcd2e5 (patch)
tree8b6a4ae741a3ce684d940f2a347d395c013945ad /x86_64/serpent-encrypt.asm
parent0134b52fb65700dde6245d806297a43b33b05198 (diff)
downloadnettle-fb49f89ffc9d3f5ae66e7c4bb9853faef2fcd2e5.tar.gz
Added an SSE2 loop, doing four blocks at a time in parallel.
Rev: nettle/x86_64/serpent-encrypt.asm:1.3
Diffstat (limited to 'x86_64/serpent-encrypt.asm')
-rw-r--r--x86_64/serpent-encrypt.asm429
1 files changed, 421 insertions, 8 deletions
diff --git a/x86_64/serpent-encrypt.asm b/x86_64/serpent-encrypt.asm
index 9b7a25b5..f342685f 100644
--- a/x86_64/serpent-encrypt.asm
+++ b/x86_64/serpent-encrypt.asm
@@ -41,6 +41,12 @@ define(<Y1>, <%xmm5>)
define(<Y2>, <%xmm6>)
define(<Y3>, <%xmm7>)
+define(<MINUS1>, <%xmm8>)
+define(<T0>, <%xmm9>)
+define(<T1>, <%xmm10>)
+define(<T2>, <%xmm11>)
+define(<T3>, <%xmm12>)
+
C Arguments
define(<CTX>, <%rdi>)
define(<N>, <%rsi>)
@@ -48,9 +54,9 @@ define(<DST>, <%rdx>)
define(<SRC>, <%rcx>)
define(<CNT>, <%r13>)
-define(<TMP>, <%r14d>) C 32-bit temporary
+define(<TMP32>, <%r14d>)
-C Sbox macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
+C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
define(<SBOX0>, <
mov $2, $8 C y3 = x1 ^ x2
@@ -160,6 +166,7 @@ define(<SBOX3>, <
mov $1, $5
xor $2, $5
>)
+
define(<SBOX4>, <
mov $1, $8
or $2, $8
@@ -272,22 +279,351 @@ define(<LT>, <
rol <$>3, $3
xor $1, $2
xor $3, $2
- mov $1, TMP
- shl <$>3, TMP
+ mov $1, TMP32
+ shl <$>3, TMP32
xor $3, $4
- xor TMP, $4
+ xor TMP32, $4
rol $2
rol <$>7, $4
xor $2, $1
xor $4, $1
- mov $2, TMP
- shl <$>7, TMP
+ mov $2, TMP32
+ shl <$>7, TMP32
xor $4, $3
- xor TMP, $3
+ xor TMP32, $3
rol <$>5, $1
rol <$>22, $3
>)
+C Parallel operation on four blocks at a time.
+
+C pnot instruction is missing. For lack of a spare register, XOR with
+C constant in memory.
+
+define(<PNOT>, <
+ pxor MINUS1, $1
+>)
+
+define(<WSBOX0>, <
+ movdqa $2, $8 C y3 = x1 ^ x2
+ pxor $3, $8
+ movdqa $1, $5 C y0 = x0 | x3
+ por $4, $5
+ movdqa $1, $6 C y1 = x0 ^ x1
+ pxor $2, $6
+ pxor $5, $8 C y3 ^= y0
+ movdqa $3, $7 C y2 = x2 | y3
+ por $8, $7
+ pxor $4, $1 C x0 ^= x3
+ pand $4, $7 C y2 &= x3
+ pxor $3, $4 C x3 ^= x2
+ por $2, $3 C x2 |= x1
+ movdqa $6, $5 C y0 = y1 & x2
+ pand $3, $5
+ pxor $5, $7 C y2 ^= y0
+ pand $7, $5 C y0 &= y2
+ pxor $3, $5 C y0 ^= x2
+ pand $1, $2 C x1 &= x0
+ pxor $1, $5 C y0 ^= x0
+ PNOT($5) C y0 = ~y0
+ movdqa $5, $6 C y1 = y0 ^ x1
+ pxor $2, $6
+ pxor $4, $6 C y1 ^= x3
+>)
+
+define(<WSBOX1>, <
+ movdqa $1, $6 C y1 = x0 | x3
+ por $4, $6
+ movdqa $3, $7 C y2 = x2 ^ x3
+ pxor $4, $7
+ movdqa $2, $5 C y0 = ~x1
+ PNOT($5)
+ movdqa $1, $8 C y3 = x0 ^ x2
+ pxor $3, $8
+ por $1, $5 C y0 |= x0
+ pand $4, $8 C y3 &= x3
+ movdqa $6, $1 C x0 = y1 & y2
+ pand $7, $1
+ por $2, $8 C y3 |= x1
+ pxor $5, $7 C y2 ^= y0
+ pxor $1, $8 C y3 ^= x0
+ movdqa $6, $1 C x0 = y1 ^ y3
+ pxor $8, $1
+ pxor $7, $1 C x0 ^= y2
+ movdqa $2, $6 C y1 = x1 & x3
+ pand $4, $6
+ pxor $1, $6 C y1 ^= x0
+ movdqa $6, $4 C x3 = y1 | y3
+ por $8, $4
+ PNOT($8) C y3 = ~y3
+ pand $4, $5 C y0 &= x3
+ pxor $3, $5 C y0 ^= x2
+>)
+
+define(<WSBOX2>, <
+ movdqa $1, $7 C y2 = x1 | x2
+ por $3, $7
+ movdqa $1, $6
+ pxor $2, $6
+ movdqa $4, $8
+ pxor $7, $8
+ movdqa $6, $5
+ pxor $8, $5
+ por $1, $4
+ pxor $5, $3
+ movdqa $2, $1
+ pxor $3, $1
+ por $2, $3
+ pand $7, $1
+ pxor $3, $8
+ por $8, $6
+ pxor $1, $6
+ movdqa $8, $7
+ pxor $6, $7
+ pxor $2, $7
+ PNOT($8)
+ pxor $4, $7
+>)
+
+define(<WSBOX3>, <
+ movdqa $1, $6
+ pxor $3, $6
+ movdqa $1, $5
+ por $4, $5
+ movdqa $1, $8
+ pand $4, $8
+ pand $5, $6
+ por $2, $8
+ movdqa $1, $7
+ pand $2, $7
+ por $3, $7
+ movdqa $4, $3
+ pxor $6, $3
+ pxor $8, $6
+ por $3, $1
+ pxor $2, $3
+ pand $4, $8
+ pxor $8, $5
+ movdqa $7, $8
+ pxor $3, $8
+ pxor $5, $7
+ por $8, $4
+ pand $4, $2
+ movdqa $1, $5
+ pxor $2, $5
+>)
+
+define(<WSBOX4>, <
+ movdqa $1, $8
+ por $2, $8
+ movdqa $2, $7
+ por $3, $7
+ pxor $1, $7
+ pand $4, $8
+ movdqa $2, $5
+ pxor $4, $5
+ por $7, $4
+ pand $4, $1
+ pand $3, $2
+ pxor $8, $3
+ pxor $7, $8
+ por $2, $7
+ movdqa $8, $6
+ pand $5, $6
+ pxor $6, $7
+ pxor $5, $6
+ por $2, $6
+ pxor $1, $6
+ pand $4, $5
+ pxor $3, $5
+ PNOT($5)
+>)
+
+define(<WSBOX5>, <
+ movdqa $2, $5
+ por $4, $5
+ pxor $3, $5
+ movdqa $2, $3
+ pxor $4, $3
+ movdqa $1, $7
+ pxor $3, $7
+ pand $3, $1
+ pxor $1, $5
+ movdqa $2, $8
+ por $7, $8
+ por $5, $2
+ PNOT($5)
+ por $5, $1
+ pxor $3, $8
+ pxor $1, $8
+ movdqa $4, $6
+ por $5, $6
+ pxor $6, $4
+ pxor $7, $6
+ por $4, $7
+ pxor $2, $7
+>)
+
+define(<WSBOX6>, <
+ movdqa $1, $5
+ pxor $4, $5
+ movdqa $1, $6
+ pand $4, $6
+ movdqa $1, $7
+ por $3, $7
+ por $2, $4
+ pxor $3, $4
+ pxor $2, $1
+ movdqa $2, $8
+ por $3, $8
+ pxor $2, $3
+ pand $5, $8
+ pxor $3, $6
+ PNOT($6)
+ pand $6, $5
+ pand $6, $2
+ pxor $8, $2
+ pxor $4, $8
+ pxor $2, $7
+ PNOT($7)
+ pxor $7, $5
+ pxor $1, $5
+>)
+
+define(<WSBOX7>, <
+ movdqa $1, $5
+ pand $3, $5
+ movdqa $2, $8
+ por $5, $8 C t04
+ pxor $3, $8
+ movdqa $4, $6
+ pandn $1, $6 C t02 implicit
+ pxor $6, $8
+ movdqa $3, $6
+ por $8, $6
+ pxor $1, $6
+ movdqa $1, $7
+ pand $2, $7
+ pxor $7, $3
+ por $4, $7
+ pxor $7, $6
+ movdqa $2, $7
+ por $5, $7 C t04
+ pand $8, $7
+ pxor $6, $2
+ por $2, $7
+ pxor $1, $7
+ pxor $6, $5
+ PNOT($4) C t02
+ por $4, $5
+ pxor $3, $5
+>)
+
+C WROL(count, w)
+define(<WROL>, <
+ movdqa $2, T0
+ pslld <$>$1, $2
+ psrld <$>eval(32 - $1), T0
+ por T0, $2
+>)
+
+C WLT(x0, x1, x2, x3)
+define(<WLT>, <
+ WROL(13, $1)
+ WROL(3, $3)
+ pxor $1, $2
+ pxor $3, $2
+ movdqa $1, T0
+ pslld <$>3, T0
+ pxor $3, $4
+ pxor T0, $4
+ WROL(1, $2)
+ WROL(7, $4)
+ pxor $2, $1
+ pxor $4, $1
+ movdqa $2, T0
+ pslld <$>7, T0
+ pxor $4, $3
+ pxor T0, $3
+ WROL(5, $1)
+ WROL(22, $3)
+>)
+
+C Note: Diagrams use little-endian representation, with least
+C significant word to the left.
+
+C Transpose values from:
+C +----+----+----+----+
+C x0: | a0 | a1 | a2 | a3 |
+C x1: | b0 | b1 | b2 | b3 |
+C x2: | c0 | c1 | c2 | c3 |
+C x3: | d0 | d1 | d2 | d3 |
+C +----+----+----+----+
+C To:
+C +----+----+----+----+
+C x0: | a0 | b0 | c0 | d0 |
+C x1: | a1 | b1 | c1 | d1 |
+C x2: | a2 | b2 | c2 | d2 |
+C x3: | a3 | b3 | c3 | d3 |
+C +----+----+----+----+
+
+define(<WTRANSPOSE>, <
+ movdqa $1, T0
+ punpcklqdq $3, T0 C |a0 a1 c0 c1|
+ punpckhqdq $3, $1 C |a2 a3 c2 c3|
+ pshufd <$>0xd8, T0, T0 C |a0 c0 a1 c1|
+ pshufd <$>0xd8, $1, T1 C |a2 c2 a3 c3|
+
+ movdqa $2, T2
+ punpcklqdq $4, T2 C |b0 b1 d0 11|
+ punpckhqdq $4, $2 C |b2 b3 d2 d3|
+ pshufd <$>0xd8, T2, T2 C |b0 d0 b1 d1|
+ pshufd <$>0xd8, $2, T3 C |b2 d2 b3 d3|
+
+ movdqa T0, $1
+ punpckldq T2, $1 C |a0 b0 c0 d0|
+ movdqa T0, $2
+ punpckhdq T2, $2 C |a1 b1 c1 d1|
+
+ movdqa T1, $3
+ punpckldq T3, $3 C |a2 b2 c2 d2|
+ movdqa T1, $4
+ punpckhdq T3, $4 C |a3 b3 c3 d3|
+>)
+
+C Copy subkeys, from:
+C
+C +----+----+----+----+
+C k0: | s3 | s2 | s1 | s0 |
+C +----+----+----+----+
+C To:
+C +----+----+----+----+
+C k0: | s0 | s0 | s0 | s0 |
+C k1: | s1 | s1 | s1 | s1 |
+C k2: | s2 | s2 | s2 | s2 |
+C k3: | s3 | s3 | s3 | s3 |
+C +----+----+----+----+
+
+dnl define(<WCOPY>, <
+dnl pshufd $55, $1, $2
+dnl pshufd $aa, $1, $3
+dnl pshufd $ff, $1, $4
+dnl pshufd $00, $1, $1
+dnl >)
+
+C FIXME: Arrange 16-byte alignment, so we can use movaps?
+define(<WKEYXOR>, <
+ movups $1(CTX, CNT), T0
+ pshufd <$>0x55, T0, T1
+ pshufd <$>0xaa, T0, T2
+ pxor T1, $3
+ pxor T2, $4
+ pshufd <$>0xff, T0, T1
+ pshufd <$>0x00, T0, T0
+ pxor T1, $5
+ pxor T0, $2
+>)
+
.file "aes-serpent-encrypt.asm"
C serpent_encrypt(struct serpent_context *ctx,
@@ -311,9 +647,86 @@ PROLOGUE(nettle_serpent_encrypt)
C Point at the final subkey.
lea 512(CTX), CTX
+ cmp $-64, N
+ ja .Lwide_end
+
+ pcmpeqd MINUS1, MINUS1
+
+.Lwblock_loop:
+ movups (SRC, N), X0
+ movups 16(SRC, N), X1
+ movups 32(SRC, N), X2
+ movups 48(SRC, N), X3
+
+ WTRANSPOSE(X0, X1, X2, X3)
+
+ mov $-512, CNT
+ jmp .Lwround_start
+
+ ALIGN(4)
+.Lwround_loop:
+ WLT(X0,X1,X2,X3)
+.Lwround_start:
+ WKEYXOR(, X0,X1,X2,X3)
+ WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
+ WLT(Y0,Y1,Y2,Y3)
+
+ WKEYXOR(16, Y0,Y1,Y2,Y3)
+ WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
+ WLT(X0,X1,X2,X3)
+
+ WKEYXOR(32, X0,X1,X2,X3)
+ WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
+ WLT(Y0,Y1,Y2,Y3)
+
+ WKEYXOR(48, Y0,Y1,Y2,Y3)
+ WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
+ WLT(X0,X1,X2,X3)
+
+ WKEYXOR(64, X0,X1,X2,X3)
+ WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
+ WLT(Y0,Y1,Y2,Y3)
+
+ WKEYXOR(80, Y0,Y1,Y2,Y3)
+ WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
+ WLT(X0,X1,X2,X3)
+
+ WKEYXOR(96, X0,X1,X2,X3)
+ WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
+ WLT(Y0,Y1,Y2,Y3)
+
+ WKEYXOR(112, Y0,Y1,Y2,Y3)
+ WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
+ add $128, CNT
+ jnz .Lwround_loop
+
+ C FIXME CNT known to be zero, no index register needed
+ WKEYXOR(, X0,X1,X2,X3)
+
+ WTRANSPOSE(X0,X1,X2,X3)
+
+ movups X0, (DST, N)
+ movups X1, 16(DST, N)
+ movups X2, 32(DST, N)
+ movups X3, 48(DST, N)
+
+ C FIXME: Adjust N, so we can use just jnc without an extra cmp.
+ add $64, N
+ jz .Lend
+
+ cmp $-64, N
+ jbe .Lwblock_loop
+
+.Lwide_end:
+
+
C The single-block loop here is slightly slower than the double-block
C loop in serpent-encrypt.c.
+C FIXME: Should use non-sse2 code only if we have a sngle block left.
+C With two or three blocks, it should be better to do them in
+C parallell.
+
.Lblock_loop:
movl (SRC, N), x0
movl 4(SRC, N), x1