summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-04-16 21:45:13 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-04-23 21:19:09 +0300
commitad4ee8d52f7199ba8bdee767044337060529069f (patch)
tree80321ad5be8f2fb47452c4e748f02829235ec6cd
parent3e17e819a6a4d505828cf93fc2c258a753f1d38c (diff)
downloadlibgcrypt-ad4ee8d52f7199ba8bdee767044337060529069f.tar.gz
mpi/amd64: optimize add_n and sub_n
* mpi/amd64/mpih-add1.S (_gcry_mpih_add_n): New implementation with 4x unrolled fast-path loop. * mpi/amd64/mpih-sub1.S (_gcry_mpih_sub_n): Likewise. -- Benchmark on AMD Ryzen 9 7900X: Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz add | 0.035 ns/B 27559 MiB/s 0.163 c/B 4700 sub | 0.034 ns/B 28332 MiB/s 0.158 c/B 4700 After (~26% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz add | 0.027 ns/B 35271 MiB/s 0.127 c/B 4700 sub | 0.027 ns/B 35206 MiB/s 0.127 c/B 4700 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--mpi/amd64/mpih-add1.S81
-rw-r--r--mpi/amd64/mpih-sub1.S80
2 files changed, 136 insertions, 25 deletions
diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 833a43cb..f2e86237 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -3,6 +3,7 @@
*
* Copyright (C) 1992, 1994, 1995, 1998,
* 2001, 2002, 2006 Free Software Foundation, Inc.
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -39,26 +40,80 @@
* mpi_ptr_t s2_ptr, rdx
* mpi_size_t size) rcx
*/
-
TEXT
ALIGN(4)
.globl C_SYMBOL_NAME(_gcry_mpih_add_n)
C_SYMBOL_NAME(_gcry_mpih_add_n:)
FUNC_ENTRY()
- leaq (%rsi,%rcx,8), %rsi
- leaq (%rdi,%rcx,8), %rdi
- leaq (%rdx,%rcx,8), %rdx
- negq %rcx
- xorl %eax, %eax /* clear cy */
+ movl %ecx, %r9d
+ andl $3, %r9d
+ je .Lprehandle0
+ cmpl $2, %r9d
+ jb .Lprehandle1
+ je .Lprehandle2
+
+#define FIRST_ADD() \
+ movq (%rsi), %rax; \
+ addq (%rdx), %rax; \
+ movq %rax, (%rdi)
+
+#define NEXT_ADD(offset) \
+ movq offset(%rsi), %rax; \
+ adcq offset(%rdx), %rax; \
+ movq %rax, offset(%rdi)
+
+.Lprehandle3:
+ leaq -2(%rcx), %rcx
+ FIRST_ADD();
+ NEXT_ADD(8);
+ NEXT_ADD(16);
+ decq %rcx
+ je .Lend
+ leaq 24(%rsi), %rsi
+ leaq 24(%rdx), %rdx
+ leaq 24(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle2:
+ leaq -1(%rcx), %rcx
+ FIRST_ADD();
+ NEXT_ADD(8);
+ decq %rcx
+ je .Lend
+ leaq 16(%rsi), %rsi
+ leaq 16(%rdx), %rdx
+ leaq 16(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle1:
+ FIRST_ADD();
+ decq %rcx
+ je .Lend
+ leaq 8(%rsi), %rsi
+ leaq 8(%rdx), %rdx
+ leaq 8(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle0:
+ clc /* clear cy */
ALIGN(4) /* minimal alignment for claimed speed */
-.Loop: movq (%rsi,%rcx,8), %rax
- movq (%rdx,%rcx,8), %r10
- adcq %r10, %rax
- movq %rax, (%rdi,%rcx,8)
- incq %rcx
+.Loop: leaq -3(%rcx), %rcx
+ NEXT_ADD(0);
+ NEXT_ADD(8);
+ NEXT_ADD(16);
+ NEXT_ADD(24);
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdx), %rdx
+ leaq 32(%rdi), %rdi
+ decq %rcx
jne .Loop
- movq %rcx, %rax /* zero %rax */
- adcq %rax, %rax
+ ALIGN(2)
+.Lend:
+ movl $0, %eax /* zero %rax */
+ adcl %eax, %eax
FUNC_EXIT()
diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S
index 8c61cb20..32799c86 100644
--- a/mpi/amd64/mpih-sub1.S
+++ b/mpi/amd64/mpih-sub1.S
@@ -3,6 +3,7 @@
*
* Copyright (C) 1992, 1994, 1995, 1998,
* 2001, 2002, 2006 Free Software Foundation, Inc.
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -44,20 +45,75 @@
.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
C_SYMBOL_NAME(_gcry_mpih_sub_n:)
FUNC_ENTRY()
- leaq (%rsi,%rcx,8), %rsi
- leaq (%rdi,%rcx,8), %rdi
- leaq (%rdx,%rcx,8), %rdx
- negq %rcx
- xorl %eax, %eax /* clear cy */
+ movl %ecx, %r9d
+ andl $3, %r9d
+ je .Lprehandle0
+ cmpl $2, %r9d
+ jb .Lprehandle1
+ je .Lprehandle2
+
+#define FIRST_SUB() \
+ movq (%rsi), %rax; \
+ subq (%rdx), %rax; \
+ movq %rax, (%rdi)
+
+#define NEXT_SUB(offset) \
+ movq offset(%rsi), %rax; \
+ sbbq offset(%rdx), %rax; \
+ movq %rax, offset(%rdi)
+
+.Lprehandle3:
+ leaq -2(%rcx), %rcx
+ FIRST_SUB();
+ NEXT_SUB(8);
+ NEXT_SUB(16);
+ decq %rcx
+ je .Lend
+ leaq 24(%rsi), %rsi
+ leaq 24(%rdx), %rdx
+ leaq 24(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle2:
+ leaq -1(%rcx), %rcx
+ FIRST_SUB();
+ NEXT_SUB(8);
+ decq %rcx
+ je .Lend
+ leaq 16(%rsi), %rsi
+ leaq 16(%rdx), %rdx
+ leaq 16(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle1:
+ FIRST_SUB();
+ decq %rcx
+ je .Lend
+ leaq 8(%rsi), %rsi
+ leaq 8(%rdx), %rdx
+ leaq 8(%rdi), %rdi
+ jmp .Loop
+
+ ALIGN(3)
+.Lprehandle0:
+ clc /* clear cy */
ALIGN(4) /* minimal alignment for claimed speed */
-.Loop: movq (%rsi,%rcx,8), %rax
- movq (%rdx,%rcx,8), %r10
- sbbq %r10, %rax
- movq %rax, (%rdi,%rcx,8)
- incq %rcx
+.Loop: leaq -3(%rcx), %rcx
+ NEXT_SUB(0);
+ NEXT_SUB(8);
+ NEXT_SUB(16);
+ NEXT_SUB(24);
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdx), %rdx
+ leaq 32(%rdi), %rdi
+ decq %rcx
jne .Loop
- movq %rcx, %rax /* zero %rax */
- adcq %rax, %rax
+ ALIGN(2)
+.Lend:
+ movl $0, %eax /* zero %rax */
+ adcl %eax, %eax
FUNC_EXIT()