summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2015-04-14 17:05:03 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2015-04-14 17:05:03 +0300
commit3dc2d62d0486325bf263655c2d9a96aee0b02abe (patch)
tree47336185d9126f14d8a3943503706023d05fe4b7 /src
parent4f700bcd20c087f60346cb8aefd0e269be8e2157 (diff)
downloadpostgresql-3dc2d62d0486325bf263655c2d9a96aee0b02abe.tar.gz
Use Intel SSE 4.2 CRC instructions where available.
Modern x86 and x86-64 processors with SSE 4.2 support have special instructions, crc32b and crc32q, for calculating CRC-32C. They greatly speed up CRC calculation. Whether the instructions can be used or not depends on the compiler and the target architecture. If generation of SSE 4.2 instructions is allowed for the target (-msse4.2 flag on gcc and clang), use them. If they are not allowed by default, but the compiler supports the -msse4.2 flag to enable them, compile just the CRC-32C function with -msse4.2 flag, and check at runtime whether the processor we're running on supports it. If it doesn't, fall back to the slicing-by-8 algorithm. (With the common defaults on current operating systems, the runtime-check variant is what you get in practice.) Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.global.in4
-rw-r--r--src/include/pg_config.h.in15
-rw-r--r--src/include/pg_config.h.win3223
-rw-r--r--src/include/port/pg_crc32c.h44
-rw-r--r--src/port/Makefile8
-rw-r--r--src/port/pg_crc32c_choose.c63
-rw-r--r--src/port/pg_crc32c_sse42.c52
-rw-r--r--src/tools/msvc/Mkvcbuild.pm13
8 files changed, 217 insertions, 5 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 7c39d82724..4b06fc2d96 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -225,6 +225,7 @@ GCC = @GCC@
SUN_STUDIO_CC = @SUN_STUDIO_CC@
CFLAGS = @CFLAGS@
CFLAGS_VECTOR = @CFLAGS_VECTOR@
+CFLAGS_SSE42 = @CFLAGS_SSE42@
# Kind-of compilers
@@ -548,6 +549,9 @@ endif
LIBOBJS = @LIBOBJS@
+# files needed for the chosen CRC-32C implementation
+PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
+
LIBS := -lpgcommon -lpgport $(LIBS)
# to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 202c51a34a..5688f750af 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -675,6 +675,12 @@
/* Define to 1 if your compiler understands __builtin_unreachable. */
#undef HAVE__BUILTIN_UNREACHABLE
+/* Define to 1 if you have __cpuid. */
+#undef HAVE__CPUID
+
+/* Define to 1 if you have __get_cpuid. */
+#undef HAVE__GET_CPUID
+
/* Define to 1 if your compiler understands _Static_assert. */
#undef HAVE__STATIC_ASSERT
@@ -818,6 +824,15 @@
/* Use replacement snprintf() functions. */
#undef USE_REPL_SNPRINTF
+/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
+#undef USE_SLICING_BY_8_CRC32C
+
+/* Define to 1 use Intel SSE 4.2 CRC instructions. */
+#undef USE_SSE42_CRC32C
+
+/* Define to 1 to use Intel SSSE 4.2 CRC instructions with a runtime check. */
+#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+
/* Define to select SysV-style semaphores. */
#undef USE_SYSV_SEMAPHORES
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 1baf64f005..d9fa711ab5 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -6,8 +6,8 @@
*
* HAVE_CBRT, HAVE_FUNCNAME_FUNC, HAVE_GETOPT, HAVE_GETOPT_H, HAVE_INTTYPES_H,
* HAVE_GETOPT_LONG, HAVE_LOCALE_T, HAVE_RINT, HAVE_STRINGS_H, HAVE_STRTOLL,
- * HAVE_STRTOULL, HAVE_STRUCT_OPTION, ENABLE_THREAD_SAFETY,
- * PG_USE_INLINE, inline
+ * HAVE_STRTOULL, HAVE_STRUCT_OPTION, ENABLE_THREAD_SAFETY, PG_USE_INLINE,
+ * inline, USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
*/
/* Define to the type of arg 1 of 'accept' */
@@ -529,6 +529,12 @@
/* Define to 1 if your compiler understands __builtin_unreachable. */
/* #undef HAVE__BUILTIN_UNREACHABLE */
+/* Define to 1 if you have __cpuid. */
+#define HAVE__CPUID 1
+
+/* Define to 1 if you have __get_cpuid. */
+#undef HAVE__GET_CPUID
+
/* Define to 1 if your compiler understands _Static_assert. */
/* #undef HAVE__STATIC_ASSERT */
@@ -639,6 +645,19 @@
/* Use replacement snprintf() functions. */
#define USE_REPL_SNPRINTF 1
+/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
+#if (_MSC_VER < 1500)
+#define USE_SLICING_BY_8_CRC32C 1
+#end
+
+/* Define to 1 use Intel SSE 4.2 CRC instructions. */
+/* #undef USE_SSE42_CRC32C */
+
+/* Define to 1 to use Intel SSSE 4.2 CRC instructions with a runtime check. */
+#if (_MSC_VER >= 1500)
+#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+#endif
+
/* Define to select SysV-style semaphores. */
/* #undef USE_SYSV_SEMAPHORES */
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index d07c0cb623..b14d194fb3 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -3,6 +3,25 @@
* pg_crc32c.h
* Routines for computing CRC-32C checksums.
*
+ * The speed of CRC-32C calculation has a big impact on performance, so we
+ * jump through some hoops to get the best implementation for each
+ * platform. Some CPU architectures have special instructions for speeding
+ * up CRC calculations (e.g. Intel SSE 4.2), on other platforms we use the
+ * Slicing-by-8 algorithm which uses lookup tables.
+ *
+ * The public interface consists of four macros:
+ *
+ * INIT_CRC32C(crc)
+ * Initialize a CRC accumulator
+ *
+ * COMP_CRC32C(crc, data, len)
+ * Accumulate some (more) bytes into a CRC
+ *
+ * FIN_CRC32C(crc)
+ * Finish a CRC calculation
+ *
+ * EQ_CRC32C(c1, c2)
+ * Check for equality of two CRCs.
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -16,9 +35,32 @@
typedef uint32 pg_crc32c;
+/* The INIT and EQ macros are the same for all implementations. */
#define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
+#if defined(USE_SSE42_CRC32C)
+/* Use SSE4.2 instructions. */
+#define COMP_CRC32C(crc, data, len) \
+ ((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+/*
+ * Use SSE4.2 instructions, but perform a runtime check first to check that
+ * they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+ ((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+
+#else
/*
* Use slicing-by-8 algorithm.
*
@@ -46,4 +88,6 @@ typedef uint32 pg_crc32c;
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+#endif
+
#endif /* PG_CRC32C_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index d1c9c8a987..bc9b63add0 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -30,10 +30,10 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I$(top_builddir)/src/port -DFRONTEND $(CPPFLAGS)
LIBS += $(PTHREAD_LIBS)
-OBJS = $(LIBOBJS) chklocale.o erand48.o inet_net_ntop.o \
+OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \
noblock.o path.o pgcheckdir.o pgmkdirp.o pgsleep.o \
pgstrcasecmp.o pqsignal.o \
- qsort.o qsort_arg.o quotes.o sprompt.o tar.o thread.o pg_crc32c_sb8.o
+ qsort.o qsort_arg.o quotes.o sprompt.o tar.o thread.o
# foo_srv.o and foo.o are both built from foo.c, but only foo.o has -DFRONTEND
OBJS_SRV = $(OBJS:%.o=%_srv.o)
@@ -57,6 +57,10 @@ libpgport.a: $(OBJS)
# thread.o needs PTHREAD_CFLAGS (but thread_srv.o does not)
thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
+# pg_crc32c_sse42.o and its _srv.o version need CFLAGS_SSE42
+pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
#
# Server versions of object files
#
diff --git a/src/port/pg_crc32c_choose.c b/src/port/pg_crc32c_choose.c
new file mode 100644
index 0000000000..ba0d1670f8
--- /dev/null
+++ b/src/port/pg_crc32c_choose.c
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_choose.c
+ * Choose which CRC-32C implementation to use, at runtime.
+ *
+ * Try to the special CRC instructions introduced in Intel SSE 4.2,
+ * if available on the platform we're running on, but fall back to the
+ * slicing-by-8 implementation otherwise.
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_crc32c_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_crc32c.h"
+
+static bool
+pg_crc32c_sse42_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+ return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+ if (pg_crc32c_sse42_available())
+ pg_comp_crc32c = pg_comp_crc32c_sse42;
+ else
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+ return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
new file mode 100644
index 0000000000..b6107103be
--- /dev/null
+++ b/src/port/pg_crc32c_sse42.c
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_sse42.c
+ * Compute CRC-32C checksum using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_crc32c_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+#include <nmmintrin.h>
+
+pg_crc32c
+pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
+{
+ const unsigned char *p = data;
+ const uint64 *p8;
+
+ /*
+ * Process eight bytes of data at a time.
+ *
+ * NB: We do unaligned 8-byte accesses here. The Intel architecture
+ * allows that, and performance testing didn't show any performance
+ * gain from aligning the beginning address.
+ */
+ p8 = (const uint64 *) p;
+ while (len >= 8)
+ {
+ crc = (uint32) _mm_crc32_u64(crc, *p8++);
+ len -= 8;
+ }
+
+ /*
+ * Handle any remaining bytes one at a time.
+ */
+ p = (const unsigned char *) p8;
+ while (len > 0)
+ {
+ crc = _mm_crc32_u8(crc, *p++);
+ len--;
+ }
+
+ return crc;
+}
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index b2c0dfbd7b..39281db901 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -92,10 +92,21 @@ sub mkvcbuild
pgcheckdir.c pgmkdirp.c pgsleep.c pgstrcasecmp.c pqsignal.c
mkdtemp.c qsort.c qsort_arg.c quotes.c system.c
sprompt.c tar.c thread.c getopt.c getopt_long.c dirent.c
- win32env.c win32error.c win32setlocale.c pg_crc32c_sb8.c);
+ win32env.c win32error.c win32setlocale.c);
push(@pgportfiles, 'rint.c') if ($vsVersion < '12.00');
+ if ($vsVersion >= '9.00')
+ {
+ push(@pgportfiles, 'pg_crc32c_choose.c');
+ push(@pgportfiles, 'pg_crc32c_sse42.c');
+ push(@pgportfiles, 'pg_crc32c_sb8.c');
+ }
+ else
+ {
+ push(@pgportfiles, 'pg_crc32c_sb8.c')
+ }
+
our @pgcommonallfiles = qw(
exec.c pg_lzcompress.c pgfnames.c psprintf.c relpath.c rmtree.c
string.c username.c wait_error.c);