5 files changed, 64 insertions, 42 deletions
diff --git a/include/atomic/gcc_builtins.h b/include/atomic/gcc_builtins.h
index 99e6f628b50..e2c3b10c267 100644
--- a/include/atomic/gcc_builtins.h
+++ b/include/atomic/gcc_builtins.h
@@ -40,18 +40,18 @@
 #define my_atomic_add64_explicit(P, A, O) __atomic_fetch_add((P), (A), (O))
 
 #define my_atomic_cas32_weak_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), true, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 1, (S), (F))
 #define my_atomic_cas64_weak_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), true, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 1, (S), (F))
 #define my_atomic_casptr_weak_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), true, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 1, (S), (F))
 
 #define my_atomic_cas32_strong_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), false, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 0, (S), (F))
 #define my_atomic_cas64_strong_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), false, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 0, (S), (F))
 #define my_atomic_casptr_strong_explicit(P, E, D, S, F) \
-  __atomic_compare_exchange_n((P), (E), (D), false, (S), (F))
+  __atomic_compare_exchange_n((P), (E), (D), 0, (S), (F))
 
 #define my_atomic_store32(P, D) __atomic_store_n((P), (D), __ATOMIC_SEQ_CST)
 #define my_atomic_store64(P, D) __atomic_store_n((P), (D), __ATOMIC_SEQ_CST)
diff --git a/include/atomic/generic-msvc.h b/include/atomic/generic-msvc.h
index 56fa4f66fcd..d5eaa4738c7 100644
--- a/include/atomic/generic-msvc.h
+++ b/include/atomic/generic-msvc.h
@@ -137,30 +137,4 @@ static inline void my_atomic_storeptr(void * volatile *a, void *v)
   *a= v;
 }
 
-
-/*
-  my_yield_processor (equivalent of x86 PAUSE instruction) should be used
-  to improve performance on hyperthreaded CPUs. Intel recommends to use it in
-  spin loops also on non-HT machines to reduce power consumption (see e.g
-  http://softwarecommunity.intel.com/articles/eng/2004.htm)
-
-  Running benchmarks for spinlocks implemented with InterlockedCompareExchange
-  and YieldProcessor shows that much better performance is achieved by calling
-  YieldProcessor in a loop - that is, yielding longer. On Intel boxes setting
-  loop count in the range 200-300 brought best results.
- */
-#define YIELD_LOOPS 200
-
-static inline int my_yield_processor()
-{
-  int i;
-  for (i=0; i<YIELD_LOOPS; i++)
-  {
-    YieldProcessor();
-  }
-  return 1;
-}
-
-#define LF_BACKOFF my_yield_processor()
-
 #endif /* ATOMIC_MSC_INCLUDED */
diff --git a/include/lf.h b/include/lf.h
index 1825de62b43..a9d7e9ee688 100644
--- a/include/lf.h
+++ b/include/lf.h
@@ -17,6 +17,7 @@
 #define INCLUDE_LF_INCLUDED
 
 #include <my_atomic.h>
+#include <my_cpu.h>
 
 C_MODE_START
 
diff --git a/include/my_atomic.h b/include/my_atomic.h
index 32c9d6b4736..896dc2b5c33 100644
--- a/include/my_atomic.h
+++ b/include/my_atomic.h
@@ -116,16 +116,6 @@
 #include "atomic/gcc_sync.h"
 #endif
 
-
-/*
-  the macro below defines (as an expression) the code that
-  will be run in spin-loops. Intel manuals recummend to have PAUSE there.
-  It is expected to be defined in include/atomic/ *.h files
-*/
-#ifndef LF_BACKOFF
-#define LF_BACKOFF (1)
-#endif
-
 #if SIZEOF_LONG == 4
 #define my_atomic_addlong(A,B) my_atomic_add32((int32*) (A), (B))
 #define my_atomic_loadlong(A) my_atomic_load32((int32*) (A))
diff --git a/include/my_cpu.h b/include/my_cpu.h
index e255de85960..f2e26fca70c 100644
--- a/include/my_cpu.h
+++ b/include/my_cpu.h
@@ -1,3 +1,5 @@
+#ifndef MY_CPU_INCLUDED
+#define MY_CPU_INCLUDED
 /* Copyright (c) 2013, MariaDB foundation Ab and SkySQL
 
    This program is free software; you can redistribute it and/or modify
@@ -43,3 +45,58 @@
 #define HMT_medium_high()
 #define HMT_high()
 #endif
+
+
+static inline void MY_RELAX_CPU(void)
+{
+#ifdef HAVE_PAUSE_INSTRUCTION
+  /*
+    According to the gcc info page, asm volatile means that the
+    instruction has important side-effects and must not be removed.
+    Also asm volatile may trigger a memory barrier (spilling all registers
+    to memory).
+  */
+#ifdef __SUNPRO_CC
+  asm ("pause" );
+#else
+  __asm__ __volatile__ ("pause");
+#endif
+
+#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
+  __asm__ __volatile__ ("rep; nop");
+#elif defined _WIN32
+  /*
+    In the Win32 API, the x86 PAUSE instruction is executed by calling
+    the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+    independent way by using YieldProcessor.
+  */
+  YieldProcessor();
+#elif defined(_ARCH_PWR8)
+  __ppc_get_timebase();
+#else
+  int32 var, oldval = 0;
+  my_atomic_cas32_strong_explicit(&var, &oldval, 1, MY_MEMORY_ORDER_RELAXED,
+                                  MY_MEMORY_ORDER_RELAXED);
+#endif
+}
+
+
+/*
+  LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel
+  recommends to use it in spin loops also on non-HT machines to reduce power
+  consumption (see e.g http://softwarecommunity.intel.com/articles/eng/2004.htm)
+
+  Running benchmarks for spinlocks implemented with InterlockedCompareExchange
+  and YieldProcessor shows that much better performance is achieved by calling
+  YieldProcessor in a loop - that is, yielding longer. On Intel boxes setting
+  loop count in the range 200-300 brought best results.
+*/
+
+static inline int LF_BACKOFF(void)
+{
+  int i;
+  for (i= 0; i < 200; i++)
+    MY_RELAX_CPU();
+  return 1;
+}
+#endif