1 files changed, 199 insertions, 163 deletions
diff --git a/include/private/gc_locks.h b/include/private/gc_locks.h
index e5f692a0..eed9f105 100644
--- a/include/private/gc_locks.h
+++ b/include/private/gc_locks.h
@@ -43,6 +43,7 @@
  *   
  */  
 # ifdef THREADS
+   void GC_noop1 GC_PROTO((word));
 #  ifdef PCR_OBSOLETE	/* Faster, but broken with multiple lwp's	*/
 #    include  "th/PCR_Th.h"
 #    include  "th/PCR_ThCrSec.h"
@@ -82,126 +83,49 @@
 #    define LOCK() mutex_lock(&GC_allocate_ml);
 #    define UNLOCK() mutex_unlock(&GC_allocate_ml);
 #  endif
-#  if defined(LINUX_THREADS) 
-#   define NO_THREAD (pthread_t)(-1)
-#   if defined(I386)|| defined(POWERPC) || defined(ALPHA) || defined(IA64) \
-    || defined(M68K) || defined(SPARC)
-#    include <pthread.h>
-#    if defined(PARALLEL_MARK) 
-      /* We need compare-and-swap to update mark bits, where it's	*/
-      /* performance critical.  If USE_MARK_BYTES is defined, it is	*/
-      /* no longer needed for this purpose.  However we use it in	*/
-      /* either case to implement atomic fetch-and-add, though that's	*/
-      /* less performance critical, and could perhaps be done with	*/
-      /* a lock.							*/
-#     if defined(GENERIC_COMPARE_AND_SWAP)
-	/* Probably not useful, except for debugging.	*/
-	extern pthread_mutex_t GC_compare_and_swap_lock;
 
-	static GC_bool GC_compare_and_exchange(volatile GC_word *addr,
-					       GC_word old, GC_word new_val)
-	{
-	  GC_bool result;
-	  pthread_mutex_lock(&GC_compare_and_swap_lock);
-	  if (*addr == old) {
-	    *addr = new_val;
-	    result = TRUE;
-	  } else {
-	    result = FALSE;
-	  }
-	  pthread_mutex_unlock(&GC_compare_and_swap_lock);
-	  return result;
-	}
-#     endif /* GENERIC_COMPARE_AND_SWAP */
-#     if defined(I386)
-#      if !defined(GENERIC_COMPARE_AND_SWAP)
-         /* Returns TRUE if the comparison succeeded. */
-         inline static GC_bool GC_compare_and_exchange(volatile GC_word *addr,
-		  				       GC_word old,
-						       GC_word new_val) 
-         {
-	   char result;
-	   __asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
-	    	: "=m"(*(addr)), "=r"(result)
-		: "r" (new_val), "0"(*(addr)), "a"(old));
-	   return (GC_bool) result;
-         }
-#      endif /* !GENERIC_COMPARE_AND_SWAP */
-       inline static void GC_memory_barrier()
-       {
-	 /* We believe the processor ensures at least processor	*/
-	 /* consistent ordering.  Thus a compiler barrier	*/
-	 /* should suffice.					*/
-         __asm__ __volatile__("" : : : "memory");
-       }
-#     endif
-#     if defined(IA64)
-#      if !defined(GENERIC_COMPARE_AND_SWAP)
-         inline static GC_bool GC_compare_and_exchange(volatile GC_word *addr,
-						       GC_word old, GC_word new_val) 
-	 {
-	  unsigned long oldval;
-	  __asm__ __volatile__("mov ar.ccv=%4 ;; cmpxchg8.rel %0=%1,%2,ar.ccv"
-		: "=r"(oldval), "=m"(*addr)
-		: "r"(new_val), "1"(*addr), "r"(old));
-	  return (oldval == old);
-         }
-#      endif /* !GENERIC_COMPARE_AND_SWAP */
-       inline static void GC_memory_barrier()
-       {
-         __asm__ __volatile__("mf" : : : "memory");
-       }
-#     endif /* IA64 */
-      /* Returns the original value of *addr.	*/
-      inline static GC_word GC_atomic_add(volatile GC_word *addr, GC_word how_much)
-      {
-	GC_word old;
-	do {
-	  old = *addr;
-	} while (!GC_compare_and_exchange(addr, old, old+how_much));
-        return old;
-      }
-#    endif /* PARALLEL_MARK */
-#    ifndef THREAD_LOCAL_ALLOC
-      /* In the THREAD_LOCAL_ALLOC case, the allocation lock tends to	*/
-      /* be held for long periods, if it is held at all.  Thus spinning	*/
-      /* and sleeping for fixed periods are likely to result in 	*/
-      /* significant wasted time.  We thus rely mostly on queued locks. */
-#     define USE_SPIN_LOCK
-#     if defined(I386)
+/* Try to define GC_TEST_AND_SET and a matching GC_CLEAR for spin lock	*/
+/* acquisition and release.  We need this for correct operation of the	*/
+/* incremental GC.							*/
+#  ifdef __GNUC__
+#    if defined(I386)
        inline static int GC_test_and_set(volatile unsigned int *addr) {
 	  int oldval;
 	  /* Note: the "xchg" instruction does not need a "lock" prefix */
 	  __asm__ __volatile__("xchgl %0, %1"
 		: "=r"(oldval), "=m"(*(addr))
-		: "0"(1), "m"(*(addr)));
+		: "0"(1), "m"(*(addr)) : "memory");
 	  return oldval;
        }
-#     endif
-#     if defined(IA64)
+#      define GC_TEST_AND_SET_DEFINED
+#    endif
+#    if defined(IA64)
        inline static int GC_test_and_set(volatile unsigned int *addr) {
 	  long oldval, n = 1;
 	  __asm__ __volatile__("xchg4 %0=%1,%2"
 		: "=r"(oldval), "=m"(*addr)
-		: "r"(n), "1"(*addr));
+		: "r"(n), "1"(*addr) : "memory");
 	  return oldval;
        }
+#      define GC_TEST_AND_SET_DEFINED
+       /* Should this handle post-increment addressing?? */
        inline static void GC_clear(volatile unsigned int *addr) {
-	 __asm__ __volatile__("st4.rel %0=r0" : "=m" (*addr));
+	 __asm__ __volatile__("st4.rel %0=r0" : "=m" (*addr) : : "memory");
        }
 #      define GC_CLEAR_DEFINED
-#     endif
-#     ifdef SPARC
+#    endif
+#    ifdef SPARC
        inline static int GC_test_and_set(volatile unsigned int *addr) {
 	 int oldval;
 
 	 __asm__ __volatile__("ldstub %1,%0"
 	 : "=r"(oldval), "=m"(*addr)
-	 : "m"(*addr));
+	 : "m"(*addr) : "memory");
 	 return oldval;
        }
-#     endif
-#     ifdef M68K
+#      define GC_TEST_AND_SET_DEFINED
+#    endif
+#    ifdef M68K
        /* Contributed by Tony Mantler.  I'm not sure how well it was	*/
        /* tested.							*/
        inline static int GC_test_and_set(volatile unsigned int *addr) {
@@ -213,11 +137,12 @@
           __asm__ __volatile__(
                  "tas %1@; sne %0; negb %0"
                  : "=d" (oldval)
-                 : "a" (addr));
+                 : "a" (addr) : "memory");
           return oldval;
        }
-#     endif
-#     if defined(POWERPC)
+#      define GC_TEST_AND_SET_DEFINED
+#    endif
+#    if defined(POWERPC)
         inline static int GC_test_and_set(volatile unsigned int *addr) {
           int oldval;
           int temp = 1; // locked value
@@ -234,13 +159,14 @@
               : "memory");
           return (int)oldval;
         }
+#       define GC_TEST_AND_SET_DEFINED
         inline static void GC_clear(volatile unsigned int *addr) {
-	  __asm__ __volatile__("eieio");
+	  __asm__ __volatile__("eieio" ::: "memory");
           *(addr) = 0;
         }
 #       define GC_CLEAR_DEFINED
-#     endif
-#     ifdef ALPHA
+#    endif
+#    if defined(ALPHA) 
         inline static int GC_test_and_set(volatile unsigned int * addr)
         {
           unsigned long oldvalue;
@@ -259,14 +185,16 @@
                              "3:     br 1b\n"
                              ".previous"
                              :"=&r" (temp), "=m" (*addr), "=&r" (oldvalue)
-                             :"Ir" (1), "m" (*addr));
+                             :"Ir" (1), "m" (*addr)
+			     :"memory");
 
           return oldvalue;
         }
+#       define GC_TEST_AND_SET_DEFINED
         /* Should probably also define GC_clear, since it needs	*/
         /* a memory barrier ??					*/
-#     endif /* ALPHA */
-#     ifdef ARM32
+#    endif /* ALPHA */
+#    ifdef ARM32
         inline static int GC_test_and_set(volatile unsigned int *addr) {
           int oldval;
           /* SWP on ARM is very similar to XCHG on x86.  Doesn't lock the
@@ -275,18 +203,154 @@
           /* See linuxthreads/sysdeps/arm/pt-machine.h in glibc-2.1 */
           __asm__ __volatile__("swp %0, %1, [%2]"
       		  	     : "=r"(oldval)
-      			     : "r"(1), "r"(addr));
+      			     : "r"(1), "r"(addr)
+			     : "memory");
           return oldval;
         }
-#     endif /* ARM32 */
-#     ifndef GC_CLEAR_DEFINED
-         inline static void GC_clear(volatile unsigned int *addr) {
-	  /* Try to discourage gcc from moving anything past this. */
-	  __asm__ __volatile__(" ");
-          *(addr) = 0;
+#       define GC_TEST_AND_SET_DEFINED
+#    endif /* ARM32 */
+#  endif /* __GNUC__ */
+#  if (defined(ALPHA) && !defined(__GNUC__))
+#    define GC_test_and_set(addr) __cxx_test_and_set_atomic(addr, 1)
+#    define GC_TEST_AND_SET_DEFINED
+#  endif
+#  if defined(MSWIN32)
+#    define GC_test_and_set(addr) InterlockedExchange((LPLONG)addr,1)
+#    define GC_TEST_AND_SET_DEFINED
+#  endif
+#  ifdef MIPS
+#    if __mips < 3 || !(defined (_ABIN32) || defined(_ABI64)) \
+	|| !defined(_COMPILER_VERSION) || _COMPILER_VERSION < 700
+#        define GC_test_and_set(addr, v) test_and_set(addr,v)
+#    else
+#	 define GC_test_and_set(addr, v) __test_and_set(addr,v)
+#	 define GC_clear(addr) __lock_release(addr);
+#	 define GC_CLEAR_DEFINED
+#    endif
+#    define GC_TEST_AND_SET_DEFINED
+#  endif /* MIPS */
+#  if 0 /* defined(HP_PA) */
+     /* The official recommendation seems to be to not use ldcw from	*/
+     /* user mode.  Since multithreaded incremental collection doesn't	*/
+     /* work anyway on HP_PA, this shouldn't be a major loss.		*/
+
+     /* "set" means 0 and "clear" means 1 here.		*/
+#    define GC_test_and_set(addr) !GC_test_and_clear(addr);
+#    define GC_TEST_AND_SET_DEFINED
+#    define GC_clear(addr) GC_noop1((word)(addr)); *(volatile unsigned int *)addr = 1;
+	/* The above needs a memory barrier! */
+#    define GC_CLEAR_DEFINED
+#  endif
+#  if defined(GC_TEST_AND_SET_DEFINED) && !defined(GC_CLEAR_DEFINED)
+#    ifdef __GNUC__
+       inline static void GC_clear(volatile unsigned int *addr) {
+         /* Try to discourage gcc from moving anything past this. */
+         __asm__ __volatile__(" " : : : "memory");
+         *(addr) = 0;
+       }
+#    else
+	    /* The function call in the following should prevent the	*/
+	    /* compiler from moving assignments to below the UNLOCK.	*/
+#      define GC_clear(addr) GC_noop1((word)(addr)); \
+			     *((volatile unsigned int *)(addr)) = 0;
+#    endif
+#    define GC_CLEAR_DEFINED
+#  endif /* !GC_CLEAR_DEFINED */
+
+#  if !defined(GC_TEST_AND_SET_DEFINED)
+#    define USE_PTHREAD_LOCKS
+#  endif
+
+#  if defined(LINUX_THREADS) || defined(OSF1_THREADS) \
+      || defined(HPUX_THREADS)
+#    define NO_THREAD (pthread_t)(-1)
+#    include <pthread.h>
+#    if defined(PARALLEL_MARK) 
+      /* We need compare-and-swap to update mark bits, where it's	*/
+      /* performance critical.  If USE_MARK_BYTES is defined, it is	*/
+      /* no longer needed for this purpose.  However we use it in	*/
+      /* either case to implement atomic fetch-and-add, though that's	*/
+      /* less performance critical, and could perhaps be done with	*/
+      /* a lock.							*/
+#     if defined(GENERIC_COMPARE_AND_SWAP)
+	/* Probably not useful, except for debugging.	*/
+	/* We do use GENERIC_COMPARE_AND_SWAP on PA_RISC, but we 	*/
+	/* minimize its use.						*/
+	extern pthread_mutex_t GC_compare_and_swap_lock;
+
+	/* Note that if GC_word updates are not atomic, a concurrent 	*/
+	/* reader should acquire GC_compare_and_swap_lock.  On 		*/
+	/* currently supported platforms, such updates are atomic.	*/
+	extern GC_bool GC_compare_and_exchange(volatile GC_word *addr,
+					       GC_word old, GC_word new_val);
+#     endif /* GENERIC_COMPARE_AND_SWAP */
+#     if defined(I386)
+#      if !defined(GENERIC_COMPARE_AND_SWAP)
+         /* Returns TRUE if the comparison succeeded. */
+         inline static GC_bool GC_compare_and_exchange(volatile GC_word *addr,
+		  				       GC_word old,
+						       GC_word new_val) 
+         {
+	   char result;
+	   __asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
+	    	: "=m"(*(addr)), "=r"(result)
+		: "r" (new_val), "0"(*(addr)), "a"(old) : "memory");
+	   return (GC_bool) result;
+         }
+#      endif /* !GENERIC_COMPARE_AND_SWAP */
+       inline static void GC_memory_write_barrier()
+       {
+	 /* We believe the processor ensures at least processor	*/
+	 /* consistent ordering.  Thus a compiler barrier	*/
+	 /* should suffice.					*/
+         __asm__ __volatile__("" : : : "memory");
+       }
+#     endif /* I386 */
+#     if defined(IA64)
+#      if !defined(GENERIC_COMPARE_AND_SWAP)
+         inline static GC_bool GC_compare_and_exchange(volatile GC_word *addr,
+						       GC_word old, GC_word new_val) 
+	 {
+	  unsigned long oldval;
+	  __asm__ __volatile__("mov ar.ccv=%4 ;; cmpxchg8.rel %0=%1,%2,ar.ccv"
+		: "=r"(oldval), "=m"(*addr)
+		: "r"(new_val), "1"(*addr), "r"(old) : "memory");
+	  return (oldval == old);
          }
-#     endif /* !GC_CLEAR_DEFINED */
+#      endif /* !GENERIC_COMPARE_AND_SWAP */
+#      if 0
+	/* Shouldn't be needed; we use volatile stores instead. */
+        inline static void GC_memory_write_barrier()
+        {
+          __asm__ __volatile__("mf" : : : "memory");
+        }
+#      endif /* 0 */
+#     endif /* IA64 */
+#     if !defined(GENERIC_COMPARE_AND_SWAP)
+        /* Returns the original value of *addr.	*/
+        inline static GC_word GC_atomic_add(volatile GC_word *addr,
+					    GC_word how_much)
+        {
+	  GC_word old;
+	  do {
+	    old = *addr;
+	  } while (!GC_compare_and_exchange(addr, old, old+how_much));
+          return old;
+        }
+#     else /* GENERIC_COMPARE_AND_SWAP */
+	/* So long as a GC_word can be atomically updated, it should	*/
+	/* be OK to read *addr without a lock.				*/
+	extern GC_word GC_atomic_add(volatile GC_word *addr, GC_word how_much);
+#     endif /* GENERIC_COMPARE_AND_SWAP */
 
+#    endif /* PARALLEL_MARK */
+
+#    if !defined(THREAD_LOCAL_ALLOC) && !defined(USE_PTHREAD_LOCKS)
+      /* In the THREAD_LOCAL_ALLOC case, the allocation lock tends to	*/
+      /* be held for long periods, if it is held at all.  Thus spinning	*/
+      /* and sleeping for fixed periods are likely to result in 	*/
+      /* significant wasted time.  We thus rely mostly on queued locks. */
+#     define USE_SPIN_LOCK
       extern volatile unsigned int GC_allocate_lock;
       extern void GC_lock(void);
 	/* Allocation lock holder.  Only set if acquired by client through */
@@ -304,13 +368,23 @@
 #        define UNLOCK() \
 		GC_clear(&GC_allocate_lock)
 #     endif /* !GC_ASSERTIONS */
-#    else /* THREAD_LOCAL_ALLOC */
-#      define USE_PTHREAD_LOCKS
+#     if 0
+	/* Another alternative for OSF1 might be:		*/
+#       include <sys/mman.h>
+        extern msemaphore GC_allocate_semaphore;
+#       define LOCK() { if (msem_lock(&GC_allocate_semaphore, MSEM_IF_NOWAIT) \
+ 			    != 0) GC_lock(); else GC_allocate_lock = 1; }
+        /* The following is INCORRECT, since the memory model is too weak. */
+	/* Is this true?  Presumably msem_unlock has the right semantics?  */
+	/*		- HB						   */
+#       define UNLOCK() { GC_allocate_lock = 0; \
+                          msem_unlock(&GC_allocate_semaphore, 0); }
+#     endif /* 0 */
+#    else /* THREAD_LOCAL_ALLOC  || USE_PTHREAD_LOCKS */
+#      ifndef USE_PTHREAD_LOCKS
+#        define USE_PTHREAD_LOCKS
+#      endif
 #    endif /* THREAD_LOCAL_ALLOC */
-#   else /* LINUX_THREADS on hardware for which we don't know how	*/
-	 /* to do test and set.						*/
-#      define USE_PTHREAD_LOCKS
-#   endif /* ! known hardware */
 #   ifdef USE_PTHREAD_LOCKS
 #      include <pthread.h>
        extern pthread_mutex_t GC_allocate_ml;
@@ -338,33 +412,13 @@
 #   ifdef GC_ASSERTIONS
       extern pthread_t GC_mark_lock_holder;
 #   endif
-#  endif /* LINUX_THREADS */
-#  if defined(HPUX_THREADS)
-#    include <pthread.h>
-     extern pthread_mutex_t GC_allocate_ml;
-#    define NO_THREAD (pthread_t)(-1)
-#    define LOCK() pthread_mutex_lock(&GC_allocate_ml)
-#    define UNLOCK() pthread_mutex_unlock(&GC_allocate_ml)
-#  endif
+#  endif /* LINUX_THREADS || OSF1_THREADS  || HPUX_THREADS */
 #  if defined(IRIX_THREADS)
-     /* This may also eventually be appropriate for HPUX_THREADS */
 #    include <pthread.h>
-#    ifndef HPUX_THREADS
-	/* This probably should never be included, but I can't test	*/
-	/* on Irix anymore.						*/
-#       include <mutex.h>
-#    endif
+     /* This probably should never be included, but I can't test	*/
+     /* on Irix anymore.						*/
+#    include <mutex.h>
 
-#    ifndef HPUX_THREADS
-#      if __mips < 3 || !(defined (_ABIN32) || defined(_ABI64)) \
-	|| !defined(_COMPILER_VERSION) || _COMPILER_VERSION < 700
-#        define GC_test_and_set(addr, v) test_and_set(addr,v)
-#      else
-#	 define GC_test_and_set(addr, v) __test_and_set(addr,v)
-#      endif
-#    else
-       /* I couldn't find a way to do this inline on HP/UX	*/
-#    endif
      extern unsigned long GC_allocate_lock;
 	/* This is not a mutex because mutexes that obey the (optional) 	*/
 	/* POSIX scheduling rules are subject to convoys in high contention	*/
@@ -377,26 +431,8 @@
 #    define NO_THREAD (pthread_t)(-1)
 #    define UNSET_LOCK_HOLDER() GC_lock_holder = NO_THREAD
 #    define I_HOLD_LOCK() (pthread_equal(GC_lock_holder, pthread_self()))
-#    ifdef HPUX_THREADS
-#      define LOCK() { if (!GC_test_and_clear(&GC_allocate_lock)) GC_lock(); }
-       /* The following is INCORRECT, since the memory model is too weak. */
-#      define UNLOCK() { GC_noop1(&GC_allocate_lock); \
-			*(volatile unsigned long *)(&GC_allocate_lock) = 1; }
-#    else
-#      define LOCK() { if (GC_test_and_set(&GC_allocate_lock, 1)) GC_lock(); }
-#      if __mips >= 3 && (defined (_ABIN32) || defined(_ABI64)) \
-	   && defined(_COMPILER_VERSION) && _COMPILER_VERSION >= 700
-#	    define UNLOCK() __lock_release(&GC_allocate_lock)
-#      else
-	    /* The function call in the following should prevent the	*/
-	    /* compiler from moving assignments to below the UNLOCK.	*/
-	    /* This is probably not necessary for ucode or gcc 2.8.	*/
-	    /* It may be necessary for Ragnarok and future gcc		*/
-	    /* versions.						*/
-#           define UNLOCK() { GC_noop1(&GC_allocate_lock); \
-			*(volatile unsigned long *)(&GC_allocate_lock) = 0; }
-#      endif
-#    endif
+#    define LOCK() { if (GC_test_and_set(&GC_allocate_lock, 1)) GC_lock(); }
+#    define UNLOCK() GC_clear(&GC_allocate_lock);
      extern VOLATILE GC_bool GC_collecting;
 #    define ENTER_GC() \
 		{ \