libgo: Update to October 24 version of master library.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@204466 138bc75d-0d04-0410-961f-82ee72b054a4
author: ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-11-06 19:49:01 +0000
committer: ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-11-06 19:49:01 +0000
commit: 0ce10ea1348e9afd5d0eec6bca986bfe58bac5ac (patch)
tree: 39530b071991b2326f881b2a30a2d82d6c133fd6 /libgo/runtime
parent: 57a8bf1b0c6057ccbacb0cf79eb84d1985c2c1fe (diff)
download: gcc-0ce10ea1348e9afd5d0eec6bca986bfe58bac5ac.tar.gz
46 files changed, 2062 insertions, 1318 deletions
diff --git a/libgo/runtime/chan.c b/libgo/runtime/chan.c
index 6f52a1d5e31..1d9e6681d35 100644
--- a/libgo/runtime/chan.c
+++ b/libgo/runtime/chan.c
@@ -10,8 +10,6 @@
 
 #define	NOSELGEN	1
 
-static	int32	debug	= 0;
-
 typedef	struct	WaitQ	WaitQ;
 typedef	struct	SudoG	SudoG;
 typedef	struct	Select	Select;
@@ -42,8 +40,9 @@ struct	Hchan
 	uintgo	qcount;			// total data in the q
 	uintgo	dataqsiz;		// size of the circular q
 	uint16	elemsize;
-	bool	closed;
 	uint8	elemalign;
+	uint8	pad;			// ensures proper alignment of the buffer that follows Hchan in memory
+	bool	closed;
 	uintgo	sendx;			// send index
 	uintgo	recvx;			// receive index
 	WaitQ	recvq;			// list of recv waiters
@@ -59,6 +58,8 @@ uint32 runtime_Hchansize = sizeof(Hchan);
 
 enum
 {
+	debug = 0,
+
 	// Scase.kind
 	CaseRecv,
 	CaseSend,
@@ -105,17 +106,17 @@ runtime_makechan_c(ChanType *t, int64 hint)
 		runtime_panicstring("makechan: size out of range");
 
 	n = sizeof(*c);
+	n = ROUND(n, elem->__align);
 
 	// allocate memory in one call
-	c = (Hchan*)runtime_mal(n + hint*elem->__size);
+	c = (Hchan*)runtime_mallocgc(n + hint*elem->__size, (uintptr)t | TypeInfo_Chan, 0);
 	c->elemsize = elem->__size;
 	c->elemalign = elem->__align;
 	c->dataqsiz = hint;
-	runtime_settype(c, (uintptr)t | TypeInfo_Chan);
 
 	if(debug)
-		runtime_printf("makechan: chan=%p; elemsize=%D; elemalign=%d; dataqsiz=%D\n",
-			c, (int64)elem->__size, elem->__align, (int64)c->dataqsiz);
+		runtime_printf("makechan: chan=%p; elemsize=%D; dataqsiz=%D\n",
+			c, (int64)elem->__size, (int64)c->dataqsiz);
 
 	return c;
 }
@@ -185,7 +186,7 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 		return;  // not reached
 	}
 
-	if(runtime_gcwaiting)
+	if(runtime_gcwaiting())
 		runtime_gosched();
 
 	if(debug) {
@@ -200,7 +201,6 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 	}
 
 	runtime_lock(c);
-	// TODO(dvyukov): add similar instrumentation to select.
 	if(raceenabled)
 		runtime_racereadpc(c, pc, runtime_chansend);
 	if(c->closed)
@@ -311,7 +311,7 @@ runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received
 	int64 t0;
 	G *g;
 
-	if(runtime_gcwaiting)
+	if(runtime_gcwaiting())
 		runtime_gosched();
 
 	if(debug)
@@ -927,6 +927,7 @@ selectgo(Select **selp)
 {
 	Select *sel;
 	uint32 o, i, j, k;
+	int64 t0;
 	Scase *cas, *dfl;
 	Hchan *c;
 	SudoG *sg;
@@ -935,7 +936,7 @@ selectgo(Select **selp)
 	G *g;
 
 	sel = *selp;
-	if(runtime_gcwaiting)
+	if(runtime_gcwaiting())
 		runtime_gosched();
 
 	if(debug)
@@ -943,6 +944,13 @@ selectgo(Select **selp)
 
 	g = runtime_g();
 
+	t0 = 0;
+	if(runtime_blockprofilerate > 0) {
+		t0 = runtime_cputicks();
+		for(i=0; i<sel->ncase; i++)
+			sel->scase[i].sg.releasetime = -1;
+	}
+
 	// The compiler rewrites selects that statically have
 	// only 0 or 1 cases plus default into simpler constructs.
 	// The only way we can end up with such small sel->ncase
@@ -1023,6 +1031,8 @@ loop:
 			break;
 
 		case CaseSend:
+			if(raceenabled)
+				runtime_racereadpc(c, runtime_selectgo, runtime_chansend);
 			if(c->closed)
 				goto sclose;
 			if(c->dataqsiz > 0) {
@@ -1124,6 +1134,8 @@ asyncrecv:
 	if(sg != nil) {
 		gp = sg->g;
 		selunlock(sel);
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	} else {
 		selunlock(sel);
@@ -1142,6 +1154,8 @@ asyncsend:
 	if(sg != nil) {
 		gp = sg->g;
 		selunlock(sel);
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	} else {
 		selunlock(sel);
@@ -1161,6 +1175,8 @@ syncrecv:
 		runtime_memmove(cas->sg.elem, sg->elem, c->elemsize);
 	gp = sg->g;
 	gp->param = sg;
+	if(sg->releasetime)
+		sg->releasetime = runtime_cputicks();
 	runtime_ready(gp);
 	goto retc;
 
@@ -1186,11 +1202,15 @@ syncsend:
 		runtime_memmove(sg->elem, cas->sg.elem, c->elemsize);
 	gp = sg->g;
 	gp->param = sg;
+	if(sg->releasetime)
+		sg->releasetime = runtime_cputicks();
 	runtime_ready(gp);
 
 retc:
 	// return index corresponding to chosen case
 	index = cas->index;
+	if(cas->sg.releasetime > 0)
+		runtime_blockevent(cas->sg.releasetime - t0, 2);
 	runtime_free(sel);
 	return index;
 
@@ -1297,17 +1317,36 @@ reflect_rselect(Slice cases)
 	return ret;
 }
 
+static void closechan(Hchan *c, void *pc);
+
 // closechan(sel *byte);
 void
 runtime_closechan(Hchan *c)
 {
+	closechan(c, runtime_getcallerpc(&c));
+}
+
+// For reflect
+//	func chanclose(c chan)
+
+void reflect_chanclose(uintptr) __asm__ (GOSYM_PREFIX "reflect.chanclose");
+
+void
+reflect_chanclose(uintptr c)
+{
+	closechan((Hchan*)c, runtime_getcallerpc(&c));
+}
+
+static void
+closechan(Hchan *c, void *pc)
+{
 	SudoG *sg;
 	G* gp;
 
 	if(c == nil)
 		runtime_panicstring("close of nil channel");
 
-	if(runtime_gcwaiting)
+	if(runtime_gcwaiting())
 		runtime_gosched();
 
 	runtime_lock(c);
@@ -1317,7 +1356,7 @@ runtime_closechan(Hchan *c)
 	}
 
 	if(raceenabled) {
-		runtime_racewritepc(c, runtime_getcallerpc(&c), runtime_closechan);
+		runtime_racewritepc(c, pc, runtime_closechan);
 		runtime_racerelease(c);
 	}
 
@@ -1330,6 +1369,8 @@ runtime_closechan(Hchan *c)
 			break;
 		gp = sg->g;
 		gp->param = nil;
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	}
 
@@ -1340,6 +1381,8 @@ runtime_closechan(Hchan *c)
 			break;
 		gp = sg->g;
 		gp->param = nil;
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	}
 
@@ -1353,17 +1396,6 @@ __go_builtin_close(Hchan *c)
 }
 
 // For reflect
-//	func chanclose(c chan)
-
-void reflect_chanclose(uintptr) __asm__ (GOSYM_PREFIX "reflect.chanclose");
-
-void
-reflect_chanclose(uintptr c)
-{
-	runtime_closechan((Hchan*)c);
-}
-
-// For reflect
 //	func chanlen(c chan) (len int)
 
 intgo reflect_chanlen(uintptr) __asm__ (GOSYM_PREFIX "reflect.chanlen");
diff --git a/libgo/runtime/cpuprof.c b/libgo/runtime/cpuprof.c
index 516387396ea..a2a1a05ce3d 100644
--- a/libgo/runtime/cpuprof.c
+++ b/libgo/runtime/cpuprof.c
@@ -146,7 +146,7 @@ runtime_SetCPUProfileRate(intgo hz)
 	runtime_lock(&lk);
 	if(hz > 0) {
 		if(prof == nil) {
-			prof = runtime_SysAlloc(sizeof *prof);
+			prof = runtime_SysAlloc(sizeof *prof, &mstats.other_sys);
 			if(prof == nil) {
 				runtime_printf("runtime: cpu profiling cannot allocate memory\n");
 				runtime_unlock(&lk);
@@ -340,7 +340,7 @@ getprofile(Profile *p)
 
 	if(p->wholding) {
 		// Release previous log to signal handling side.
-		// Loop because we are racing against setprofile(off).
+		// Loop because we are racing against SetCPUProfileRate(0).
 		for(;;) {
 			n = p->handoff;
 			if(n == 0) {
@@ -367,9 +367,7 @@ getprofile(Profile *p)
 		return ret;
 
 	// Wait for new log.
-	runtime_entersyscallblock();
-	runtime_notesleep(&p->wait);
-	runtime_exitsyscall();
+	runtime_notetsleepg(&p->wait, -1);
 	runtime_noteclear(&p->wait);
 
 	n = p->handoff;
diff --git a/libgo/runtime/env_posix.c b/libgo/runtime/env_posix.c
index 7f3fa0d8e0f..3219550af99 100644
--- a/libgo/runtime/env_posix.c
+++ b/libgo/runtime/env_posix.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin freebsd linux netbsd openbsd windows
+// +build darwin dragonfly freebsd linux netbsd openbsd windows
 
 #include "runtime.h"
 #include "array.h"
@@ -12,7 +12,8 @@ extern Slice syscall_Envs __asm__ (GOSYM_PREFIX "syscall.Envs");
 const byte*
 runtime_getenv(const char *s)
 {
-	int32 i, j, len;
+	int32 i, j;
+	intgo len;
 	const byte *v, *bs;
 	String* envv;
 	int32 envc;
diff --git a/libgo/runtime/go-byte-array-to-string.c b/libgo/runtime/go-byte-array-to-string.c
index 0cd63c76d8d..088b78690fe 100644
--- a/libgo/runtime/go-byte-array-to-string.c
+++ b/libgo/runtime/go-byte-array-to-string.c
@@ -16,7 +16,7 @@ __go_byte_array_to_string (const void* p, intgo len)
   String ret;
 
   bytes = (const unsigned char *) p;
-  retdata = runtime_mallocgc ((uintptr) len, FlagNoPointers, 1, 0);
+  retdata = runtime_mallocgc ((uintptr) len, 0, FlagNoScan);
   __builtin_memcpy (retdata, bytes, len);
   ret.str = retdata;
   ret.len = len;
diff --git a/libgo/runtime/go-caller.c b/libgo/runtime/go-caller.c
index d84580fa594..8ca3c7efcd7 100644
--- a/libgo/runtime/go-caller.c
+++ b/libgo/runtime/go-caller.c
@@ -228,3 +228,23 @@ runtime_funcline_go (Func *f __attribute__((unused)), uintptr targetpc)
     runtime_memclr (&ret, sizeof ret);
   return ret;
 }
+
+/* Return the name of a function.  */
+String runtime_funcname_go (Func *f)
+  __asm__ (GOSYM_PREFIX "runtime.funcname_go");
+
+String
+runtime_funcname_go (Func *f)
+{
+  return f->name;
+}
+
+/* Return the entry point of a function.  */
+uintptr runtime_funcentry_go(Func *f)
+  __asm__ (GOSYM_PREFIX "runtime.funcentry_go");
+
+uintptr
+runtime_funcentry_go (Func *f)
+{
+  return f->entry;
+}
diff --git a/libgo/runtime/go-int-array-to-string.c b/libgo/runtime/go-int-array-to-string.c
index 6cae2fd8ccb..d93fe651d95 100644
--- a/libgo/runtime/go-int-array-to-string.c
+++ b/libgo/runtime/go-int-array-to-string.c
@@ -41,7 +41,7 @@ __go_int_array_to_string (const void* p, intgo len)
 	slen += 4;
     }
 
-  retdata = runtime_mallocgc ((uintptr) slen, FlagNoPointers, 1, 0);
+  retdata = runtime_mallocgc ((uintptr) slen, 0, FlagNoScan);
   ret.str = retdata;
   ret.len = slen;
 
diff --git a/libgo/runtime/go-int-to-string.c b/libgo/runtime/go-int-to-string.c
index eb441674b6c..d90b1ddfed1 100644
--- a/libgo/runtime/go-int-to-string.c
+++ b/libgo/runtime/go-int-to-string.c
@@ -60,7 +60,7 @@ __go_int_to_string (intgo v)
 	}
     }
 
-  retdata = runtime_mallocgc (len, FlagNoPointers, 1, 0);
+  retdata = runtime_mallocgc (len, 0, FlagNoScan);
   __builtin_memcpy (retdata, buf, len);
   ret.str = retdata;
   ret.len = len;
diff --git a/libgo/runtime/go-make-slice.c b/libgo/runtime/go-make-slice.c
index f08cb012dc8..855bb17ce59 100644
--- a/libgo/runtime/go-make-slice.c
+++ b/libgo/runtime/go-make-slice.c
@@ -55,15 +55,15 @@ __go_make_slice2 (const struct __go_type_descriptor *td, uintptr_t len,
   if (size == 0)
     ret.__values = &runtime_zerobase;
   else if ((std->__element_type->__code & GO_NO_POINTERS) != 0)
-    ret.__values = runtime_mallocgc (size, FlagNoPointers, 1, 1);
+    ret.__values =
+      runtime_mallocgc (size,
+			(uintptr) std->__element_type | TypeInfo_Array,
+			FlagNoScan);
   else
-    {
-      ret.__values = runtime_mallocgc (size, 0, 1, 1);
-
-      if (UseSpanType)
-	runtime_settype (ret.__values,
-			 (uintptr) std->__element_type | TypeInfo_Array);
-    }
+    ret.__values =
+      runtime_mallocgc (size,
+			(uintptr) std->__element_type | TypeInfo_Array,
+			0);
 
   return ret;
 }
diff --git a/libgo/runtime/go-new.c b/libgo/runtime/go-new.c
index b1af5f22473..9d46706eaa4 100644
--- a/libgo/runtime/go-new.c
+++ b/libgo/runtime/go-new.c
@@ -12,11 +12,11 @@
 void *
 __go_new (uintptr_t size)
 {
-  return runtime_mallocgc (size, 0, 1, 1);
+  return runtime_mallocgc (size, 0, 0);
 }
 
 void *
 __go_new_nopointers (uintptr_t size)
 {
-  return runtime_mallocgc (size, FlagNoPointers, 1, 1);
+  return runtime_mallocgc (size, 0, FlagNoScan);
 }
diff --git a/libgo/runtime/go-reflect-call.c b/libgo/runtime/go-reflect-call.c
index 5cf370798bf..0fed68a50e7 100644
--- a/libgo/runtime/go-reflect-call.c
+++ b/libgo/runtime/go-reflect-call.c
@@ -271,7 +271,21 @@ go_func_return_ffi (const struct __go_func_type *func)
   types = (const struct __go_type_descriptor **) func->__out.__values;
 
   if (count == 1)
-    return go_type_to_ffi (types[0]);
+    {
+
+#if defined (__i386__) && !defined (__x86_64__)
+      /* FFI does not support complex types.  On 32-bit x86, a
+	 complex64 will be returned in %eax/%edx.  We normally tell
+	 FFI that a complex64 is a struct of two floats.  On 32-bit
+	 x86 a struct of two floats is returned via a hidden first
+	 pointer parameter.  Fortunately we can make everything work
+	 by pretending that complex64 is int64.  */
+      if ((types[0]->__code & GO_CODE_MASK) == GO_COMPLEX64)
+	return &ffi_type_sint64;
+#endif
+
+      return go_type_to_ffi (types[0]);
+    }
 
   ret = (ffi_type *) __go_alloc (sizeof (ffi_type));
   ret->type = FFI_TYPE_STRUCT;
diff --git a/libgo/runtime/go-signal.c b/libgo/runtime/go-signal.c
index 23a94db4157..4f0dcc78c17 100644
--- a/libgo/runtime/go-signal.c
+++ b/libgo/runtime/go-signal.c
@@ -139,22 +139,6 @@ SigTab runtime_sigtab[] = {
 #undef P
 #undef D
 
-
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
-
-static void
-runtime_badsignal(int32 sig)
-{
-	// Avoid -D_FORTIFY_SOURCE problems.
-	int rv __attribute__((unused));
-
-	if (sig == SIGPROF) {
-		return;  // Ignore SIGPROFs intended for a non-Go thread.
-	}
-	rv = runtime_write(2, badsignal, sizeof badsignal - 1);
-	runtime_exit(1);
-}
-
 /* Handle a signal, for cases where we don't panic.  We can split the
    stack here.  */
 
diff --git a/libgo/runtime/go-string-to-byte-array.c b/libgo/runtime/go-string-to-byte-array.c
index 75fac1dbfe6..5e030330f29 100644
--- a/libgo/runtime/go-string-to-byte-array.c
+++ b/libgo/runtime/go-string-to-byte-array.c
@@ -15,7 +15,8 @@ __go_string_to_byte_array (String str)
   unsigned char *data;
   struct __go_open_array ret;
 
-  data = (unsigned char *) runtime_mallocgc (str.len, FlagNoPointers, 1, 0);
+  data = (unsigned char *) runtime_mallocgc (str.len, 0,
+					     FlagNoScan | FlagNoZero);
   __builtin_memcpy (data, str.str, str.len);
   ret.__values = (void *) data;
   ret.__count = str.len;
diff --git a/libgo/runtime/go-string-to-int-array.c b/libgo/runtime/go-string-to-int-array.c
index 16970bdd042..d91c9e2df82 100644
--- a/libgo/runtime/go-string-to-int-array.c
+++ b/libgo/runtime/go-string-to-int-array.c
@@ -32,8 +32,8 @@ __go_string_to_int_array (String str)
       p += __go_get_rune (p, pend - p, &rune);
     }
 
-  data = (uint32_t *) runtime_mallocgc (c * sizeof (uint32_t), FlagNoPointers,
-					1, 0);
+  data = (uint32_t *) runtime_mallocgc (c * sizeof (uint32_t), 0,
+					FlagNoScan | FlagNoZero);
   p = str.str;
   pd = data;
   while (p < pend)
diff --git a/libgo/runtime/go-strplus.c b/libgo/runtime/go-strplus.c
index d6e6df67fce..13915e3e673 100644
--- a/libgo/runtime/go-strplus.c
+++ b/libgo/runtime/go-strplus.c
@@ -21,7 +21,7 @@ __go_string_plus (String s1, String s2)
     return s1;
 
   len = s1.len + s2.len;
-  retdata = runtime_mallocgc (len, FlagNoPointers, 1, 0);
+  retdata = runtime_mallocgc (len, 0, FlagNoScan | FlagNoZero);
   __builtin_memcpy (retdata, s1.str, s1.len);
   __builtin_memcpy (retdata + s1.len, s2.str, s2.len);
   ret.str = retdata;
diff --git a/libgo/runtime/lfstack.c b/libgo/runtime/lfstack.c
index 230ed87c43f..132783c3644 100644
--- a/libgo/runtime/lfstack.c
+++ b/libgo/runtime/lfstack.c
@@ -41,10 +41,10 @@ runtime_lfstackpush(uint64 *head, LFNode *node)
 
 	node->pushcnt++;
 	new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<<PTR_BITS);
-	old = runtime_atomicload64(head);
 	for(;;) {
+		old = runtime_atomicload64(head);
 		node->next = (LFNode*)(uintptr)(old&PTR_MASK);
-		if(runtime_cas64(head, &old, new))
+		if(runtime_cas64(head, old, new))
 			break;
 	}
 }
@@ -55,8 +55,8 @@ runtime_lfstackpop(uint64 *head)
 	LFNode *node, *node2;
 	uint64 old, new;
 
-	old = runtime_atomicload64(head);
 	for(;;) {
+		old = runtime_atomicload64(head);
 		if(old == 0)
 			return nil;
 		node = (LFNode*)(uintptr)(old&PTR_MASK);
@@ -64,7 +64,7 @@ runtime_lfstackpop(uint64 *head)
 		new = 0;
 		if(node2 != nil)
 			new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<<PTR_BITS);
-		if(runtime_cas64(head, &old, new))
+		if(runtime_cas64(head, old, new))
 			return node;
 	}
 }
diff --git a/libgo/runtime/lock_futex.c b/libgo/runtime/lock_futex.c
index 4b9651a75de..fa270132895 100644
--- a/libgo/runtime/lock_futex.c
+++ b/libgo/runtime/lock_futex.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd linux
+// +build dragonfly freebsd linux
 
 #include "runtime.h"
 
@@ -91,14 +91,14 @@ runtime_unlock(Lock *l)
 {
 	uint32 v;
 
-	if(--runtime_m()->locks < 0)
-		runtime_throw("runtime_unlock: lock count");
-
 	v = runtime_xchg((uint32*)&l->key, MUTEX_UNLOCKED);
 	if(v == MUTEX_UNLOCKED)
 		runtime_throw("unlock of unlocked lock");
 	if(v == MUTEX_SLEEPING)
 		runtime_futexwakeup((uint32*)&l->key, 1);
+
+	if(--runtime_m()->locks < 0)
+		runtime_throw("runtime_unlock: lock count");
 }
 
 // One-time notifications.
@@ -111,37 +111,45 @@ runtime_noteclear(Note *n)
 void
 runtime_notewakeup(Note *n)
 {
-	if(runtime_xchg((uint32*)&n->key, 1))
+	uint32 old;
+
+	old = runtime_xchg((uint32*)&n->key, 1);
+	if(old != 0) {
+		runtime_printf("notewakeup - double wakeup (%d)\n", old);
 		runtime_throw("notewakeup - double wakeup");
+	}
 	runtime_futexwakeup((uint32*)&n->key, 1);
 }
 
 void
 runtime_notesleep(Note *n)
 {
-	if(runtime_m()->profilehz > 0)
-		runtime_setprof(false);
+  /* For gccgo it's OK to sleep in non-g0, and it happens in
+     stoptheworld because we have not implemented preemption.
+
+	if(runtime_g() != runtime_m()->g0)
+		runtime_throw("notesleep not on g0");
+  */
 	while(runtime_atomicload((uint32*)&n->key) == 0)
 		runtime_futexsleep((uint32*)&n->key, 0, -1);
-	if(runtime_m()->profilehz > 0)
-		runtime_setprof(true);
 }
 
-void
-runtime_notetsleep(Note *n, int64 ns)
+static bool
+notetsleep(Note *n, int64 ns, int64 deadline, int64 now)
 {
-	int64 deadline, now;
+	// Conceptually, deadline and now are local variables.
+	// They are passed as arguments so that the space for them
+	// does not count against our nosplit stack sequence.
 
 	if(ns < 0) {
-		runtime_notesleep(n);
-		return;
+		while(runtime_atomicload((uint32*)&n->key) == 0)
+			runtime_futexsleep((uint32*)&n->key, 0, -1);
+		return true;
 	}
 
 	if(runtime_atomicload((uint32*)&n->key) != 0)
-		return;
+		return true;
 
-	if(runtime_m()->profilehz > 0)
-		runtime_setprof(false);
 	deadline = runtime_nanotime() + ns;
 	for(;;) {
 		runtime_futexsleep((uint32*)&n->key, 0, ns);
@@ -152,6 +160,33 @@ runtime_notetsleep(Note *n, int64 ns)
 			break;
 		ns = deadline - now;
 	}
-	if(runtime_m()->profilehz > 0)
-		runtime_setprof(true);
+	return runtime_atomicload((uint32*)&n->key) != 0;
+}
+
+bool
+runtime_notetsleep(Note *n, int64 ns)
+{
+	bool res;
+
+	if(runtime_g() != runtime_m()->g0 && !runtime_m()->gcing)
+		runtime_throw("notetsleep not on g0");
+
+	res = notetsleep(n, ns, 0, 0);
+	return res;
+}
+
+// same as runtime_notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+bool
+runtime_notetsleepg(Note *n, int64 ns)
+{
+	bool res;
+
+	if(runtime_g() == runtime_m()->g0)
+		runtime_throw("notetsleepg on g0");
+
+	runtime_entersyscallblock();
+	res = notetsleep(n, ns, 0, 0);
+	runtime_exitsyscall();
+	return res;
 }
diff --git a/libgo/runtime/lock_sema.c b/libgo/runtime/lock_sema.c
index 2663c5463de..ce435119323 100644
--- a/libgo/runtime/lock_sema.c
+++ b/libgo/runtime/lock_sema.c
@@ -95,9 +95,6 @@ runtime_unlock(Lock *l)
 	uintptr v;
 	M *mp;
 
-	if(--runtime_m()->locks < 0)
-		runtime_throw("runtime_unlock: lock count");
-
 	for(;;) {
 		v = (uintptr)runtime_atomicloadp((void**)&l->key);
 		if(v == LOCKED) {
@@ -114,6 +111,9 @@ runtime_unlock(Lock *l)
 			}
 		}
 	}
+
+	if(--runtime_m()->locks < 0)
+		runtime_throw("runtime_unlock: lock count");
 }
 
 // One-time notifications.
@@ -151,6 +151,10 @@ runtime_notesleep(Note *n)
 	M *m;
 
 	m = runtime_m();
+
+	if(runtime_g() != m->g0)
+		runtime_throw("notesleep not on g0");
+
 	if(m->waitsema == 0)
 		m->waitsema = runtime_semacreate();
 	if(!runtime_casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup)
@@ -159,61 +163,49 @@ runtime_notesleep(Note *n)
 		return;
 	}
 	// Queued.  Sleep.
-	if(m->profilehz > 0)
-		runtime_setprof(false);
 	runtime_semasleep(-1);
-	if(m->profilehz > 0)
-		runtime_setprof(true);
 }
 
-void
-runtime_notetsleep(Note *n, int64 ns)
+static bool
+notetsleep(Note *n, int64 ns, int64 deadline, M *mp)
 {
 	M *m;
-	M *mp;
-	int64 deadline, now;
-
-	if(ns < 0) {
-		runtime_notesleep(n);
-		return;
-	}
 
 	m = runtime_m();
-	if(m->waitsema == 0)
-		m->waitsema = runtime_semacreate();
+
+	// Conceptually, deadline and mp are local variables.
+	// They are passed as arguments so that the space for them
+	// does not count against our nosplit stack sequence.
 
 	// Register for wakeup on n->waitm.
 	if(!runtime_casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup already)
 		if(n->key != LOCKED)
 			runtime_throw("notetsleep - waitm out of sync");
-		return;
+		return true;
+	}
+
+	if(ns < 0) {
+		// Queued.  Sleep.
+		runtime_semasleep(-1);
+		return true;
 	}
 
-	if(m->profilehz > 0)
-		runtime_setprof(false);
 	deadline = runtime_nanotime() + ns;
 	for(;;) {
 		// Registered.  Sleep.
 		if(runtime_semasleep(ns) >= 0) {
 			// Acquired semaphore, semawakeup unregistered us.
 			// Done.
-			if(m->profilehz > 0)
-				runtime_setprof(true);
-			return;
+			return true;
 		}
 
 		// Interrupted or timed out.  Still registered.  Semaphore not acquired.
-		now = runtime_nanotime();
-		if(now >= deadline)
+		ns = deadline - runtime_nanotime();
+		if(ns <= 0)
 			break;
-
 		// Deadline hasn't arrived.  Keep sleeping.
-		ns = deadline - now;
 	}
 
-	if(m->profilehz > 0)
-		runtime_setprof(true);
-
 	// Deadline arrived.  Still registered.  Semaphore not acquired.
 	// Want to give up and return, but have to unregister first,
 	// so that any notewakeup racing with the return does not
@@ -223,15 +215,54 @@ runtime_notetsleep(Note *n, int64 ns)
 		if(mp == m) {
 			// No wakeup yet; unregister if possible.
 			if(runtime_casp((void**)&n->key, mp, nil))
-				return;
+				return false;
 		} else if(mp == (M*)LOCKED) {
 			// Wakeup happened so semaphore is available.
 			// Grab it to avoid getting out of sync.
 			if(runtime_semasleep(-1) < 0)
 				runtime_throw("runtime: unable to acquire - semaphore out of sync");
-			return;
-		} else {
+			return true;
+		} else
 			runtime_throw("runtime: unexpected waitm - semaphore out of sync");
-		}
 	}
 }
+
+bool
+runtime_notetsleep(Note *n, int64 ns)
+{
+	M *m;
+	bool res;
+
+	m = runtime_m();
+
+	if(runtime_g() != m->g0 && !m->gcing)
+		runtime_throw("notetsleep not on g0");
+
+	if(m->waitsema == 0)
+		m->waitsema = runtime_semacreate();
+
+	res = notetsleep(n, ns, 0, nil);
+	return res;
+}
+
+// same as runtime_notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+bool
+runtime_notetsleepg(Note *n, int64 ns)
+{
+	M *m;
+	bool res;
+
+	m = runtime_m();
+
+	if(runtime_g() == m->g0)
+		runtime_throw("notetsleepg on g0");
+
+	if(m->waitsema == 0)
+		m->waitsema = runtime_semacreate();
+
+	runtime_entersyscallblock();
+	res = notetsleep(n, ns, 0, nil);
+	runtime_exitsyscall();
+	return res;
+}
diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
index 8ccaa6b888c..d349f4749fa 100644
--- a/libgo/runtime/malloc.goc
+++ b/libgo/runtime/malloc.goc
@@ -18,7 +18,17 @@ package runtime
 #include "go-type.h"
 #include "race.h"
 
-MHeap *runtime_mheap;
+// Map gccgo field names to gc field names.
+// Eface aka __go_empty_interface.
+#define type __type_descriptor
+// Type aka __go_type_descriptor
+#define kind __code
+#define string __reflection
+#define KindPtr GO_PTR
+#define KindNoPointers GO_NO_POINTERS
+
+// Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
+MHeap runtime_mheap;
 
 int32	runtime_checking;
 
@@ -30,19 +40,28 @@ extern volatile intgo runtime_MemProfileRate
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
+// If the block will be freed with runtime_free(), typ must be 0.
 void*
-runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
+runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 {
 	M *m;
 	G *g;
 	int32 sizeclass;
 	intgo rate;
 	MCache *c;
+	MCacheList *l;
 	uintptr npages;
 	MSpan *s;
-	void *v;
+	MLink *v;
 	bool incallback;
 
+	if(size == 0) {
+		// All 0-length allocations use this pointer.
+		// The language does not require the allocations to
+		// have distinct values.
+		return &runtime_zerobase;
+	}
+
 	m = runtime_m();
 	g = runtime_g();
 
@@ -56,34 +75,45 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		runtime_exitsyscall();
 		m = runtime_m();
 		incallback = true;
-		dogc = false;
+		flag |= FlagNoGC;
 	}
 
-	if(runtime_gcwaiting && g != m->g0 && m->locks == 0 && dogc) {
+	if(runtime_gcwaiting() && g != m->g0 && m->locks == 0 && !(flag & FlagNoGC)) {
 		runtime_gosched();
 		m = runtime_m();
 	}
 	if(m->mallocing)
 		runtime_throw("malloc/free - deadlock");
+	// Disable preemption during settype_flush.
+	// We can not use m->mallocing for this, because settype_flush calls mallocgc.
+	m->locks++;
 	m->mallocing = 1;
-	if(size == 0)
-		size = 1;
 
 	if(DebugTypeAtBlockEnd)
 		size += sizeof(uintptr);
 
 	c = m->mcache;
-	c->local_nmalloc++;
 	if(size <= MaxSmallSize) {
 		// Allocate from mcache free lists.
-		sizeclass = runtime_SizeToClass(size);
+		// Inlined version of SizeToClass().
+		if(size <= 1024-8)
+			sizeclass = runtime_size_to_class8[(size+7)>>3];
+		else
+			sizeclass = runtime_size_to_class128[(size-1024+127) >> 7];
 		size = runtime_class_to_size[sizeclass];
-		v = runtime_MCache_Alloc(c, sizeclass, size, zeroed);
-		if(v == nil)
-			runtime_throw("out of memory");
-		c->local_alloc += size;
-		c->local_total_alloc += size;
-		c->local_by_size[sizeclass].nmalloc++;
+		l = &c->list[sizeclass];
+		if(l->list == nil)
+			runtime_MCache_Refill(c, sizeclass);
+		v = l->list;
+		l->list = v->next;
+		l->nlist--;
+		if(!(flag & FlagNoZero)) {
+			v->next = nil;
+			// block is zeroed iff second word is zero ...
+			if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
+				runtime_memclr((byte*)v, size);
+		}
+		c->local_cachealloc += size;
 	} else {
 		// TODO(rsc): Report tracebacks for very large allocations.
 
@@ -91,32 +121,39 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		npages = size >> PageShift;
 		if((size & PageMask) != 0)
 			npages++;
-		s = runtime_MHeap_Alloc(runtime_mheap, npages, 0, 1, zeroed);
+		s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, !(flag & FlagNoZero));
 		if(s == nil)
 			runtime_throw("out of memory");
+		s->limit = (byte*)(s->start<<PageShift) + size;
 		size = npages<<PageShift;
-		c->local_alloc += size;
-		c->local_total_alloc += size;
 		v = (void*)(s->start << PageShift);
 
 		// setup for mark sweep
 		runtime_markspan(v, 0, 0, true);
 	}
 
-	if (sizeof(void*) == 4 && c->local_total_alloc >= (1<<30)) {
-		// purge cache stats to prevent overflow
-		runtime_lock(runtime_mheap);
-		runtime_purgecachedstats(c);
-		runtime_unlock(runtime_mheap);
-	}
-
 	if(!(flag & FlagNoGC))
-		runtime_markallocated(v, size, (flag&FlagNoPointers) != 0);
+		runtime_markallocated(v, size, (flag&FlagNoScan) != 0);
 
 	if(DebugTypeAtBlockEnd)
-		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = 0;
+		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = typ;
+
+	// TODO: save type even if FlagNoScan?  Potentially expensive but might help
+	// heap profiling/tracing.
+	if(UseSpanType && !(flag & FlagNoScan) && typ != 0) {
+		uintptr *buf, i;
+
+		buf = m->settype_buf;
+		i = m->settype_bufsize;
+		buf[i++] = (uintptr)v;
+		buf[i++] = typ;
+		m->settype_bufsize = i;
+	}
 
 	m->mallocing = 0;
+	if(UseSpanType && !(flag & FlagNoScan) && typ != 0 && m->settype_bufsize == nelem(m->settype_buf))
+		runtime_settype_flush(m);
+	m->locks--;
 
 	if(!(flag & FlagNoProfiling) && (rate = runtime_MemProfileRate) > 0) {
 		if(size >= (uint32) rate)
@@ -135,13 +172,11 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		}
 	}
 
-	if(dogc && mstats.heap_alloc >= mstats.next_gc)
+	if(!(flag & FlagNoInvokeGC) && mstats.heap_alloc >= mstats.next_gc)
 		runtime_gc(0);
 
-	if(raceenabled) {
-		runtime_racemalloc(v, size, m->racepc);
-		m->racepc = nil;
-	}
+	if(raceenabled)
+		runtime_racemalloc(v, size);
 
 	if(incallback)
 		runtime_entersyscall();
@@ -152,7 +187,7 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 void*
 __go_alloc(uintptr size)
 {
-	return runtime_mallocgc(size, 0, 0, 1);
+	return runtime_mallocgc(size, 0, FlagNoInvokeGC);
 }
 
 // Free the object whose base pointer is v.
@@ -197,7 +232,9 @@ __go_free(void *v)
 		// they might coalesce v into other spans and change the bitmap further.
 		runtime_markfreed(v, size);
 		runtime_unmarkspan(v, 1<<PageShift);
-		runtime_MHeap_Free(runtime_mheap, s, 1);
+		runtime_MHeap_Free(&runtime_mheap, s, 1);
+		c->local_nlargefree++;
+		c->local_largefree += size;
 	} else {
 		// Small object.
 		size = runtime_class_to_size[sizeclass];
@@ -207,11 +244,9 @@ __go_free(void *v)
 		// it might coalesce v and other blocks into a bigger span
 		// and change the bitmap further.
 		runtime_markfreed(v, size);
-		c->local_by_size[sizeclass].nfree++;
+		c->local_nsmallfree[sizeclass]++;
 		runtime_MCache_Free(c, v, sizeclass, size);
 	}
-	c->local_nfree++;
-	c->local_alloc -= size;
 	if(prof)
 		runtime_MProf_Free(v, size);
 	m->mallocing = 0;
@@ -230,12 +265,12 @@ runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	m->mcache->local_nlookup++;
 	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
 		// purge cache stats to prevent overflow
-		runtime_lock(runtime_mheap);
+		runtime_lock(&runtime_mheap);
 		runtime_purgecachedstats(m->mcache);
-		runtime_unlock(runtime_mheap);
+		runtime_unlock(&runtime_mheap);
 	}
 
-	s = runtime_MHeap_LookupMaybe(runtime_mheap, v);
+	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
 	if(sp)
 		*sp = s;
 	if(s == nil) {
@@ -257,11 +292,6 @@ runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 		return 1;
 	}
 
-	if((byte*)v >= (byte*)s->limit) {
-		// pointers past the last block do not count as pointers.
-		return 0;
-	}
-
 	n = s->elemsize;
 	if(base) {
 		i = ((byte*)v - p)/n;
@@ -279,11 +309,9 @@ runtime_allocmcache(void)
 	intgo rate;
 	MCache *c;
 
-	runtime_lock(runtime_mheap);
-	c = runtime_FixAlloc_Alloc(&runtime_mheap->cachealloc);
-	mstats.mcache_inuse = runtime_mheap->cachealloc.inuse;
-	mstats.mcache_sys = runtime_mheap->cachealloc.sys;
-	runtime_unlock(runtime_mheap);
+	runtime_lock(&runtime_mheap);
+	c = runtime_FixAlloc_Alloc(&runtime_mheap.cachealloc);
+	runtime_unlock(&runtime_mheap);
 	runtime_memclr((byte*)c, sizeof(*c));
 
 	// Set first allocation sample size.
@@ -300,30 +328,32 @@ void
 runtime_freemcache(MCache *c)
 {
 	runtime_MCache_ReleaseAll(c);
-	runtime_lock(runtime_mheap);
+	runtime_lock(&runtime_mheap);
 	runtime_purgecachedstats(c);
-	runtime_FixAlloc_Free(&runtime_mheap->cachealloc, c);
-	runtime_unlock(runtime_mheap);
+	runtime_FixAlloc_Free(&runtime_mheap.cachealloc, c);
+	runtime_unlock(&runtime_mheap);
 }
 
 void
 runtime_purgecachedstats(MCache *c)
 {
+	MHeap *h;
+	int32 i;
+
 	// Protected by either heap or GC lock.
+	h = &runtime_mheap;
 	mstats.heap_alloc += c->local_cachealloc;
 	c->local_cachealloc = 0;
-	mstats.heap_objects += c->local_objects;
-	c->local_objects = 0;
-	mstats.nmalloc += c->local_nmalloc;
-	c->local_nmalloc = 0;
-	mstats.nfree += c->local_nfree;
-	c->local_nfree = 0;
 	mstats.nlookup += c->local_nlookup;
 	c->local_nlookup = 0;
-	mstats.alloc += c->local_alloc;
-	c->local_alloc= 0;
-	mstats.total_alloc += c->local_total_alloc;
-	c->local_total_alloc= 0;
+	h->largefree += c->local_largefree;
+	c->local_largefree = 0;
+	h->nlargefree += c->local_nlargefree;
+	c->local_nlargefree = 0;
+	for(i=0; i<(int32)nelem(c->local_nsmallfree); i++) {
+		h->nsmallfree[i] += c->local_nsmallfree[i];
+		c->local_nsmallfree[i] = 0;
+	}
 }
 
 extern uintptr runtime_sizeof_C_MStats
@@ -335,24 +365,24 @@ void
 runtime_mallocinit(void)
 {
 	byte *p;
-	uintptr arena_size, bitmap_size;
+	uintptr arena_size, bitmap_size, spans_size;
 	extern byte _end[];
 	byte *want;
 	uintptr limit;
+	uint64 i;
 
 	runtime_sizeof_C_MStats = sizeof(MStats);
 
 	p = nil;
 	arena_size = 0;
 	bitmap_size = 0;
-	
+	spans_size = 0;
+
 	// for 64-bit build
 	USED(p);
 	USED(arena_size);
 	USED(bitmap_size);
-
-	if((runtime_mheap = runtime_SysAlloc(sizeof(*runtime_mheap))) == nil)
-		runtime_throw("runtime: cannot allocate heap metadata");
+	USED(spans_size);
 
 	runtime_InitSizes();
 
@@ -369,15 +399,17 @@ runtime_mallocinit(void)
 		// 128 GB (MaxMem) should be big enough for now.
 		//
 		// The code will work with the reservation at any address, but ask
-		// SysReserve to use 0x000000c000000000 if possible.
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
 		// Allocating a 128 GB region takes away 37 bits, and the amd64
 		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
 		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
-		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x0x00df.
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
 		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
 		// UTF-8 sequences, and they are otherwise as far away from 
-		// ff (likely a common byte) as possible. An earlier attempt to use 0x11f8 
-		// caused out of memory errors on OS X during thread allocations.
+		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
+		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
 		// These choices are both for debuggability and to reduce the
 		// odds of the conservative garbage collector not collecting memory
 		// because some non-pointer block of memory had a bit pattern
@@ -389,7 +421,14 @@ runtime_mallocinit(void)
 		// If this fails we fall back to the 32 bit memory mechanism
 		arena_size = MaxMem;
 		bitmap_size = arena_size / (sizeof(void*)*8/4);
-		p = runtime_SysReserve((void*)(0x00c0ULL<<32), bitmap_size + arena_size);
+		spans_size = arena_size / PageSize * sizeof(runtime_mheap.spans[0]);
+		spans_size = ROUND(spans_size, PageSize);
+		for(i = 0; i <= 0x7f; i++) {
+			p = (void*)(uintptr)(i<<40 | 0x00c0ULL<<32);
+			p = runtime_SysReserve(p, bitmap_size + spans_size + arena_size);
+			if(p != nil)
+				break;
+		}
 	}
 	if (p == nil) {
 		// On a 32-bit machine, we can't typically get away
@@ -411,11 +450,14 @@ runtime_mallocinit(void)
 		// of address space, which is probably too much in a 32-bit world.
 		bitmap_size = MaxArena32 / (sizeof(void*)*8/4);
 		arena_size = 512<<20;
-		if(limit > 0 && arena_size+bitmap_size > limit) {
+		spans_size = MaxArena32 / PageSize * sizeof(runtime_mheap.spans[0]);
+		if(limit > 0 && arena_size+bitmap_size+spans_size > limit) {
 			bitmap_size = (limit / 9) & ~((1<<PageShift) - 1);
 			arena_size = bitmap_size * 8;
+			spans_size = arena_size / PageSize * sizeof(runtime_mheap.spans[0]);
 		}
-		
+		spans_size = ROUND(spans_size, PageSize);
+
 		// SysReserve treats the address we ask for, end, as a hint,
 		// not as an absolute requirement.  If we ask for the end
 		// of the data segment but the operating system requires
@@ -425,25 +467,27 @@ runtime_mallocinit(void)
 		// So adjust it upward a little bit ourselves: 1/4 MB to get
 		// away from the running binary image and then round up
 		// to a MB boundary.
-		want = (byte*)(((uintptr)_end + (1<<18) + (1<<20) - 1)&~((1<<20)-1));
-		if(0xffffffff - (uintptr)want <= bitmap_size + arena_size)
+		want = (byte*)ROUND((uintptr)_end + (1<<18), 1<<20);
+		if(0xffffffff - (uintptr)want <= bitmap_size + spans_size + arena_size)
 		  want = 0;
-		p = runtime_SysReserve(want, bitmap_size + arena_size);
+		p = runtime_SysReserve(want, bitmap_size + spans_size + arena_size);
 		if(p == nil)
 			runtime_throw("runtime: cannot reserve arena virtual address space");
 		if((uintptr)p & (((uintptr)1<<PageShift)-1))
-			runtime_printf("runtime: SysReserve returned unaligned address %p; asked for %p", p, bitmap_size+arena_size);
+			runtime_printf("runtime: SysReserve returned unaligned address %p; asked for %p", p,
+				bitmap_size+spans_size+arena_size);
 	}
 	if((uintptr)p & (((uintptr)1<<PageShift)-1))
 		runtime_throw("runtime: SysReserve returned unaligned address");
 
-	runtime_mheap->bitmap = p;
-	runtime_mheap->arena_start = p + bitmap_size;
-	runtime_mheap->arena_used = runtime_mheap->arena_start;
-	runtime_mheap->arena_end = runtime_mheap->arena_start + arena_size;
+	runtime_mheap.spans = (MSpan**)p;
+	runtime_mheap.bitmap = p + spans_size;
+	runtime_mheap.arena_start = p + spans_size + bitmap_size;
+	runtime_mheap.arena_used = runtime_mheap.arena_start;
+	runtime_mheap.arena_end = runtime_mheap.arena_start + arena_size;
 
 	// Initialize the rest of the allocator.	
-	runtime_MHeap_Init(runtime_mheap, runtime_SysAlloc);
+	runtime_MHeap_Init(&runtime_mheap);
 	runtime_m()->mcache = runtime_allocmcache();
 
 	// See if it works.
@@ -463,8 +507,7 @@ runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
 		uintptr needed;
 
 		needed = (uintptr)h->arena_used + n - (uintptr)h->arena_end;
-		// Round wanted arena size to a multiple of 256MB.
-		needed = (needed + (256<<20) - 1) & ~((256<<20)-1);
+		needed = ROUND(needed, 256<<20);
 		new_end = h->arena_end + needed;
 		if(new_end <= h->arena_start + MaxArena32) {
 			p = runtime_SysReserve(h->arena_end, new_end - h->arena_end);
@@ -475,9 +518,10 @@ runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
 	if(n <= (uintptr)(h->arena_end - h->arena_used)) {
 		// Keep taking from our reservation.
 		p = h->arena_used;
-		runtime_SysMap(p, n);
+		runtime_SysMap(p, n, &mstats.heap_sys);
 		h->arena_used += n;
 		runtime_MHeap_MapBits(h);
+		runtime_MHeap_MapSpans(h);
 		if(raceenabled)
 			runtime_racemapshadow(p, n);
 		return p;
@@ -490,14 +534,14 @@ runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
 	// On 32-bit, once the reservation is gone we can
 	// try to get memory at a location chosen by the OS
 	// and hope that it is in the range we allocated bitmap for.
-	p = runtime_SysAlloc(n);
+	p = runtime_SysAlloc(n, &mstats.heap_sys);
 	if(p == nil)
 		return nil;
 
 	if(p < h->arena_start || (uintptr)(p+n - h->arena_start) >= MaxArena32) {
 		runtime_printf("runtime: memory allocated by OS (%p) not in usable range [%p,%p)\n",
 			p, h->arena_start, h->arena_start+MaxArena32);
-		runtime_SysFree(p, n);
+		runtime_SysFree(p, n, &mstats.heap_sys);
 		return nil;
 	}
 
@@ -506,6 +550,7 @@ runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
 		if(h->arena_used > h->arena_end)
 			h->arena_end = h->arena_used;
 		runtime_MHeap_MapBits(h);
+		runtime_MHeap_MapSpans(h);
 		if(raceenabled)
 			runtime_racemapshadow(p, n);
 	}
@@ -513,17 +558,68 @@ runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
 	return p;
 }
 
+static struct
+{
+	Lock;
+	byte*	pos;
+	byte*	end;
+} persistent;
+
+enum
+{
+	PersistentAllocChunk	= 256<<10,
+	PersistentAllocMaxBlock	= 64<<10,  // VM reservation granularity is 64K on windows
+};
+
+// Wrapper around SysAlloc that can allocate small chunks.
+// There is no associated free operation.
+// Intended for things like function/type/debug-related persistent data.
+// If align is 0, uses default align (currently 8).
+void*
+runtime_persistentalloc(uintptr size, uintptr align, uint64 *stat)
+{
+	byte *p;
+
+	if(align != 0) {
+		if(align&(align-1))
+			runtime_throw("persistentalloc: align is now a power of 2");
+		if(align > PageSize)
+			runtime_throw("persistentalloc: align is too large");
+	} else
+		align = 8;
+	if(size >= PersistentAllocMaxBlock)
+		return runtime_SysAlloc(size, stat);
+	runtime_lock(&persistent);
+	persistent.pos = (byte*)ROUND((uintptr)persistent.pos, align);
+	if(persistent.pos + size > persistent.end) {
+		persistent.pos = runtime_SysAlloc(PersistentAllocChunk, &mstats.other_sys);
+		if(persistent.pos == nil) {
+			runtime_unlock(&persistent);
+			runtime_throw("runtime: cannot allocate memory");
+		}
+		persistent.end = persistent.pos + PersistentAllocChunk;
+	}
+	p = persistent.pos;
+	persistent.pos += size;
+	runtime_unlock(&persistent);
+	if(stat != &mstats.other_sys) {
+		// reaccount the allocation against provided stat
+		runtime_xadd64(stat, size);
+		runtime_xadd64(&mstats.other_sys, -(uint64)size);
+	}
+	return p;
+}
+
 static Lock settype_lock;
 
 void
-runtime_settype_flush(M *mp, bool sysalloc)
+runtime_settype_flush(M *mp)
 {
 	uintptr *buf, *endbuf;
 	uintptr size, ofs, j, t;
 	uintptr ntypes, nbytes2, nbytes3;
 	uintptr *data2;
 	byte *data3;
-	bool sysalloc3;
 	void *v;
 	uintptr typ, p;
 	MSpan *s;
@@ -542,8 +638,8 @@ runtime_settype_flush(M *mp, bool sysalloc)
 		// (Manually inlined copy of runtime_MHeap_Lookup)
 		p = (uintptr)v>>PageShift;
 		if(sizeof(void*) == 8)
-			p -= (uintptr)runtime_mheap->arena_start >> PageShift;
-		s = runtime_mheap->map[p];
+			p -= (uintptr)runtime_mheap.arena_start >> PageShift;
+		s = runtime_mheap.spans[p];
 
 		if(s->sizeclass == 0) {
 			s->types.compression = MTypes_Single;
@@ -558,20 +654,9 @@ runtime_settype_flush(M *mp, bool sysalloc)
 		case MTypes_Empty:
 			ntypes = (s->npages << PageShift) / size;
 			nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
-
-			if(!sysalloc) {
-				data3 = runtime_mallocgc(nbytes3, FlagNoProfiling|FlagNoPointers, 0, 1);
-			} else {
-				data3 = runtime_SysAlloc(nbytes3);
-				if(data3 == nil)
-					runtime_throw("runtime: cannot allocate memory");
-				if(0) runtime_printf("settype(0->3): SysAlloc(%x) --> %p\n", (uint32)nbytes3, data3);
-			}
-
+			data3 = runtime_mallocgc(nbytes3, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
 			s->types.compression = MTypes_Bytes;
-			s->types.sysalloc = sysalloc;
 			s->types.data = (uintptr)data3;
-
 			((uintptr*)data3)[1] = typ;
 			data3[8*sizeof(uintptr) + ofs] = 1;
 			break;
@@ -596,20 +681,8 @@ runtime_settype_flush(M *mp, bool sysalloc)
 			} else {
 				ntypes = (s->npages << PageShift) / size;
 				nbytes2 = ntypes * sizeof(uintptr);
-
-				if(!sysalloc) {
-					data2 = runtime_mallocgc(nbytes2, FlagNoProfiling|FlagNoPointers, 0, 1);
-				} else {
-					data2 = runtime_SysAlloc(nbytes2);
-					if(data2 == nil)
-						runtime_throw("runtime: cannot allocate memory");
-					if(0) runtime_printf("settype.(3->2): SysAlloc(%x) --> %p\n", (uint32)nbytes2, data2);
-				}
-
-				sysalloc3 = s->types.sysalloc;
-
+				data2 = runtime_mallocgc(nbytes2, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
 				s->types.compression = MTypes_Words;
-				s->types.sysalloc = sysalloc;
 				s->types.data = (uintptr)data2;
 
 				// Move the contents of data3 to data2. Then deallocate data3.
@@ -618,12 +691,6 @@ runtime_settype_flush(M *mp, bool sysalloc)
 					t = ((uintptr*)data3)[t];
 					data2[j] = t;
 				}
-				if(sysalloc3) {
-					nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
-					if(0) runtime_printf("settype.(3->2): SysFree(%p,%x)\n", data3, (uint32)nbytes3);
-					runtime_SysFree(data3, nbytes3);
-				}
-
 				data2[ofs] = typ;
 			}
 			break;
@@ -634,64 +701,6 @@ runtime_settype_flush(M *mp, bool sysalloc)
 	mp->settype_bufsize = 0;
 }
 
-// It is forbidden to use this function if it is possible that
-// explicit deallocation via calling runtime_free(v) may happen.
-void
-runtime_settype(void *v, uintptr t)
-{
-	M *mp;
-	uintptr *buf;
-	uintptr i;
-	MSpan *s;
-
-	if(t == 0)
-		runtime_throw("settype: zero type");
-
-	mp = runtime_m();
-	buf = mp->settype_buf;
-	i = mp->settype_bufsize;
-	buf[i+0] = (uintptr)v;
-	buf[i+1] = t;
-	i += 2;
-	mp->settype_bufsize = i;
-
-	if(i == nelem(mp->settype_buf)) {
-		runtime_settype_flush(mp, false);
-	}
-
-	if(DebugTypeAtBlockEnd) {
-		s = runtime_MHeap_Lookup(runtime_mheap, v);
-		*(uintptr*)((uintptr)v+s->elemsize-sizeof(uintptr)) = t;
-	}
-}
-
-void
-runtime_settype_sysfree(MSpan *s)
-{
-	uintptr ntypes, nbytes;
-
-	if(!s->types.sysalloc)
-		return;
-
-	nbytes = (uintptr)-1;
-
-	switch (s->types.compression) {
-	case MTypes_Words:
-		ntypes = (s->npages << PageShift) / s->elemsize;
-		nbytes = ntypes * sizeof(uintptr);
-		break;
-	case MTypes_Bytes:
-		ntypes = (s->npages << PageShift) / s->elemsize;
-		nbytes = 8*sizeof(uintptr) + 1*ntypes;
-		break;
-	}
-
-	if(nbytes != (uintptr)-1) {
-		if(0) runtime_printf("settype: SysFree(%p,%x)\n", (void*)s->types.data, (uint32)nbytes);
-		runtime_SysFree((void*)s->types.data, nbytes);
-	}
-}
-
 uintptr
 runtime_gettype(void *v)
 {
@@ -699,7 +708,7 @@ runtime_gettype(void *v)
 	uintptr t, ofs;
 	byte *data;
 
-	s = runtime_MHeap_LookupMaybe(runtime_mheap, v);
+	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
 	if(s != nil) {
 		t = 0;
 		switch(s->types.compression) {
@@ -736,61 +745,23 @@ runtime_gettype(void *v)
 void*
 runtime_mal(uintptr n)
 {
-	return runtime_mallocgc(n, 0, 1, 1);
+	return runtime_mallocgc(n, 0, 0);
 }
 
 void *
 runtime_new(const Type *typ)
 {
-	void *ret;
-	uint32 flag;
-
-	if(raceenabled)
-		runtime_m()->racepc = runtime_getcallerpc(&typ);
-
-	if(typ->__size == 0) {
-		// All 0-length allocations use this pointer.
-		// The language does not require the allocations to
-		// have distinct values.
-		ret = (uint8*)&runtime_zerobase;
-	} else {
-		flag = typ->__code&GO_NO_POINTERS ? FlagNoPointers : 0;
-		ret = runtime_mallocgc(typ->__size, flag, 1, 1);
-
-		if(UseSpanType && !flag) {
-			if(false)
-				runtime_printf("new %S: %p\n", *typ->__reflection, ret);
-			runtime_settype(ret, (uintptr)typ | TypeInfo_SingleObject);
-		}
-	}
-
-	return ret;
+	return runtime_mallocgc(typ->__size, (uintptr)typ | TypeInfo_SingleObject, typ->kind&KindNoPointers ? FlagNoScan : 0);
 }
 
 static void*
 cnew(const Type *typ, intgo n, int32 objtyp)
 {
-	uint32 flag;
-	void *ret;
-
 	if((objtyp&(PtrSize-1)) != objtyp)
 		runtime_throw("runtime: invalid objtyp");
 	if(n < 0 || (typ->__size > 0 && (uintptr)n > (MaxMem/typ->__size)))
 		runtime_panicstring("runtime: allocation size out of range");
-	if(typ->__size == 0 || n == 0) {
-		// All 0-length allocations use this pointer.
-		// The language does not require the allocations to
-		// have distinct values.
-		return &runtime_zerobase;
-	}
-	flag = typ->__code&GO_NO_POINTERS ? FlagNoPointers : 0;
-	ret = runtime_mallocgc(typ->__size*n, flag, 1, 1);
-	if(UseSpanType && !flag) {
-		if(false)
-			runtime_printf("cnew [%D]%S: %p\n", (int64)n, *typ->__reflection, ret);
-		runtime_settype(ret, (uintptr)typ | TypeInfo_SingleObject);
-	}
-	return ret;
+	return runtime_mallocgc(typ->__size*n, (uintptr)typ | objtyp, typ->kind&KindNoPointers ? FlagNoScan : 0);
 }
 
 // same as runtime_new, but callable from C
@@ -814,6 +785,8 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 	byte *base;
 	uintptr size;
 	const FuncType *ft;
+	const Type *fint;
+	const PtrType *ot;
 
 	if(obj.__type_descriptor == nil) {
 		runtime_printf("runtime.SetFinalizer: first argument is nil interface\n");
@@ -828,22 +801,36 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 		goto throw;
 	}
 	ft = nil;
+	ot = (const PtrType*)obj.__type_descriptor;
+	fint = nil;
 	if(finalizer.__type_descriptor != nil) {
 		if(finalizer.__type_descriptor->__code != GO_FUNC)
 			goto badfunc;
 		ft = (const FuncType*)finalizer.__type_descriptor;
-		if(ft->__dotdotdot || ft->__in.__count != 1 || !__go_type_descriptors_equal(*(Type**)ft->__in.__values, obj.__type_descriptor))
+		if(ft->__dotdotdot || ft->__in.__count != 1)
+			goto badfunc;
+		fint = *(Type**)ft->__in.__values;
+		if(__go_type_descriptors_equal(fint, obj.__type_descriptor)) {
+			// ok - same type
+		} else if(fint->__code == GO_PTR && (fint->__uncommon == nil || fint->__uncommon->__name == nil || obj.type->__uncommon == nil || obj.type->__uncommon->__name == nil) && __go_type_descriptors_equal(((const PtrType*)fint)->__element_type, ((const PtrType*)obj.type)->__element_type)) {
+			// ok - not same type, but both pointers,
+			// one or the other is unnamed, and same element type, so assignable.
+		} else if(fint->kind == GO_INTERFACE && ((const InterfaceType*)fint)->__methods.__count == 0) {
+			// ok - satisfies empty interface
+		} else if(fint->kind == GO_INTERFACE && __go_convert_interface_2(fint, obj.__type_descriptor, 1) != nil) {
+			// ok - satisfies non-empty interface
+		} else
 			goto badfunc;
 	}
 
-	if(!runtime_addfinalizer(obj.__object, finalizer.__type_descriptor != nil ? *(void**)finalizer.__object : nil, ft)) {
+	if(!runtime_addfinalizer(obj.__object, finalizer.__type_descriptor != nil ? *(void**)finalizer.__object : nil, ft, ot)) {
 		runtime_printf("runtime.SetFinalizer: finalizer already set\n");
 		goto throw;
 	}
 	return;
 
 badfunc:
-	runtime_printf("runtime.SetFinalizer: second argument is %S, not func(%S)\n", *finalizer.__type_descriptor->__reflection, *obj.__type_descriptor->__reflection);
+	runtime_printf("runtime.SetFinalizer: cannot pass %S to finalizer %S\n", *obj.__type_descriptor->__reflection, *finalizer.__type_descriptor->__reflection);
 throw:
 	runtime_throw("runtime.SetFinalizer");
 }
diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h
index ebea34eb32c..45c4c09c147 100644
--- a/libgo/runtime/malloc.h
+++ b/libgo/runtime/malloc.h
@@ -108,9 +108,7 @@ enum
 	// Tunable constants.
 	MaxSmallSize = 32<<10,
 
-	FixAllocChunk = 128<<10,	// Chunk size for FixAlloc
-	MaxMCacheListLen = 256,		// Maximum objects on MCacheList
-	MaxMCacheSize = 2<<20,		// Maximum bytes in one MCache
+	FixAllocChunk = 16<<10,		// Chunk size for FixAlloc
 	MaxMHeapList = 1<<(20 - PageShift),	// Maximum page length for fixed-size list in MHeap.
 	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
 
@@ -155,13 +153,13 @@ struct MLink
 
 // SysAlloc obtains a large chunk of zeroed memory from the
 // operating system, typically on the order of a hundred kilobytes
-// or a megabyte.  If the pointer argument is non-nil, the caller
-// wants a mapping there or nowhere.
+// or a megabyte.
 //
 // SysUnused notifies the operating system that the contents
 // of the memory region are no longer needed and can be reused
-// for other purposes.  The program reserves the right to start
-// accessing those pages in the future.
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
 //
 // SysFree returns it unconditionally; this is only used if
 // an out-of-memory error has been detected midway through
@@ -174,10 +172,11 @@ struct MLink
 //
 // SysMap maps previously reserved address space for use.
 
-void*	runtime_SysAlloc(uintptr nbytes);
-void	runtime_SysFree(void *v, uintptr nbytes);
+void*	runtime_SysAlloc(uintptr nbytes, uint64 *stat);
+void	runtime_SysFree(void *v, uintptr nbytes, uint64 *stat);
 void	runtime_SysUnused(void *v, uintptr nbytes);
-void	runtime_SysMap(void *v, uintptr nbytes);
+void	runtime_SysUsed(void *v, uintptr nbytes);
+void	runtime_SysMap(void *v, uintptr nbytes, uint64 *stat);
 void*	runtime_SysReserve(void *v, uintptr nbytes);
 
 // FixAlloc is a simple free-list allocator for fixed size objects.
@@ -190,18 +189,17 @@ void*	runtime_SysReserve(void *v, uintptr nbytes);
 // smashed by freeing and reallocating.
 struct FixAlloc
 {
-	uintptr size;
-	void *(*alloc)(uintptr);
-	void (*first)(void *arg, byte *p);	// called first time p is returned
-	void *arg;
-	MLink *list;
-	byte *chunk;
-	uint32 nchunk;
-	uintptr inuse;	// in-use bytes now
-	uintptr sys;	// bytes obtained from system
+	uintptr	size;
+	void	(*first)(void *arg, byte *p);	// called first time p is returned
+	void*	arg;
+	MLink*	list;
+	byte*	chunk;
+	uint32	nchunk;
+	uintptr	inuse;	// in-use bytes now
+	uint64*	stat;
 };
 
-void	runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void *(*alloc)(uintptr), void (*first)(void*, byte*), void *arg);
+void	runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat);
 void*	runtime_FixAlloc_Alloc(FixAlloc *f);
 void	runtime_FixAlloc_Free(FixAlloc *f, void *p);
 
@@ -236,6 +234,8 @@ struct MStats
 	uint64	mcache_inuse;	// MCache structures
 	uint64	mcache_sys;
 	uint64	buckhash_sys;	// profiling bucket hash table
+	uint64	gc_sys;
+	uint64	other_sys;
 
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
@@ -267,14 +267,12 @@ extern MStats mstats
 // class_to_size[i] = largest size in class i
 // class_to_allocnpages[i] = number of pages to allocate when
 //	making new objects in class i
-// class_to_transfercount[i] = number of objects to move when
-//	taking a bunch of objects out of the central lists
-//	and putting them in the thread free list.
 
 int32	runtime_SizeToClass(int32);
 extern	int32	runtime_class_to_size[NumSizeClasses];
 extern	int32	runtime_class_to_allocnpages[NumSizeClasses];
-extern	int32	runtime_class_to_transfercount[NumSizeClasses];
+extern	int8	runtime_size_to_class8[1024/8 + 1];
+extern	int8	runtime_size_to_class128[(MaxSmallSize-1024)/128 + 1];
 extern	void	runtime_InitSizes(void);
 
 
@@ -285,30 +283,24 @@ struct MCacheList
 {
 	MLink *list;
 	uint32 nlist;
-	uint32 nlistmin;
 };
 
 struct MCache
 {
-	MCacheList list[NumSizeClasses];
-	uintptr size;
+	// The following members are accessed on every malloc,
+	// so they are grouped here for better caching.
+	int32 next_sample;		// trigger heap sample after allocating this many bytes
 	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
-	intptr local_objects;	// objects allocated (or freed) from cache since last lock of heap
-	intptr local_alloc;	// bytes allocated (or freed) since last lock of heap
-	uintptr local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
-	uintptr local_nmalloc;	// number of mallocs since last lock of heap
-	uintptr local_nfree;	// number of frees since last lock of heap
-	uintptr local_nlookup;	// number of pointer lookups since last lock of heap
-	int32 next_sample;	// trigger heap sample after allocating this many bytes
-	// Statistics about allocation size classes since last lock of heap
-	struct {
-		uintptr nmalloc;
-		uintptr nfree;
-	} local_by_size[NumSizeClasses];
-
+	// The rest is not accessed on every malloc.
+	MCacheList list[NumSizeClasses];
+	// Local allocator stats, flushed during GC.
+	uintptr local_nlookup;		// number of pointer lookups
+	uintptr local_largefree;	// bytes freed for large objects (>MaxSmallSize)
+	uintptr local_nlargefree;	// number of frees for large objects (>MaxSmallSize)
+	uintptr local_nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
 };
 
-void*	runtime_MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed);
+void	runtime_MCache_Refill(MCache *c, int32 sizeclass);
 void	runtime_MCache_Free(MCache *c, void *p, int32 sizeclass, uintptr size);
 void	runtime_MCache_ReleaseAll(MCache *c);
 
@@ -346,7 +338,6 @@ enum
 struct MTypes
 {
 	byte	compression;	// one of MTypes_*
-	bool	sysalloc;	// whether (void*)data is from runtime_SysAlloc
 	uintptr	data;
 };
 
@@ -397,8 +388,8 @@ struct MCentral
 };
 
 void	runtime_MCentral_Init(MCentral *c, int32 sizeclass);
-int32	runtime_MCentral_AllocList(MCentral *c, int32 n, MLink **first);
-void	runtime_MCentral_FreeList(MCentral *c, int32 n, MLink *first);
+int32	runtime_MCentral_AllocList(MCentral *c, MLink **first);
+void	runtime_MCentral_FreeList(MCentral *c, MLink *first);
 void	runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
 
 // Main malloc heap.
@@ -414,7 +405,8 @@ struct MHeap
 	uint32	nspancap;
 
 	// span lookup
-	MSpan *map[1<<MHeapMap_Bits];
+	MSpan**	spans;
+	uintptr	spans_mapped;
 
 	// range of addresses we might see in the heap
 	byte *bitmap;
@@ -434,10 +426,15 @@ struct MHeap
 
 	FixAlloc spanalloc;	// allocator for Span*
 	FixAlloc cachealloc;	// allocator for MCache*
+
+	// Malloc stats.
+	uint64 largefree;	// bytes freed for large objects (>MaxSmallSize)
+	uint64 nlargefree;	// number of frees for large objects (>MaxSmallSize)
+	uint64 nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
 };
-extern MHeap *runtime_mheap;
+extern MHeap runtime_mheap;
 
-void	runtime_MHeap_Init(MHeap *h, void *(*allocator)(uintptr));
+void	runtime_MHeap_Init(MHeap *h);
 MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
 void	runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan*	runtime_MHeap_Lookup(MHeap *h, void *v);
@@ -445,9 +442,11 @@ MSpan*	runtime_MHeap_LookupMaybe(MHeap *h, void *v);
 void	runtime_MGetSizeClassInfo(int32 sizeclass, uintptr *size, int32 *npages, int32 *nobj);
 void*	runtime_MHeap_SysAlloc(MHeap *h, uintptr n);
 void	runtime_MHeap_MapBits(MHeap *h);
+void	runtime_MHeap_MapSpans(MHeap *h);
 void	runtime_MHeap_Scavenger(void*);
 
-void*	runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed);
+void*	runtime_mallocgc(uintptr size, uintptr typ, uint32 flag);
+void*	runtime_persistentalloc(uintptr size, uintptr align, uint64 *stat);
 int32	runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void	runtime_gc(int32 force);
 void	runtime_markallocated(void *v, uintptr n, bool noptr);
@@ -463,17 +462,18 @@ void	runtime_purgecachedstats(MCache*);
 void*	runtime_cnew(const Type*);
 void*	runtime_cnewarray(const Type*, intgo);
 
-void	runtime_settype(void*, uintptr);
-void	runtime_settype_flush(M*, bool);
+void	runtime_settype_flush(M*);
 void	runtime_settype_sysfree(MSpan*);
 uintptr	runtime_gettype(void*);
 
 enum
 {
 	// flags to malloc
-	FlagNoPointers = 1<<0,	// no pointers here
-	FlagNoProfiling = 1<<1,	// must not profile
-	FlagNoGC = 1<<2,	// must not free or scan for pointers
+	FlagNoScan	= 1<<0,	// GC doesn't have to scan object
+	FlagNoProfiling	= 1<<1,	// must not profile
+	FlagNoGC	= 1<<2,	// must not free or scan for pointers
+	FlagNoZero	= 1<<3, // don't zero memory
+	FlagNoInvokeGC	= 1<<4, // don't invoke GC
 };
 
 typedef struct Obj Obj;
@@ -493,15 +493,15 @@ void	runtime_helpgc(int32 nproc);
 void	runtime_gchelper(void);
 
 struct __go_func_type;
-bool	runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft);
+struct __go_ptr_type;
+bool	runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft, const struct __go_ptr_type **ot);
 void	runtime_walkfintab(void (*fn)(void*), void (*scan)(Obj));
 
 enum
 {
 	TypeInfo_SingleObject = 0,
 	TypeInfo_Array = 1,
-	TypeInfo_Map = 2,
-	TypeInfo_Chan = 3,
+	TypeInfo_Chan = 2,
 
 	// Enables type information at the end of blocks allocated from heap	
 	DebugTypeAtBlockEnd = 0,
diff --git a/libgo/runtime/mcache.c b/libgo/runtime/mcache.c
index 45bac4ffbce..38f824a139b 100644
--- a/libgo/runtime/mcache.c
+++ b/libgo/runtime/mcache.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Per-thread (in Go, per-M) malloc cache for small objects.
+// Per-P malloc cache for small objects.
 //
 // See malloc.h for an overview.
 
@@ -10,48 +10,23 @@
 #include "arch.h"
 #include "malloc.h"
 
-void*
-runtime_MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed)
+void
+runtime_MCache_Refill(MCache *c, int32 sizeclass)
 {
 	MCacheList *l;
-	MLink *first, *v;
-	int32 n;
 
-	// Allocate from list.
+	// Replenish using central lists.
 	l = &c->list[sizeclass];
-	if(l->list == nil) {
-		// Replenish using central lists.
-		n = runtime_MCentral_AllocList(&runtime_mheap->central[sizeclass],
-			runtime_class_to_transfercount[sizeclass], &first);
-		if(n == 0)
-			runtime_throw("out of memory");
-		l->list = first;
-		l->nlist = n;
-		c->size += n*size;
-	}
-	v = l->list;
-	l->list = v->next;
-	l->nlist--;
-	if(l->nlist < l->nlistmin)
-		l->nlistmin = l->nlist;
-	c->size -= size;
-
-	// v is zeroed except for the link pointer
-	// that we used above; zero that.
-	v->next = nil;
-	if(zeroed) {
-		// block is zeroed iff second word is zero ...
-		if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
-			runtime_memclr((byte*)v, size);
-	}
-	c->local_cachealloc += size;
-	c->local_objects++;
-	return v;
+	if(l->list)
+		runtime_throw("MCache_Refill: the list is not empty");
+	l->nlist = runtime_MCentral_AllocList(&runtime_mheap.central[sizeclass], &l->list);
+	if(l->list == nil)
+		runtime_throw("out of memory");
 }
 
 // Take n elements off l and return them to the central free list.
 static void
-ReleaseN(MCache *c, MCacheList *l, int32 n, int32 sizeclass)
+ReleaseN(MCacheList *l, int32 n, int32 sizeclass)
 {
 	MLink *first, **lp;
 	int32 i;
@@ -64,18 +39,14 @@ ReleaseN(MCache *c, MCacheList *l, int32 n, int32 sizeclass)
 	l->list = *lp;
 	*lp = nil;
 	l->nlist -= n;
-	if(l->nlist < l->nlistmin)
-		l->nlistmin = l->nlist;
-	c->size -= n*runtime_class_to_size[sizeclass];
 
 	// Return them to central free list.
-	runtime_MCentral_FreeList(&runtime_mheap->central[sizeclass], n, first);
+	runtime_MCentral_FreeList(&runtime_mheap.central[sizeclass], first);
 }
 
 void
 runtime_MCache_Free(MCache *c, void *v, int32 sizeclass, uintptr size)
 {
-	int32 i, n;
 	MCacheList *l;
 	MLink *p;
 
@@ -85,34 +56,12 @@ runtime_MCache_Free(MCache *c, void *v, int32 sizeclass, uintptr size)
 	p->next = l->list;
 	l->list = p;
 	l->nlist++;
-	c->size += size;
 	c->local_cachealloc -= size;
-	c->local_objects--;
-
-	if(l->nlist >= MaxMCacheListLen) {
-		// Release a chunk back.
-		ReleaseN(c, l, runtime_class_to_transfercount[sizeclass], sizeclass);
-	}
-
-	if(c->size >= MaxMCacheSize) {
-		// Scavenge.
-		for(i=0; i<NumSizeClasses; i++) {
-			l = &c->list[i];
-			n = l->nlistmin;
 
-			// n is the minimum number of elements we've seen on
-			// the list since the last scavenge.  If n > 0, it means that
-			// we could have gotten by with n fewer elements
-			// without needing to consult the central free list.
-			// Move toward that situation by releasing n/2 of them.
-			if(n > 0) {
-				if(n > 1)
-					n /= 2;
-				ReleaseN(c, l, n, i);
-			}
-			l->nlistmin = l->nlist;
-		}
-	}
+	// We transfer span at a time from MCentral to MCache,
+	// if we have 2 times more than that, release a half back.
+	if(l->nlist >= 2*(runtime_class_to_allocnpages[sizeclass]<<PageShift)/size)
+		ReleaseN(l, l->nlist/2, sizeclass);
 }
 
 void
@@ -123,7 +72,10 @@ runtime_MCache_ReleaseAll(MCache *c)
 
 	for(i=0; i<NumSizeClasses; i++) {
 		l = &c->list[i];
-		ReleaseN(c, l, l->nlist, i);
-		l->nlistmin = 0;
+		if(l->list) {
+			runtime_MCentral_FreeList(&runtime_mheap.central[i], l->list);
+			l->list = nil;
+			l->nlist = 0;
+		}
 	}
 }
diff --git a/libgo/runtime/mcentral.c b/libgo/runtime/mcentral.c
index b3108a1c061..81916101e46 100644
--- a/libgo/runtime/mcentral.c
+++ b/libgo/runtime/mcentral.c
@@ -30,16 +30,15 @@ runtime_MCentral_Init(MCentral *c, int32 sizeclass)
 	runtime_MSpanList_Init(&c->empty);
 }
 
-// Allocate up to n objects from the central free list.
+// Allocate a list of objects from the central free list.
 // Return the number of objects allocated.
 // The objects are linked together by their first words.
-// On return, *pstart points at the first object.
+// On return, *pfirst points at the first object.
 int32
-runtime_MCentral_AllocList(MCentral *c, int32 n, MLink **pfirst)
+runtime_MCentral_AllocList(MCentral *c, MLink **pfirst)
 {
 	MSpan *s;
-	MLink *first, *last;
-	int32 cap, avail, i;
+	int32 cap, n;
 
 	runtime_lock(c);
 	// Replenish central list if empty.
@@ -52,49 +51,27 @@ runtime_MCentral_AllocList(MCentral *c, int32 n, MLink **pfirst)
 	}
 	s = c->nonempty.next;
 	cap = (s->npages << PageShift) / s->elemsize;
-	avail = cap - s->ref;
-	if(avail < n)
-		n = avail;
-
-	// First one is guaranteed to work, because we just grew the list.
-	first = s->freelist;
-	last = first;
-	for(i=1; i<n; i++) {
-		last = last->next;
-	}
-	s->freelist = last->next;
-	last->next = nil;
+	n = cap - s->ref;
+	*pfirst = s->freelist;
+	s->freelist = nil;
 	s->ref += n;
 	c->nfree -= n;
-
-	if(n == avail) {
-		if(s->freelist != nil || s->ref != (uint32)cap) {
-			runtime_throw("invalid freelist");
-		}
-		runtime_MSpanList_Remove(s);
-		runtime_MSpanList_Insert(&c->empty, s);
-	}
-
+	runtime_MSpanList_Remove(s);
+	runtime_MSpanList_Insert(&c->empty, s);
 	runtime_unlock(c);
-	*pfirst = first;
 	return n;
 }
 
-// Free n objects back into the central free list.
+// Free the list of objects back into the central free list.
 void
-runtime_MCentral_FreeList(MCentral *c, int32 n, MLink *start)
+runtime_MCentral_FreeList(MCentral *c, MLink *start)
 {
-	MLink *v, *next;
-
-	// Assume next == nil marks end of list.
-	// n and end would be useful if we implemented
-	// the transfer cache optimization in the TODO above.
-	USED(n);
+	MLink *next;
 
 	runtime_lock(c);
-	for(v=start; v; v=next) {
-		next = v->next;
-		MCentral_Free(c, v);
+	for(; start != nil; start = next) {
+		next = start->next;
+		MCentral_Free(c, start);
 	}
 	runtime_unlock(c);
 }
@@ -108,7 +85,7 @@ MCentral_Free(MCentral *c, void *v)
 	int32 size;
 
 	// Find span for v.
-	s = runtime_MHeap_Lookup(runtime_mheap, v);
+	s = runtime_MHeap_Lookup(&runtime_mheap, v);
 	if(s == nil || s->ref == 0)
 		runtime_throw("invalid free");
 
@@ -133,7 +110,7 @@ MCentral_Free(MCentral *c, void *v)
 		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime_unlock(c);
-		runtime_MHeap_Free(runtime_mheap, s, 0);
+		runtime_MHeap_Free(&runtime_mheap, s, 0);
 		runtime_lock(c);
 	}
 }
@@ -168,7 +145,7 @@ runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *e
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime_unlock(c);
 		runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		runtime_MHeap_Free(runtime_mheap, s, 0);
+		runtime_MHeap_Free(&runtime_mheap, s, 0);
 	} else {
 		runtime_unlock(c);
 	}
@@ -200,7 +177,7 @@ MCentral_Grow(MCentral *c)
 
 	runtime_unlock(c);
 	runtime_MGetSizeClassInfo(c->sizeclass, &size, &npages, &n);
-	s = runtime_MHeap_Alloc(runtime_mheap, npages, c->sizeclass, 0, 1);
+	s = runtime_MHeap_Alloc(&runtime_mheap, npages, c->sizeclass, 0, 1);
 	if(s == nil) {
 		// TODO(rsc): Log out of memory
 		runtime_lock(c);
diff --git a/libgo/runtime/mem.c b/libgo/runtime/mem.c
index 8481e950750..78f7c51faf2 100644
--- a/libgo/runtime/mem.c
+++ b/libgo/runtime/mem.c
@@ -60,13 +60,11 @@ mmap_fixed(byte *v, uintptr n, int32 prot, int32 flags, int32 fd, uint32 offset)
 }
 
 void*
-runtime_SysAlloc(uintptr n)
+runtime_SysAlloc(uintptr n, uint64 *stat)
 {
 	void *p;
 	int fd = -1;
 
-	mstats.sys += n;
-
 #ifdef USE_DEV_ZERO
 	if (dev_zero == -1) {
 		dev_zero = open("/dev/zero", O_RDONLY);
@@ -91,6 +89,7 @@ runtime_SysAlloc(uintptr n)
 		}
 		return nil;
 	}
+	runtime_xadd64(stat, n);
 	return p;
 }
 
@@ -103,9 +102,16 @@ runtime_SysUnused(void *v __attribute__ ((unused)), uintptr n __attribute__ ((un
 }
 
 void
-runtime_SysFree(void *v, uintptr n)
+runtime_SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime_SysFree(void *v, uintptr n, uint64 *stat)
 {
-	mstats.sys -= n;
+	runtime_xadd64(stat, -(uint64)n);
 	runtime_munmap(v, n);
 }
 
@@ -132,8 +138,10 @@ runtime_SysReserve(void *v, uintptr n)
 	// Only user-mode Linux (UML) rejects these requests.
 	if(sizeof(void*) == 8 && (uintptr)v >= 0xffffffffU) {
 		p = mmap_fixed(v, 64<<10, PROT_NONE, MAP_ANON|MAP_PRIVATE, fd, 0);
-		if (p != v)
+		if (p != v) {
+			runtime_munmap(p, 64<<10);
 			return nil;
+		}
 		runtime_munmap(p, 64<<10);
 		return v;
 	}
@@ -149,12 +157,12 @@ runtime_SysReserve(void *v, uintptr n)
 }
 
 void
-runtime_SysMap(void *v, uintptr n)
+runtime_SysMap(void *v, uintptr n, uint64 *stat)
 {
 	void *p;
 	int fd = -1;
 	
-	mstats.sys += n;
+	runtime_xadd64(stat, n);
 
 #ifdef USE_DEV_ZERO
 	if (dev_zero == -1) {
diff --git a/libgo/runtime/mfinal.c b/libgo/runtime/mfinal.c
index 407092bf392..625af528e1e 100644
--- a/libgo/runtime/mfinal.c
+++ b/libgo/runtime/mfinal.c
@@ -5,6 +5,7 @@
 #include "runtime.h"
 #include "arch.h"
 #include "malloc.h"
+#include "go-type.h"
 
 enum { debug = 0 };
 
@@ -13,6 +14,7 @@ struct Fin
 {
 	FuncVal *fn;
 	const struct __go_func_type *ft;
+	const struct __go_ptr_type *ot;
 };
 
 // Finalizer hash table.  Direct hash, linear scan, at most 3/4 full.
@@ -42,7 +44,7 @@ static struct {
 } fintab[TABSZ];
 
 static void
-addfintab(Fintab *t, void *k, FuncVal *fn, const struct __go_func_type *ft)
+addfintab(Fintab *t, void *k, FuncVal *fn, const struct __go_func_type *ft, const struct __go_ptr_type *ot)
 {
 	int32 i, j;
 
@@ -67,6 +69,7 @@ ret:
 	t->fkey[i] = k;
 	t->val[i].fn = fn;
 	t->val[i].ft = ft;
+	t->val[i].ot = ot;
 }
 
 static bool
@@ -87,6 +90,7 @@ lookfintab(Fintab *t, void *k, bool del, Fin *f)
 				t->fkey[i] = (void*)-1;
 				t->val[i].fn = nil;
 				t->val[i].ft = nil;
+				t->val[i].ot = nil;
 				t->ndead++;
 			}
 			return true;
@@ -117,13 +121,13 @@ resizefintab(Fintab *tab)
 		newtab.max *= 3;
 	}
 	
-	newtab.fkey = runtime_mallocgc(newtab.max*sizeof newtab.fkey[0], FlagNoPointers, 0, 1);
-	newtab.val = runtime_mallocgc(newtab.max*sizeof newtab.val[0], 0, 0, 1);
+	newtab.fkey = runtime_mallocgc(newtab.max*sizeof newtab.fkey[0], 0, FlagNoInvokeGC|FlagNoScan);
+	newtab.val = runtime_mallocgc(newtab.max*sizeof newtab.val[0], 0, FlagNoInvokeGC);
 	
 	for(i=0; i<tab->max; i++) {
 		k = tab->fkey[i];
 		if(k != nil && k != (void*)-1)
-			addfintab(&newtab, k, tab->val[i].fn, tab->val[i].ft);
+			addfintab(&newtab, k, tab->val[i].fn, tab->val[i].ft, tab->val[i].ot);
 	}
 	
 	runtime_free(tab->fkey);
@@ -137,7 +141,7 @@ resizefintab(Fintab *tab)
 }
 
 bool
-runtime_addfinalizer(void *p, FuncVal *f, const struct __go_func_type *ft)
+runtime_addfinalizer(void *p, FuncVal *f, const struct __go_func_type *ft, const struct __go_ptr_type *ot)
 {
 	Fintab *tab;
 	byte *base;
@@ -166,7 +170,7 @@ runtime_addfinalizer(void *p, FuncVal *f, const struct __go_func_type *ft)
 		resizefintab(tab);
 	}
 
-	addfintab(tab, p, f, ft);
+	addfintab(tab, p, f, ft, ot);
 	runtime_setblockspecial(p, true);
 	runtime_unlock(tab);
 	return true;
@@ -175,7 +179,7 @@ runtime_addfinalizer(void *p, FuncVal *f, const struct __go_func_type *ft)
 // get finalizer; if del, delete finalizer.
 // caller is responsible for updating RefHasFinalizer (special) bit.
 bool
-runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft)
+runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft, const struct __go_ptr_type **ot)
 {
 	Fintab *tab;
 	bool res;
@@ -189,6 +193,7 @@ runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_typ
 		return false;
 	*fn = f.fn;
 	*ft = f.ft;
+	*ot = f.ot;
 	return true;
 }
 
diff --git a/libgo/runtime/mfixalloc.c b/libgo/runtime/mfixalloc.c
index 6e4f0c6e607..9d0b3bbda7e 100644
--- a/libgo/runtime/mfixalloc.c
+++ b/libgo/runtime/mfixalloc.c
@@ -13,17 +13,16 @@
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
 void
-runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void *(*alloc)(uintptr), void (*first)(void*, byte*), void *arg)
+runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat)
 {
 	f->size = size;
-	f->alloc = alloc;
 	f->first = first;
 	f->arg = arg;
 	f->list = nil;
 	f->chunk = nil;
 	f->nchunk = 0;
 	f->inuse = 0;
-	f->sys = 0;
+	f->stat = stat;
 }
 
 void*
@@ -43,10 +42,7 @@ runtime_FixAlloc_Alloc(FixAlloc *f)
 		return v;
 	}
 	if(f->nchunk < f->size) {
-		f->sys += FixAllocChunk;
-		f->chunk = f->alloc(FixAllocChunk);
-		if(f->chunk == nil)
-			runtime_throw("out of memory (FixAlloc)");
+		f->chunk = runtime_persistentalloc(FixAllocChunk, 0, f->stat);
 		f->nchunk = FixAllocChunk;
 	}
 	v = f->chunk;
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
index c3b32111ca0..3edcee9c397 100644
--- a/libgo/runtime/mgc0.c
+++ b/libgo/runtime/mgc0.c
@@ -59,6 +59,13 @@ enum {
 	PRECISE = 1,
 	LOOP = 2,
 	PC_BITS = PRECISE | LOOP,
+
+	// Pointer map
+	BitsPerPointer = 2,
+	BitsNoPointer = 0,
+	BitsPointer = 1,
+	BitsIface = 2,
+	BitsEface = 3,
 };
 
 // Bits in per-word bitmap.
@@ -70,7 +77,7 @@ enum {
 // The bits in the word are packed together by type first, then by
 // heap location, so each 64-bit bitmap word consists of, from top to bottom,
 // the 16 bitSpecial bits for the corresponding heap words, then the 16 bitMarked bits,
-// then the 16 bitNoPointers/bitBlockBoundary bits, then the 16 bitAllocated bits.
+// then the 16 bitNoScan/bitBlockBoundary bits, then the 16 bitAllocated bits.
 // This layout makes it easier to iterate over the bits of a given type.
 //
 // The bitmap starts at mheap.arena_start and extends *backward* from
@@ -87,7 +94,7 @@ enum {
 //	/* then test bits & bitAllocated, bits & bitMarked, etc. */
 //
 #define bitAllocated		((uintptr)1<<(bitShift*0))
-#define bitNoPointers		((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
+#define bitNoScan		((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
 #define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
 #define bitSpecial		((uintptr)1<<(bitShift*3))	/* when bitAllocated is set - has finalizer or being profiled */
 #define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set */
@@ -109,8 +116,6 @@ enum {
 //
 uint32 runtime_worldsema = 1;
 
-static int32 gctrace;
-
 // The size of Workbuf is N*PageSize.
 typedef struct Workbuf Workbuf;
 struct Workbuf
@@ -129,6 +134,7 @@ struct Finalizer
 	FuncVal *fn;
 	void *arg;
 	const struct __go_func_type *ft;
+	const struct __go_ptr_type *ot;
 };
 
 typedef struct FinBlock FinBlock;
@@ -178,7 +184,6 @@ static struct {
 
 enum {
 	GC_DEFAULT_PTR = GC_NUM_INSTR,
-	GC_MAP_NEXT,
 	GC_CHAN,
 
 	GC_NUM_INSTR2
@@ -201,6 +206,16 @@ static struct {
 	uint64 instr[GC_NUM_INSTR2];
 	uint64 putempty;
 	uint64 getfull;
+	struct {
+		uint64 foundbit;
+		uint64 foundword;
+		uint64 foundspan;
+	} flushptrbuf;
+	struct {
+		uint64 foundbit;
+		uint64 foundword;
+		uint64 foundspan;
+	} markonly;
 } gcstats;
 
 // markonly marks an object. It returns true if the object
@@ -210,12 +225,12 @@ static bool
 markonly(void *obj)
 {
 	byte *p;
-	uintptr *bitp, bits, shift, x, xbits, off;
+	uintptr *bitp, bits, shift, x, xbits, off, j;
 	MSpan *s;
 	PageID k;
 
 	// Words outside the arena cannot be pointers.
-	if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
+	if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
 		return false;
 
 	// obj may be a pointer to a live object.
@@ -225,42 +240,57 @@ markonly(void *obj)
 	obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
 	// Find bits for this word.
-	off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
-	bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
+	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 	xbits = *bitp;
 	bits = xbits >> shift;
 
 	// Pointing at the beginning of a block?
-	if((bits & (bitAllocated|bitBlockBoundary)) != 0)
+	if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
+		if(CollectStats)
+			runtime_xadd64(&gcstats.markonly.foundbit, 1);
 		goto found;
+	}
+
+	// Pointing just past the beginning?
+	// Scan backward a little to find a block boundary.
+	for(j=shift; j-->0; ) {
+		if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
+			shift = j;
+			bits = xbits>>shift;
+			if(CollectStats)
+				runtime_xadd64(&gcstats.markonly.foundword, 1);
+			goto found;
+		}
+	}
 
 	// Otherwise consult span table to find beginning.
 	// (Manually inlined copy of MHeap_LookupMaybe.)
 	k = (uintptr)obj>>PageShift;
 	x = k;
 	if(sizeof(void*) == 8)
-		x -= (uintptr)runtime_mheap->arena_start>>PageShift;
-	s = runtime_mheap->map[x];
-	if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
+		x -= (uintptr)runtime_mheap.arena_start>>PageShift;
+	s = runtime_mheap.spans[x];
+	if(s == nil || k < s->start || (byte*)obj >= s->limit || s->state != MSpanInUse)
 		return false;
 	p = (byte*)((uintptr)s->start<<PageShift);
 	if(s->sizeclass == 0) {
 		obj = p;
 	} else {
-		if((byte*)obj >= (byte*)s->limit)
-			return false;
 		uintptr size = s->elemsize;
 		int32 i = ((byte*)obj - p)/size;
 		obj = p+i*size;
 	}
 
 	// Now that we know the object header, reload bits.
-	off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
-	bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
+	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 	xbits = *bitp;
 	bits = xbits >> shift;
+	if(CollectStats)
+		runtime_xadd64(&gcstats.markonly.foundspan, 1);
 
 found:
 	// Now we have bits, bitp, and shift correct for
@@ -338,7 +368,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 	Workbuf *wbuf;
 	PtrTarget *ptrbuf_end;
 
-	arena_start = runtime_mheap->arena_start;
+	arena_start = runtime_mheap.arena_start;
 
 	wp = *_wp;
 	wbuf = *_wbuf;
@@ -377,7 +407,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 
 			// obj belongs to interval [mheap.arena_start, mheap.arena_used).
 			if(Debug > 1) {
-				if(obj < runtime_mheap->arena_start || obj >= runtime_mheap->arena_used)
+				if(obj < runtime_mheap.arena_start || obj >= runtime_mheap.arena_used)
 					runtime_throw("object is outside of mheap");
 			}
 
@@ -398,8 +428,11 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			bits = xbits >> shift;
 
 			// Pointing at the beginning of a block?
-			if((bits & (bitAllocated|bitBlockBoundary)) != 0)
+			if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
+				if(CollectStats)
+					runtime_xadd64(&gcstats.flushptrbuf.foundbit, 1);
 				goto found;
+			}
 
 			ti = 0;
 
@@ -410,6 +443,8 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 					obj = (byte*)obj - (shift-j)*PtrSize;
 					shift = j;
 					bits = xbits>>shift;
+					if(CollectStats)
+						runtime_xadd64(&gcstats.flushptrbuf.foundword, 1);
 					goto found;
 				}
 			}
@@ -420,15 +455,13 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			x = k;
 			if(sizeof(void*) == 8)
 				x -= (uintptr)arena_start>>PageShift;
-			s = runtime_mheap->map[x];
-			if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
+			s = runtime_mheap.spans[x];
+			if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
 				continue;
 			p = (byte*)((uintptr)s->start<<PageShift);
 			if(s->sizeclass == 0) {
 				obj = p;
 			} else {
-				if((byte*)obj >= (byte*)s->limit)
-					continue;
 				size = s->elemsize;
 				int32 i = ((byte*)obj - p)/size;
 				obj = p+i*size;
@@ -440,6 +473,8 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			shift = off % wordsPerBitmapWord;
 			xbits = *bitp;
 			bits = xbits >> shift;
+			if(CollectStats)
+				runtime_xadd64(&gcstats.flushptrbuf.foundspan, 1);
 
 		found:
 			// Now we have bits, bitp, and shift correct for
@@ -460,7 +495,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			}
 
 			// If object has no pointers, don't need to scan further.
-			if((bits & bitNoPointers) != 0)
+			if((bits & bitNoScan) != 0)
 				continue;
 
 			// Ask span about size class.
@@ -468,7 +503,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			x = (uintptr)obj >> PageShift;
 			if(sizeof(void*) == 8)
 				x -= (uintptr)arena_start>>PageShift;
-			s = runtime_mheap->map[x];
+			s = runtime_mheap.spans[x];
 
 			PREFETCH(obj);
 
@@ -552,9 +587,6 @@ flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_
 static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
 
 #if 0
-// Hashmap iterator program
-static uintptr mapProg[2] = {0, GC_MAP_NEXT};
-
 // Hchan program
 static uintptr chanProg[2] = {0, GC_CHAN};
 #endif
@@ -578,7 +610,7 @@ checkptr(void *obj, uintptr objti)
 	if(!Debug)
 		runtime_throw("checkptr is debug only");
 
-	if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
+	if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
 		return;
 	type = runtime_gettype(obj);
 	t = (Type*)(type & ~(uintptr)(PtrSize-1));
@@ -586,8 +618,8 @@ checkptr(void *obj, uintptr objti)
 		return;
 	x = (uintptr)obj >> PageShift;
 	if(sizeof(void*) == 8)
-		x -= (uintptr)(runtime_mheap->arena_start)>>PageShift;
-	s = runtime_mheap->map[x];
+		x -= (uintptr)(runtime_mheap.arena_start)>>PageShift;
+	s = runtime_mheap.spans[x];
 	objstart = (byte*)((uintptr)s->start<<PageShift);
 	if(s->sizeclass != 0) {
 		i = ((byte*)obj - objstart)/s->elemsize;
@@ -595,8 +627,11 @@ checkptr(void *obj, uintptr objti)
 	}
 	tisize = *(uintptr*)objti;
 	// Sanity check for object size: it should fit into the memory block.
-	if((byte*)obj + tisize > objstart + s->elemsize)
+	if((byte*)obj + tisize > objstart + s->elemsize) {
+		runtime_printf("object of type '%S' at %p/%p does not fit in block %p/%p\n",
+			       *t->string, obj, tisize, objstart, s->elemsize);
 		runtime_throw("invalid gc type info");
+	}
 	if(obj != objstart)
 		return;
 	// If obj points to the beginning of the memory block,
@@ -613,7 +648,7 @@ checkptr(void *obj, uintptr objti)
 		for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
 			if(pc1[j] != pc2[j]) {
 				runtime_printf("invalid gc type info for '%s' at %p, type info %p, block info %p\n",
-					t->string ? (const int8*)t->string->str : (const int8*)"?", j, pc1[j], pc2[j]);
+					       t->string ? (const int8*)t->string->str : (const int8*)"?", j, pc1[j], pc2[j]);
 				runtime_throw("invalid gc type info");
 			}
 		}
@@ -638,7 +673,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 	uintptr n, i, end_b, elemsize, size, ti, objti, count /* , type */;
 	uintptr *pc, precise_type, nominal_size;
 #if 0
-	uintptr *map_ret, mapkey_size, mapval_size, mapkey_ti, mapval_ti, *chan_ret, chancap;
+	uintptr *chan_ret, chancap;
 #endif
 	void *obj;
 	const Type *t;
@@ -650,11 +685,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 	Eface *eface;
 	Iface *iface;
 #if 0
-	Hmap *hmap;
-	MapType *maptype;
-	bool mapkey_kind, mapval_kind;
-	struct hash_gciter map_iter;
-	struct hash_gciter_data d;
 	Hchan *chan;
 	ChanType *chantype;
 #endif
@@ -663,8 +693,8 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		runtime_throw("scanblock: size of Workbuf is suboptimal");
 
 	// Memory arena parameters.
-	arena_start = runtime_mheap->arena_start;
-	arena_used = runtime_mheap->arena_used;
+	arena_start = runtime_mheap.arena_start;
+	arena_used = runtime_mheap.arena_used;
 
 	stack_ptr = stack+nelem(stack)-1;
 	
@@ -685,10 +715,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 
 	// (Silence the compiler)
 #if 0
-	map_ret = nil;
-	mapkey_size = mapval_size = 0;
-	mapkey_kind = mapval_kind = false;
-	mapkey_ti = mapval_ti = 0;
 	chan = nil;
 	chantype = nil;
 	chan_ret = nil;
@@ -759,23 +785,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 					stack_top.elemsize = pc[0];
 					stack_top.loop_or_ret = pc+1;
 					break;
-				case TypeInfo_Map:
-					hmap = (Hmap*)b;
-					maptype = (MapType*)t;
-					if(hash_gciter_init(hmap, &map_iter)) {
-						mapkey_size = maptype->key->size;
-						mapkey_kind = maptype->key->kind;
-						mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
-						mapval_size = maptype->elem->size;
-						mapval_kind = maptype->elem->kind;
-						mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
-
-						map_ret = nil;
-						pc = mapProg;
-					} else {
-						goto next_block;
-					}
-					break;
 				case TypeInfo_Chan:
 					chan = (Hchan*)b;
 					chantype = (ChanType*)t;
@@ -985,79 +994,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
 			continue;
 
-#if 0
-		case GC_MAP_PTR:
-			hmap = *(Hmap**)(stack_top.b + pc[1]);
-			if(hmap == nil) {
-				pc += 3;
-				continue;
-			}
-			if(markonly(hmap)) {
-				maptype = (MapType*)pc[2];
-				if(hash_gciter_init(hmap, &map_iter)) {
-					mapkey_size = maptype->key->size;
-					mapkey_kind = maptype->key->kind;
-					mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
-					mapval_size = maptype->elem->size;
-					mapval_kind = maptype->elem->kind;
-					mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
-
-					// Start mapProg.
-					map_ret = pc+3;
-					pc = mapProg+1;
-				} else {
-					pc += 3;
-				}
-			} else {
-				pc += 3;
-			}
-			continue;
-
-		case GC_MAP_NEXT:
-			// Add all keys and values to buffers, mark all subtables.
-			while(hash_gciter_next(&map_iter, &d)) {
-				// buffers: reserve space for 2 objects.
-				if(ptrbufpos+2 >= ptrbuf_end)
-					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
-				if(objbufpos+2 >= objbuf_end)
-					flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
-
-				if(d.st != nil)
-					markonly(d.st);
-
-				if(d.key_data != nil) {
-					if(!(mapkey_kind & KindNoPointers) || d.indirectkey) {
-						if(!d.indirectkey)
-							*objbufpos++ = (Obj){d.key_data, mapkey_size, mapkey_ti};
-						else {
-							if(Debug) {
-								obj = *(void**)d.key_data;
-								if(!(arena_start <= obj && obj < arena_used))
-									runtime_throw("scanblock: inconsistent hashmap");
-							}
-							*ptrbufpos++ = (struct PtrTarget){*(void**)d.key_data, mapkey_ti};
-						}
-					}
-					if(!(mapval_kind & KindNoPointers) || d.indirectval) {
-						if(!d.indirectval)
-							*objbufpos++ = (Obj){d.val_data, mapval_size, mapval_ti};
-						else {
-							if(Debug) {
-								obj = *(void**)d.val_data;
-								if(!(arena_start <= obj && obj < arena_used))
-									runtime_throw("scanblock: inconsistent hashmap");
-							}
-							*ptrbufpos++ = (struct PtrTarget){*(void**)d.val_data, mapval_ti};
-						}
-					}
-				}
-			}
-			if(map_ret == nil)
-				goto next_block;
-			pc = map_ret;
-			continue;
-#endif
-
 		case GC_REGION:
 			obj = (void*)(stack_top.b + pc[1]);
 			size = pc[2];
@@ -1071,7 +1007,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 
 #if 0
 		case GC_CHAN_PTR:
-			// Similar to GC_MAP_PTR
 			chan = *(Hchan**)(stack_top.b + pc[1]);
 			if(chan == nil) {
 				pc += 3;
@@ -1191,14 +1126,14 @@ debug_scanblock(byte *b, uintptr n)
 		obj = (byte*)vp[i];
 
 		// Words outside the arena cannot be pointers.
-		if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
+		if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
 			continue;
 
 		// Round down to word boundary.
 		obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
 		// Consult span table to find beginning.
-		s = runtime_MHeap_LookupMaybe(runtime_mheap, obj);
+		s = runtime_MHeap_LookupMaybe(&runtime_mheap, obj);
 		if(s == nil)
 			continue;
 
@@ -1207,15 +1142,13 @@ debug_scanblock(byte *b, uintptr n)
 		if(s->sizeclass == 0) {
 			obj = p;
 		} else {
-			if((byte*)obj >= (byte*)s->limit)
-				continue;
 			int32 i = ((byte*)obj - p)/size;
 			obj = p+i*size;
 		}
 
 		// Now that we know the object header, reload bits.
-		off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
-		bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
+		bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		xbits = *bitp;
 		bits = xbits >> shift;
@@ -1230,7 +1163,7 @@ debug_scanblock(byte *b, uintptr n)
 			runtime_printf("found unmarked block %p in %p\n", obj, vp+i);
 
 		// If object has no pointers, don't need to scan further.
-		if((bits & bitNoPointers) != 0)
+		if((bits & bitNoScan) != 0)
 			continue;
 
 		debug_scanblock(obj, size);
@@ -1320,7 +1253,7 @@ getempty(Workbuf *b)
 		runtime_lock(&work);
 		if(work.nchunk < sizeof *b) {
 			work.nchunk = 1<<20;
-			work.chunk = runtime_SysAlloc(work.nchunk);
+			work.chunk = runtime_SysAlloc(work.nchunk, &mstats.gc_sys);
 			if(work.chunk == nil)
 				runtime_throw("runtime: cannot allocate memory");
 		}
@@ -1416,12 +1349,12 @@ addroot(Obj obj)
 		cap = PageSize/sizeof(Obj);
 		if(cap < 2*work.rootcap)
 			cap = 2*work.rootcap;
-		new = (Obj*)runtime_SysAlloc(cap*sizeof(Obj));
+		new = (Obj*)runtime_SysAlloc(cap*sizeof(Obj), &mstats.gc_sys);
 		if(new == nil)
 			runtime_throw("runtime: cannot allocate memory");
 		if(work.roots != nil) {
 			runtime_memmove(new, work.roots, work.rootcap*sizeof(Obj));
-			runtime_SysFree(work.roots, work.rootcap*sizeof(Obj));
+			runtime_SysFree(work.roots, work.rootcap*sizeof(Obj), &mstats.gc_sys);
 		}
 		work.roots = new;
 		work.rootcap = cap;
@@ -1560,8 +1493,8 @@ addroots(void)
 	runtime_time_scan(addroot);
 
 	// MSpan.types
-	allspans = runtime_mheap->allspans;
-	for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
+	allspans = runtime_mheap.allspans;
+	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
 		s = allspans[spanidx];
 		if(s->state == MSpanInUse) {
 			// The garbage collector ignores type pointers stored in MSpan.types:
@@ -1589,10 +1522,7 @@ addroots(void)
 		case Gdead:
 			break;
 		case Grunning:
-			if(gp != runtime_g())
-				runtime_throw("mark - world not stopped");
-			addstackroots(gp);
-			break;
+			runtime_throw("mark - world not stopped");
 		case Grunnable:
 		case Gsyscall:
 		case Gwaiting:
@@ -1614,10 +1544,11 @@ handlespecial(byte *p, uintptr size)
 {
 	FuncVal *fn;
 	const struct __go_func_type *ft;
+	const struct __go_ptr_type *ot;
 	FinBlock *block;
 	Finalizer *f;
 	
-	if(!runtime_getfinalizer(p, true, &fn, &ft)) {
+	if(!runtime_getfinalizer(p, true, &fn, &ft, &ot)) {
 		runtime_setblockspecial(p, false);
 		runtime_MProf_Free(p, size);
 		return false;
@@ -1626,9 +1557,7 @@ handlespecial(byte *p, uintptr size)
 	runtime_lock(&finlock);
 	if(finq == nil || finq->cnt == finq->cap) {
 		if(finc == nil) {
-			finc = runtime_SysAlloc(PageSize);
-			if(finc == nil)
-				runtime_throw("runtime: cannot allocate memory");
+			finc = runtime_persistentalloc(PageSize, 0, &mstats.gc_sys);
 			finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
 			finc->alllink = allfin;
 			allfin = finc;
@@ -1642,6 +1571,7 @@ handlespecial(byte *p, uintptr size)
 	finq->cnt++;
 	f->fn = fn;
 	f->ft = ft;
+	f->ot = ot;
 	f->arg = p;
 	runtime_unlock(&finlock);
 	return true;
@@ -1668,10 +1598,10 @@ sweepspan(ParFor *desc, uint32 idx)
 	m = runtime_m();
 
 	USED(&desc);
-	s = runtime_mheap->allspans[idx];
+	s = runtime_mheap.allspans[idx];
 	if(s->state != MSpanInUse)
 		return;
-	arena_start = runtime_mheap->arena_start;
+	arena_start = runtime_mheap.arena_start;
 	p = (byte*)(s->start << PageShift);
 	cl = s->sizeclass;
 	size = s->elemsize;
@@ -1735,9 +1665,9 @@ sweepspan(ParFor *desc, uint32 idx)
 			// Free large span.
 			runtime_unmarkspan(p, 1<<PageShift);
 			*(uintptr*)p = (uintptr)0xdeaddeaddeaddeadll;	// needs zeroing
-			runtime_MHeap_Free(runtime_mheap, s, 1);
-			c->local_alloc -= size;
-			c->local_nfree++;
+			runtime_MHeap_Free(&runtime_mheap, s, 1);
+			c->local_nlargefree++;
+			c->local_largefree += size;
 		} else {
 			// Free small object.
 			switch(compression) {
@@ -1758,12 +1688,9 @@ sweepspan(ParFor *desc, uint32 idx)
 	}
 
 	if(nfree) {
-		c->local_by_size[cl].nfree += nfree;
-		c->local_alloc -= size * nfree;
-		c->local_nfree += nfree;
+		c->local_nsmallfree[cl] += nfree;
 		c->local_cachealloc -= nfree * size;
-		c->local_objects -= nfree;
-		runtime_MCentral_FreeSpan(&runtime_mheap->central[cl], s, nfree, head.next, end);
+		runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
 	}
 }
 
@@ -1777,10 +1704,10 @@ dumpspan(uint32 idx)
 	MSpan *s;
 	bool allocated, special;
 
-	s = runtime_mheap->allspans[idx];
+	s = runtime_mheap.allspans[idx];
 	if(s->state != MSpanInUse)
 		return;
-	arena_start = runtime_mheap->arena_start;
+	arena_start = runtime_mheap.arena_start;
 	p = (byte*)(s->start << PageShift);
 	sizeclass = s->sizeclass;
 	size = s->elemsize;
@@ -1838,7 +1765,7 @@ runtime_memorydump(void)
 {
 	uint32 spanidx;
 
-	for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
+	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
 		dumpspan(spanidx);
 	}
 }
@@ -1880,13 +1807,28 @@ runtime_gchelper(void)
 static int32 gcpercent = GcpercentUnknown;
 
 static void
-cachestats(GCStats *stats)
+cachestats(void)
+{
+	MCache *c;
+	P *p, **pp;
+
+	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime_purgecachedstats(c);
+	}
+}
+
+static void
+updatememstats(GCStats *stats)
 {
 	M *mp;
+	MSpan *s;
 	MCache *c;
 	P *p, **pp;
 	uint32 i;
-	uint64 stacks_inuse;
+	uint64 stacks_inuse, smallfree;
 	uint64 *src, *dst;
 
 	if(stats)
@@ -1902,29 +1844,80 @@ cachestats(GCStats *stats)
 			runtime_memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
 		}
 	}
+	mstats.stacks_inuse = stacks_inuse;
+	mstats.mcache_inuse = runtime_mheap.cachealloc.inuse;
+	mstats.mspan_inuse = runtime_mheap.spanalloc.inuse;
+	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
+		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
+	
+	// Calculate memory allocator stats.
+	// During program execution we only count number of frees and amount of freed memory.
+	// Current number of alive object in the heap and amount of alive heap memory
+	// are calculated by scanning all spans.
+	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+	// plus amount of alive heap memory.
+	mstats.alloc = 0;
+	mstats.total_alloc = 0;
+	mstats.nmalloc = 0;
+	mstats.nfree = 0;
+	for(i = 0; i < nelem(mstats.by_size); i++) {
+		mstats.by_size[i].nmalloc = 0;
+		mstats.by_size[i].nfree = 0;
+	}
+
+	// Flush MCache's to MCentral.
 	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
 		c = p->mcache;
 		if(c==nil)
 			continue;
-		runtime_purgecachedstats(c);
-		for(i=0; i<nelem(c->local_by_size); i++) {
-			mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
-			c->local_by_size[i].nmalloc = 0;
-			mstats.by_size[i].nfree += c->local_by_size[i].nfree;
-			c->local_by_size[i].nfree = 0;
+		runtime_MCache_ReleaseAll(c);
+	}
+
+	// Aggregate local stats.
+	cachestats();
+
+	// Scan all spans and count number of alive objects.
+	for(i = 0; i < runtime_mheap.nspan; i++) {
+		s = runtime_mheap.allspans[i];
+		if(s->state != MSpanInUse)
+			continue;
+		if(s->sizeclass == 0) {
+			mstats.nmalloc++;
+			mstats.alloc += s->elemsize;
+		} else {
+			mstats.nmalloc += s->ref;
+			mstats.by_size[s->sizeclass].nmalloc += s->ref;
+			mstats.alloc += s->ref*s->elemsize;
 		}
 	}
-	mstats.stacks_inuse = stacks_inuse;
+
+	// Aggregate by size class.
+	smallfree = 0;
+	mstats.nfree = runtime_mheap.nlargefree;
+	for(i = 0; i < nelem(mstats.by_size); i++) {
+		mstats.nfree += runtime_mheap.nsmallfree[i];
+		mstats.by_size[i].nfree = runtime_mheap.nsmallfree[i];
+		mstats.by_size[i].nmalloc += runtime_mheap.nsmallfree[i];
+		smallfree += runtime_mheap.nsmallfree[i] * runtime_class_to_size[i];
+	}
+	mstats.nmalloc += mstats.nfree;
+
+	// Calculate derived stats.
+	mstats.total_alloc = mstats.alloc + runtime_mheap.largefree + smallfree;
+	mstats.heap_alloc = mstats.alloc;
+	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
 }
 
 // Structure of arguments passed to function gc().
-// This allows the arguments to be passed via reflect_call.
+// This allows the arguments to be passed via runtime_mcall.
 struct gc_args
 {
-	int32 force;
+	int64 start_time; // start time of GC in ns (just before stoptheworld)
 };
 
 static void gc(struct gc_args *args);
+static void mgc(G *gp);
 
 static int32
 readgogc(void)
@@ -1943,8 +1936,9 @@ void
 runtime_gc(int32 force)
 {
 	M *m;
-	const byte *p;
-	struct gc_args a, *ap;
+	G *g;
+	struct gc_args a;
+	int32 i;
 
 	// The atomic operations are not atomic if the uint64s
 	// are not aligned on uint64 boundaries. This has been
@@ -1967,30 +1961,77 @@ runtime_gc(int32 force)
 	// while holding a lock.  The next mallocgc
 	// without a lock will do the gc instead.
 	m = runtime_m();
-	if(!mstats.enablegc || m->locks > 0 || runtime_panicking)
+	if(!mstats.enablegc || runtime_g() == m->g0 || m->locks > 0 || runtime_panicking)
 		return;
 
 	if(gcpercent == GcpercentUnknown) {	// first time through
-		gcpercent = readgogc();
-
-		p = runtime_getenv("GOGCTRACE");
-		if(p != nil)
-			gctrace = runtime_atoi(p);
+		runtime_lock(&runtime_mheap);
+		if(gcpercent == GcpercentUnknown)
+			gcpercent = readgogc();
+		runtime_unlock(&runtime_mheap);
 	}
 	if(gcpercent < 0)
 		return;
 
-	// Run gc on a bigger stack to eliminate
-	// a potentially large number of calls to runtime_morestack.
-	// But not when using gccgo.
-	a.force = force;
-	ap = &a;
-	gc(ap);
+	runtime_semacquire(&runtime_worldsema, false);
+	if(!force && mstats.heap_alloc < mstats.next_gc) {
+		// typically threads which lost the race to grab
+		// worldsema exit here when gc is done.
+		runtime_semrelease(&runtime_worldsema);
+		return;
+	}
 
-	if(gctrace > 1 && !force) {
-		a.force = 1;
-		gc(&a);
+	// Ok, we're doing it!  Stop everybody else
+	a.start_time = runtime_nanotime();
+	m->gcing = 1;
+	runtime_stoptheworld();
+	
+	// Run gc on the g0 stack.  We do this so that the g stack
+	// we're currently running on will no longer change.  Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  Also an
+	// enabler for copyable stacks.
+	for(i = 0; i < (runtime_debug.gctrace > 1 ? 2 : 1); i++) {
+		// switch to g0, call gc(&a), then switch back
+		g = runtime_g();
+		g->param = &a;
+		g->status = Gwaiting;
+		g->waitreason = "garbage collection";
+		runtime_mcall(mgc);
+		// record a new start time in case we're going around again
+		a.start_time = runtime_nanotime();
 	}
+
+	// all done
+	m->gcing = 0;
+	m->locks++;
+	runtime_semrelease(&runtime_worldsema);
+	runtime_starttheworld();
+	m->locks--;
+
+	// now that gc is done, kick off finalizer thread if needed
+	if(finq != nil) {
+		runtime_lock(&finlock);
+		// kick off or wake up goroutine to run queued finalizers
+		if(fing == nil)
+			fing = __go_go(runfinq, nil);
+		else if(fingwait) {
+			fingwait = 0;
+			runtime_ready(fing);
+		}
+		runtime_unlock(&finlock);
+	}
+	// give the queued finalizers, if any, a chance to run
+	runtime_gosched();
+}
+
+static void
+mgc(G *gp)
+{
+	gc(gp->param);
+	gp->param = nil;
+	gp->status = Grunning;
+	runtime_gogo(gp);
 }
 
 static void
@@ -2004,29 +2045,20 @@ gc(struct gc_args *args)
 	uint32 i;
 	// Eface eface;
 
-	runtime_semacquire(&runtime_worldsema);
-	if(!args->force && mstats.heap_alloc < mstats.next_gc) {
-		runtime_semrelease(&runtime_worldsema);
-		return;
-	}
-
 	m = runtime_m();
 
-	t0 = runtime_nanotime();
-
-	m->gcing = 1;
-	runtime_stoptheworld();
+	t0 = args->start_time;
 
 	if(CollectStats)
 		runtime_memclr((byte*)&gcstats, sizeof(gcstats));
 
 	for(mp=runtime_allm; mp; mp=mp->alllink)
-		runtime_settype_flush(mp, false);
+		runtime_settype_flush(mp);
 
 	heap0 = 0;
 	obj0 = 0;
-	if(gctrace) {
-		cachestats(nil);
+	if(runtime_debug.gctrace) {
+		updatememstats(nil);
 		heap0 = mstats.heap_alloc;
 		obj0 = mstats.nmalloc - mstats.nfree;
 	}
@@ -2050,7 +2082,7 @@ gc(struct gc_args *args)
 	work.nproc = runtime_gcprocs();
 	addroots();
 	runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
-	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap->nspan, nil, true, sweepspan);
+	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap.nspan, nil, true, sweepspan);
 	if(work.nproc > 1) {
 		runtime_noteclear(&work.alldone);
 		runtime_helpgc(work.nproc);
@@ -2076,29 +2108,8 @@ gc(struct gc_args *args)
 	if(work.nproc > 1)
 		runtime_notesleep(&work.alldone);
 
-	cachestats(&stats);
-
-	stats.nprocyield += work.sweepfor->nprocyield;
-	stats.nosyield += work.sweepfor->nosyield;
-	stats.nsleep += work.sweepfor->nsleep;
-
-	mstats.next_gc = mstats.heap_alloc+(mstats.heap_alloc-runtime_stacks_sys)*gcpercent/100;
-	m->gcing = 0;
-
-	if(finq != nil) {
-		m->locks++;	// disable gc during the mallocs in newproc
-		// kick off or wake up goroutine to run queued finalizers
-		if(fing == nil)
-			fing = __go_go(runfinq, nil);
-		else if(fingwait) {
-			fingwait = 0;
-			runtime_ready(fing);
-		}
-		m->locks--;
-	}
-
-	heap1 = mstats.heap_alloc;
-	obj1 = mstats.nmalloc - mstats.nfree;
+	cachestats();
+	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 
 	t4 = runtime_nanotime();
 	mstats.last_gc = t4;
@@ -2108,7 +2119,15 @@ gc(struct gc_args *args)
 	if(mstats.debuggc)
 		runtime_printf("pause %D\n", t4-t0);
 
-	if(gctrace) {
+	if(runtime_debug.gctrace) {
+		updatememstats(&stats);
+		heap1 = mstats.heap_alloc;
+		obj1 = mstats.nmalloc - mstats.nfree;
+
+		stats.nprocyield += work.sweepfor->nprocyield;
+		stats.nosyield += work.sweepfor->nosyield;
+		stats.nsleep += work.sweepfor->nsleep;
+
 		runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
 				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
 			mstats.numgc, work.nproc, (t2-t1)/1000000, (t3-t2)/1000000, (t1-t0+t4-t3)/1000000,
@@ -2137,16 +2156,13 @@ gc(struct gc_args *args)
 			runtime_printf("\ttotal:\t%D\n", ninstr);
 
 			runtime_printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
+
+			runtime_printf("markonly base lookup: bit %D word %D span %D\n", gcstats.markonly.foundbit, gcstats.markonly.foundword, gcstats.markonly.foundspan);
+			runtime_printf("flushptrbuf base lookup: bit %D word %D span %D\n", gcstats.flushptrbuf.foundbit, gcstats.flushptrbuf.foundword, gcstats.flushptrbuf.foundspan);
 		}
 	}
 
 	runtime_MProf_GC();
-	runtime_semrelease(&runtime_worldsema);
-	runtime_starttheworld();
-
-	// give the queued finalizers, if any, a chance to run
-	if(finq != nil)
-		runtime_gosched();
 }
 
 void runtime_ReadMemStats(MStats *)
@@ -2161,15 +2177,17 @@ runtime_ReadMemStats(MStats *stats)
 	// because stoptheworld can only be used by
 	// one goroutine at a time, and there might be
 	// a pending garbage collection already calling it.
-	runtime_semacquire(&runtime_worldsema);
+	runtime_semacquire(&runtime_worldsema, false);
 	m = runtime_m();
 	m->gcing = 1;
 	runtime_stoptheworld();
-	cachestats(nil);
+	updatememstats(nil);
 	*stats = mstats;
 	m->gcing = 0;
+	m->locks++;
 	runtime_semrelease(&runtime_worldsema);
 	runtime_starttheworld();
+	m->locks--;
 }
 
 void runtime_debug_readGCStats(Slice*)
@@ -2187,7 +2205,7 @@ runtime_debug_readGCStats(Slice *pauses)
 
 	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
 	p = (uint64*)pauses->array;
-	runtime_lock(runtime_mheap);
+	runtime_lock(&runtime_mheap);
 	n = mstats.numgc;
 	if(n > nelem(mstats.pause_ns))
 		n = nelem(mstats.pause_ns);
@@ -2202,7 +2220,7 @@ runtime_debug_readGCStats(Slice *pauses)
 	p[n] = mstats.last_gc;
 	p[n+1] = mstats.numgc;
 	p[n+2] = mstats.pause_total_ns;	
-	runtime_unlock(runtime_mheap);
+	runtime_unlock(&runtime_mheap);
 	pauses->__count = n+3;
 }
 
@@ -2214,14 +2232,14 @@ runtime_debug_setGCPercent(intgo in)
 {
 	intgo out;
 
-	runtime_lock(runtime_mheap);
+	runtime_lock(&runtime_mheap);
 	if(gcpercent == GcpercentUnknown)
 		gcpercent = readgogc();
 	out = gcpercent;
 	if(in < 0)
 		in = -1;
 	gcpercent = in;
-	runtime_unlock(runtime_mheap);
+	runtime_unlock(&runtime_mheap);
 	return out;
 }
 
@@ -2235,6 +2253,8 @@ gchelperstart(void)
 		runtime_throw("gchelperstart: bad m->helpgc");
 	if(runtime_xchg(&bufferList[m->helpgc].busy, 1))
 		runtime_throw("gchelperstart: already busy");
+	if(runtime_g() != m->g0)
+		runtime_throw("gchelper not running on g0 stack");
 }
 
 static void
@@ -2243,33 +2263,51 @@ runfinq(void* dummy __attribute__ ((unused)))
 	Finalizer *f;
 	FinBlock *fb, *next;
 	uint32 i;
+	Eface ef;
+	Iface iface;
 
 	for(;;) {
-		// There's no need for a lock in this section
-		// because it only conflicts with the garbage
-		// collector, and the garbage collector only
-		// runs when everyone else is stopped, and
-		// runfinq only stops at the gosched() or
-		// during the calls in the for loop.
+		runtime_lock(&finlock);
 		fb = finq;
 		finq = nil;
 		if(fb == nil) {
 			fingwait = 1;
-			runtime_park(nil, nil, "finalizer wait");
+			runtime_park(runtime_unlock, &finlock, "finalizer wait");
 			continue;
 		}
+		runtime_unlock(&finlock);
 		if(raceenabled)
 			runtime_racefingo();
 		for(; fb; fb=next) {
 			next = fb->next;
 			for(i=0; i<(uint32)fb->cnt; i++) {
+				const Type *fint;
 				void *param;
 
 				f = &fb->fin[i];
-				param = &f->arg;
+				fint = ((const Type**)f->ft->__in.array)[0];
+				if(fint->kind == KindPtr) {
+					// direct use of pointer
+					param = &f->arg;
+				} else if(((const InterfaceType*)fint)->__methods.__count == 0) {
+					// convert to empty interface
+					ef.type = (const Type*)f->ot;
+					ef.__object = f->arg;
+					param = &ef;
+				} else {
+					// convert to interface with methods
+					iface.__methods = __go_convert_interface_2((const Type*)fint,
+										   (const Type*)f->ot,
+										   1);
+					iface.__object = f->arg;
+					if(iface.__methods == nil)
+						runtime_throw("invalid type conversion in runfinq");
+					param = &iface;
+				}
 				reflect_call(f->ft, f->fn, 0, 0, &param, nil);
 				f->fn = nil;
 				f->arg = nil;
+				f->ot = nil;
 			}
 			fb->cnt = 0;
 			fb->next = finc;
@@ -2280,28 +2318,28 @@ runfinq(void* dummy __attribute__ ((unused)))
 }
 
 // mark the block at v of size n as allocated.
-// If noptr is true, mark it as having no pointers.
+// If noscan is true, mark it as not needing scanning.
 void
-runtime_markallocated(void *v, uintptr n, bool noptr)
+runtime_markallocated(void *v, uintptr n, bool noscan)
 {
 	uintptr *b, obits, bits, off, shift;
 
 	if(0)
 		runtime_printf("markallocated %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
 		runtime_throw("markallocated: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
 		obits = *b;
 		bits = (obits & ~(bitMask<<shift)) | (bitAllocated<<shift);
-		if(noptr)
-			bits |= bitNoPointers<<shift;
-		if(runtime_singleproc) {
+		if(noscan)
+			bits |= bitNoScan<<shift;
+		if(runtime_gomaxprocs == 1) {
 			*b = bits;
 			break;
 		} else {
@@ -2319,19 +2357,19 @@ runtime_markfreed(void *v, uintptr n)
 	uintptr *b, obits, bits, off, shift;
 
 	if(0)
-		runtime_printf("markallocated %p+%p\n", v, n);
+		runtime_printf("markfreed %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
-		runtime_throw("markallocated: bad pointer");
+	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+		runtime_throw("markfreed: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
 		obits = *b;
 		bits = (obits & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
-		if(runtime_singleproc) {
+		if(runtime_gomaxprocs == 1) {
 			*b = bits;
 			break;
 		} else {
@@ -2351,11 +2389,11 @@ runtime_checkfreed(void *v, uintptr n)
 	if(!runtime_checking)
 		return;
 
-	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
 		return;	// not allocated, so okay
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	bits = *b>>shift;
@@ -2374,7 +2412,7 @@ runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
 	uintptr *b, off, shift;
 	byte *p;
 
-	if((byte*)v+size*n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
+	if((byte*)v+size*n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
 		runtime_throw("markspan: bad pointer");
 
 	p = v;
@@ -2385,8 +2423,8 @@ runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
 		// the entire span, and each bitmap word has bits for only
 		// one span, so no other goroutines are changing these
 		// bitmap words.
-		off = (uintptr*)p - (uintptr*)runtime_mheap->arena_start;  // word offset
-		b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;  // word offset
+		b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		*b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 	}
@@ -2398,14 +2436,14 @@ runtime_unmarkspan(void *v, uintptr n)
 {
 	uintptr *p, *b, off;
 
-	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
 		runtime_throw("markspan: bad pointer");
 
 	p = v;
-	off = p - (uintptr*)runtime_mheap->arena_start;  // word offset
+	off = p - (uintptr*)runtime_mheap.arena_start;  // word offset
 	if(off % wordsPerBitmapWord != 0)
 		runtime_throw("markspan: unaligned pointer");
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	n /= PtrSize;
 	if(n%wordsPerBitmapWord != 0)
 		runtime_throw("unmarkspan: unaligned length");
@@ -2426,8 +2464,8 @@ runtime_blockspecial(void *v)
 	if(DebugMark)
 		return true;
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	return (*b & (bitSpecial<<shift)) != 0;
@@ -2441,8 +2479,8 @@ runtime_setblockspecial(void *v, bool s)
 	if(DebugMark)
 		return;
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
-	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
+	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
@@ -2451,7 +2489,7 @@ runtime_setblockspecial(void *v, bool s)
 			bits = obits | (bitSpecial<<shift);
 		else
 			bits = obits & ~(bitSpecial<<shift);
-		if(runtime_singleproc) {
+		if(runtime_gomaxprocs == 1) {
 			*b = bits;
 			break;
 		} else {
@@ -2476,13 +2514,13 @@ runtime_MHeap_MapBits(MHeap *h)
 	uintptr n;
 
 	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
-	n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
+	n = ROUND(n, bitmapChunk);
 	if(h->bitmap_mapped >= n)
 		return;
 
 	page_size = getpagesize();
 	n = (n+page_size-1) & ~(page_size-1);
 
-	runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped);
+	runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped, &mstats.gc_sys);
 	h->bitmap_mapped = n;
 }
diff --git a/libgo/runtime/mgc0.h b/libgo/runtime/mgc0.h
index d14fb37c209..f8abe6c9c1c 100644
--- a/libgo/runtime/mgc0.h
+++ b/libgo/runtime/mgc0.h
@@ -26,7 +26,6 @@ enum {
 	GC_ARRAY_START, // Start an array with a fixed length. Args: (off, len, elemsize)
 	GC_ARRAY_NEXT,  // The next element of an array. Args: none
 	GC_CALL,        // Call a subroutine. Args: (off, objgcrel)
-	GC_MAP_PTR,     // Go map. Args: (off, MapType*)
 	GC_CHAN_PTR,    // Go channel. Args: (off, ChanType*)
 	GC_STRING,      // Go string. Args: (off)
 	GC_EFACE,       // interface{}. Args: (off)
diff --git a/libgo/runtime/mheap.c b/libgo/runtime/mheap.c
index b4d94b68559..1b6cfd3dcde 100644
--- a/libgo/runtime/mheap.c
+++ b/libgo/runtime/mheap.c
@@ -36,12 +36,12 @@ RecordSpan(void *vh, byte *p)
 		cap = 64*1024/sizeof(all[0]);
 		if(cap < h->nspancap*3/2)
 			cap = h->nspancap*3/2;
-		all = (MSpan**)runtime_SysAlloc(cap*sizeof(all[0]));
+		all = (MSpan**)runtime_SysAlloc(cap*sizeof(all[0]), &mstats.other_sys);
 		if(all == nil)
 			runtime_throw("runtime: cannot allocate memory");
 		if(h->allspans) {
 			runtime_memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
-			runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]));
+			runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
 		}
 		h->allspans = all;
 		h->nspancap = cap;
@@ -51,12 +51,12 @@ RecordSpan(void *vh, byte *p)
 
 // Initialize the heap; fetch memory using alloc.
 void
-runtime_MHeap_Init(MHeap *h, void *(*alloc)(uintptr))
+runtime_MHeap_Init(MHeap *h)
 {
 	uint32 i;
 
-	runtime_FixAlloc_Init(&h->spanalloc, sizeof(MSpan), alloc, RecordSpan, h);
-	runtime_FixAlloc_Init(&h->cachealloc, sizeof(MCache), alloc, nil, nil);
+	runtime_FixAlloc_Init(&h->spanalloc, sizeof(MSpan), RecordSpan, h, &mstats.mspan_sys);
+	runtime_FixAlloc_Init(&h->cachealloc, sizeof(MCache), nil, nil, &mstats.mcache_sys);
 	// h->mapcache needs no init
 	for(i=0; i<nelem(h->free); i++)
 		runtime_MSpanList_Init(&h->free[i]);
@@ -65,6 +65,23 @@ runtime_MHeap_Init(MHeap *h, void *(*alloc)(uintptr))
 		runtime_MCentral_Init(&h->central[i], i);
 }
 
+void
+runtime_MHeap_MapSpans(MHeap *h)
+{
+	uintptr n;
+
+	// Map spans array, PageSize at a time.
+	n = (uintptr)h->arena_used;
+	if(sizeof(void*) == 8)
+		n -= (uintptr)h->arena_start;
+	n = n / PageSize * sizeof(h->spans[0]);
+	n = ROUND(n, PageSize);
+	if(h->spans_mapped >= n)
+		return;
+	runtime_SysMap((byte*)h->spans + h->spans_mapped, n - h->spans_mapped, &mstats.other_sys);
+	h->spans_mapped = n;
+}
+
 // Allocate a new span of npage pages from the heap
 // and record its size class in the HeapMap and HeapMapCache.
 MSpan*
@@ -73,7 +90,8 @@ runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32
 	MSpan *s;
 
 	runtime_lock(h);
-	runtime_purgecachedstats(runtime_m()->mcache);
+	mstats.heap_alloc += runtime_m()->mcache->local_cachealloc;
+	runtime_m()->mcache->local_cachealloc = 0;
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
@@ -138,6 +156,7 @@ HaveSpan:
 		// is just a unique constant not seen elsewhere in the
 		// runtime, as a clue in case it turns up unexpectedly in
 		// memory or in a stack trace.
+		runtime_SysUsed((void*)(s->start<<PageShift), s->npages<<PageShift);
 		*(uintptr*)(s->start<<PageShift) = (uintptr)0xbeadbeadbeadbeadULL;
 	}
 	s->npreleased = 0;
@@ -145,17 +164,15 @@ HaveSpan:
 	if(s->npages > npage) {
 		// Trim extra and put it back in the heap.
 		t = runtime_FixAlloc_Alloc(&h->spanalloc);
-		mstats.mspan_inuse = h->spanalloc.inuse;
-		mstats.mspan_sys = h->spanalloc.sys;
 		runtime_MSpan_Init(t, s->start + npage, s->npages - npage);
 		s->npages = npage;
 		p = t->start;
 		if(sizeof(void*) == 8)
 			p -= ((uintptr)h->arena_start>>PageShift);
 		if(p > 0)
-			h->map[p-1] = s;
-		h->map[p] = t;
-		h->map[p+t->npages-1] = t;
+			h->spans[p-1] = s;
+		h->spans[p] = t;
+		h->spans[p+t->npages-1] = t;
 		*(uintptr*)(t->start<<PageShift) = *(uintptr*)(s->start<<PageShift);  // copy "needs zeroing" mark
 		t->state = MSpanInUse;
 		MHeap_FreeLocked(h, t);
@@ -172,7 +189,7 @@ HaveSpan:
 	if(sizeof(void*) == 8)
 		p -= ((uintptr)h->arena_start>>PageShift);
 	for(n=0; n<npage; n++)
-		h->map[p+n] = s;
+		h->spans[p+n] = s;
 	return s;
 }
 
@@ -232,19 +249,16 @@ MHeap_Grow(MHeap *h, uintptr npage)
 			return false;
 		}
 	}
-	mstats.heap_sys += ask;
 
 	// Create a fake "in use" span and free it, so that the
 	// right coalescing happens.
 	s = runtime_FixAlloc_Alloc(&h->spanalloc);
-	mstats.mspan_inuse = h->spanalloc.inuse;
-	mstats.mspan_sys = h->spanalloc.sys;
 	runtime_MSpan_Init(s, (uintptr)v>>PageShift, ask>>PageShift);
 	p = s->start;
 	if(sizeof(void*) == 8)
 		p -= ((uintptr)h->arena_start>>PageShift);
-	h->map[p] = s;
-	h->map[p + s->npages - 1] = s;
+	h->spans[p] = s;
+	h->spans[p + s->npages - 1] = s;
 	s->state = MSpanInUse;
 	MHeap_FreeLocked(h, s);
 	return true;
@@ -261,7 +275,7 @@ runtime_MHeap_Lookup(MHeap *h, void *v)
 	p = (uintptr)v;
 	if(sizeof(void*) == 8)
 		p -= (uintptr)h->arena_start;
-	return h->map[p >> PageShift];
+	return h->spans[p >> PageShift];
 }
 
 // Look up the span at the given address.
@@ -283,10 +297,8 @@ runtime_MHeap_LookupMaybe(MHeap *h, void *v)
 	q = p;
 	if(sizeof(void*) == 8)
 		q -= (uintptr)h->arena_start >> PageShift;
-	s = h->map[q];
-	if(s == nil || p < s->start || p - s->start >= s->npages)
-		return nil;
-	if(s->state != MSpanInUse)
+	s = h->spans[q];
+	if(s == nil || p < s->start || (byte*)v >= s->limit || s->state != MSpanInUse)
 		return nil;
 	return s;
 }
@@ -296,7 +308,8 @@ void
 runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 {
 	runtime_lock(h);
-	runtime_purgecachedstats(runtime_m()->mcache);
+	mstats.heap_alloc += runtime_m()->mcache->local_cachealloc;
+	runtime_m()->mcache->local_cachealloc = 0;
 	mstats.heap_inuse -= s->npages<<PageShift;
 	if(acct) {
 		mstats.heap_alloc -= s->npages<<PageShift;
@@ -313,8 +326,6 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 	MSpan *t;
 	PageID p;
 
-	if(s->types.sysalloc)
-		runtime_settype_sysfree(s);
 	s->types.compression = MTypes_Empty;
 
 	if(s->state != MSpanInUse || s->ref != 0) {
@@ -334,31 +345,31 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 	p = s->start;
 	if(sizeof(void*) == 8)
 		p -= (uintptr)h->arena_start >> PageShift;
-	if(p > 0 && (t = h->map[p-1]) != nil && t->state != MSpanInUse) {
-		tp = (uintptr*)(t->start<<PageShift);
-		*tp |= *sp;	// propagate "needs zeroing" mark
+	if(p > 0 && (t = h->spans[p-1]) != nil && t->state != MSpanInUse) {
+		if(t->npreleased == 0) {  // cant't touch this otherwise
+			tp = (uintptr*)(t->start<<PageShift);
+			*tp |= *sp;	// propagate "needs zeroing" mark
+		}
 		s->start = t->start;
 		s->npages += t->npages;
 		s->npreleased = t->npreleased; // absorb released pages
 		p -= t->npages;
-		h->map[p] = s;
+		h->spans[p] = s;
 		runtime_MSpanList_Remove(t);
 		t->state = MSpanDead;
 		runtime_FixAlloc_Free(&h->spanalloc, t);
-		mstats.mspan_inuse = h->spanalloc.inuse;
-		mstats.mspan_sys = h->spanalloc.sys;
 	}
-	if(p+s->npages < nelem(h->map) && (t = h->map[p+s->npages]) != nil && t->state != MSpanInUse) {
-		tp = (uintptr*)(t->start<<PageShift);
-		*sp |= *tp;	// propagate "needs zeroing" mark
+	if((p+s->npages)*sizeof(h->spans[0]) < h->spans_mapped && (t = h->spans[p+s->npages]) != nil && t->state != MSpanInUse) {
+		if(t->npreleased == 0) {  // cant't touch this otherwise
+			tp = (uintptr*)(t->start<<PageShift);
+			*sp |= *tp;	// propagate "needs zeroing" mark
+		}
 		s->npages += t->npages;
 		s->npreleased += t->npreleased;
-		h->map[p + s->npages - 1] = s;
+		h->spans[p + s->npages - 1] = s;
 		runtime_MSpanList_Remove(t);
 		t->state = MSpanDead;
 		runtime_FixAlloc_Free(&h->spanalloc, t);
-		mstats.mspan_inuse = h->spanalloc.inuse;
-		mstats.mspan_sys = h->spanalloc.sys;
 	}
 
 	// Insert s into appropriate list.
@@ -388,7 +399,7 @@ scavengelist(MSpan *list, uint64 now, uint64 limit)
 
 	sumreleased = 0;
 	for(s=list->next; s != list; s=s->next) {
-		if((now - s->unusedsince) > limit) {
+		if((now - s->unusedsince) > limit && s->npreleased != s->npages) {
 			released = (s->npages - s->npreleased) << PageShift;
 			mstats.heap_released += released;
 			sumreleased += released;
@@ -399,19 +410,26 @@ scavengelist(MSpan *list, uint64 now, uint64 limit)
 	return sumreleased;
 }
 
-static uintptr
-scavenge(uint64 now, uint64 limit)
+static void
+scavenge(int32 k, uint64 now, uint64 limit)
 {
 	uint32 i;
 	uintptr sumreleased;
 	MHeap *h;
 	
-	h = runtime_mheap;
+	h = &runtime_mheap;
 	sumreleased = 0;
 	for(i=0; i < nelem(h->free); i++)
 		sumreleased += scavengelist(&h->free[i], now, limit);
 	sumreleased += scavengelist(&h->large, now, limit);
-	return sumreleased;
+
+	if(runtime_debug.gctrace > 0) {
+		if(sumreleased > 0)
+			runtime_printf("scvg%d: %D MB released\n", k, (uint64)sumreleased>>20);
+		runtime_printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
+			k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
+			mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
+	}
 }
 
 // Release (part of) unused memory to OS.
@@ -424,9 +442,6 @@ runtime_MHeap_Scavenger(void* dummy)
 	MHeap *h;
 	uint64 tick, now, forcegc, limit;
 	uint32 k;
-	uintptr sumreleased;
-	const byte *env;
-	bool trace;
 	Note note, *notep;
 
 	USED(dummy);
@@ -446,17 +461,10 @@ runtime_MHeap_Scavenger(void* dummy)
 	else
 		tick = limit/2;
 
-	trace = false;
-	env = runtime_getenv("GOGCTRACE");
-	if(env != nil)
-		trace = runtime_atoi(env) > 0;
-
-	h = runtime_mheap;
+	h = &runtime_mheap;
 	for(k=0;; k++) {
 		runtime_noteclear(&note);
-		runtime_entersyscallblock();
-		runtime_notetsleep(&note, tick);
-		runtime_exitsyscall();
+		runtime_notetsleepg(&note, tick);
 
 		runtime_lock(h);
 		now = runtime_nanotime();
@@ -468,24 +476,14 @@ runtime_MHeap_Scavenger(void* dummy)
 			runtime_noteclear(&note);
 			notep = &note;
 			__go_go(forcegchelper, (void*)notep);
-			runtime_entersyscallblock();
-			runtime_notesleep(&note);
-			runtime_exitsyscall();
-			if(trace)
+			runtime_notetsleepg(&note, -1);
+			if(runtime_debug.gctrace > 0)
 				runtime_printf("scvg%d: GC forced\n", k);
 			runtime_lock(h);
 			now = runtime_nanotime();
 		}
-		sumreleased = scavenge(now, limit);
+		scavenge(k, now, limit);
 		runtime_unlock(h);
-
-		if(trace) {
-			if(sumreleased > 0)
-				runtime_printf("scvg%d: %p MB released\n", k, sumreleased>>20);
-			runtime_printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
-				k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
-				mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
-		}
 	}
 }
 
@@ -495,9 +493,9 @@ void
 runtime_debug_freeOSMemory(void)
 {
 	runtime_gc(1);
-	runtime_lock(runtime_mheap);
-	scavenge(~(uintptr)0, 0);
-	runtime_unlock(runtime_mheap);
+	runtime_lock(&runtime_mheap);
+	scavenge(-1, ~(uintptr)0, 0);
+	runtime_unlock(&runtime_mheap);
 }
 
 // Initialize a new span with the given start and npages.
diff --git a/libgo/runtime/mprof.goc b/libgo/runtime/mprof.goc
index 73d937908c6..7507dfc9173 100644
--- a/libgo/runtime/mprof.goc
+++ b/libgo/runtime/mprof.goc
@@ -14,44 +14,11 @@ package runtime
 #include "go-string.h"
 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
-static Lock proflock, alloclock;
+static Lock proflock;
 
 // All memory allocations are local and do not escape outside of the profiler.
 // The profiler is forbidden from referring to garbage-collected memory.
 
-static byte *pool;        // memory allocation pool
-static uintptr poolfree;  // number of bytes left in the pool
-enum {
-	Chunk = 32*PageSize,  // initial size of the pool
-};
-
-// Memory allocation local to this file.
-// There is no way to return the allocated memory back to the OS.
-static void*
-allocate(uintptr size)
-{
-	void *v;
-
-	if(size == 0)
-		return nil;
-
-	if(size >= Chunk/2)
-		return runtime_SysAlloc(size);
-
-	runtime_lock(&alloclock);
-	if(size > poolfree) {
-		pool = runtime_SysAlloc(Chunk);
-		if(pool == nil)
-			runtime_throw("runtime: cannot allocate memory");
-		poolfree = Chunk;
-	}
-	v = pool;
-	pool += size;
-	poolfree -= size;
-	runtime_unlock(&alloclock);
-	return v;
-}
-
 enum { MProf, BProf };  // profile types
 
 // Per-call-stack profiling information.
@@ -104,10 +71,9 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 	Bucket *b;
 
 	if(buckhash == nil) {
-		buckhash = runtime_SysAlloc(BuckHashSize*sizeof buckhash[0]);
+		buckhash = runtime_SysAlloc(BuckHashSize*sizeof buckhash[0], &mstats.buckhash_sys);
 		if(buckhash == nil)
 			runtime_throw("runtime: cannot allocate memory");
-		mstats.buckhash_sys += BuckHashSize*sizeof buckhash[0];
 	}
 
 	// Hash stack.
@@ -137,9 +103,7 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 	if(!alloc)
 		return nil;
 
-	b = allocate(sizeof *b + nstk*sizeof stk[0]);
-	if(b == nil)
-		runtime_throw("runtime: cannot allocate memory");
+	b = runtime_persistentalloc(sizeof *b + nstk*sizeof stk[0], 0, &mstats.buckhash_sys);
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime_memmove(b->stk, stk, nstk*sizeof stk[0]);
 	b->typ = typ;
@@ -241,7 +205,7 @@ setaddrbucket(uintptr addr, Bucket *b)
 		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 
-	ah = allocate(sizeof *ah);
+	ah = runtime_persistentalloc(sizeof *ah, 0, &mstats.buckhash_sys);
 	addrmem += sizeof *ah;
 	ah->next = addrhash[h];
 	ah->addr = addr>>AddrHashShift;
@@ -249,7 +213,7 @@ setaddrbucket(uintptr addr, Bucket *b)
 
 found:
 	if((e = addrfree) == nil) {
-		e = allocate(64*sizeof *e);
+		e = runtime_persistentalloc(64*sizeof *e, 0, &mstats.buckhash_sys);
 		addrmem += 64*sizeof *e;
 		for(i=0; i+1<64; i++)
 			e[i].next = &e[i+1];
@@ -296,16 +260,10 @@ found:
 void
 runtime_MProf_Malloc(void *p, uintptr size)
 {
-	M *m;
 	int32 nstk;
 	Location stk[32];
 	Bucket *b;
 
-	m = runtime_m();
-	if(m->nomemprof > 0)
-		return;
-
-	m->nomemprof++;
 	nstk = runtime_callers(1, stk, 32);
 	runtime_lock(&proflock);
 	b = stkbucket(MProf, stk, nstk, true);
@@ -313,22 +271,14 @@ runtime_MProf_Malloc(void *p, uintptr size)
 	b->recent_alloc_bytes += size;
 	setaddrbucket((uintptr)p, b);
 	runtime_unlock(&proflock);
-	m = runtime_m();
-	m->nomemprof--;
 }
 
 // Called when freeing a profiled block.
 void
 runtime_MProf_Free(void *p, uintptr size)
 {
-	M *m;
 	Bucket *b;
 
-	m = runtime_m();
-	if(m->nomemprof > 0)
-		return;
-
-	m->nomemprof++;
 	runtime_lock(&proflock);
 	b = getaddrbucket((uintptr)p);
 	if(b != nil) {
@@ -336,8 +286,6 @@ runtime_MProf_Free(void *p, uintptr size)
 		b->recent_free_bytes += size;
 	}
 	runtime_unlock(&proflock);
-	m = runtime_m();
-	m->nomemprof--;
 }
 
 int64 runtime_blockprofilerate;  // in CPU ticks
@@ -347,7 +295,17 @@ void runtime_SetBlockProfileRate(intgo) __asm__ (GOSYM_PREFIX "runtime.SetBlockP
 void
 runtime_SetBlockProfileRate(intgo rate)
 {
-	runtime_atomicstore64((uint64*)&runtime_blockprofilerate, rate * runtime_tickspersecond() / (1000*1000*1000));
+	int64 r;
+
+	if(rate <= 0)
+		r = 0;  // disable profiling
+	else {
+		// convert ns to cycles, use float64 to prevent overflow during multiplication
+		r = (float64)rate*runtime_tickspersecond()/(1000*1000*1000);
+		if(r == 0)
+			r = 1;
+	}
+	runtime_atomicstore64((uint64*)&runtime_blockprofilerate, r);
 }
 
 void
@@ -510,10 +468,10 @@ func Stack(b Slice, all bool) (n int) {
 	bool enablegc;
 	
 	sp = runtime_getcallersp(&b);
-	pc = runtime_getcallerpc(&b);
+	pc = (byte*)(uintptr)runtime_getcallerpc(&b);
 
 	if(all) {
-		runtime_semacquire(&runtime_worldsema);
+		runtime_semacquire(&runtime_worldsema, false);
 		runtime_m()->gcing = 1;
 		runtime_stoptheworld();
 		enablegc = mstats.enablegc;
@@ -530,7 +488,7 @@ func Stack(b Slice, all bool) (n int) {
 		USED(sp);
 		runtime_goroutineheader(g);
 		runtime_traceback();
-		runtime_goroutinetrailer(g);
+		runtime_printcreatedby(g);
 		if(all)
 			runtime_tracebackothers(g);
 		n = b.__count - g->writenbuf;
@@ -572,7 +530,7 @@ func GoroutineProfile(b Slice) (n int, ok bool) {
 	ok = false;
 	n = runtime_gcount();
 	if(n <= b.__count) {
-		runtime_semacquire(&runtime_worldsema);
+		runtime_semacquire(&runtime_worldsema, false);
 		runtime_m()->gcing = 1;
 		runtime_stoptheworld();
 
@@ -598,5 +556,5 @@ func GoroutineProfile(b Slice) (n int, ok bool) {
 void
 runtime_mprofinit(void)
 {
-	addrhash = allocate((1<<AddrHashBits)*sizeof *addrhash);
+	addrhash = runtime_persistentalloc((1<<AddrHashBits)*sizeof *addrhash, 0, &mstats.buckhash_sys);
 }
diff --git a/libgo/runtime/msize.c b/libgo/runtime/msize.c
index 3b5591c1b17..745a76958c8 100644
--- a/libgo/runtime/msize.c
+++ b/libgo/runtime/msize.c
@@ -31,7 +31,6 @@
 
 int32 runtime_class_to_size[NumSizeClasses];
 int32 runtime_class_to_allocnpages[NumSizeClasses];
-int32 runtime_class_to_transfercount[NumSizeClasses];
 
 // The SizeToClass lookup is implemented using two arrays,
 // one mapping sizes <= 1024 to their class and one mapping
@@ -42,17 +41,17 @@ int32 runtime_class_to_transfercount[NumSizeClasses];
 // size divided by 128 (rounded up).  The arrays are filled in
 // by InitSizes.
 
-static int32 size_to_class8[1024/8 + 1];
-static int32 size_to_class128[(MaxSmallSize-1024)/128 + 1];
+int8 runtime_size_to_class8[1024/8 + 1];
+int8 runtime_size_to_class128[(MaxSmallSize-1024)/128 + 1];
 
-int32
-runtime_SizeToClass(int32 size)
+static int32
+SizeToClass(int32 size)
 {
 	if(size > MaxSmallSize)
 		runtime_throw("SizeToClass - invalid size");
 	if(size > 1024-8)
-		return size_to_class128[(size-1024+127) >> 7];
-	return size_to_class8[(size+7)>>3];
+		return runtime_size_to_class128[(size-1024+127) >> 7];
+	return runtime_size_to_class8[(size+7)>>3];
 }
 
 void
@@ -111,16 +110,16 @@ runtime_InitSizes(void)
 	nextsize = 0;
 	for (sizeclass = 1; sizeclass < NumSizeClasses; sizeclass++) {
 		for(; nextsize < 1024 && nextsize <= runtime_class_to_size[sizeclass]; nextsize+=8)
-			size_to_class8[nextsize/8] = sizeclass;
+			runtime_size_to_class8[nextsize/8] = sizeclass;
 		if(nextsize >= 1024)
 			for(; nextsize <= runtime_class_to_size[sizeclass]; nextsize += 128)
-				size_to_class128[(nextsize-1024)/128] = sizeclass;
+				runtime_size_to_class128[(nextsize-1024)/128] = sizeclass;
 	}
 
 	// Double-check SizeToClass.
 	if(0) {
 		for(n=0; n < MaxSmallSize; n++) {
-			sizeclass = runtime_SizeToClass(n);
+			sizeclass = SizeToClass(n);
 			if(sizeclass < 1 || sizeclass >= NumSizeClasses || runtime_class_to_size[sizeclass] < n) {
 				runtime_printf("size=%d sizeclass=%d runtime_class_to_size=%d\n", n, sizeclass, runtime_class_to_size[sizeclass]);
 				runtime_printf("incorrect SizeToClass");
@@ -137,16 +136,6 @@ runtime_InitSizes(void)
 	// Copy out for statistics table.
 	for(i=0; i<nelem(runtime_class_to_size); i++)
 		mstats.by_size[i].size = runtime_class_to_size[i];
-
-	// Initialize the runtime_class_to_transfercount table.
-	for(sizeclass = 1; sizeclass < NumSizeClasses; sizeclass++) {
-		n = 64*1024 / runtime_class_to_size[sizeclass];
-		if(n < 2)
-			n = 2;
-		if(n > 32)
-			n = 32;
-		runtime_class_to_transfercount[sizeclass] = n;
-	}
 	return;
 
 dump:
@@ -157,12 +146,14 @@ dump:
 			runtime_printf(" %d", runtime_class_to_size[sizeclass]);
 		runtime_printf("\n\n");
 		runtime_printf("size_to_class8:");
-		for(i=0; i<nelem(size_to_class8); i++)
-			runtime_printf(" %d=>%d(%d)\n", i*8, size_to_class8[i], runtime_class_to_size[size_to_class8[i]]);
+		for(i=0; i<nelem(runtime_size_to_class8); i++)
+			runtime_printf(" %d=>%d(%d)\n", i*8, runtime_size_to_class8[i],
+				runtime_class_to_size[runtime_size_to_class8[i]]);
 		runtime_printf("\n");
 		runtime_printf("size_to_class128:");
-		for(i=0; i<nelem(size_to_class128); i++)
-			runtime_printf(" %d=>%d(%d)\n", i*128, size_to_class128[i], runtime_class_to_size[size_to_class128[i]]);
+		for(i=0; i<nelem(runtime_size_to_class128); i++)
+			runtime_printf(" %d=>%d(%d)\n", i*128, runtime_size_to_class128[i],
+				runtime_class_to_size[runtime_size_to_class128[i]]);
 		runtime_printf("\n");
 	}
 	runtime_throw("InitSizes failed");
diff --git a/libgo/runtime/netpoll.goc b/libgo/runtime/netpoll.goc
index a0bd735f85c..02705734dd8 100644
--- a/libgo/runtime/netpoll.goc
+++ b/libgo/runtime/netpoll.goc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin linux
+// +build darwin dragonfly freebsd linux netbsd openbsd windows
 
 package net
 
@@ -19,7 +19,7 @@ package net
 // Integrated network poller (platform-independent part).
 // A particular implementation (epoll/kqueue) must define the following functions:
 // void runtime_netpollinit(void);			// to initialize the poller
-// int32 runtime_netpollopen(int32 fd, PollDesc *pd);	// to arm edge-triggered notifications
+// int32 runtime_netpollopen(uintptr fd, PollDesc *pd);	// to arm edge-triggered notifications
 							// and associate fd with pd.
 // An implementation must call the following function to denote that the pd is ready.
 // void runtime_netpollready(G **gpp, PollDesc *pd, int32 mode);
@@ -30,7 +30,7 @@ struct PollDesc
 {
 	PollDesc* link;	// in pollcache, protected by pollcache.Lock
 	Lock;		// protectes the following fields
-	int32	fd;
+	uintptr	fd;
 	bool	closing;
 	uintptr	seq;	// protects from stale timers and ready notifications
 	G*	rg;	// G waiting for read or READY (binary semaphore)
@@ -52,8 +52,8 @@ static struct
 	// seq is incremented when deadlines are changed or descriptor is reused.
 } pollcache;
 
-static void	netpollblock(PollDesc*, int32);
-static G*	netpollunblock(PollDesc*, int32);
+static bool	netpollblock(PollDesc*, int32);
+static G*	netpollunblock(PollDesc*, int32, bool);
 static void	deadline(int64, Eface);
 static void	readDeadline(int64, Eface);
 static void	writeDeadline(int64, Eface);
@@ -68,7 +68,7 @@ func runtime_pollServerInit() {
 	runtime_netpollinit();
 }
 
-func runtime_pollOpen(fd int) (pd *PollDesc, errno int) {
+func runtime_pollOpen(fd uintptr) (pd *PollDesc, errno int) {
 	pd = allocPollDesc();
 	runtime_lock(pd);
 	if(pd->wg != nil && pd->wg != READY)
@@ -117,18 +117,35 @@ ret:
 func runtime_pollWait(pd *PollDesc, mode int) (err int) {
 	runtime_lock(pd);
 	err = checkerr(pd, mode);
-	if(err)
-		goto ret;
-	netpollblock(pd, mode);
-	err = checkerr(pd, mode);
-ret:
+	if(err == 0) {
+		while(!netpollblock(pd, mode)) {
+			err = checkerr(pd, mode);
+			if(err != 0)
+				break;
+			// Can happen if timeout has fired and unblocked us,
+			// but before we had a chance to run, timeout has been reset.
+			// Pretend it has not happened and retry.
+		}
+	}
+	runtime_unlock(pd);
+}
+
+func runtime_pollWaitCanceled(pd *PollDesc, mode int) {
+	runtime_lock(pd);
+	// wait for ioready, ignore closing or timeouts.
+	while(!netpollblock(pd, mode))
+		;
 	runtime_unlock(pd);
 }
 
 func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
+	G *rg, *wg;
+
 	runtime_lock(pd);
-	if(pd->closing)
-		goto ret;
+	if(pd->closing) {
+		runtime_unlock(pd);
+		return;
+	}
 	pd->seq++;  // invalidate current timers
 	// Reset current timers.
 	if(pd->rt.fv) {
@@ -140,9 +157,8 @@ func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
 		pd->wt.fv = nil;
 	}
 	// Setup new timers.
-	if(d != 0 && d <= runtime_nanotime()) {
+	if(d != 0 && d <= runtime_nanotime())
 		d = -1;
-	}
 	if(mode == 'r' || mode == 'r'+'w')
 		pd->rd = d;
 	if(mode == 'w' || mode == 'r'+'w')
@@ -172,8 +188,18 @@ func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
 			runtime_addtimer(&pd->wt);
 		}
 	}
-ret:
+	// If we set the new deadline in the past, unblock currently pending IO if any.
+	rg = nil;
+	wg = nil;
+	if(pd->rd < 0)
+		rg = netpollunblock(pd, 'r', false);
+	if(pd->wd < 0)
+		wg = netpollunblock(pd, 'w', false);
 	runtime_unlock(pd);
+	if(rg)
+		runtime_ready(rg);
+	if(wg)
+		runtime_ready(wg);
 }
 
 func runtime_pollUnblock(pd *PollDesc) {
@@ -184,8 +210,8 @@ func runtime_pollUnblock(pd *PollDesc) {
 		runtime_throw("runtime_pollUnblock: already closing");
 	pd->closing = true;
 	pd->seq++;
-	rg = netpollunblock(pd, 'r');
-	wg = netpollunblock(pd, 'w');
+	rg = netpollunblock(pd, 'r', false);
+	wg = netpollunblock(pd, 'w', false);
 	if(pd->rt.fv) {
 		runtime_deltimer(&pd->rt);
 		pd->rt.fv = nil;
@@ -201,6 +227,12 @@ func runtime_pollUnblock(pd *PollDesc) {
 		runtime_ready(wg);
 }
 
+uintptr
+runtime_netpollfd(PollDesc *pd)
+{
+	return pd->fd;
+}
+
 // make pd ready, newly runnable goroutines (if any) are enqueued info gpp list
 void
 runtime_netpollready(G **gpp, PollDesc *pd, int32 mode)
@@ -210,9 +242,9 @@ runtime_netpollready(G **gpp, PollDesc *pd, int32 mode)
 	rg = wg = nil;
 	runtime_lock(pd);
 	if(mode == 'r' || mode == 'r'+'w')
-		rg = netpollunblock(pd, 'r');
+		rg = netpollunblock(pd, 'r', true);
 	if(mode == 'w' || mode == 'r'+'w')
-		wg = netpollunblock(pd, 'w');
+		wg = netpollunblock(pd, 'w', true);
 	runtime_unlock(pd);
 	if(rg) {
 		rg->schedlink = *gpp;
@@ -234,7 +266,8 @@ checkerr(PollDesc *pd, int32 mode)
 	return 0;
 }
 
-static void
+// returns true if IO is ready, or false if timedout or closed
+static bool
 netpollblock(PollDesc *pd, int32 mode)
 {
 	G **gpp;
@@ -244,17 +277,20 @@ netpollblock(PollDesc *pd, int32 mode)
 		gpp = &pd->wg;
 	if(*gpp == READY) {
 		*gpp = nil;
-		return;
+		return true;
 	}
 	if(*gpp != nil)
-		runtime_throw("epoll: double wait");
+		runtime_throw("netpollblock: double wait");
 	*gpp = runtime_g();
 	runtime_park(runtime_unlock, &pd->Lock, "IO wait");
 	runtime_lock(pd);
+	if(runtime_g()->param)
+		return true;
+	return false;
 }
 
 static G*
-netpollunblock(PollDesc *pd, int32 mode)
+netpollunblock(PollDesc *pd, int32 mode, bool ioready)
 {
 	G **gpp, *old;
 
@@ -264,10 +300,15 @@ netpollunblock(PollDesc *pd, int32 mode)
 	if(*gpp == READY)
 		return nil;
 	if(*gpp == nil) {
-		*gpp = READY;
+		// Only set READY for ioready. runtime_pollWait
+		// will check for timeout/cancel before waiting.
+		if(ioready)
+			*gpp = READY;
 		return nil;
 	}
 	old = *gpp;
+	// pass unblock reason onto blocked g
+	old->param = (void*)(uintptr)ioready;
 	*gpp = nil;
 	return old;
 }
@@ -296,14 +337,14 @@ deadlineimpl(int64 now, Eface arg, bool read, bool write)
 			runtime_throw("deadlineimpl: inconsistent read deadline");
 		pd->rd = -1;
 		pd->rt.fv = nil;
-		rg = netpollunblock(pd, 'r');
+		rg = netpollunblock(pd, 'r', false);
 	}
 	if(write) {
 		if(pd->wd <= 0 || (pd->wt.fv == nil && !read))
 			runtime_throw("deadlineimpl: inconsistent write deadline");
 		pd->wd = -1;
 		pd->wt.fv = nil;
-		wg = netpollunblock(pd, 'w');
+		wg = netpollunblock(pd, 'w', false);
 	}
 	runtime_unlock(pd);
 	if(rg)
@@ -343,7 +384,7 @@ allocPollDesc(void)
 			n = 1;
 		// Must be in non-GC memory because can be referenced
 		// only from epoll/kqueue internals.
-		pd = runtime_SysAlloc(n*sizeof(*pd));
+		pd = runtime_persistentalloc(n*sizeof(*pd), 0, &mstats.other_sys);
 		for(i = 0; i < n; i++) {
 			pd[i].link = pollcache.first;
 			pollcache.first = &pd[i];
diff --git a/libgo/runtime/netpoll_epoll.c b/libgo/runtime/netpoll_epoll.c
index 98c5cbeb587..b98aa818c89 100644
--- a/libgo/runtime/netpoll_epoll.c
+++ b/libgo/runtime/netpoll_epoll.c
@@ -94,24 +94,24 @@ runtime_netpollinit(void)
 }
 
 int32
-runtime_netpollopen(int32 fd, PollDesc *pd)
+runtime_netpollopen(uintptr fd, PollDesc *pd)
 {
 	EpollEvent ev;
 	int32 res;
 
 	ev.events = EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET;
 	ev.data.ptr = (void*)pd;
-	res = runtime_epollctl(epfd, EPOLL_CTL_ADD, fd, &ev);
+	res = runtime_epollctl(epfd, EPOLL_CTL_ADD, (int32)fd, &ev);
 	return -res;
 }
 
 int32
-runtime_netpollclose(int32 fd)
+runtime_netpollclose(uintptr fd)
 {
 	EpollEvent ev;
 	int32 res;
 
-	res = runtime_epollctl(epfd, EPOLL_CTL_DEL, fd, &ev);
+	res = runtime_epollctl(epfd, EPOLL_CTL_DEL, (int32)fd, &ev);
 	return -res;
 }
 
diff --git a/libgo/runtime/netpoll_kqueue.c b/libgo/runtime/netpoll_kqueue.c
index 9b79b2020df..78901611884 100644
--- a/libgo/runtime/netpoll_kqueue.c
+++ b/libgo/runtime/netpoll_kqueue.c
@@ -2,10 +2,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin
+// +build darwin dragonfly freebsd netbsd openbsd
 
 #include "runtime.h"
 #include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
 
 // Integrated network poller (kqueue-based implementation).
 
@@ -27,7 +28,7 @@ runtime_netpollinit(void)
 }
 
 int32
-runtime_netpollopen(int32 fd, PollDesc *pd)
+runtime_netpollopen(uintptr fd, PollDesc *pd)
 {
 	Kevent ev[2];
 	int32 n;
@@ -35,30 +36,22 @@ runtime_netpollopen(int32 fd, PollDesc *pd)
 	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
 	// for the whole fd lifetime.  The notifications are automatically unregistered
 	// when fd is closed.
-	ev[0].ident = fd;
+	ev[0].ident = (uint32)fd;
 	ev[0].filter = EVFILT_READ;
-	ev[0].flags = EV_ADD|EV_RECEIPT|EV_CLEAR;
+	ev[0].flags = EV_ADD|EV_CLEAR;
 	ev[0].fflags = 0;
 	ev[0].data = 0;
-	ev[0].udata = (byte*)pd;
+	ev[0].udata = (kevent_udata)pd;
 	ev[1] = ev[0];
 	ev[1].filter = EVFILT_WRITE;
-	n = runtime_kevent(kq, ev, 2, ev, 2, nil);
+	n = runtime_kevent(kq, ev, 2, nil, 0, nil);
 	if(n < 0)
 		return -n;
-	if(n != 2 ||
-		(ev[0].flags&EV_ERROR) == 0 || ev[0].ident != fd || ev[0].filter != EVFILT_READ ||
-		(ev[1].flags&EV_ERROR) == 0 || ev[1].ident != fd || ev[1].filter != EVFILT_WRITE)
-		return EFAULT;  // just to mark out from other errors
-	if(ev[0].data != 0)
-		return ev[0].data;
-	if(ev[1].data != 0)
-		return ev[1].data;
 	return 0;
 }
 
 int32
-runtime_netpollclose(int32 fd)
+runtime_netpollclose(uintptr fd)
 {
 	// Don't need to unregister because calling close()
 	// on fd will remove any kevents that reference the descriptor.
@@ -74,7 +67,7 @@ runtime_netpoll(bool block)
 	static int32 lasterr;
 	Kevent events[64], *ev;
 	Timespec ts, *tp;
-	int32 n, i;
+	int32 n, i, mode;
 	G *gp;
 
 	if(kq == -1)
@@ -97,10 +90,13 @@ retry:
 	}
 	for(i = 0; i < n; i++) {
 		ev = &events[i];
+		mode = 0;
 		if(ev->filter == EVFILT_READ)
-			runtime_netpollready(&gp, (PollDesc*)ev->udata, 'r');
+			mode += 'r';
 		if(ev->filter == EVFILT_WRITE)
-			runtime_netpollready(&gp, (PollDesc*)ev->udata, 'w');
+			mode += 'w';
+		if(mode)
+			runtime_netpollready(&gp, (PollDesc*)ev->udata, mode);
 	}
 	if(block && gp == nil)
 		goto retry;
diff --git a/libgo/runtime/netpoll_stub.c b/libgo/runtime/netpoll_stub.c
index e28e38e2643..84eef754c8d 100644
--- a/libgo/runtime/netpoll_stub.c
+++ b/libgo/runtime/netpoll_stub.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd netbsd openbsd plan9 windows
+// +build plan9
 
 #include "runtime.h"
 
diff --git a/libgo/runtime/panic.c b/libgo/runtime/panic.c
index 7d79256cf41..7a8d95b1ade 100644
--- a/libgo/runtime/panic.c
+++ b/libgo/runtime/panic.c
@@ -38,7 +38,7 @@ runtime_startpanic(void)
 	M *m;
 
 	m = runtime_m();
-	if(runtime_mheap == 0 || runtime_mheap->cachealloc.size == 0) { // very early
+	if(runtime_mheap.cachealloc.size == 0) { // very early
 		runtime_printf("runtime: panic before malloc heap initialized\n");
 		m->mallocing = 1; // tell rest of panic not to try to malloc
 	} else if(m->mcache == nil) // can happen if called from signal handler or throw
@@ -48,8 +48,13 @@ runtime_startpanic(void)
 		runtime_exit(3);
 	}
 	m->dying = 1;
+	if(runtime_g() != nil)
+		runtime_g()->writebuf = nil;
 	runtime_xadd(&runtime_panicking, 1);
 	runtime_lock(&paniclk);
+	if(runtime_debug.schedtrace > 0 || runtime_debug.scheddetail > 0)
+		runtime_schedtrace(true);
+	runtime_freezetheworld();
 }
 
 void
@@ -58,18 +63,22 @@ runtime_dopanic(int32 unused __attribute__ ((unused)))
 	G *g;
 	static bool didothers;
 	bool crash;
+	int32 t;
 
 	g = runtime_g();
 	if(g->sig != 0)
 		runtime_printf("[signal %x code=%p addr=%p]\n",
 			       g->sig, (void*)g->sigcode0, (void*)g->sigcode1);
 
-	if(runtime_gotraceback(&crash)){
+	if((t = runtime_gotraceback(&crash)) > 0){
 		if(g != runtime_m()->g0) {
 			runtime_printf("\n");
 			runtime_goroutineheader(g);
 			runtime_traceback();
-			runtime_goroutinetrailer(g);
+			runtime_printcreatedby(g);
+		} else if(t >= 2 || runtime_m()->throwing > 0) {
+			runtime_printf("\nruntime stack:\n");
+			runtime_traceback();
 		}
 		if(!didothers) {
 			didothers = true;
@@ -113,11 +122,15 @@ runtime_panicstring(const char *s)
 {
 	Eface err;
 
+	if(runtime_m()->mallocing) {
+		runtime_printf("panic: %s\n", s);
+		runtime_throw("panic during malloc");
+	}
 	if(runtime_m()->gcing) {
 		runtime_printf("panic: %s\n", s);
 		runtime_throw("panic during gc");
 	}
-	runtime_newErrorString(runtime_gostringnocopy((const byte*)s), &err);
+	runtime_newErrorCString(s, &err);
 	runtime_panic(err);
 }
 
diff --git a/libgo/runtime/parfor.c b/libgo/runtime/parfor.c
index c0e40f5081b..9489d8dc2ec 100644
--- a/libgo/runtime/parfor.c
+++ b/libgo/runtime/parfor.c
@@ -151,9 +151,9 @@ runtime_parfordo(ParFor *desc)
 			if(victim >= tid)
 				victim++;
 			victimpos = &desc->thr[victim].pos;
-			pos = runtime_atomicload64(victimpos);
 			for(;;) {
 				// See if it has any work.
+				pos = runtime_atomicload64(victimpos);
 				begin = (uint32)pos;
 				end = (uint32)(pos>>32);
 				if(begin+1 >= end) {
@@ -166,7 +166,7 @@ runtime_parfordo(ParFor *desc)
 				}
 				begin2 = begin + (end-begin)/2;
 				newpos = (uint64)begin | (uint64)begin2<<32;
-				if(runtime_cas64(victimpos, &pos, newpos)) {
+				if(runtime_cas64(victimpos, pos, newpos)) {
 					begin = begin2;
 					break;
 				}
diff --git a/libgo/runtime/print.c b/libgo/runtime/print.c
index f5c6e82840e..766ddbdc499 100644
--- a/libgo/runtime/print.c
+++ b/libgo/runtime/print.c
@@ -5,6 +5,7 @@
 #include <stdarg.h>
 #include "runtime.h"
 #include "array.h"
+#include "go-type.h"
 
 //static Lock debuglock;
 
@@ -13,7 +14,7 @@ static void go_vprintf(const char*, va_list);
 // write to goroutine-local buffer if diverting output,
 // or else standard error.
 static void
-gwrite(const void *v, int32 n)
+gwrite(const void *v, intgo n)
 {
 	G* g = runtime_g();
 
@@ -301,8 +302,6 @@ runtime_printpointer(void *p)
 void
 runtime_printstring(String v)
 {
-	// extern uint32 runtime_maxstring;
-
 	// if(v.len > runtime_maxstring) {
 	//	gwrite("[string too long]", 17);
 	//	return;
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 0e77a3e0603..ab7cde43863 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -231,8 +231,8 @@ kickoff(void)
 }
 
 // Switch context to a different goroutine.  This is like longjmp.
-static void runtime_gogo(G*) __attribute__ ((noinline));
-static void
+void runtime_gogo(G*) __attribute__ ((noinline));
+void
 runtime_gogo(G* newg)
 {
 #ifdef USING_SPLIT_STACK
@@ -249,8 +249,8 @@ runtime_gogo(G* newg)
 // setjmp.  Because getcontext always returns 0, unlike setjmp, we use
 // g->fromgogo as a code.  It will be true if we got here via
 // setcontext.  g == nil the first time this is called in a new m.
-static void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
-static void
+void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
+void
 runtime_mcall(void (*pfn)(G*))
 {
 	M *mp;
@@ -365,8 +365,9 @@ struct Sched {
 	uint64	goidgen;
 	M*	midle;	 // idle m's waiting for work
 	int32	nmidle;	 // number of idle m's waiting for work
-	int32	mlocked; // number of locked m's waiting for work
+	int32	nmidlelocked; // number of locked m's waiting for work
 	int32	mcount;	 // number of m's that have been created
+	int32	maxmcount;	// maximum number of m's allowed (or die)
 
 	P*	pidle;  // idle P's
 	uint32	npidle;
@@ -381,6 +382,7 @@ struct Sched {
 	Lock	gflock;
 	G*	gfree;
 
+	uint32	gcwaiting;	// gc is waiting to run
 	int32	stopwait;
 	Note	stopnote;
 	uint32	sysmonwait;
@@ -396,10 +398,8 @@ enum { MaxGomaxprocs = 1<<8 };
 
 Sched	runtime_sched;
 int32	runtime_gomaxprocs;
-bool	runtime_singleproc;
-bool	runtime_iscgo = true;
 uint32	runtime_needextram = 1;
-uint32	runtime_gcwaiting;
+bool	runtime_iscgo = true;
 M	runtime_m0;
 G	runtime_g0;	 // idle goroutine for m0
 G*	runtime_allg;
@@ -409,6 +409,7 @@ P**	runtime_allp;
 M*	runtime_extram;
 int8*	runtime_goos;
 int32	runtime_ncpu;
+bool	runtime_precisestack;
 static int32	newprocs;
 
 void* runtime_mstart(void*);
@@ -431,21 +432,22 @@ static void wakep(void);
 static void stoplockedm(void);
 static void startlockedm(G*);
 static void sysmon(void);
-static uint32 retake(uint32*);
-static void inclocked(int32);
+static uint32 retake(int64);
+static void incidlelocked(int32);
 static void checkdead(void);
 static void exitsyscall0(G*);
 static void park0(G*);
-static void gosched0(G*);
 static void goexit0(G*);
 static void gfput(P*, G*);
 static G* gfget(P*);
 static void gfpurge(P*);
 static void globrunqput(G*);
-static G* globrunqget(P*);
+static G* globrunqget(P*, int32);
 static P* pidleget(void);
 static void pidleput(P*);
 static void injectglist(G*);
+static bool preemptall(void);
+static bool exitsyscallfast(void);
 
 // The bootstrap sequence is:
 //
@@ -460,6 +462,7 @@ runtime_schedinit(void)
 {
 	int32 n, procs;
 	const byte *p;
+	Eface i;
 
 	m = &runtime_m0;
 	g = &runtime_g0;
@@ -470,18 +473,22 @@ runtime_schedinit(void)
 	initcontext();
 	inittlssize();
 
-	m->nomemprof++;
+	runtime_sched.maxmcount = 10000;
+	runtime_precisestack = 0;
+
 	runtime_mprofinit();
 	runtime_mallocinit();
 	mcommoninit(m);
+	
+	// Initialize the itable value for newErrorCString,
+	// so that the next time it gets called, possibly
+	// in a fault during a garbage collection, it will not
+	// need to allocated memory.
+	runtime_newErrorCString(0, &i);
 
 	runtime_goargs();
 	runtime_goenvs();
-
-	// For debugging:
-	// Allocate internal symbol table representation now,
-	// so that we don't need to call malloc when we crash.
-	// runtime_findfunc(0);
+	runtime_parsedebugvars();
 
 	runtime_sched.lastpoll = runtime_nanotime();
 	procs = 1;
@@ -496,16 +503,26 @@ runtime_schedinit(void)
 
 	// Can not enable GC until all roots are registered.
 	// mstats.enablegc = 1;
-	m->nomemprof--;
+
+	// if(raceenabled)
+	//	g->racectx = runtime_raceinit();
 }
 
 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
 extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 
+static void
+initDone(void *arg __attribute__ ((unused))) {
+	runtime_unlockOSThread();
+};
+
 // The main goroutine.
 void
 runtime_main(void* dummy __attribute__((unused)))
 {
+	Defer d;
+	_Bool frame;
+	
 	newm(sysmon, nil);
 
 	// Lock the main goroutine onto this, the main OS thread,
@@ -515,10 +532,24 @@ runtime_main(void* dummy __attribute__((unused)))
 	// by calling runtime.LockOSThread during initialization
 	// to preserve the lock.
 	runtime_lockOSThread();
+	
+	// Defer unlock so that runtime.Goexit during init does the unlock too.
+	d.__pfn = initDone;
+	d.__next = g->defer;
+	d.__arg = (void*)-1;
+	d.__panic = g->panic;
+	d.__retaddr = nil;
+	d.__frame = &frame;
+	g->defer = &d;
+
 	if(m != &runtime_m0)
 		runtime_throw("runtime_main not on m0");
 	__go_go(runtime_MHeap_Scavenger, nil);
 	main_init();
+
+	if(g->defer != &d || d.__pfn != initDone)
+		runtime_throw("runtime: bad defer entry after init");
+	g->defer = d.__next;
 	runtime_unlockOSThread();
 
 	// For gccgo we have to wait until after main is initialized
@@ -574,7 +605,7 @@ runtime_goroutineheader(G *gp)
 }
 
 void
-runtime_goroutinetrailer(G *g)
+runtime_printcreatedby(G *g)
 {
 	if(g != nil && g->gopc != 0 && g->goid != 1) {
 		String fn;
@@ -604,8 +635,28 @@ runtime_tracebackothers(G * volatile me)
 
 	tb.gp = me;
 	traceback = runtime_gotraceback(nil);
+	
+	// Show the current goroutine first, if we haven't already.
+	if((gp = m->curg) != nil && gp != me) {
+		runtime_printf("\n");
+		runtime_goroutineheader(gp);
+		gp->traceback = &tb;
+
+#ifdef USING_SPLIT_STACK
+		__splitstack_getcontext(&me->stack_context[0]);
+#endif
+		getcontext(&me->context);
+
+		if(gp->traceback != nil) {
+		  runtime_gogo(gp);
+		}
+
+		runtime_printtrace(tb.locbuf, tb.c, false);
+		runtime_printcreatedby(gp);
+	}
+
 	for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
-		if(gp == me || gp->status == Gdead)
+		if(gp == me || gp == m->curg || gp->status == Gdead)
 			continue;
 		if(gp->issystem && traceback < 2)
 			continue;
@@ -620,25 +671,38 @@ runtime_tracebackothers(G * volatile me)
 
 		// This means that if g is running or in a syscall, we
 		// can't reliably print a stack trace.  FIXME.
-		if(gp->status == Gsyscall || gp->status == Grunning) {
-			runtime_printf("no stack trace available\n");
-			runtime_goroutinetrailer(gp);
-			continue;
-		}
 
-		gp->traceback = &tb;
+		if(gp->status == Grunning) {
+			runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
+			runtime_printcreatedby(gp);
+		} else if(gp->status == Gsyscall) {
+			runtime_printf("\tgoroutine in C code; stack unavailable\n");
+			runtime_printcreatedby(gp);
+		} else {
+			gp->traceback = &tb;
 
 #ifdef USING_SPLIT_STACK
-		__splitstack_getcontext(&me->stack_context[0]);
+			__splitstack_getcontext(&me->stack_context[0]);
 #endif
-		getcontext(&me->context);
+			getcontext(&me->context);
 
-		if(gp->traceback != nil) {
-			runtime_gogo(gp);
+			if(gp->traceback != nil) {
+				runtime_gogo(gp);
+			}
+
+			runtime_printtrace(tb.locbuf, tb.c, false);
+			runtime_printcreatedby(gp);
 		}
+	}
+}
 
-		runtime_printtrace(tb.locbuf, tb.c, false);
-		runtime_goroutinetrailer(gp);
+static void
+checkmcount(void)
+{
+	// sched lock is held
+	if(runtime_sched.mcount > runtime_sched.maxmcount) {
+		runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
+		runtime_throw("thread exhaustion");
 	}
 }
 
@@ -669,7 +733,7 @@ mcommoninit(M *mp)
 
 	runtime_lock(&runtime_sched);
 	mp->id = runtime_sched.mcount++;
-
+	checkmcount();
 	runtime_mpreinit(mp);
 
 	// Add to runtime_allm so garbage collector doesn't free m
@@ -686,6 +750,7 @@ void
 runtime_ready(G *gp)
 {
 	// Mark runnable.
+	m->locks++;  // disable preemption because it can be holding p in a local var
 	if(gp->status != Gwaiting) {
 		runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 		runtime_throw("bad g->status in ready");
@@ -694,6 +759,7 @@ runtime_ready(G *gp)
 	runqput(m->p, gp);
 	if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 		wakep();
+	m->locks--;
 }
 
 int32
@@ -753,6 +819,34 @@ runtime_helpgc(int32 nproc)
 	runtime_unlock(&runtime_sched);
 }
 
+// Similar to stoptheworld but best-effort and can be called several times.
+// There is no reverse operation, used during crashing.
+// This function must not lock any mutexes.
+void
+runtime_freezetheworld(void)
+{
+	int32 i;
+
+	if(runtime_gomaxprocs == 1)
+		return;
+	// stopwait and preemption requests can be lost
+	// due to races with concurrently executing threads,
+	// so try several times
+	for(i = 0; i < 5; i++) {
+		// this should tell the scheduler to not start any new goroutines
+		runtime_sched.stopwait = 0x7fffffff;
+		runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+		// this should stop running goroutines
+		if(!preemptall())
+			break;  // no running goroutines
+		runtime_usleep(1000);
+	}
+	// to be sure
+	runtime_usleep(1000);
+	preemptall();
+	runtime_usleep(1000);
+}
+
 void
 runtime_stoptheworld(void)
 {
@@ -763,7 +857,8 @@ runtime_stoptheworld(void)
 
 	runtime_lock(&runtime_sched);
 	runtime_sched.stopwait = runtime_gomaxprocs;
-	runtime_atomicstore((uint32*)&runtime_gcwaiting, 1);
+	runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+	preemptall();
 	// stop current P
 	m->p->status = Pgcstop;
 	runtime_sched.stopwait--;
@@ -782,7 +877,7 @@ runtime_stoptheworld(void)
 	wait = runtime_sched.stopwait > 0;
 	runtime_unlock(&runtime_sched);
 
-	// wait for remaining P's to stop voluntary
+	// wait for remaining P's to stop voluntarily
 	if(wait) {
 		runtime_notesleep(&runtime_sched.stopnote);
 		runtime_noteclear(&runtime_sched.stopnote);
@@ -810,6 +905,7 @@ runtime_starttheworld(void)
 	G *gp;
 	bool add;
 
+	m->locks++;  // disable preemption because it can be holding p in a local var
 	gp = runtime_netpoll(false);  // non-blocking
 	injectglist(gp);
 	add = needaddgcproc();
@@ -819,7 +915,7 @@ runtime_starttheworld(void)
 		newprocs = 0;
 	} else
 		procresize(runtime_gomaxprocs);
-	runtime_gcwaiting = 0;
+	runtime_sched.gcwaiting = 0;
 
 	p1 = nil;
 	while((p = pidleget()) != nil) {
@@ -829,16 +925,9 @@ runtime_starttheworld(void)
 			pidleput(p);
 			break;
 		}
-		mp = mget();
-		if(mp == nil) {
-			p->link = p1;
-			p1 = p;
-			continue;
-		}
-		if(mp->nextp)
-			runtime_throw("starttheworld: inconsistent mp->nextp");
-		mp->nextp = p;
-		runtime_notewakeup(&mp->park);
+		p->m = mget();
+		p->link = p1;
+		p1 = p;
 	}
 	if(runtime_sched.sysmonwait) {
 		runtime_sched.sysmonwait = false;
@@ -849,8 +938,18 @@ runtime_starttheworld(void)
 	while(p1) {
 		p = p1;
 		p1 = p1->link;
-		add = false;
-		newm(nil, p);
+		if(p->m) {
+			mp = p->m;
+			p->m = nil;
+			if(mp->nextp)
+				runtime_throw("starttheworld: inconsistent mp->nextp");
+			mp->nextp = p;
+			runtime_notewakeup(&mp->park);
+		} else {
+			// Start M to run P.  Do not start another M below.
+			newm(nil, p);
+			add = false;
+		}
 	}
 
 	if(add) {
@@ -863,6 +962,7 @@ runtime_starttheworld(void)
 		// the maximum number of procs.
 		newm(mhelpgc, nil);
 	}
+	m->locks--;
 }
 
 // Called to start an M.
@@ -909,11 +1009,8 @@ runtime_mstart(void* mp)
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
-	if(m == &runtime_m0) {
+	if(m == &runtime_m0)
 		runtime_initsig();
-		if(runtime_iscgo)
-			runtime_newextram();
-	}
 	
 	if(m->mstartfn)
 		m->mstartfn();
@@ -1015,6 +1112,14 @@ runtime_needm(void)
 {
 	M *mp;
 
+	if(runtime_needextram) {
+		// Can happen if C/C++ code calls Go from a global ctor.
+		// Can not throw, because scheduler is not initialized yet.
+		runtime_write(2, "fatal error: cgo callback before cgo call\n",
+			sizeof("fatal error: cgo callback before cgo call\n")-1);
+		runtime_exit(1);
+	}
+
 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
 	// that the extra list always contains or will soon contain
@@ -1090,6 +1195,7 @@ runtime_newextram(void)
 	mp->locked = LockInternal;
 	mp->lockedg = gp;
 	gp->lockedm = mp;
+	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
 	// put on allg for garbage collector
 	runtime_lock(&runtime_sched);
 	if(runtime_lastg == nil)
@@ -1325,7 +1431,7 @@ handoffp(P *p)
 		return;
 	}
 	runtime_lock(&runtime_sched);
-	if(runtime_gcwaiting) {
+	if(runtime_sched.gcwaiting) {
 		p->status = Pgcstop;
 		if(--runtime_sched.stopwait == 0)
 			runtime_notewakeup(&runtime_sched.stopnote);
@@ -1373,7 +1479,7 @@ stoplockedm(void)
 		p = releasep();
 		handoffp(p);
 	}
-	inclocked(1);
+	incidlelocked(1);
 	// Wait until another thread schedules lockedg again.
 	runtime_notesleep(&m->park);
 	runtime_noteclear(&m->park);
@@ -1396,7 +1502,7 @@ startlockedm(G *gp)
 	if(mp->nextp)
 		runtime_throw("startlockedm: m has p");
 	// directly handoff current P to the locked m
-	inclocked(-1);
+	incidlelocked(-1);
 	p = releasep();
 	mp->nextp = p;
 	runtime_notewakeup(&mp->park);
@@ -1410,7 +1516,7 @@ gcstopm(void)
 {
 	P *p;
 
-	if(!runtime_gcwaiting)
+	if(!runtime_sched.gcwaiting)
 		runtime_throw("gcstopm: not waiting for gc");
 	if(m->spinning) {
 		m->spinning = false;
@@ -1437,7 +1543,7 @@ execute(G *gp)
 		runtime_throw("execute: bad g status");
 	}
 	gp->status = Grunning;
-	m->p->tick++;
+	m->p->schedtick++;
 	m->curg = gp;
 	gp->m = m;
 
@@ -1459,7 +1565,7 @@ findrunnable(void)
 	int32 i;
 
 top:
-	if(runtime_gcwaiting) {
+	if(runtime_sched.gcwaiting) {
 		gcstopm();
 		goto top;
 	}
@@ -1470,7 +1576,7 @@ top:
 	// global runq
 	if(runtime_sched.runqsize) {
 		runtime_lock(&runtime_sched);
-		gp = globrunqget(m->p);
+		gp = globrunqget(m->p, 0);
 		runtime_unlock(&runtime_sched);
 		if(gp)
 			return gp;
@@ -1493,7 +1599,7 @@ top:
 	}
 	// random steal from other P's
 	for(i = 0; i < 2*runtime_gomaxprocs; i++) {
-		if(runtime_gcwaiting)
+		if(runtime_sched.gcwaiting)
 			goto top;
 		p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
 		if(p == m->p)
@@ -1506,12 +1612,12 @@ top:
 stop:
 	// return P and block
 	runtime_lock(&runtime_sched);
-	if(runtime_gcwaiting) {
+	if(runtime_sched.gcwaiting) {
 		runtime_unlock(&runtime_sched);
 		goto top;
 	}
 	if(runtime_sched.runqsize) {
-		gp = globrunqget(m->p);
+		gp = globrunqget(m->p, 0);
 		runtime_unlock(&runtime_sched);
 		return gp;
 	}
@@ -1561,6 +1667,25 @@ stop:
 	goto top;
 }
 
+static void
+resetspinning(void)
+{
+	int32 nmspinning;
+
+	if(m->spinning) {
+		m->spinning = false;
+		nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
+		if(nmspinning < 0)
+			runtime_throw("findrunnable: negative nmspinning");
+	} else
+		nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
+
+	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+	// so see if we need to wakeup another P here.
+	if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
+		wakep();
+}
+
 // Injects the list of runnable G's into the scheduler.
 // Can run concurrently with GC.
 static void
@@ -1590,33 +1715,44 @@ static void
 schedule(void)
 {
 	G *gp;
+	uint32 tick;
 
 	if(m->locks)
 		runtime_throw("schedule: holding locks");
 
 top:
-	if(runtime_gcwaiting) {
+	if(runtime_sched.gcwaiting) {
 		gcstopm();
 		goto top;
 	}
 
-	gp = runqget(m->p);
-	if(gp == nil)
-		gp = findrunnable();
-
-	if(m->spinning) {
-		m->spinning = false;
-		runtime_xadd(&runtime_sched.nmspinning, -1);
+	gp = nil;
+	// Check the global runnable queue once in a while to ensure fairness.
+	// Otherwise two goroutines can completely occupy the local runqueue
+	// by constantly respawning each other.
+	tick = m->p->schedtick;
+	// This is a fancy way to say tick%61==0,
+	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
+	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
+		runtime_lock(&runtime_sched);
+		gp = globrunqget(m->p, 1);
+		runtime_unlock(&runtime_sched);
+		if(gp)
+			resetspinning();
+	}
+	if(gp == nil) {
+		gp = runqget(m->p);
+		if(gp && m->spinning)
+			runtime_throw("schedule: spinning with local work");
+	}
+	if(gp == nil) {
+		gp = findrunnable();  // blocks until work is available
+		resetspinning();
 	}
-
-	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
-	// so see if we need to wakeup another M here.
-	if (m->p->runqhead != m->p->runqtail &&
-		runtime_atomicload(&runtime_sched.nmspinning) == 0 &&
-		runtime_atomicload(&runtime_sched.npidle) > 0)  // TODO: fast atomic
-		wakep();
 
 	if(gp->lockedm) {
+		// Hands off own p to the locked m,
+		// then blocks waiting for a new p.
 		startlockedm(gp);
 		goto top;
 	}
@@ -1658,12 +1794,12 @@ park0(G *gp)
 void
 runtime_gosched(void)
 {
-	runtime_mcall(gosched0);
+	runtime_mcall(runtime_gosched0);
 }
 
 // runtime_gosched continuation on g0.
-static void
-gosched0(G *gp)
+void
+runtime_gosched0(G *gp)
 {
 	gp->status = Grunnable;
 	gp->m = nil;
@@ -1679,6 +1815,9 @@ gosched0(G *gp)
 }
 
 // Finishes execution of the current goroutine.
+// Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
+// Since it does not return it does not matter.  But if it is preempted
+// at the split stack check, GC will complain about inconsistent sp.
 void
 runtime_goexit(void)
 {
@@ -1698,7 +1837,7 @@ goexit0(G *gp)
 	m->curg = nil;
 	m->lockedg = nil;
 	if(m->locked & ~LockExternal) {
-		runtime_printf("invalid m->locked = %d", m->locked);
+		runtime_printf("invalid m->locked = %d\n", m->locked);
 		runtime_throw("internal lockOSThread error");
 	}	
 	m->locked = 0;
@@ -1720,10 +1859,11 @@ void runtime_entersyscall(void) __attribute__ ((no_split_stack));
 void
 runtime_entersyscall()
 {
-	if(m->profilehz > 0)
-		runtime_setprof(false);
+	// Disable preemption because during this function g is in Gsyscall status,
+	// but can have inconsistent g->sched, do not let GC observe it.
+	m->locks++;
 
-	// Leave SP around for gc and traceback.
+	// Leave SP around for GC and traceback.
 #ifdef USING_SPLIT_STACK
 	g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
 				       &g->gcnext_segment, &g->gcnext_sp,
@@ -1752,10 +1892,9 @@ runtime_entersyscall()
 	}
 
 	m->mcache = nil;
-	m->p->tick++;
 	m->p->m = nil;
 	runtime_atomicstore(&m->p->status, Psyscall);
-	if(runtime_gcwaiting) {
+	if(runtime_sched.gcwaiting) {
 		runtime_lock(&runtime_sched);
 		if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
 			if(--runtime_sched.stopwait == 0)
@@ -1763,6 +1902,8 @@ runtime_entersyscall()
 		}
 		runtime_unlock(&runtime_sched);
 	}
+
+	m->locks--;
 }
 
 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
@@ -1771,10 +1912,9 @@ runtime_entersyscallblock(void)
 {
 	P *p;
 
-	if(m->profilehz > 0)
-		runtime_setprof(false);
+	m->locks++;  // see comment in entersyscall
 
-	// Leave SP around for gc and traceback.
+	// Leave SP around for GC and traceback.
 #ifdef USING_SPLIT_STACK
 	g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
 				       &g->gcnext_segment, &g->gcnext_sp,
@@ -1792,7 +1932,9 @@ runtime_entersyscallblock(void)
 	p = releasep();
 	handoffp(p);
 	if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
-		inclocked(1);
+		incidlelocked(1);
+
+	m->locks--;
 }
 
 // The goroutine g exited its system call.
@@ -1803,19 +1945,16 @@ void
 runtime_exitsyscall(void)
 {
 	G *gp;
-	P *p;
 
-	// Check whether the profiler needs to be turned on.
-	if(m->profilehz > 0)
-		runtime_setprof(true);
+	m->locks++;  // see comment in entersyscall
 
 	gp = g;
-	// Try to re-acquire the last P.
-	if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
+	if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
+		incidlelocked(-1);
+
+	if(exitsyscallfast()) {
 		// There's a cpu for us, so we can run.
-		m->mcache = m->p->mcache;
-		m->p->m = m;
-		m->p->tick++;
+		m->p->syscalltick++;
 		gp->status = Grunning;
 		// Garbage collector isn't running (since we are),
 		// so okay to clear gcstack and gcsp.
@@ -1824,27 +1963,11 @@ runtime_exitsyscall(void)
 #endif
 		gp->gcnext_sp = nil;
 		runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
+		m->locks--;
 		return;
 	}
 
-	if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
-		inclocked(-1);
-	// Try to get any other idle P.
-	m->p = nil;
-	if(runtime_sched.pidle) {
-		runtime_lock(&runtime_sched);
-		p = pidleget();
-		runtime_unlock(&runtime_sched);
-		if(p) {
-			acquirep(p);
-#ifdef USING_SPLIT_STACK
-			gp->gcstack = nil;
-#endif
-			gp->gcnext_sp = nil;
-			runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
-			return;
-		}
-	}
+	m->locks--;
 
 	// Call the scheduler.
 	runtime_mcall(exitsyscall0);
@@ -1860,6 +1983,43 @@ runtime_exitsyscall(void)
 #endif
 	gp->gcnext_sp = nil;
 	runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
+	m->p->syscalltick++;
+}
+
+static bool
+exitsyscallfast(void)
+{
+	P *p;
+
+	// Freezetheworld sets stopwait but does not retake P's.
+	if(runtime_sched.stopwait) {
+		m->p = nil;
+		return false;
+	}
+
+	// Try to re-acquire the last P.
+	if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
+		// There's a cpu for us, so we can run.
+		m->mcache = m->p->mcache;
+		m->p->m = m;
+		return true;
+	}
+	// Try to get any other idle P.
+	m->p = nil;
+	if(runtime_sched.pidle) {
+		runtime_lock(&runtime_sched);
+		p = pidleget();
+		if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
+			runtime_atomicstore(&runtime_sched.sysmonwait, 0);
+			runtime_notewakeup(&runtime_sched.sysmonnote);
+		}
+		runtime_unlock(&runtime_sched);
+		if(p) {
+			acquirep(p);
+			return true;
+		}
+	}
+	return false;
 }
 
 // runtime_exitsyscall slow path on g0.
@@ -1876,6 +2036,10 @@ exitsyscall0(G *gp)
 	p = pidleget();
 	if(p == nil)
 		globrunqput(gp);
+	else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
+		runtime_atomicstore(&runtime_sched.sysmonwait, 0);
+		runtime_notewakeup(&runtime_sched.sysmonnote);
+	}
 	runtime_unlock(&runtime_sched);
 	if(p) {
 		acquirep(p);
@@ -1890,6 +2054,33 @@ exitsyscall0(G *gp)
 	schedule();  // Never returns.
 }
 
+// Called from syscall package before fork.
+void syscall_runtime_BeforeFork(void)
+  __asm__(GOSYM_PREFIX "syscall.runtime_BeforeFork");
+void
+syscall_runtime_BeforeFork(void)
+{
+	// Fork can hang if preempted with signals frequently enough (see issue 5517).
+	// Ensure that we stay on the same M where we disable profiling.
+	m->locks++;
+	if(m->profilehz != 0)
+		runtime_resetcpuprofiler(0);
+}
+
+// Called from syscall package after fork in parent.
+void syscall_runtime_AfterFork(void)
+  __asm__(GOSYM_PREFIX "syscall.runtime_AfterFork");
+void
+syscall_runtime_AfterFork(void)
+{
+	int32 hz;
+
+	hz = runtime_sched.profilehz;
+	if(hz != 0)
+		runtime_resetcpuprofiler(hz);
+	m->locks--;
+}
+
 // Allocate a new g, with a stack big enough for stacksize bytes.
 G*
 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
@@ -1919,9 +2110,16 @@ runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
 
 /* For runtime package testing.  */
 
+
+// Create a new g running fn with siz bytes of arguments.
+// Put it on the queue of g's waiting to run.
+// The compiler turns a go statement into a call to this.
+// Cannot split the stack because it assumes that the arguments
+// are available sequentially after &fn; they would not be
+// copied if a stack split occurred.  It's OK for this to call
+// functions that split the stack.
 void runtime_testing_entersyscall(void)
   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
-
 void
 runtime_testing_entersyscall()
 {
@@ -1944,6 +2142,7 @@ __go_go(void (*fn)(void*), void* arg)
 	size_t spsize;
 	G *newg;
 
+//runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
 	m->locks++;  // disable preemption because it can be holding p in a local var
 
 	if((newg = gfget(m->p)) != nil) {
@@ -2099,7 +2298,7 @@ runtime_gomaxprocsfunc(int32 n)
 	}
 	runtime_unlock(&runtime_sched);
 
-	runtime_semacquire(&runtime_worldsema);
+	runtime_semacquire(&runtime_worldsema, false);
 	m->gcing = 1;
 	runtime_stoptheworld();
 	newprocs = n;
@@ -2110,8 +2309,11 @@ runtime_gomaxprocsfunc(int32 n)
 	return ret;
 }
 
+// lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
+// after they modify m->locked. Do not allow preemption during this call,
+// or else the m might be different in this function than in the caller.
 static void
-LockOSThread(void)
+lockOSThread(void)
 {
 	m->lockedg = g;
 	g->lockedm = m;
@@ -2122,18 +2324,22 @@ void
 runtime_LockOSThread(void)
 {
 	m->locked |= LockExternal;
-	LockOSThread();
+	lockOSThread();
 }
 
 void
 runtime_lockOSThread(void)
 {
 	m->locked += LockInternal;
-	LockOSThread();
+	lockOSThread();
 }
 
+
+// unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
+// after they update m->locked. Do not allow preemption during this call,
+// or else the m might be in different in this function than in the caller.
 static void
-UnlockOSThread(void)
+unlockOSThread(void)
 {
 	if(m->locked != 0)
 		return;
@@ -2147,7 +2353,7 @@ void
 runtime_UnlockOSThread(void)
 {
 	m->locked &= ~LockExternal;
-	UnlockOSThread();
+	unlockOSThread();
 }
 
 void
@@ -2156,7 +2362,7 @@ runtime_unlockOSThread(void)
 	if(m->locked < LockInternal)
 		runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
 	m->locked -= LockInternal;
-	UnlockOSThread();
+	unlockOSThread();
 }
 
 bool
@@ -2176,13 +2382,6 @@ runtime_golockedOSThread(void)
 	return runtime_lockedOSThread();
 }
 
-// for testing of wire, unwire
-uint32
-runtime_mid()
-{
-	return m->id;
-}
-
 intgo runtime_NumGoroutine (void)
   __asm__ (GOSYM_PREFIX "runtime.NumGoroutine");
 
@@ -2227,28 +2426,42 @@ static struct {
 	Location locbuf[100];
 } prof;
 
+static void
+System(void)
+{
+}
+
 // Called if we receive a SIGPROF signal.
 void
 runtime_sigprof()
 {
 	int32 n, i;
+	bool traceback;
 
-	// Windows does profiling in a dedicated thread w/o m.
-	if(!Windows && (m == nil || m->mcache == nil))
-		return;
 	if(prof.fn == nil || prof.hz == 0)
 		return;
-
+	traceback = true;
+	// Windows does profiling in a dedicated thread w/o m.
+	if(!Windows && (m == nil || m->mcache == nil))
+		traceback = false;
+	
 	runtime_lock(&prof);
 	if(prof.fn == nil) {
 		runtime_unlock(&prof);
 		return;
 	}
-	n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf));
-	for(i = 0; i < n; i++)
-		prof.pcbuf[i] = prof.locbuf[i].pc;
-	if(n > 0)
-		prof.fn(prof.pcbuf, n);
+	n = 0;
+	if(traceback) {
+		n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf));
+		for(i = 0; i < n; i++)
+			prof.pcbuf[i] = prof.locbuf[i].pc;
+	}
+	if (!traceback || n <= 0) {
+		n = 2;
+		prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
+		prof.pcbuf[1] = (uintptr)System + 1;
+	}
+	prof.fn(prof.pcbuf, n);
 	runtime_unlock(&prof);
 }
 
@@ -2264,7 +2477,11 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 	if(fn == nil)
 		hz = 0;
 
-	// Stop profiler on this cpu so that it is safe to lock prof.
+	// Disable preemption, otherwise we can be rescheduled to another thread
+	// that has profiling enabled.
+	m->locks++;
+
+	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
 	// it would deadlock.
 	runtime_resetcpuprofiler(0);
@@ -2279,6 +2496,8 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 
 	if(hz != 0)
 		runtime_resetcpuprofiler(hz);
+
+	m->locks--;
 }
 
 // Change number of processors.  The world is stopped, sched is locked.
@@ -2296,7 +2515,8 @@ procresize(int32 new)
 	for(i = 0; i < new; i++) {
 		p = runtime_allp[i];
 		if(p == nil) {
-			p = (P*)runtime_mallocgc(sizeof(*p), 0, 0, 1);
+			p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
+			p->id = i;
 			p->status = Pgcstop;
 			runtime_atomicstorep(&runtime_allp[i], p);
 		}
@@ -2308,7 +2528,7 @@ procresize(int32 new)
 		}
 		if(p->runq == nil) {
 			p->runqsize = 128;
-			p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, 0, 1);
+			p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, FlagNoInvokeGC);
 		}
 	}
 
@@ -2351,7 +2571,6 @@ procresize(int32 new)
 		p->status = Pidle;
 		pidleput(p);
 	}
-	runtime_singleproc = new == 1;
 	runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
 }
 
@@ -2393,10 +2612,10 @@ releasep(void)
 }
 
 static void
-inclocked(int32 v)
+incidlelocked(int32 v)
 {
 	runtime_lock(&runtime_sched);
-	runtime_sched.mlocked += v;
+	runtime_sched.nmidlelocked += v;
 	if(v > 0)
 		checkdead();
 	runtime_unlock(&runtime_sched);
@@ -2411,12 +2630,12 @@ checkdead(void)
 	int32 run, grunning, s;
 
 	// -1 for sysmon
-	run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.mlocked - 1 - countextra();
+	run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
 	if(run > 0)
 		return;
 	if(run < 0) {
-		runtime_printf("checkdead: nmidle=%d mlocked=%d mcount=%d\n",
-			runtime_sched.nmidle, runtime_sched.mlocked, runtime_sched.mcount);
+		runtime_printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
+			runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
 		runtime_throw("checkdead: inconsistent counts");
 	}
 	grunning = 0;
@@ -2441,10 +2660,10 @@ static void
 sysmon(void)
 {
 	uint32 idle, delay;
-	int64 now, lastpoll;
+	int64 now, lastpoll, lasttrace;
 	G *gp;
-	uint32 ticks[MaxGomaxprocs];
 
+	lasttrace = 0;
 	idle = 0;  // how many cycles in succession we had not wokeup somebody
 	delay = 0;
 	for(;;) {
@@ -2455,9 +2674,10 @@ sysmon(void)
 		if(delay > 10*1000)  // up to 10ms
 			delay = 10*1000;
 		runtime_usleep(delay);
-		if(runtime_gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {  // TODO: fast atomic
+		if(runtime_debug.schedtrace <= 0 &&
+			(runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
 			runtime_lock(&runtime_sched);
-			if(runtime_atomicload(&runtime_gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
+			if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
 				runtime_atomicstore(&runtime_sched.sysmonwait, 1);
 				runtime_unlock(&runtime_sched);
 				runtime_notesleep(&runtime_sched.sysmonnote);
@@ -2470,53 +2690,198 @@ sysmon(void)
 		// poll network if not polled for more than 10ms
 		lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
 		now = runtime_nanotime();
-		if(lastpoll != 0 && lastpoll + 10*1000*1000 > now) {
+		if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
+			runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
 			gp = runtime_netpoll(false);  // non-blocking
-			injectglist(gp);
+			if(gp) {
+				// Need to decrement number of idle locked M's
+				// (pretending that one more is running) before injectglist.
+				// Otherwise it can lead to the following situation:
+				// injectglist grabs all P's but before it starts M's to run the P's,
+				// another M returns from syscall, finishes running its G,
+				// observes that there is no work to do and no other running M's
+				// and reports deadlock.
+				incidlelocked(-1);
+				injectglist(gp);
+				incidlelocked(1);
+			}
 		}
 		// retake P's blocked in syscalls
-		if(retake(ticks))
+		// and preempt long running G's
+		if(retake(now))
 			idle = 0;
 		else
 			idle++;
+
+		if(runtime_debug.schedtrace > 0 && lasttrace + runtime_debug.schedtrace*1000000ll <= now) {
+			lasttrace = now;
+			runtime_schedtrace(runtime_debug.scheddetail);
+		}
 	}
 }
 
+typedef struct Pdesc Pdesc;
+struct Pdesc
+{
+	uint32	schedtick;
+	int64	schedwhen;
+	uint32	syscalltick;
+	int64	syscallwhen;
+};
+static Pdesc pdesc[MaxGomaxprocs];
+
 static uint32
-retake(uint32 *ticks)
+retake(int64 now)
 {
 	uint32 i, s, n;
 	int64 t;
 	P *p;
+	Pdesc *pd;
 
 	n = 0;
 	for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
 		if(p==nil)
 			continue;
-		t = p->tick;
-		if(ticks[i] != t) {
-			ticks[i] = t;
-			continue;
-		}
+		pd = &pdesc[i];
 		s = p->status;
-		if(s != Psyscall)
-			continue;
-		if(p->runqhead == p->runqtail && runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)  // TODO: fast atomic
-			continue;
-		// Need to increment number of locked M's before the CAS.
-		// Otherwise the M from which we retake can exit the syscall,
-		// increment nmidle and report deadlock.
-		inclocked(-1);
-		if(runtime_cas(&p->status, s, Pidle)) {
-			n++;
-			handoffp(p);
+		if(s == Psyscall) {
+			// Retake P from syscall if it's there for more than 1 sysmon tick (20us).
+			// But only if there is other work to do.
+			t = p->syscalltick;
+			if(pd->syscalltick != t) {
+				pd->syscalltick = t;
+				pd->syscallwhen = now;
+				continue;
+			}
+			if(p->runqhead == p->runqtail &&
+				runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)
+				continue;
+			// Need to decrement number of idle locked M's
+			// (pretending that one more is running) before the CAS.
+			// Otherwise the M from which we retake can exit the syscall,
+			// increment nmidle and report deadlock.
+			incidlelocked(-1);
+			if(runtime_cas(&p->status, s, Pidle)) {
+				n++;
+				handoffp(p);
+			}
+			incidlelocked(1);
+		} else if(s == Prunning) {
+			// Preempt G if it's running for more than 10ms.
+			t = p->schedtick;
+			if(pd->schedtick != t) {
+				pd->schedtick = t;
+				pd->schedwhen = now;
+				continue;
+			}
+			if(pd->schedwhen + 10*1000*1000 > now)
+				continue;
+			// preemptone(p);
 		}
-		inclocked(1);
 	}
 	return n;
 }
 
+// Tell all goroutines that they have been preempted and they should stop.
+// This function is purely best-effort.  It can fail to inform a goroutine if a
+// processor just started running it.
+// No locks need to be held.
+// Returns true if preemption request was issued to at least one goroutine.
+static bool
+preemptall(void)
+{
+	return false;
+}
+
+void
+runtime_schedtrace(bool detailed)
+{
+	static int64 starttime;
+	int64 now;
+	int64 id1, id2, id3;
+	int32 i, q, t, h, s;
+	const char *fmt;
+	M *mp, *lockedm;
+	G *gp, *lockedg;
+	P *p;
+
+	now = runtime_nanotime();
+	if(starttime == 0)
+		starttime = now;
+
+	runtime_lock(&runtime_sched);
+	runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
+		(now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
+		runtime_sched.nmidle, runtime_sched.runqsize);
+	if(detailed) {
+		runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
+			runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
+			runtime_sched.stopwait, runtime_sched.sysmonwait);
+	}
+	// We must be careful while reading data from P's, M's and G's.
+	// Even if we hold schedlock, most data can be changed concurrently.
+	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+	for(i = 0; i < runtime_gomaxprocs; i++) {
+		p = runtime_allp[i];
+		if(p == nil)
+			continue;
+		mp = p->m;
+		t = p->runqtail;
+		h = p->runqhead;
+		s = p->runqsize;
+		q = t - h;
+		if(q < 0)
+			q += s;
+		if(detailed)
+			runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d/%d gfreecnt=%d\n",
+				i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, q, s, p->gfreecnt);
+		else {
+			// In non-detailed mode format lengths of per-P run queues as:
+			// [len1 len2 len3 len4]
+			fmt = " %d";
+			if(runtime_gomaxprocs == 1)
+				fmt = " [%d]\n";
+			else if(i == 0)
+				fmt = " [%d";
+			else if(i == runtime_gomaxprocs-1)
+				fmt = " %d]\n";
+			runtime_printf(fmt, q);
+		}
+	}
+	if(!detailed) {
+		runtime_unlock(&runtime_sched);
+		return;
+	}
+	for(mp = runtime_allm; mp; mp = mp->alllink) {
+		p = mp->p;
+		gp = mp->curg;
+		lockedg = mp->lockedg;
+		id1 = -1;
+		if(p)
+			id1 = p->id;
+		id2 = -1;
+		if(gp)
+			id2 = gp->goid;
+		id3 = -1;
+		if(lockedg)
+			id3 = lockedg->goid;
+		runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
+			" locks=%d dying=%d helpgc=%d spinning=%d lockedg=%D\n",
+			mp->id, id1, id2,
+			mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
+			mp->spinning, id3);
+	}
+	for(gp = runtime_allg; gp; gp = gp->alllink) {
+		mp = gp->m;
+		lockedm = gp->lockedm;
+		runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
+			gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
+			lockedm ? lockedm->id : -1);
+	}
+	runtime_unlock(&runtime_sched);
+}
+
 // Put mp on midle list.
 // Sched must be locked.
 static void
@@ -2559,7 +2924,7 @@ globrunqput(G *gp)
 // Try get a batch of G's from the global runnable queue.
 // Sched must be locked.
 static G*
-globrunqget(P *p)
+globrunqget(P *p, int32 max)
 {
 	G *gp, *gp1;
 	int32 n;
@@ -2569,6 +2934,8 @@ globrunqget(P *p)
 	n = runtime_sched.runqsize/runtime_gomaxprocs+1;
 	if(n > runtime_sched.runqsize)
 		n = runtime_sched.runqsize;
+	if(max > 0 && n > max)
+		n = max;
 	runtime_sched.runqsize -= n;
 	if(runtime_sched.runqsize == 0)
 		runtime_sched.runqtail = nil;
@@ -2827,6 +3194,22 @@ runtime_testSchedLocalQueueSteal(void)
 	}
 }
 
+intgo runtime_debug_setMaxThreads(intgo)
+     __asm__(GOSYM_PREFIX "runtime_debug.setMaxThreads");
+
+intgo
+runtime_debug_setMaxThreads(intgo in)
+{
+	intgo out;
+
+	runtime_lock(&runtime_sched);
+	out = runtime_sched.maxmcount;
+	runtime_sched.maxmcount = in;
+	checkmcount();
+	runtime_unlock(&runtime_sched);
+	return out;
+}
+
 void
 runtime_proc_scan(void (*addroot)(Obj))
 {
@@ -2852,3 +3235,11 @@ __go_get_closure(void)
 {
 	return g->closure;
 }
+
+// Return whether we are waiting for a GC.  This gc toolchain uses
+// preemption instead.
+bool
+runtime_gcwaiting(void)
+{
+	return runtime_sched.gcwaiting;
+}
diff --git a/libgo/runtime/race.h b/libgo/runtime/race.h
index 3357bed312d..884245cedad 100644
--- a/libgo/runtime/race.h
+++ b/libgo/runtime/race.h
@@ -16,14 +16,14 @@ uintptr	runtime_raceinit(void);
 void	runtime_racefini(void);
 
 void	runtime_racemapshadow(void *addr, uintptr size);
-void	runtime_racemalloc(void *p, uintptr sz, void *pc);
+void	runtime_racemalloc(void *p, uintptr sz);
 void	runtime_racefree(void *p);
 uintptr	runtime_racegostart(void *pc);
 void	runtime_racegoend(void);
 void	runtime_racewritepc(void *addr, void *callpc, void *pc);
 void	runtime_racereadpc(void *addr, void *callpc, void *pc);
-void	runtime_racewriterangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
-void	runtime_racereadrangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
+void	runtime_racewriterangepc(void *addr, uintptr sz, void *callpc, void *pc);
+void	runtime_racereadrangepc(void *addr, uintptr sz, void *callpc, void *pc);
 void	runtime_racefingo(void);
 void	runtime_raceacquire(void *addr);
 void	runtime_raceacquireg(G *gp, void *addr);
diff --git a/libgo/runtime/runtime.c b/libgo/runtime/runtime.c
index 1ff6d00e299..56fc045eac8 100644
--- a/libgo/runtime/runtime.c
+++ b/libgo/runtime/runtime.c
@@ -124,11 +124,12 @@ TestAtomic64(void)
 	z64 = 42;
 	x64 = 0;
 	PREFETCH(&z64);
-	if(runtime_cas64(&z64, &x64, 1))
+	if(runtime_cas64(&z64, x64, 1))
 		runtime_throw("cas64 failed");
-	if(x64 != 42)
+	if(x64 != 0)
 		runtime_throw("cas64 failed");
-	if(!runtime_cas64(&z64, &x64, 1))
+	x64 = 42;
+	if(!runtime_cas64(&z64, x64, 1))
 		runtime_throw("cas64 failed");
 	if(x64 != 42 || z64 != 1)
 		runtime_throw("cas64 failed");
@@ -279,3 +280,79 @@ runtime_signalstack(byte *p, int32 n)
 	if(sigaltstack(&st, nil) < 0)
 		*(int *)0xf1 = 0xf1;
 }
+
+DebugVars	runtime_debug;
+
+static struct {
+	const char* name;
+	int32*	value;
+} dbgvar[] = {
+	{"gctrace", &runtime_debug.gctrace},
+	{"schedtrace", &runtime_debug.schedtrace},
+	{"scheddetail", &runtime_debug.scheddetail},
+};
+
+void
+runtime_parsedebugvars(void)
+{
+	const byte *p;
+	intgo i, n;
+
+	p = runtime_getenv("GODEBUG");
+	if(p == nil)
+		return;
+	for(;;) {
+		for(i=0; i<(intgo)nelem(dbgvar); i++) {
+			n = runtime_findnull((const byte*)dbgvar[i].name);
+			if(runtime_mcmp(p, dbgvar[i].name, n) == 0 && p[n] == '=')
+				*dbgvar[i].value = runtime_atoi(p+n+1);
+		}
+		p = (const byte *)runtime_strstr((const char *)p, ",");
+		if(p == nil)
+			break;
+		p++;
+	}
+}
+
+// Poor mans 64-bit division.
+// This is a very special function, do not use it if you are not sure what you are doing.
+// int64 division is lowered into _divv() call on 386, which does not fit into nosplit functions.
+// Handles overflow in a time-specific manner.
+int32
+runtime_timediv(int64 v, int32 div, int32 *rem)
+{
+	int32 res, bit;
+
+	if(v >= (int64)div*0x7fffffffLL) {
+		if(rem != nil)
+			*rem = 0;
+		return 0x7fffffff;
+	}
+	res = 0;
+	for(bit = 30; bit >= 0; bit--) {
+		if(v >= ((int64)div<<bit)) {
+			v = v - ((int64)div<<bit);
+			res += 1<<bit;
+		}
+	}
+	if(rem != nil)
+		*rem = v;
+	return res;
+}
+
+// Setting the max stack size doesn't really do anything for gccgo.
+
+uintptr runtime_maxstacksize = 1<<20; // enough until runtime.main sets it for real
+
+intgo runtime_debug_setMaxStack(intgo)
+	__asm__ (GOSYM_PREFIX "runtime_debug.setMaxStack");
+
+intgo
+runtime_debug_setMaxStack(intgo in)
+{
+	intgo out;
+
+	out = runtime_maxstacksize;
+	runtime_maxstacksize = in;
+	return out;
+}
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index d2e7d4c11bc..e82e83231e6 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -72,6 +72,7 @@ typedef	struct	ParFor		ParFor;
 typedef	struct	ParForThread	ParForThread;
 typedef	struct	CgoMal		CgoMal;
 typedef	struct	PollDesc	PollDesc;
+typedef	struct	DebugVars	DebugVars;
 
 typedef	struct	__go_open_array		Slice;
 typedef struct	__go_interface		Iface;
@@ -82,6 +83,7 @@ typedef	struct	__go_panic_stack	Panic;
 
 typedef struct	__go_ptr_type		PtrType;
 typedef struct	__go_func_type		FuncType;
+typedef struct	__go_interface_type	InterfaceType;
 typedef struct	__go_map_type		MapType;
 typedef struct	__go_channel_type	ChanType;
 
@@ -206,21 +208,20 @@ struct	G
 	void*	param;		// passed parameter on wakeup
 	bool	fromgogo;	// reached from gogo
 	int16	status;
-	int64	goid;
 	uint32	selgen;		// valid sudog pointer
+	int64	goid;
 	const char*	waitreason;	// if status==Gwaiting
 	G*	schedlink;
 	bool	ispanic;
 	bool	issystem;	// do not output in stack dump
 	bool	isbackground;	// ignore in deadlock detector
-	bool	blockingsyscall;	// hint that the next syscall will block
 	M*	m;		// for debuggers, but offset not hard-coded
 	M*	lockedm;
 	int32	sig;
 	int32	writenbuf;
 	byte*	writebuf;
-	// DeferChunk	*dchunk;
-	// DeferChunk	*dchunknext;
+	// DeferChunk*	dchunk;
+	// DeferChunk*	dchunknext;
 	uintptr	sigcode0;
 	uintptr	sigcode1;
 	// uintptr	sigpc;
@@ -243,6 +244,7 @@ struct	M
 	size_t	gsignalstacksize;
 	void	(*mstartfn)(void);
 	G*	curg;		// current running goroutine
+	G*	caughtsig;	// goroutine running during fatal signal
 	P*	p;		// attached P for executing Go code (nil if not executing Go code)
 	P*	nextp;
 	int32	id;
@@ -250,11 +252,9 @@ struct	M
 	int32	throwing;
 	int32	gcing;
 	int32	locks;
-	int32	nomemprof;
 	int32	dying;
 	int32	profilehz;
 	int32	helpgc;
-	bool	blockingsyscall;
 	bool	spinning;
 	uint32	fastrand;
 	uint64	ncgocall;	// number of cgo calls in total
@@ -289,10 +289,12 @@ struct P
 {
 	Lock;
 
-	uint32	status;  // one of Pidle/Prunning/...
+	int32	id;
+	uint32	status;		// one of Pidle/Prunning/...
 	P*	link;
-	uint32	tick;   // incremented on every scheduler or system call
-	M*	m;	// back-link to associated M (nil if idle)
+	uint32	schedtick;	// incremented on every scheduler call
+	uint32	syscalltick;	// incremented on every system call
+	M*	m;		// back-link to associated M (nil if idle)
 	MCache*	mcache;
 
 	// Queue of runnable goroutines.
@@ -308,9 +310,13 @@ struct P
 	byte	pad[64];
 };
 
-// The m->locked word holds a single bit saying whether
-// external calls to LockOSThread are in effect, and then a counter
-// of the internal nesting depth of lockOSThread / unlockOSThread.
+// The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+// The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
+// External locks are not recursive; a second lock is silently ignored.
+// The upper bits of m->lockedcount record the nesting depth of calls to lockOSThread
+// (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
+// Internal locks can be recursive. For instance, a lock for cgo can occur while the main
+// goroutine is holding the lock during the initialization phase.
 enum
 {
 	LockExternal = 1,
@@ -333,19 +339,16 @@ enum
 	SigIgnored = 1<<6,	// the signal was ignored before we registered for it
 };
 
-#ifndef NSIG
-#define NSIG 32
-#endif
-
-// NOTE(rsc): keep in sync with extern.go:/type.Func.
-// Eventually, the loaded symbol table should be closer to this form.
+// Layout of in-memory per-function information prepared by linker
+// See http://golang.org/s/go12symtab.
+// Keep in sync with linker and with ../../libmach/sym.c
+// and with package debug/gosym.
 struct	Func
 {
 	String	name;
 	uintptr	entry;	// entry pc
 };
 
-
 #ifdef GOOS_windows
 enum {
    Windows = 1
@@ -372,7 +375,7 @@ struct	Timers
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
 struct	Timer
 {
-	int32	i;		// heap index
+	int32	i;	// heap index
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(now, arg) in the timer goroutine, so f must be
@@ -420,6 +423,16 @@ struct CgoMal
 	void	*alloc;
 };
 
+// Holds variables parsed from GODEBUG env var.
+struct DebugVars
+{
+	int32	gctrace;
+	int32	schedtrace;
+	int32	scheddetail;
+};
+
+extern bool runtime_precisestack;
+
 /*
  * defined macros
  *    you need super-gopher-guru privilege
@@ -453,12 +466,11 @@ extern	M*	runtime_allm;
 extern	P**	runtime_allp;
 extern	int32	runtime_gomaxprocs;
 extern	uint32	runtime_needextram;
-extern	bool	runtime_singleproc;
 extern	uint32	runtime_panicking;
-extern	uint32	runtime_gcwaiting;		// gc is waiting to run
 extern	int8*	runtime_goos;
 extern	int32	runtime_ncpu;
 extern 	void	(*runtime_sysargs)(int32, uint8**);
+extern	DebugVars	runtime_debug;
 
 /*
  * common functions and data
@@ -466,11 +478,13 @@ extern 	void	(*runtime_sysargs)(int32, uint8**);
 #define runtime_strcmp(s1, s2) __builtin_strcmp((s1), (s2))
 #define runtime_strstr(s1, s2) __builtin_strstr((s1), (s2))
 intgo	runtime_findnull(const byte*);
+intgo	runtime_findnullw(const uint16*);
 void	runtime_dump(byte*, int32);
 
 /*
  * very low level c-called
  */
+void	runtime_gogo(G*);
 struct __go_func_type;
 void	runtime_args(int32, byte**);
 void	runtime_osinit();
@@ -492,14 +506,13 @@ void	runtime_sigenable(uint32 sig);
 void	runtime_sigdisable(uint32 sig);
 int32	runtime_gotraceback(bool *crash);
 void	runtime_goroutineheader(G*);
-void	runtime_goroutinetrailer(G*);
 void	runtime_printtrace(Location*, int32, bool);
 #define runtime_open(p, f, m) open((p), (f), (m))
 #define runtime_read(d, v, n) read((d), (v), (n))
 #define runtime_write(d, v, n) write((d), (v), (n))
 #define runtime_close(d) close(d)
 #define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-#define runtime_cas64(pval, pold, new) __atomic_compare_exchange_n (pval, pold, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define runtime_cas64(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
 #define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
 // Don't confuse with XADD x86 instruction,
 // this one is actually 'addx', that is, add-and-fetch.
@@ -530,17 +543,21 @@ void	runtime_mallocinit(void);
 void	runtime_mprofinit(void);
 #define runtime_malloc(s) __go_alloc(s)
 #define runtime_free(p) __go_free(p)
-bool	runtime_addfinalizer(void*, FuncVal *fn, const struct __go_func_type *);
+bool	runtime_addfinalizer(void*, FuncVal *fn, const struct __go_func_type *, const struct __go_ptr_type *);
 #define runtime_getcallersp(p) __builtin_frame_address(1)
 int32	runtime_mcount(void);
 int32	runtime_gcount(void);
+void	runtime_mcall(void(*)(G*));
 uint32	runtime_fastrand1(void);
+int32	runtime_timediv(int64, int32, int32*);
 
 void runtime_setmg(M*, G*);
 void runtime_newextram(void);
 #define runtime_exit(s) exit(s)
 #define runtime_breakpoint() __builtin_trap()
 void	runtime_gosched(void);
+void	runtime_gosched0(G*);
+void	runtime_schedtrace(bool);
 void	runtime_park(void(*)(Lock*), Lock*, const char*);
 void	runtime_tsleep(int64, const char*);
 M*	runtime_newm(void);
@@ -555,6 +572,8 @@ int32	runtime_callers(int32, Location*, int32);
 int64	runtime_nanotime(void);
 void	runtime_dopanic(int32) __attribute__ ((noreturn));
 void	runtime_startpanic(void);
+void	runtime_freezetheworld(void);
+void	runtime_unwindstack(G*, byte*);
 void	runtime_sigprof();
 void	runtime_resetcpuprofiler(int32);
 void	runtime_setcpuprofilerate(void(*)(uintptr*, int32), int32);
@@ -567,10 +586,14 @@ void	runtime_addtimer(Timer*);
 bool	runtime_deltimer(Timer*);
 G*	runtime_netpoll(bool);
 void	runtime_netpollinit(void);
-int32	runtime_netpollopen(int32, PollDesc*);
-int32   runtime_netpollclose(int32);
+int32	runtime_netpollopen(uintptr, PollDesc*);
+int32   runtime_netpollclose(uintptr);
 void	runtime_netpollready(G**, PollDesc*, int32);
+uintptr	runtime_netpollfd(PollDesc*);
 void	runtime_crash(void);
+void	runtime_parsedebugvars(void);
+void	_rt0_go(void);
+void*	runtime_funcdata(Func*, int32);
 
 void	runtime_stoptheworld(void);
 void	runtime_starttheworld(void);
@@ -603,11 +626,15 @@ void	runtime_unlock(Lock*);
  * wake up early, it must wait to call noteclear until it
  * can be sure that no other goroutine is calling
  * notewakeup.
+ *
+ * notesleep/notetsleep are generally called on g0,
+ * notetsleepg is similar to notetsleep but is called on user g.
  */
 void	runtime_noteclear(Note*);
 void	runtime_notesleep(Note*);
 void	runtime_notewakeup(Note*);
-void	runtime_notetsleep(Note*, int64);
+bool	runtime_notetsleep(Note*, int64);  // false - timeout
+bool	runtime_notetsleepg(Note*, int64);  // false - timeout
 
 /*
  * low-level synchronization for implementing the above
@@ -698,11 +725,13 @@ void	runtime_newTypeAssertionError(const String*, const String*, const String*,
      __asm__ (GOSYM_PREFIX "runtime.NewTypeAssertionError");
 void	runtime_newErrorString(String, Eface*)
      __asm__ (GOSYM_PREFIX "runtime.NewErrorString");
+void	runtime_newErrorCString(const char*, Eface*)
+     __asm__ (GOSYM_PREFIX "runtime.NewErrorCString");
 
 /*
  * wrapped for go users
  */
-void	runtime_semacquire(uint32 volatile *);
+void	runtime_semacquire(uint32 volatile *, bool);
 void	runtime_semrelease(uint32 volatile *);
 int32	runtime_gomaxprocsfunc(int32 n);
 void	runtime_procyield(uint32);
@@ -711,19 +740,10 @@ void	runtime_lockOSThread(void);
 void	runtime_unlockOSThread(void);
 
 bool	runtime_showframe(String, bool);
+void	runtime_printcreatedby(G*);
 
 uintptr	runtime_memlimit(void);
 
-// If appropriate, ask the operating system to control whether this
-// thread should receive profiling signals.  This is only necessary on OS X.
-// An operating system should not deliver a profiling signal to a
-// thread that is not actually executing (what good is that?), but that's
-// what OS X prefers to do.  When profiling is turned on, we mask
-// away the profiling signal when threads go to sleep, so that OS X
-// is forced to deliver the signal to a thread that's actually running.
-// This is a no-op on other systems.
-void	runtime_setprof(bool);
-
 #define ISNAN(f) __builtin_isnan(f)
 
 enum
@@ -763,3 +783,6 @@ int32 getproccount(void);
 
 void	__go_set_closure(void*);
 void*	__go_get_closure(void);
+
+bool	runtime_gcwaiting(void);
+void	runtime_badsignal(int);
diff --git a/libgo/runtime/sema.goc b/libgo/runtime/sema.goc
index be971bd1265..f5d5bc89e3d 100644
--- a/libgo/runtime/sema.goc
+++ b/libgo/runtime/sema.goc
@@ -21,22 +21,23 @@ package sync
 #include "runtime.h"
 #include "arch.h"
 
-typedef struct Sema Sema;
-struct Sema
+typedef struct SemaWaiter SemaWaiter;
+struct SemaWaiter
 {
 	uint32 volatile*	addr;
 	G*	g;
 	int64	releasetime;
-	Sema*	prev;
-	Sema*	next;
+	int32	nrelease;	// -1 for acquire
+	SemaWaiter*	prev;
+	SemaWaiter*	next;
 };
 
 typedef struct SemaRoot SemaRoot;
 struct SemaRoot
 {
 	Lock;
-	Sema*	head;
-	Sema*	tail;
+	SemaWaiter*	head;
+	SemaWaiter*	tail;
 	// Number of waiters. Read w/o the lock.
 	uint32 volatile	nwait;
 };
@@ -58,7 +59,7 @@ semroot(uint32 volatile *addr)
 }
 
 static void
-semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s)
+semqueue(SemaRoot *root, uint32 volatile *addr, SemaWaiter *s)
 {
 	s->g = runtime_g();
 	s->addr = addr;
@@ -72,7 +73,7 @@ semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s)
 }
 
 static void
-semdequeue(SemaRoot *root, Sema *s)
+semdequeue(SemaRoot *root, SemaWaiter *s)
 {
 	if(s->next)
 		s->next->prev = s->prev;
@@ -97,10 +98,10 @@ cansemacquire(uint32 volatile *addr)
 	return 0;
 }
 
-static void
-semacquireimpl(uint32 volatile *addr, int32 profile)
+void
+runtime_semacquire(uint32 volatile *addr, bool profile)
 {
-	Sema s;	// Needs to be allocated on stack, otherwise garbage collector could deallocate it
+	SemaWaiter s;	// Needs to be allocated on stack, otherwise garbage collector could deallocate it
 	SemaRoot *root;
 	int64 t0;
 	
@@ -145,15 +146,9 @@ semacquireimpl(uint32 volatile *addr, int32 profile)
 }
 
 void
-runtime_semacquire(uint32 volatile *addr)
-{
-	semacquireimpl(addr, 0);
-}
-
-void
 runtime_semrelease(uint32 volatile *addr)
 {
-	Sema *s;
+	SemaWaiter *s;
 	SemaRoot *root;
 
 	root = semroot(addr);
@@ -188,10 +183,117 @@ runtime_semrelease(uint32 volatile *addr)
 	}
 }
 
+// TODO(dvyukov): move to netpoll.goc once it's used by all OSes.
+void net_runtime_Semacquire(uint32 *addr)
+  __asm__ (GOSYM_PREFIX "net.runtime_Semacquire");
+
+void net_runtime_Semacquire(uint32 *addr)
+{
+	runtime_semacquire(addr, true);
+}
+
+void net_runtime_Semrelease(uint32 *addr)
+  __asm__ (GOSYM_PREFIX "net.runtime_Semrelease");
+
+void net_runtime_Semrelease(uint32 *addr)
+{
+	runtime_semrelease(addr);
+}
+
 func runtime_Semacquire(addr *uint32) {
-	semacquireimpl(addr, 1);
+	runtime_semacquire(addr, true);
 }
 
 func runtime_Semrelease(addr *uint32) {
 	runtime_semrelease(addr);
 }
+
+typedef struct SyncSema SyncSema;
+struct SyncSema
+{
+	Lock;
+	SemaWaiter*	head;
+	SemaWaiter*	tail;
+};
+
+func runtime_Syncsemcheck(size uintptr) {
+	if(size != sizeof(SyncSema)) {
+		runtime_printf("bad SyncSema size: sync:%D runtime:%D\n", (int64)size, (int64)sizeof(SyncSema));
+		runtime_throw("bad SyncSema size");
+	}
+}
+
+// Syncsemacquire waits for a pairing Syncsemrelease on the same semaphore s.
+func runtime_Syncsemacquire(s *SyncSema) {
+	SemaWaiter w, *wake;
+	int64 t0;
+
+	w.g = runtime_g();
+	w.nrelease = -1;
+	w.next = nil;
+	w.releasetime = 0;
+	t0 = 0;
+	if(runtime_blockprofilerate > 0) {
+		t0 = runtime_cputicks();
+		w.releasetime = -1;
+	}
+
+	runtime_lock(s);
+	if(s->head && s->head->nrelease > 0) {
+		// have pending release, consume it
+		wake = nil;
+		s->head->nrelease--;
+		if(s->head->nrelease == 0) {
+			wake = s->head;
+			s->head = wake->next;
+			if(s->head == nil)
+				s->tail = nil;
+		}
+		runtime_unlock(s);
+		if(wake)
+			runtime_ready(wake->g);
+	} else {
+		// enqueue itself
+		if(s->tail == nil)
+			s->head = &w;
+		else
+			s->tail->next = &w;
+		s->tail = &w;
+		runtime_park(runtime_unlock, s, "semacquire");
+		if(t0)
+			runtime_blockevent(w.releasetime - t0, 2);
+	}
+}
+
+// Syncsemrelease waits for n pairing Syncsemacquire on the same semaphore s.
+func runtime_Syncsemrelease(s *SyncSema, n uint32) {
+	SemaWaiter w, *wake;
+
+	w.g = runtime_g();
+	w.nrelease = (int32)n;
+	w.next = nil;
+	w.releasetime = 0;
+
+	runtime_lock(s);
+	while(w.nrelease > 0 && s->head && s->head->nrelease < 0) {
+		// have pending acquire, satisfy it
+		wake = s->head;
+		s->head = wake->next;
+		if(s->head == nil)
+			s->tail = nil;
+		if(wake->releasetime)
+			wake->releasetime = runtime_cputicks();
+		runtime_ready(wake->g);
+		w.nrelease--;
+	}
+	if(w.nrelease > 0) {
+		// enqueue itself
+		if(s->tail == nil)
+			s->head = &w;
+		else
+			s->tail->next = &w;
+		s->tail = &w;
+		runtime_park(runtime_unlock, s, "semarelease");
+	} else
+		runtime_unlock(s);
+}
diff --git a/libgo/runtime/signal_unix.c b/libgo/runtime/signal_unix.c
index 5a506c8af3d..ea0a58f2ea2 100644
--- a/libgo/runtime/signal_unix.c
+++ b/libgo/runtime/signal_unix.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin freebsd linux openbsd netbsd
+// +build darwin dragonfly freebsd linux openbsd netbsd
 
 #include <sys/time.h>
 
@@ -100,13 +100,11 @@ runtime_resetcpuprofiler(int32 hz)
 	runtime_memclr((byte*)&it, sizeof it);
 	if(hz == 0) {
 		runtime_setitimer(ITIMER_PROF, &it, nil);
-		runtime_setprof(false);
 	} else {
 		it.it_interval.tv_sec = 0;
 		it.it_interval.tv_usec = 1000000 / hz;
 		it.it_value = it.it_interval;
 		runtime_setitimer(ITIMER_PROF, &it, nil);
-		runtime_setprof(true);
 	}
 	runtime_m()->profilehz = hz;
 }
diff --git a/libgo/runtime/sigqueue.goc b/libgo/runtime/sigqueue.goc
index 8657216d3f4..6769b239dc3 100644
--- a/libgo/runtime/sigqueue.goc
+++ b/libgo/runtime/sigqueue.goc
@@ -107,9 +107,7 @@ func signal_recv() (m uint32) {
 				new = HASWAITER;
 			if(runtime_cas(&sig.state, old, new)) {
 				if (new == HASWAITER) {
-					runtime_entersyscallblock();
-					runtime_notesleep(&sig);
-					runtime_exitsyscall();
+					runtime_notetsleepg(&sig, -1);
 					runtime_noteclear(&sig);
 				}
 				break;
@@ -157,3 +155,10 @@ func signal_disable(s uint32) {
 	sig.wanted[s/32] &= ~(1U<<(s&31));
 	runtime_sigdisable(s);
 }
+
+// This runs on a foreign stack, without an m or a g.  No stack split.
+void
+runtime_badsignal(int sig)
+{
+	__go_sigsend(sig);
+}
diff --git a/libgo/runtime/string.goc b/libgo/runtime/string.goc
index 64ed4f6ebaa..a7446e93c45 100644
--- a/libgo/runtime/string.goc
+++ b/libgo/runtime/string.goc
@@ -21,6 +21,18 @@ runtime_findnull(const byte *s)
 	return __builtin_strlen((const char*) s);
 }
 
+intgo
+runtime_findnullw(const uint16 *s)
+{
+	intgo l;
+
+	if(s == nil)
+		return 0;
+	for(l=0; s[l]!=0; l++)
+		;
+	return l;
+}
+
 static String
 gostringsize(intgo l, byte** pmem)
 {
@@ -32,7 +44,7 @@ gostringsize(intgo l, byte** pmem)
 		return runtime_emptystring;
 	}
 	// leave room for NUL for C runtime (e.g., callers of getenv)
-	mem = runtime_mallocgc(l+1, FlagNoPointers, 1, 0);
+	mem = runtime_mallocgc(l+1, 0, FlagNoScan|FlagNoZero);
 	s.str = mem;
 	s.len = l;
 	mem[l] = 0;
@@ -63,6 +75,15 @@ runtime_gostringnocopy(const byte *str)
 	return s;
 }
 
+String runtime_cstringToGo(byte*)
+  __asm__ (GOSYM_PREFIX "runtime.cstringToGo");
+
+String
+runtime_cstringToGo(byte *str)
+{
+	return runtime_gostringnocopy(str);
+}
+
 enum
 {
 	Runeself	= 0x80,
diff --git a/libgo/runtime/thread-linux.c b/libgo/runtime/thread-linux.c
index 13d23c47b07..ae56261e6f5 100644
--- a/libgo/runtime/thread-linux.c
+++ b/libgo/runtime/thread-linux.c
@@ -4,6 +4,7 @@
 
 #include "runtime.h"
 #include "defs.h"
+#include "signal_unix.h"
 
 // Linux futex.
 //
@@ -33,25 +34,22 @@ typedef struct timespec Timespec;
 void
 runtime_futexsleep(uint32 *addr, uint32 val, int64 ns)
 {
-	Timespec ts, *tsp;
-
-	if(ns < 0)
-		tsp = nil;
-	else {
-		ts.tv_sec = ns/1000000000LL;
-		ts.tv_nsec = ns%1000000000LL;
-		// Avoid overflow
-		if(ts.tv_sec > 1<<30)
-			ts.tv_sec = 1<<30;
-		tsp = &ts;
-	}
+	Timespec ts;
+	int32 nsec;
 
 	// Some Linux kernels have a bug where futex of
 	// FUTEX_WAIT returns an internal error code
 	// as an errno.  Libpthread ignores the return value
 	// here, and so can we: as it says a few lines up,
 	// spurious wakeups are allowed.
-	syscall(__NR_futex, addr, FUTEX_WAIT, val, tsp, nil, 0);
+
+	if(ns < 0) {
+		syscall(__NR_futex, addr, FUTEX_WAIT, val, nil, nil, 0);
+		return;
+	}
+	ts.tv_sec = runtime_timediv(ns, 1000000000LL, &nsec);
+	ts.tv_nsec = nsec;
+	syscall(__NR_futex, addr, FUTEX_WAIT, val, &ts, nil, 0);
 }
 
 // If any procs are sleeping on addr, wake up at most cnt.
diff --git a/libgo/runtime/time.goc b/libgo/runtime/time.goc
index 8d12fe01080..e4e35ec0846 100644
--- a/libgo/runtime/time.goc
+++ b/libgo/runtime/time.goc
@@ -12,8 +12,13 @@ package time
 #include "malloc.h"
 #include "race.h"
 
+enum {
+	debug = 0,
+};
+
 static Timers timers;
 static void addtimer(Timer*);
+static void dumptimers(const char*);
 
 // Package time APIs.
 // Godoc uses the comments in package time, not these.
@@ -92,6 +97,11 @@ addtimer(Timer *t)
 	int32 n;
 	Timer **nt;
 
+	// when must never be negative; otherwise timerproc will overflow
+	// during its delta calculation and never expire other timers.
+	if(t->when < 0)
+		t->when = (int64)((1ULL<<63)-1);
+
 	if(timers.len >= timers.cap) {
 		// Grow slice.
 		n = 16;
@@ -121,8 +131,13 @@ addtimer(Timer *t)
 		timers.timerproc = __go_go(timerproc, nil);
 		timers.timerproc->issystem = true;
 	}
+	if(debug)
+		dumptimers("addtimer");
 }
 
+// Used to force a dereference before the lock is acquired.
+static int32 gi;
+
 // Delete timer t from the heap.
 // Do not need to update the timerproc:
 // if it wakes up early, no big deal.
@@ -131,6 +146,11 @@ runtime_deltimer(Timer *t)
 {
 	int32 i;
 
+	// Dereference t so that any panic happens before the lock is held.
+	// Discard result, because t might be moving in the heap.
+	i = t->i;
+	gi = i;
+
 	runtime_lock(&timers);
 
 	// t may not be registered anymore and may have
@@ -152,6 +172,8 @@ runtime_deltimer(Timer *t)
 		siftup(i);
 		siftdown(i);
 	}
+	if(debug)
+		dumptimers("deltimer");
 	runtime_unlock(&timers);
 	return true;
 }
@@ -170,6 +192,7 @@ timerproc(void* dummy __attribute__ ((unused)))
 
 	for(;;) {
 		runtime_lock(&timers);
+		timers.sleeping = false;
 		now = runtime_nanotime();
 		for(;;) {
 			if(timers.len == 0) {
@@ -210,9 +233,7 @@ timerproc(void* dummy __attribute__ ((unused)))
 		timers.sleeping = true;
 		runtime_noteclear(&timers.waitnote);
 		runtime_unlock(&timers);
-		runtime_entersyscallblock();
-		runtime_notetsleep(&timers.waitnote, delta);
-		runtime_exitsyscall();
+		runtime_notetsleepg(&timers.waitnote, delta);
 	}
 }
 
@@ -222,18 +243,20 @@ static void
 siftup(int32 i)
 {
 	int32 p;
+	int64 when;
 	Timer **t, *tmp;
 
 	t = timers.t;
+	when = t[i]->when;
+	tmp = t[i];
 	while(i > 0) {
-		p = (i-1)/2;  // parent
-		if(t[i]->when >= t[p]->when)
+		p = (i-1)/4;  // parent
+		if(when >= t[p]->when)
 			break;
-		tmp = t[i];
 		t[i] = t[p];
-		t[p] = tmp;
 		t[i]->i = i;
-		t[p]->i = p;
+		t[p] = tmp;
+		tmp->i = p;
 		i = p;
 	}
 }
@@ -241,29 +264,61 @@ siftup(int32 i)
 static void
 siftdown(int32 i)
 {
-	int32 c, len;
+	int32 c, c3, len;
+	int64 when, w, w3;
 	Timer **t, *tmp;
 
 	t = timers.t;
 	len = timers.len;
+	when = t[i]->when;
+	tmp = t[i];
 	for(;;) {
-		c = i*2 + 1;  // left child
+		c = i*4 + 1;  // left child
+		c3 = c + 2;  // mid child
 		if(c >= len) {
 			break;
 		}
-		if(c+1 < len && t[c+1]->when < t[c]->when)
+		w = t[c]->when;
+		if(c+1 < len && t[c+1]->when < w) {
+			w = t[c+1]->when;
 			c++;
-		if(t[c]->when >= t[i]->when)
+		}
+		if(c3 < len) {
+			w3 = t[c3]->when;
+			if(c3+1 < len && t[c3+1]->when < w3) {
+				w3 = t[c3+1]->when;
+				c3++;
+			}
+			if(w3 < w) {
+				w = w3;
+				c = c3;
+			}
+		}
+		if(w >= when)
 			break;
-		tmp = t[i];
 		t[i] = t[c];
-		t[c] = tmp;
 		t[i]->i = i;
-		t[c]->i = c;
+		t[c] = tmp;
+		tmp->i = c;
 		i = c;
 	}
 }
 
+static void
+dumptimers(const char *msg)
+{
+	Timer *t;
+	int32 i;
+
+	runtime_printf("timers: %s\n", msg);
+	for(i = 0; i < timers.len; i++) {
+		t = timers.t[i];
+		runtime_printf("\t%d\t%p:\ti %d when %D period %D fn %p\n",
+				i, t, t->i, t->when, t->period, t->fv->fn);
+	}
+	runtime_printf("\n");
+}
+
 void
 runtime_time_scan(void (*addroot)(Obj))
 {
author	ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-11-06 19:49:01 +0000
committer	ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-11-06 19:49:01 +0000
commit	0ce10ea1348e9afd5d0eec6bca986bfe58bac5ac (patch)
tree	39530b071991b2326f881b2a30a2d82d6c133fd6 /libgo/runtime
parent	57a8bf1b0c6057ccbacb0cf79eb84d1985c2c1fe (diff)
download	gcc-0ce10ea1348e9afd5d0eec6bca986bfe58bac5ac.tar.gz