summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2014-12-01 16:42:41 -0500
committerRuss Cox <rsc@golang.org>2014-12-01 16:42:41 -0500
commit5bfd239aab779ff234ad60fbd27d1effdc2db2d2 (patch)
tree4277d6c6536acfd2e29204cb282938cc4f963d86
parent523a94031f3956c9bafe1c7f33bb25fb4e21993e (diff)
downloadgo-5bfd239aab779ff234ad60fbd27d1effdc2db2d2.tar.gz
[release-branch.go1.4] runtime: fix hang in GC due to shrinkstack vs netpoll race
??? CL 179680043 / 752cd9199639 runtime: fix hang in GC due to shrinkstack vs netpoll race During garbage collection, after scanning a stack, we think about shrinking it to reclaim some memory. The shrinking code (called while the world is stopped) checked that the status was Gwaiting or Grunnable and then changed the state to Gcopystack, to essentially lock the stack so that no other GC thread is scanning it. The same locking happens for stack growth (and is more necessary there). oldstatus = runtime?readgstatus(gp); oldstatus &= ~Gscan; if(oldstatus == Gwaiting || oldstatus == Grunnable) runtime?casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable else runtime?throw("copystack: bad status, not Gwaiting or Grunnable"); Unfortunately, "stop the world" doesn't stop everything. It stops all normal goroutine execution, but the network polling thread is still blocked in epoll and may wake up. If it does, and it chooses a goroutine to mark runnable, and that goroutine is the one whose stack is shrinking, then it can happen that between readgstatus and casgstatus, the status changes from Gwaiting to Grunnable. casgstatus assumes that if the status is not what is expected, it is a transient change (like from Gwaiting to Gscanwaiting and back, or like from Gwaiting to Gcopystack and back), and it loops until the status has been restored to the expected value. In this case, the status has changed semi-permanently from Gwaiting to Grunnable - it won't change again until the GC is done and the world can continue, but the GC is waiting for the status to change back. This wedges the program. To fix, call a special variant of casgstatus that accepts either Gwaiting or Grunnable as valid statuses. Without the fix bug with the extra check+throw in casgstatus, the program below dies in a few seconds (2-10) with GOMAXPROCS=8 on a 2012 Retina MacBook Pro. With the fix, it runs for minutes and minutes. package main import ( "io" "log" "net" "runtime" ) func main() { const N = 100 for i := 0; i < N; i++ { l, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { log.Fatal(err) } ch := make(chan net.Conn, 1) go func() { var err error c1, err := net.Dial("tcp", l.Addr().String()) if err != nil { log.Fatal(err) } ch <- c1 }() c2, err := l.Accept() if err != nil { log.Fatal(err) } c1 := <-ch l.Close() go netguy(c1, c2) go netguy(c2, c1) c1.Write(make([]byte, 100)) } for { runtime.GC() } } func netguy(r, w net.Conn) { buf := make([]byte, 100) for { bigstack(1000) _, err := io.ReadFull(r, buf) if err != nil { log.Fatal(err) } w.Write(buf) } } var g int func bigstack(n int) { var buf [100]byte if n > 0 { bigstack(n - 1) } g = int(buf[0]) + int(buf[99]) } Fixes issue 9186. LGTM=rlh R=austin, rlh CC=dvyukov, golang-codereviews, iant, khr, r https://codereview.appspot.com/179680043 ??? TBR=rlh CC=golang-codereviews https://codereview.appspot.com/184030043
-rw-r--r--src/runtime/proc.c32
-rw-r--r--src/runtime/runtime.h2
-rw-r--r--src/runtime/stack.c7
3 files changed, 35 insertions, 6 deletions
diff --git a/src/runtime/proc.c b/src/runtime/proc.c
index 91e3fe16d..8462c4b1d 100644
--- a/src/runtime/proc.c
+++ b/src/runtime/proc.c
@@ -402,6 +402,7 @@ runtime·castogscanstatus(G *gp, uint32 oldval, uint32 newval)
static void badcasgstatus(void);
static void helpcasgstatus(void);
+static void badgstatusrunnable(void);
// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
// and casfromgscanstatus instead.
@@ -423,6 +424,10 @@ runtime·casgstatus(G *gp, uint32 oldval, uint32 newval)
// loop if gp->atomicstatus is in a scan state giving
// GC time to finish and change the state to oldval.
while(!runtime·cas(&gp->atomicstatus, oldval, newval)) {
+ if(oldval == Gwaiting && gp->atomicstatus == Grunnable) {
+ fn = badgstatusrunnable;
+ runtime·onM(&fn);
+ }
// Help GC if needed.
if(gp->preemptscan && !gp->gcworkdone && (oldval == Grunning || oldval == Gsyscall)) {
gp->preemptscan = false;
@@ -434,6 +439,33 @@ runtime·casgstatus(G *gp, uint32 oldval, uint32 newval)
}
static void
+badgstatusrunnable(void)
+{
+ runtime·throw("casgstatus: waiting for Gwaiting but is Grunnable");
+}
+
+// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
+// Returns old status. Cannot call casgstatus directly, because we are racing with an
+// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
+// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
+// it would loop waiting for the status to go back to Gwaiting, which it never will.
+#pragma textflag NOSPLIT
+uint32
+runtime·casgcopystack(G *gp)
+{
+ uint32 oldstatus;
+
+ for(;;) {
+ oldstatus = runtime·readgstatus(gp) & ~Gscan;
+ if(oldstatus != Gwaiting && oldstatus != Grunnable)
+ runtime·throw("copystack: bad status, not Gwaiting or Grunnable");
+ if(runtime·cas(&gp->atomicstatus, oldstatus, Gcopystack))
+ break;
+ }
+ return oldstatus;
+}
+
+static void
badcasgstatus(void)
{
uint32 oldval, newval;
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index 977c4547d..177a1287e 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -666,6 +666,8 @@ enum {
uint32 runtime·readgstatus(G*);
void runtime·casgstatus(G*, uint32, uint32);
+void runtime·casgstatus(G*, uint32, uint32);
+uint32 runtime·casgcopystack(G*);
void runtime·quiesce(G*);
bool runtime·stopg(G*);
void runtime·restartg(G*);
diff --git a/src/runtime/stack.c b/src/runtime/stack.c
index 072bc242b..cb9557243 100644
--- a/src/runtime/stack.c
+++ b/src/runtime/stack.c
@@ -637,12 +637,7 @@ copystack(G *gp, uintptr newsize)
}
runtime·memmove((byte*)new.hi - used, (byte*)old.hi - used, used);
- oldstatus = runtime·readgstatus(gp);
- oldstatus &= ~Gscan;
- if(oldstatus == Gwaiting || oldstatus == Grunnable)
- runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable
- else
- runtime·throw("copystack: bad status, not Gwaiting or Grunnable");
+ oldstatus = runtime·casgcopystack(gp); // cas from Gwaiting or Grunnable to Gcopystack, return old status
// Swap out old stack for new one
gp->stack = new;