From 5756b3560141d0c09c4a27d2025f5438f49f59f2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 10 Sep 2020 21:20:46 +0000
Subject: runtime: align 12-byte objects to 8 bytes on 32-bit systems

Currently on 32-bit systems 8-byte fields in a struct have an alignment
of 4 bytes, which means that atomic instructions may fault. This issue
is tracked in #36606.

Our current workaround is to allocate memory and put any such atomically
accessed fields at the beginning of the object. This workaround fails
because the tiny allocator might not align the object right. This case
specifically only happens with 12-byte objects because a type's size is
rounded up to its alignment. So if e.g. we have a type like:

type obj struct {
    a uint64
    b byte
}

then its size will be 12 bytes, because "a" will require a 4 byte
alignment. This argument may be extended to all objects of size 9-15
bytes.

So, make this workaround work by specifically aligning such objects to 8
bytes on 32-bit systems. This change leaves a TODO to remove the code
once #36606 gets resolved. It also adds a test which will presumably no
longer be necessary (the compiler should enforce the right alignment)
when it gets resolved as well.

Fixes #37262.

Change-Id: I3a34e5b014b3c37ed2e5e75e62d71d8640aa42bc
Reviewed-on: https://go-review.googlesource.com/c/go/+/254057
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/malloc.go      |  8 +++++++
 src/runtime/malloc_test.go | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 4fa14996c2..c71f856f09 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1016,6 +1016,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
 				off = alignUp(off, 8)
+			} else if sys.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
 				off = alignUp(off, 4)
 			} else if size&1 == 0 {
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 5c97f548fd..4ba94d0494 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -12,8 +12,10 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"runtime"
 	. "runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -168,6 +170,61 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+var (
+	tinyByteSink   *byte
+	tinyUint32Sink *uint32
+	tinyObj12Sink  *obj12
+)
+
+type obj12 struct {
+	a uint64
+	b uint32
+}
+
+func TestTinyAllocIssue37262(t *testing.T) {
+	// Try to cause an alignment access fault
+	// by atomically accessing the first 64-bit
+	// value of a tiny-allocated object.
+	// See issue 37262 for details.
+
+	// GC twice, once to reach a stable heap state
+	// and again to make sure we finish the sweep phase.
+	runtime.GC()
+	runtime.GC()
+
+	// Make 1-byte allocations until we get a fresh tiny slot.
+	aligned := false
+	for i := 0; i < 16; i++ {
+		tinyByteSink = new(byte)
+		if uintptr(unsafe.Pointer(tinyByteSink))&0xf == 0xf {
+			aligned = true
+			break
+		}
+	}
+	if !aligned {
+		t.Fatal("unable to get a fresh tiny slot")
+	}
+
+	// Create a 4-byte object so that the current
+	// tiny slot is partially filled.
+	tinyUint32Sink = new(uint32)
+
+	// Create a 12-byte object, which fits into the
+	// tiny slot. If it actually gets place there,
+	// then the field "a" will be improperly aligned
+	// for atomic access on 32-bit architectures.
+	// This won't be true if issue 36606 gets resolved.
+	tinyObj12Sink = new(obj12)
+
+	// Try to atomically access "x.a".
+	atomic.StoreUint64(&tinyObj12Sink.a, 10)
+
+	// Clear the sinks.
+	tinyByteSink = nil
+	tinyUint32Sink = nil
+	tinyObj12Sink = nil
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
-- 
cgit v1.2.1


From fe2cfb74ba6352990f5b41260b99e80f78e4a90a Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Thu, 1 Oct 2020 14:49:33 -0700
Subject: all: drop 387 support

My last 387 CL. So sad ... ... ... ... not!

Fixes #40255

Change-Id: I8d4ddb744b234b8adc735db2f7c3c7b6d8bbdfa4
Reviewed-on: https://go-review.googlesource.com/c/go/+/258957
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/mkpreempt.go  | 33 +++++++++------------------------
 src/runtime/preempt_386.s | 45 ++++++++++++++++++---------------------------
 src/runtime/vlrt.go       |  5 ++---
 3 files changed, 29 insertions(+), 54 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index c2e14cdcd6..c5bfb0f207 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -190,40 +190,25 @@ func (l *layout) restore() {
 func gen386() {
 	p("PUSHFL")
 
-	// Save general purpose registers.
+	// Assign stack offsets.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
-		if reg == "SP" || strings.HasPrefix(reg, "X") {
+		if reg == "SP" {
 			continue
 		}
-		l.add("MOVL", reg, 4)
-	}
-
-	// Save the 387 state.
-	l.addSpecial(
-		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
-		"FRSTOR %d(SP)",
-		108)
-
-	// Save SSE state only if supported.
-	lSSE := layout{stack: l.stack, sp: "SP"}
-	for i := 0; i < 8; i++ {
-		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
+		if strings.HasPrefix(reg, "X") {
+			l.add("MOVUPS", reg, 16)
+		} else {
+			l.add("MOVL", reg, 4)
+		}
 	}
 
-	p("ADJSP $%d", lSSE.stack)
+	p("ADJSP $%d", l.stack)
 	p("NOP SP")
 	l.save()
-	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse")
-	lSSE.save()
-	label("nosse:")
 	p("CALL ·asyncPreempt2(SB)")
-	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse2")
-	lSSE.restore()
-	label("nosse2:")
 	l.restore()
-	p("ADJSP $%d", -lSSE.stack)
-
+	p("ADJSP $%d", -l.stack)
 	p("POPFL")
 	p("RET")
 }
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
index a00ac8f385..5c9b8ea224 100644
--- a/src/runtime/preempt_386.s
+++ b/src/runtime/preempt_386.s
@@ -5,7 +5,7 @@
 
 TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	PUSHFL
-	ADJSP $264
+	ADJSP $156
 	NOP SP
 	MOVL AX, 0(SP)
 	MOVL CX, 4(SP)
@@ -14,32 +14,23 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVL BP, 16(SP)
 	MOVL SI, 20(SP)
 	MOVL DI, 24(SP)
-	FSAVE 28(SP)
-	FLDCW runtime·controlWord64(SB)
-	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
-	JNE nosse
-	MOVUPS X0, 136(SP)
-	MOVUPS X1, 152(SP)
-	MOVUPS X2, 168(SP)
-	MOVUPS X3, 184(SP)
-	MOVUPS X4, 200(SP)
-	MOVUPS X5, 216(SP)
-	MOVUPS X6, 232(SP)
-	MOVUPS X7, 248(SP)
-nosse:
+	MOVUPS X0, 28(SP)
+	MOVUPS X1, 44(SP)
+	MOVUPS X2, 60(SP)
+	MOVUPS X3, 76(SP)
+	MOVUPS X4, 92(SP)
+	MOVUPS X5, 108(SP)
+	MOVUPS X6, 124(SP)
+	MOVUPS X7, 140(SP)
 	CALL ·asyncPreempt2(SB)
-	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
-	JNE nosse2
-	MOVUPS 248(SP), X7
-	MOVUPS 232(SP), X6
-	MOVUPS 216(SP), X5
-	MOVUPS 200(SP), X4
-	MOVUPS 184(SP), X3
-	MOVUPS 168(SP), X2
-	MOVUPS 152(SP), X1
-	MOVUPS 136(SP), X0
-nosse2:
-	FRSTOR 28(SP)
+	MOVUPS 140(SP), X7
+	MOVUPS 124(SP), X6
+	MOVUPS 108(SP), X5
+	MOVUPS 92(SP), X4
+	MOVUPS 76(SP), X3
+	MOVUPS 60(SP), X2
+	MOVUPS 44(SP), X1
+	MOVUPS 28(SP), X0
 	MOVL 24(SP), DI
 	MOVL 20(SP), SI
 	MOVL 16(SP), BP
@@ -47,6 +38,6 @@ nosse2:
 	MOVL 8(SP), DX
 	MOVL 4(SP), CX
 	MOVL 0(SP), AX
-	ADJSP $-264
+	ADJSP $-156
 	POPFL
 	RET
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 38e0b32801..996c0611fd 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -263,7 +263,7 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 	return q, n
 }
 
-// Floating point control word values for GOARCH=386 GO386=387.
+// Floating point control word values.
 // Bits 0-5 are bits to disable floating-point exceptions.
 // Bits 8-9 are the precision control:
 //   0 = single precision a.k.a. float32
@@ -273,6 +273,5 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 //   3 = round toward zero
 var (
 	controlWord64      uint16 = 0x3f + 2<<8 + 0<<10
-	controlWord32             = 0x3f + 0<<8 + 0<<10
-	controlWord64trunc        = 0x3f + 2<<8 + 3<<10
+	controlWord64trunc uint16 = 0x3f + 2<<8 + 3<<10
 )
-- 
cgit v1.2.1


From d888f1d5c06828e9d7b0166f770a443f6315c2d1 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 1 Oct 2020 16:06:03 -0400
Subject: runtime: add debugging to TestTimePprof

We've seen timeouts of TestTimePprof, but the tracebacks aren't useful
because goroutines are running on other threads. Add GOTRACEBACK=crash
to catch these in the future.

For #41120.

Change-Id: I97318172ef78d0cbab10df5e4ffcbfeadff579e3
Reviewed-on: https://go-review.googlesource.com/c/go/+/258802
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Bryan C. Mills <bcmills@google.com>
---
 src/runtime/crash_test.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 34f30c9a37..eae4f538c1 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -667,7 +667,9 @@ func TestBadTraceback(t *testing.T) {
 }
 
 func TestTimePprof(t *testing.T) {
-	fn := runTestProg(t, "testprog", "TimeProf")
+	// Pass GOTRACEBACK for issue #41120 to try to get more
+	// information on timeout.
+	fn := runTestProg(t, "testprog", "TimeProf", "GOTRACEBACK=crash")
 	fn = strings.TrimSpace(fn)
 	defer os.Remove(fn)
 
-- 
cgit v1.2.1


From f89d05eb7ba1885474d03bb62f0a36a2d3cf56ea Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 1 Oct 2020 10:58:47 -0400
Subject: runtime: update and tidy cgo callback description

The documentation on how cgo callbacks (C -> Go calls) works
internally has gotten somewhat stale. This CL refreshes it.

Change-Id: I1ab66225c9da52d698d97ebeb4f3c7b9b5ee97db
Reviewed-on: https://go-review.googlesource.com/c/go/+/258937
Trust: Austin Clements <austin@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/cgocall.go | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 427ed0ffb9..0b69ff3233 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -35,44 +35,48 @@
 // cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
 // know about packages).  The gcc-compiled C function f calls GoF.
 //
-// GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
-// (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
-// adapter from the gcc function call ABI to the 6c function call ABI.
-// It is called from gcc to call 6c functions. In this case it calls
-// _cgoexp_GoF(frame, framesize), still running on m->g0's stack
+// GoF calls crosscall2(_cgoexp_GoF, frame, framesize, ctxt).
+// Crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
+// the gcc function call ABI to the gc function call ABI.
+// It is called from gcc to call gc functions. In this case it calls
+// _cgoexp_GoF(frame, framesize), still running on m.g0's stack
 // and outside the $GOMAXPROCS limit. Thus, this code cannot yet
 // call arbitrary Go code directly and must be careful not to allocate
-// memory or use up m->g0's stack.
+// memory or use up m.g0's stack.
 //
-// _cgoexp_GoF calls runtime.cgocallback(p.GoF, frame, framesize, ctxt).
+// _cgoexp_GoF (generated by cmd/cgo) calls
+// runtime.cgocallback(funcPC(p.GoF), frame, framesize, ctxt).
 // (The reason for having _cgoexp_GoF instead of writing a crosscall3
 // to make this call directly is that _cgoexp_GoF, because it is compiled
-// with 6c instead of gcc, can refer to dotted names like
+// with gc instead of gcc, can refer to dotted names like
 // runtime.cgocallback and p.GoF.)
 //
-// runtime.cgocallback (in asm_$GOARCH.s) switches from m->g0's
-// stack to the original g (m->curg)'s stack, on which it calls
+// runtime.cgocallback (in asm_$GOARCH.s) turns the raw PC of p.GoF
+// into a Go function value and calls runtime.cgocallback_gofunc.
+//
+// runtime.cgocallback_gofunc (in asm_$GOARCH.s) switches from m.g0's
+// stack to the original g (m.curg)'s stack, on which it calls
 // runtime.cgocallbackg(p.GoF, frame, framesize).
 // As part of the stack switch, runtime.cgocallback saves the current
-// SP as m->g0->sched.sp, so that any use of m->g0's stack during the
+// SP as m.g0.sched.sp, so that any use of m.g0's stack during the
 // execution of the callback will be done below the existing stack frames.
-// Before overwriting m->g0->sched.sp, it pushes the old value on the
-// m->g0 stack, so that it can be restored later.
+// Before overwriting m.g0.sched.sp, it pushes the old value on the
+// m.g0 stack, so that it can be restored later.
 //
 // runtime.cgocallbackg (below) is now running on a real goroutine
-// stack (not an m->g0 stack).  First it calls runtime.exitsyscall, which will
+// stack (not an m.g0 stack).  First it calls runtime.exitsyscall, which will
 // block until the $GOMAXPROCS limit allows running this goroutine.
 // Once exitsyscall has returned, it is safe to do things like call the memory
 // allocator or invoke the Go callback function p.GoF.  runtime.cgocallbackg
-// first defers a function to unwind m->g0.sched.sp, so that if p.GoF
-// panics, m->g0.sched.sp will be restored to its old value: the m->g0 stack
-// and the m->curg stack will be unwound in lock step.
+// first defers a function to unwind m.g0.sched.sp, so that if p.GoF
+// panics, m.g0.sched.sp will be restored to its old value: the m.g0 stack
+// and the m.curg stack will be unwound in lock step.
 // Then it calls p.GoF.  Finally it pops but does not execute the deferred
 // function, calls runtime.entersyscall, and returns to runtime.cgocallback.
 //
 // After it regains control, runtime.cgocallback switches back to
-// m->g0's stack (the pointer is still in m->g0.sched.sp), restores the old
-// m->g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+// m.g0's stack (the pointer is still in m.g0.sched.sp), restores the old
+// m.g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
 //
 // _cgoexp_GoF immediately returns to crosscall2, which restores the
 // callee-save registers for gcc and returns to GoF, which returns to f.
-- 
cgit v1.2.1


From 869c02ce1f635960bfc2f06bb52e2b4e17eaa199 Mon Sep 17 00:00:00 2001
From: Elias Naur <mail@eliasnaur.com>
Date: Wed, 16 Sep 2020 15:23:58 +0200
Subject: misc/ios: add support for running programs on the iOS simulator

Update the README to mention the emulator. Remove reference to gomobile
while here; there are multiple ways to develop for iOS today, including
using the c-archive buildmode directly.

Updates #38485

Change-Id: Iccef75e646ea8e1b9bc3fc37419cc2d6bf3dfdf4
Reviewed-on: https://go-review.googlesource.com/c/go/+/255257
Run-TryBot: Elias Naur <mail@eliasnaur.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Elias Naur <mail@eliasnaur.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/cgo/gcc_darwin_arm64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index fd7d4084c9..9ea43ae4af 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -131,7 +131,7 @@ init_working_dir()
 		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
 	}
 
-	// The test harness in go_darwin_arm_exec passes the relative working directory
+	// The test harness in go_ios_exec passes the relative working directory
 	// in the GoExecWrapperWorkingDirectory property of the app bundle.
 	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
 	if (wd_ref != NULL) {
-- 
cgit v1.2.1


From 9dc65d7dc9268d5150174ec55cc4753fe18f554c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Sat, 3 Oct 2020 16:44:22 -0400
Subject: runtime: correct signature of call16

The signature of call16 is currently missing the "typ" parameter. This
CL fixes this. This wasn't caught by vet because call16 is defined by
macro expansion (see #17544), and we didn't notice the mismatch with
the other call* functions because call16 is defined only on 32-bit
architectures and lives alone in stubs32.go.

Unfortunately, this means its GC signature is also wrong: the "arg"
parameter is treated as a scalar rather than a pointer, so GC won't
trace it and stack copying won't adjust it. This turns out to matter
in exactly one case right now: on 32-bit architectures (which are the
only architectures where call16 is defined), a stack-allocated defer
of a function with a 16-byte or smaller argument frame including a
non-empty result area can corrupt memory if the deferred function
grows the stack and is invoked during a panic. Whew. All other current
uses of reflectcall pass a heap-allocated "arg" frame (which happens
to be reachable from other stack roots, so tracing isn't a problem).

Curiously, in 2016, the signatures of all call* functions were wrong
in exactly this way. CL 31654 fixed all of them in stubs.go, but
missed the one in stubs32.go.

Fixes #41795.

Change-Id: I31e3c0df201f79ee5707eeb8dc4ff0d13fc10ada
Reviewed-on: https://go-review.googlesource.com/c/go/+/259338
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/stubs32.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/stubs32.go b/src/runtime/stubs32.go
index a7f52f6b9e..c4715fe989 100644
--- a/src/runtime/stubs32.go
+++ b/src/runtime/stubs32.go
@@ -11,4 +11,4 @@ import "unsafe"
 // Declarations for runtime services implemented in C or assembly that
 // are only present on 32 bit systems.
 
-func call16(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-- 
cgit v1.2.1


From 40bff82885b8de850f909f38357c53670562f815 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Sat, 3 Oct 2020 20:40:49 -0400
Subject: runtime: define and use call16 everywhere

Currently, runtime.call16 is defined and used only on 32-bit
architectures, while 64-bit architectures all start at call32 and go
up from there. This led to unnecessary complexity because call16's
prototype needed to be in a different file, separate from all of the
other call* prototypes, which in turn led to it getting out of sync
with the other call* prototypes. This CL adds call16 on 64-bit
architectures, bringing them all into sync, and moves the call16
prototype to live with the others.

Prior to CL 31655 (in 2016), call16 couldn't be implemented on 64-bit
architectures because it needed at least four words of argument space
to invoke "callwritebarrier" after copying back the results. CL 31655
changed the way call* invoked the write barrier in preparation for the
hybrid barrier; since the hybrid barrier had to be invoked prior to
copying back results, it needed a different solution that didn't reuse
call*'s stack space. At this point, call16 was no longer a problem on
64-bit, but we never added it. Until now.

Change-Id: Id10ade0e4f75c6ea76afa6229ddaee2b994c27dd
Reviewed-on: https://go-review.googlesource.com/c/go/+/259339
Trust: Austin Clements <austin@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/asm_amd64.s   |  2 ++
 src/runtime/asm_arm64.s   |  2 ++
 src/runtime/asm_mips64x.s |  1 +
 src/runtime/asm_ppc64x.s  |  2 ++
 src/runtime/asm_riscv64.s |  1 +
 src/runtime/asm_s390x.s   |  2 ++
 src/runtime/asm_wasm.s    |  2 ++
 src/runtime/stubs.go      |  1 +
 src/runtime/stubs32.go    | 14 --------------
 9 files changed, 13 insertions(+), 14 deletions(-)
 delete mode 100644 src/runtime/stubs32.go

(limited to 'src/runtime')

diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index fa25c55b96..256f4112cd 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -470,6 +470,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 
 TEXT ·reflectcall(SB), NOSPLIT, $0-32
 	MOVLQZX argsize+24(FP), CX
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -537,6 +538,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 6b3d1e779e..5eda3063d7 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -331,6 +331,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R16
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -416,6 +417,7 @@ TEXT callRet<>(SB), NOSPLIT, $40-0
 
 // These have 8 added to make the overall frame size a multiple of 16,
 // as required by the ABI. (There is another +8 for the saved LR.)
+CALLFN(·call16, 24 )
 CALLFN(·call32, 40 )
 CALLFN(·call64, 72 )
 CALLFN(·call128, 136 )
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 7330f40e85..0ff1b24225 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -294,6 +294,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R1
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 23387a2165..603058a61b 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -372,6 +372,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -478,6 +479,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 8f6c8773eb..4084ced7f8 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -342,6 +342,7 @@ TEXT reflect·call(SB), NOSPLIT, $0-0
 // func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), T0
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index cb39451faa..46a434119b 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -383,6 +383,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT, $-8-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -461,6 +462,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index 7d88beb537..1275af136b 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -308,6 +308,7 @@ TEXT ·reflectcall(SB), NOSPLIT, $0-32
 
 	MOVW argsize+24(FP), R0
 
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -398,6 +399,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index b891a12fdd..bd2514e862 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -271,6 +271,7 @@ func return0()
 
 // in asm_*.s
 // not called directly; definitions here supply type information for traceback.
+func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call64(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call128(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/stubs32.go b/src/runtime/stubs32.go
deleted file mode 100644
index c4715fe989..0000000000
--- a/src/runtime/stubs32.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 arm mips mipsle
-
-package runtime
-
-import "unsafe"
-
-// Declarations for runtime services implemented in C or assembly that
-// are only present on 32 bit systems.
-
-func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-- 
cgit v1.2.1


From a517c3422e808ae51533a0700e05d59e8a799136 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Mon, 5 Oct 2020 12:17:30 -0400
Subject: runtime: clean up runtime.call* frame sizes on ARM64

ARM64 used to require that all assembly frame sizes were of the form
16*N+8 because ARM64 requires 16-byte SP alignment and the assembler
added an 8 byte LR slot. This made all of the runtime.call* frame
sizes wonky. The assembler now rounds up the frame size appropriately
after adding any additional slots it needs, so this is no longer
necessary.

This CL cleans up the frame sizes of these functions so they look the
way you'd expect and match all other architectures.

Change-Id: I47819092296b8983c43eadf2e66c7c1e0d518555
Reviewed-on: https://go-review.googlesource.com/c/go/+/259448
Trust: Austin Clements <austin@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/asm_arm64.s | 56 ++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 5eda3063d7..1f46d1962c 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -415,35 +415,33 @@ TEXT callRet<>(SB), NOSPLIT, $40-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
-// These have 8 added to make the overall frame size a multiple of 16,
-// as required by the ABI. (There is another +8 for the saved LR.)
-CALLFN(·call16, 24 )
-CALLFN(·call32, 40 )
-CALLFN(·call64, 72 )
-CALLFN(·call128, 136 )
-CALLFN(·call256, 264 )
-CALLFN(·call512, 520 )
-CALLFN(·call1024, 1032 )
-CALLFN(·call2048, 2056 )
-CALLFN(·call4096, 4104 )
-CALLFN(·call8192, 8200 )
-CALLFN(·call16384, 16392 )
-CALLFN(·call32768, 32776 )
-CALLFN(·call65536, 65544 )
-CALLFN(·call131072, 131080 )
-CALLFN(·call262144, 262152 )
-CALLFN(·call524288, 524296 )
-CALLFN(·call1048576, 1048584 )
-CALLFN(·call2097152, 2097160 )
-CALLFN(·call4194304, 4194312 )
-CALLFN(·call8388608, 8388616 )
-CALLFN(·call16777216, 16777224 )
-CALLFN(·call33554432, 33554440 )
-CALLFN(·call67108864, 67108872 )
-CALLFN(·call134217728, 134217736 )
-CALLFN(·call268435456, 268435464 )
-CALLFN(·call536870912, 536870920 )
-CALLFN(·call1073741824, 1073741832 )
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
 
 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
-- 
cgit v1.2.1


From 28e549dec3954b36d0c83442be913d8709d7e5ae Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Sat, 12 Sep 2020 12:33:24 -0400
Subject: runtime: use sigaltstack on macOS/ARM64

Currently we don't use sigaltstack on darwin/arm64, as is not
supported on iOS. However, it is supported on macOS. Use it.
(iOS remains unchanged.)

Change-Id: Icc154c5e2edf2dbdc8ca68741ad9157fc15a72ee
Reviewed-on: https://go-review.googlesource.com/c/go/+/256917
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/mkpreempt.go       |  7 ++-----
 src/runtime/os_darwin.go       |  8 ++++----
 src/runtime/preempt_arm64.s    |  3 ---
 src/runtime/stack.go           |  2 +-
 src/runtime/sys_darwin_arm64.s | 22 ++++++++++++++++++----
 5 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index c5bfb0f207..40683bb9d9 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -340,12 +340,9 @@ func genARM64() {
 	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
 	p("SUB $8, RSP, R29")  // set up new frame pointer
 	p("#endif")
-	// On darwin, save the LR again after decrementing SP. We run the
-	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// On iOS, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support sigaltstack),
 	// so any writes below SP may be clobbered.
-	p("#ifdef GOOS_darwin")
-	p("MOVD R30, (RSP)")
-	p("#endif")
 	p("#ifdef GOOS_ios")
 	p("MOVD R30, (RSP)")
 	p("#endif")
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 01c40b4813..394bd6fb0f 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -289,9 +289,9 @@ func mpreinit(mp *m) {
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// The signal handler handles it directly.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		minitSignalStack()
 	}
 	minitSignalMask()
@@ -301,9 +301,9 @@ func minit() {
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// See minit.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		unminitSignals()
 	}
 }
diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s
index d0e77659c3..36ee13282c 100644
--- a/src/runtime/preempt_arm64.s
+++ b/src/runtime/preempt_arm64.s
@@ -10,9 +10,6 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD R29, -8(RSP)
 	SUB $8, RSP, R29
 	#endif
-	#ifdef GOOS_darwin
-	MOVD R30, (RSP)
-	#endif
 	#ifdef GOOS_ios
 	MOVD R30, (RSP)
 	#endif
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 3802cd049e..2afc2635aa 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -66,7 +66,7 @@ const (
 	// to each stack below the usual guard area for OS-specific
 	// purposes like signal handling. Used on Windows, Plan 9,
 	// and iOS because they do not use a separate stack.
-	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + (sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64*1024
+	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + sys.GoosIos*sys.GoarchArm64*1024
 
 	// The minimum size of stack used by Go code
 	_StackMin = 2048
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 585d4f2c64..427cb17781 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -202,6 +202,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$192
 	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOOS_ios
 	MOVD	RSP, R6
 	CMP	$0, g
 	BEQ	nog
@@ -226,16 +227,21 @@ nog:
 	// Switch to gsignal stack.
 	MOVD	R6, RSP
 
-	// Call sigtrampgo.
+	// Save arguments.
 	MOVW	R0, (8*1)(RSP)
 	MOVD	R1, (8*2)(RSP)
 	MOVD	R2, (8*3)(RSP)
+#endif
+
+	// Call sigtrampgo.
 	MOVD	$runtime·sigtrampgo(SB), R11
 	BL	(R11)
 
+#ifdef GOOS_ios
 	// Switch to old stack.
 	MOVD	(8*4)(RSP), R5
 	MOVD	R5, RSP
+#endif
 
 	// Restore callee-save registers.
 	MOVD	(8*4)(RSP), R19
@@ -329,12 +335,20 @@ TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
 	ADD	$16, RSP
 	RET
 
-// sigaltstack on iOS is not supported and will always
-// run the signal handler on the main stack, so our sigtramp has
-// to do the stack switch ourselves.
 TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
+#ifdef GOOS_ios
+	// sigaltstack on iOS is not supported and will always
+	// run the signal handler on the main stack, so our sigtramp has
+	// to do the stack switch ourselves.
 	MOVW	$43, R0
 	BL	libc_exit(SB)
+#else
+	MOVD	8(R0), R1		// arg 2 old
+	MOVD	0(R0), R0		// arg 1 new
+	CALL	libc_sigaltstack(SB)
+	CBZ	R0, 2(PC)
+	BL	notok<>(SB)
+#endif
 	RET
 
 // Thread related functions
-- 
cgit v1.2.1


From a739306ca7d9ea3a98acca59b853fe889f04c28c Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Thu, 17 Sep 2020 10:53:10 -0400
Subject: runtime: enable more address bits on macOS/ARM64

Apparently macOS/ARM64 has 47-bit addresses, instead of 33-bit as
on ios/ARM64. Enable more address bits.

Updates #38485.

Change-Id: I8aa64ba22a3933e3d9c4fffd17d902b5f31c30e3
Reviewed-on: https://go-review.googlesource.com/c/go/+/256918
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/malloc.go           | 8 ++++----
 src/runtime/mpagealloc_32bit.go | 4 ++--
 src/runtime/mpagealloc_64bit.go | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index c71f856f09..f7e9b7c4b4 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -198,7 +198,7 @@ const (
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
-	// On darwin/arm64, although 64-bit pointers are presumably
+	// On ios/arm64, although 64-bit pointers are presumably
 	// available, pointers are truncated to 33 bits. Furthermore,
 	// only the top 4 GiB of the address space are actually available
 	// to the application, but we allow the whole 33 bits anyway for
@@ -207,7 +207,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosIos*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -514,14 +514,14 @@ func mallocinit() {
 		// However, on arm64, we ignore all this advice above and slam the
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
-		// On darwin/arm64, the address space is even smaller.
+		// On ios/arm64, the address space is even smaller.
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && (GOOS == "darwin" || GOOS == "ios"):
+			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 6658a900ac..90f1e54d6c 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm mips mipsle wasm darwin,arm64
+// +build 386 arm mips mipsle wasm ios,arm64
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
 // allocator, even though it has 64-bit pointers. This is because any wasm
 // pointer always has its top 32 bits as zero, so the effective heap address
 // space is only 2^32 bytes in size (see heapAddrBits).
 
-// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// ios/arm64 is treated as a 32-bit architecture for the purposes of the
 // page allocator, even though it has 64-bit pointers and a 33-bit address
 // space (see heapAddrBits). The 33 bit address space cannot be rounded up
 // to 64 bits because there are too many summary levels to fit in just 33
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 831626e4b2..a1691ba802 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
+// +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
 
-// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+// See mpagealloc_32bit.go for why ios/arm64 is excluded here.
 
 package runtime
 
-- 
cgit v1.2.1


From db428ad7b61ed757671162054252b4326045e96c Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Thu, 17 Sep 2020 15:02:26 -0400
Subject: all: enable more tests on macOS/ARM64

Updates #38485.

Change-Id: Iac96f5ffe88521fcb11eab306d0df6463bdce046
Reviewed-on: https://go-review.googlesource.com/c/go/+/256920
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/debug/panic_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/debug/panic_test.go b/src/runtime/debug/panic_test.go
index 93be216985..b67a3de4f9 100644
--- a/src/runtime/debug/panic_test.go
+++ b/src/runtime/debug/panic_test.go
@@ -20,8 +20,8 @@ func TestPanicOnFault(t *testing.T) {
 	if runtime.GOARCH == "s390x" {
 		t.Skip("s390x fault addresses are missing the low order bits")
 	}
-	if (runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64" {
-		t.Skip("darwin/arm64 doesn't provide fault addresses")
+	if runtime.GOOS == "ios" {
+		t.Skip("iOS doesn't provide fault addresses")
 	}
 	m, err := syscall.Mmap(-1, 0, 0x1000, syscall.PROT_READ /* Note: no PROT_WRITE */, syscall.MAP_SHARED|syscall.MAP_ANON)
 	if err != nil {
-- 
cgit v1.2.1


From 3923460dda205721d9bee2714a7f0dd403082a90 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Sat, 3 Oct 2020 16:18:43 -0400
Subject: runtime/cgo: only build xx_cgo_panicmem on iOS

On iOS, when running under lldb, we install xx_cgo_panicmem as
EXC_BAD_ACCESS handler so we can get a proper Go panic for
SIGSEGV. Only build it on iOS.

Updates #38485.

Change-Id: I801c477439e05920a4bb8fdf5eae6f4923ab8274
Reviewed-on: https://go-review.googlesource.com/c/go/+/259440
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/cgo/gcc_signal2_darwin_arm64.c |  11 --
 src/runtime/cgo/gcc_signal2_ios_arm64.c    |  11 ++
 src/runtime/cgo/gcc_signal_darwin_arm64.c  | 213 -----------------------------
 src/runtime/cgo/gcc_signal_darwin_lldb.c   |  12 --
 src/runtime/cgo/gcc_signal_darwin_nolldb.c |  12 ++
 src/runtime/cgo/gcc_signal_ios_arm64.c     | 213 +++++++++++++++++++++++++++++
 src/runtime/cgo/signal_darwin_arm64.go     |  10 --
 src/runtime/cgo/signal_darwin_arm64.s      |  56 --------
 src/runtime/cgo/signal_ios_arm64.go        |  10 ++
 src/runtime/cgo/signal_ios_arm64.s         |  56 ++++++++
 10 files changed, 302 insertions(+), 302 deletions(-)
 delete mode 100644 src/runtime/cgo/gcc_signal2_darwin_arm64.c
 create mode 100644 src/runtime/cgo/gcc_signal2_ios_arm64.c
 delete mode 100644 src/runtime/cgo/gcc_signal_darwin_arm64.c
 delete mode 100644 src/runtime/cgo/gcc_signal_darwin_lldb.c
 create mode 100644 src/runtime/cgo/gcc_signal_darwin_nolldb.c
 create mode 100644 src/runtime/cgo/gcc_signal_ios_arm64.c
 delete mode 100644 src/runtime/cgo/signal_darwin_arm64.go
 delete mode 100644 src/runtime/cgo/signal_darwin_arm64.s
 create mode 100644 src/runtime/cgo/signal_ios_arm64.go
 create mode 100644 src/runtime/cgo/signal_ios_arm64.s

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/gcc_signal2_darwin_arm64.c b/src/runtime/cgo/gcc_signal2_darwin_arm64.c
deleted file mode 100644
index 5b8a18ffd6..0000000000
--- a/src/runtime/cgo/gcc_signal2_darwin_arm64.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build lldb
-
-// Used by gcc_signal_darwin_arm64.c when doing the test build during cgo.
-// We hope that for real binaries the definition provided by Go will take precedence
-// and the linker will drop this .o file altogether, which is why this definition
-// is all by itself in its own file.
-void __attribute__((weak)) xx_cgo_panicmem(void) {}
diff --git a/src/runtime/cgo/gcc_signal2_ios_arm64.c b/src/runtime/cgo/gcc_signal2_ios_arm64.c
new file mode 100644
index 0000000000..5b8a18ffd6
--- /dev/null
+++ b/src/runtime/cgo/gcc_signal2_ios_arm64.c
@@ -0,0 +1,11 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build lldb
+
+// Used by gcc_signal_darwin_arm64.c when doing the test build during cgo.
+// We hope that for real binaries the definition provided by Go will take precedence
+// and the linker will drop this .o file altogether, which is why this definition
+// is all by itself in its own file.
+void __attribute__((weak)) xx_cgo_panicmem(void) {}
diff --git a/src/runtime/cgo/gcc_signal_darwin_arm64.c b/src/runtime/cgo/gcc_signal_darwin_arm64.c
deleted file mode 100644
index 6519edd4cc..0000000000
--- a/src/runtime/cgo/gcc_signal_darwin_arm64.c
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Emulation of the Unix signal SIGSEGV.
-//
-// On iOS, Go tests and apps under development are run by lldb.
-// The debugger uses a task-level exception handler to intercept signals.
-// Despite having a 'handle' mechanism like gdb, lldb will not allow a
-// SIGSEGV to pass to the running program. For Go, this means we cannot
-// generate a panic, which cannot be recovered, and so tests fail.
-//
-// We work around this by registering a thread-level mach exception handler
-// and intercepting EXC_BAD_ACCESS. The kernel offers thread handlers a
-// chance to resolve exceptions before the task handler, so we can generate
-// the panic and avoid lldb's SIGSEGV handler.
-//
-// The dist tool enables this by build flag when testing.
-
-// +build lldb
-
-#include <limits.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <mach/arm/thread_status.h>
-#include <mach/exception_types.h>
-#include <mach/mach.h>
-#include <mach/mach_init.h>
-#include <mach/mach_port.h>
-#include <mach/thread_act.h>
-#include <mach/thread_status.h>
-
-#include "libcgo.h"
-#include "libcgo_unix.h"
-
-void xx_cgo_panicmem(void);
-uintptr_t x_cgo_panicmem = (uintptr_t)xx_cgo_panicmem;
-
-static pthread_mutex_t mach_exception_handler_port_set_mu;
-static mach_port_t mach_exception_handler_port_set = MACH_PORT_NULL;
-
-kern_return_t
-catch_exception_raise(
-	mach_port_t exception_port,
-	mach_port_t thread,
-	mach_port_t task,
-	exception_type_t exception,
-	exception_data_t code_vector,
-	mach_msg_type_number_t code_count)
-{
-	kern_return_t ret;
-	arm_unified_thread_state_t thread_state;
-	mach_msg_type_number_t state_count = ARM_UNIFIED_THREAD_STATE_COUNT;
-
-	// Returning KERN_SUCCESS intercepts the exception.
-	//
-	// Returning KERN_FAILURE lets the exception fall through to the
-	// next handler, which is the standard signal emulation code
-	// registered on the task port.
-
-	if (exception != EXC_BAD_ACCESS) {
-		return KERN_FAILURE;
-	}
-
-	ret = thread_get_state(thread, ARM_UNIFIED_THREAD_STATE, (thread_state_t)&thread_state, &state_count);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: thread_get_state failed: %d\n", ret);
-		abort();
-	}
-
-	// Bounce call to sigpanic through asm that makes it look like
-	// we call sigpanic directly from the faulting code.
-#ifdef __arm64__
-	thread_state.ts_64.__x[1] = thread_state.ts_64.__lr;
-	thread_state.ts_64.__x[2] = thread_state.ts_64.__pc;
-	thread_state.ts_64.__pc = x_cgo_panicmem;
-#else
-	thread_state.ts_32.__r[1] = thread_state.ts_32.__lr;
-	thread_state.ts_32.__r[2] = thread_state.ts_32.__pc;
-	thread_state.ts_32.__pc = x_cgo_panicmem;
-#endif
-
-	if (0) {
-		// Useful debugging logic when panicmem is broken.
-		//
-		// Sends the first SIGSEGV and lets lldb catch the
-		// second one, avoiding a loop that locks up iOS
-		// devices requiring a hard reboot.
-		fprintf(stderr, "runtime/cgo: caught exc_bad_access\n");
-		fprintf(stderr, "__lr = %llx\n", thread_state.ts_64.__lr);
-		fprintf(stderr, "__pc = %llx\n", thread_state.ts_64.__pc);
-		static int pass1 = 0;
-		if (pass1) {
-			return KERN_FAILURE;
-		}
-		pass1 = 1;
-	}
-
-	ret = thread_set_state(thread, ARM_UNIFIED_THREAD_STATE, (thread_state_t)&thread_state, state_count);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: thread_set_state failed: %d\n", ret);
-		abort();
-	}
-
-	return KERN_SUCCESS;
-}
-
-void
-darwin_arm_init_thread_exception_port()
-{
-	// Called by each new OS thread to bind its EXC_BAD_ACCESS exception
-	// to mach_exception_handler_port_set.
-	int ret;
-	mach_port_t port = MACH_PORT_NULL;
-
-	ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: mach_port_allocate failed: %d\n", ret);
-		abort();
-	}
-	ret = mach_port_insert_right(
-		mach_task_self(),
-		port,
-		port,
-		MACH_MSG_TYPE_MAKE_SEND);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: mach_port_insert_right failed: %d\n", ret);
-		abort();
-	}
-
-	ret = thread_set_exception_ports(
-		mach_thread_self(),
-		EXC_MASK_BAD_ACCESS,
-		port,
-		EXCEPTION_DEFAULT,
-		THREAD_STATE_NONE);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: thread_set_exception_ports failed: %d\n", ret);
-		abort();
-	}
-
-	ret = pthread_mutex_lock(&mach_exception_handler_port_set_mu);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: pthread_mutex_lock failed: %d\n", ret);
-		abort();
-	}
-	ret = mach_port_move_member(
-		mach_task_self(),
-		port,
-		mach_exception_handler_port_set);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: mach_port_move_member failed: %d\n", ret);
-		abort();
-	}
-	ret = pthread_mutex_unlock(&mach_exception_handler_port_set_mu);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: pthread_mutex_unlock failed: %d\n", ret);
-		abort();
-	}
-}
-
-static void*
-mach_exception_handler(void *port)
-{
-	// Calls catch_exception_raise.
-	extern boolean_t exc_server();
-	mach_msg_server(exc_server, 2048, (mach_port_t)port, 0);
-	abort(); // never returns
-}
-
-void
-darwin_arm_init_mach_exception_handler()
-{
-	pthread_mutex_init(&mach_exception_handler_port_set_mu, NULL);
-
-	// Called once per process to initialize a mach port server, listening
-	// for EXC_BAD_ACCESS thread exceptions.
-	int ret;
-	pthread_t thr = NULL;
-	pthread_attr_t attr;
-	sigset_t ign, oset;
-
-	ret = mach_port_allocate(
-		mach_task_self(),
-		MACH_PORT_RIGHT_PORT_SET,
-		&mach_exception_handler_port_set);
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: mach_port_allocate failed for port_set: %d\n", ret);
-		abort();
-	}
-
-	// Block all signals to the exception handler thread
-	sigfillset(&ign);
-	pthread_sigmask(SIG_SETMASK, &ign, &oset);
-
-	// Start a thread to handle exceptions.
-	uintptr_t port_set = (uintptr_t)mach_exception_handler_port_set;
-	pthread_attr_init(&attr);
-	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-	ret = _cgo_try_pthread_create(&thr, &attr, mach_exception_handler, (void*)port_set);
-
-	pthread_sigmask(SIG_SETMASK, &oset, nil);
-
-	if (ret) {
-		fprintf(stderr, "runtime/cgo: pthread_create failed: %d\n", ret);
-		abort();
-	}
-	pthread_attr_destroy(&attr);
-}
diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_darwin_lldb.c
deleted file mode 100644
index 0ccdae324e..0000000000
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !lldb
-// +build darwin
-// +build arm64
-
-#include <stdint.h>
-
-void darwin_arm_init_thread_exception_port() {}
-void darwin_arm_init_mach_exception_handler() {}
diff --git a/src/runtime/cgo/gcc_signal_darwin_nolldb.c b/src/runtime/cgo/gcc_signal_darwin_nolldb.c
new file mode 100644
index 0000000000..26be71bd1d
--- /dev/null
+++ b/src/runtime/cgo/gcc_signal_darwin_nolldb.c
@@ -0,0 +1,12 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !lldb !ios
+// +build darwin
+// +build arm64
+
+#include <stdint.h>
+
+void darwin_arm_init_thread_exception_port() {}
+void darwin_arm_init_mach_exception_handler() {}
diff --git a/src/runtime/cgo/gcc_signal_ios_arm64.c b/src/runtime/cgo/gcc_signal_ios_arm64.c
new file mode 100644
index 0000000000..6519edd4cc
--- /dev/null
+++ b/src/runtime/cgo/gcc_signal_ios_arm64.c
@@ -0,0 +1,213 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Emulation of the Unix signal SIGSEGV.
+//
+// On iOS, Go tests and apps under development are run by lldb.
+// The debugger uses a task-level exception handler to intercept signals.
+// Despite having a 'handle' mechanism like gdb, lldb will not allow a
+// SIGSEGV to pass to the running program. For Go, this means we cannot
+// generate a panic, which cannot be recovered, and so tests fail.
+//
+// We work around this by registering a thread-level mach exception handler
+// and intercepting EXC_BAD_ACCESS. The kernel offers thread handlers a
+// chance to resolve exceptions before the task handler, so we can generate
+// the panic and avoid lldb's SIGSEGV handler.
+//
+// The dist tool enables this by build flag when testing.
+
+// +build lldb
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <mach/arm/thread_status.h>
+#include <mach/exception_types.h>
+#include <mach/mach.h>
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#include <mach/thread_status.h>
+
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+void xx_cgo_panicmem(void);
+uintptr_t x_cgo_panicmem = (uintptr_t)xx_cgo_panicmem;
+
+static pthread_mutex_t mach_exception_handler_port_set_mu;
+static mach_port_t mach_exception_handler_port_set = MACH_PORT_NULL;
+
+kern_return_t
+catch_exception_raise(
+	mach_port_t exception_port,
+	mach_port_t thread,
+	mach_port_t task,
+	exception_type_t exception,
+	exception_data_t code_vector,
+	mach_msg_type_number_t code_count)
+{
+	kern_return_t ret;
+	arm_unified_thread_state_t thread_state;
+	mach_msg_type_number_t state_count = ARM_UNIFIED_THREAD_STATE_COUNT;
+
+	// Returning KERN_SUCCESS intercepts the exception.
+	//
+	// Returning KERN_FAILURE lets the exception fall through to the
+	// next handler, which is the standard signal emulation code
+	// registered on the task port.
+
+	if (exception != EXC_BAD_ACCESS) {
+		return KERN_FAILURE;
+	}
+
+	ret = thread_get_state(thread, ARM_UNIFIED_THREAD_STATE, (thread_state_t)&thread_state, &state_count);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: thread_get_state failed: %d\n", ret);
+		abort();
+	}
+
+	// Bounce call to sigpanic through asm that makes it look like
+	// we call sigpanic directly from the faulting code.
+#ifdef __arm64__
+	thread_state.ts_64.__x[1] = thread_state.ts_64.__lr;
+	thread_state.ts_64.__x[2] = thread_state.ts_64.__pc;
+	thread_state.ts_64.__pc = x_cgo_panicmem;
+#else
+	thread_state.ts_32.__r[1] = thread_state.ts_32.__lr;
+	thread_state.ts_32.__r[2] = thread_state.ts_32.__pc;
+	thread_state.ts_32.__pc = x_cgo_panicmem;
+#endif
+
+	if (0) {
+		// Useful debugging logic when panicmem is broken.
+		//
+		// Sends the first SIGSEGV and lets lldb catch the
+		// second one, avoiding a loop that locks up iOS
+		// devices requiring a hard reboot.
+		fprintf(stderr, "runtime/cgo: caught exc_bad_access\n");
+		fprintf(stderr, "__lr = %llx\n", thread_state.ts_64.__lr);
+		fprintf(stderr, "__pc = %llx\n", thread_state.ts_64.__pc);
+		static int pass1 = 0;
+		if (pass1) {
+			return KERN_FAILURE;
+		}
+		pass1 = 1;
+	}
+
+	ret = thread_set_state(thread, ARM_UNIFIED_THREAD_STATE, (thread_state_t)&thread_state, state_count);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: thread_set_state failed: %d\n", ret);
+		abort();
+	}
+
+	return KERN_SUCCESS;
+}
+
+void
+darwin_arm_init_thread_exception_port()
+{
+	// Called by each new OS thread to bind its EXC_BAD_ACCESS exception
+	// to mach_exception_handler_port_set.
+	int ret;
+	mach_port_t port = MACH_PORT_NULL;
+
+	ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: mach_port_allocate failed: %d\n", ret);
+		abort();
+	}
+	ret = mach_port_insert_right(
+		mach_task_self(),
+		port,
+		port,
+		MACH_MSG_TYPE_MAKE_SEND);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: mach_port_insert_right failed: %d\n", ret);
+		abort();
+	}
+
+	ret = thread_set_exception_ports(
+		mach_thread_self(),
+		EXC_MASK_BAD_ACCESS,
+		port,
+		EXCEPTION_DEFAULT,
+		THREAD_STATE_NONE);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: thread_set_exception_ports failed: %d\n", ret);
+		abort();
+	}
+
+	ret = pthread_mutex_lock(&mach_exception_handler_port_set_mu);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: pthread_mutex_lock failed: %d\n", ret);
+		abort();
+	}
+	ret = mach_port_move_member(
+		mach_task_self(),
+		port,
+		mach_exception_handler_port_set);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: mach_port_move_member failed: %d\n", ret);
+		abort();
+	}
+	ret = pthread_mutex_unlock(&mach_exception_handler_port_set_mu);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: pthread_mutex_unlock failed: %d\n", ret);
+		abort();
+	}
+}
+
+static void*
+mach_exception_handler(void *port)
+{
+	// Calls catch_exception_raise.
+	extern boolean_t exc_server();
+	mach_msg_server(exc_server, 2048, (mach_port_t)port, 0);
+	abort(); // never returns
+}
+
+void
+darwin_arm_init_mach_exception_handler()
+{
+	pthread_mutex_init(&mach_exception_handler_port_set_mu, NULL);
+
+	// Called once per process to initialize a mach port server, listening
+	// for EXC_BAD_ACCESS thread exceptions.
+	int ret;
+	pthread_t thr = NULL;
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+
+	ret = mach_port_allocate(
+		mach_task_self(),
+		MACH_PORT_RIGHT_PORT_SET,
+		&mach_exception_handler_port_set);
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: mach_port_allocate failed for port_set: %d\n", ret);
+		abort();
+	}
+
+	// Block all signals to the exception handler thread
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Start a thread to handle exceptions.
+	uintptr_t port_set = (uintptr_t)mach_exception_handler_port_set;
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	ret = _cgo_try_pthread_create(&thr, &attr, mach_exception_handler, (void*)port_set);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (ret) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %d\n", ret);
+		abort();
+	}
+	pthread_attr_destroy(&attr);
+}
diff --git a/src/runtime/cgo/signal_darwin_arm64.go b/src/runtime/cgo/signal_darwin_arm64.go
deleted file mode 100644
index 3425c448c4..0000000000
--- a/src/runtime/cgo/signal_darwin_arm64.go
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package cgo
-
-import _ "unsafe"
-
-//go:cgo_export_static xx_cgo_panicmem xx_cgo_panicmem
-func xx_cgo_panicmem()
diff --git a/src/runtime/cgo/signal_darwin_arm64.s b/src/runtime/cgo/signal_darwin_arm64.s
deleted file mode 100644
index 1ae00d13f3..0000000000
--- a/src/runtime/cgo/signal_darwin_arm64.s
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// xx_cgo_panicmem is the entrypoint for SIGSEGV as intercepted via a
-// mach thread port as EXC_BAD_ACCESS. As the segfault may have happened
-// in C code, we first need to load_g then call xx_cgo_panicmem.
-//
-//	R1 - LR at moment of fault
-//	R2 - PC at moment of fault
-TEXT xx_cgo_panicmem(SB),NOSPLIT|NOFRAME,$0
-	// If in external C code, we need to load the g register.
-	BL  runtime·load_g(SB)
-	CMP $0, g
-	BNE ongothread
-
-	// On a foreign thread.
-	// TODO(crawshaw): call badsignal
-	MOVD.W $0, -16(RSP)
-	MOVW $139, R1
-	MOVW R1, 8(RSP)
-	B    runtime·exit(SB)
-
-ongothread:
-	// Trigger a SIGSEGV panic.
-	//
-	// The goal is to arrange the stack so it looks like the runtime
-	// function sigpanic was called from the PC that faulted. It has
-	// to be sigpanic, as the stack unwinding code in traceback.go
-	// looks explicitly for it.
-	//
-	// To do this we call into runtime·setsigsegv, which sets the
-	// appropriate state inside the g object. We give it the faulting
-	// PC on the stack, then put it in the LR before calling sigpanic.
-
-	// Build a 32-byte stack frame for us for this call.
-	// Saved LR (none available) is at the bottom,
-	// then the PC argument for setsigsegv,
-	// then a copy of the LR for us to restore.
-	MOVD.W $0, -32(RSP)
-	MOVD R1, 8(RSP)
-	MOVD R2, 16(RSP)
-	BL runtime·setsigsegv(SB)
-	MOVD 8(RSP), R1
-	MOVD 16(RSP), R2
-
-	// Build a 16-byte stack frame for the simulated
-	// call to sigpanic, by taking 16 bytes away from the
-	// 32-byte stack frame above.
-	// The saved LR in this frame is the LR at time of fault,
-	// and the LR on entry to sigpanic is the PC at time of fault.
-	MOVD.W R1, 16(RSP)
-	MOVD R2, R30
-	B runtime·sigpanic(SB)
diff --git a/src/runtime/cgo/signal_ios_arm64.go b/src/runtime/cgo/signal_ios_arm64.go
new file mode 100644
index 0000000000..3425c448c4
--- /dev/null
+++ b/src/runtime/cgo/signal_ios_arm64.go
@@ -0,0 +1,10 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import _ "unsafe"
+
+//go:cgo_export_static xx_cgo_panicmem xx_cgo_panicmem
+func xx_cgo_panicmem()
diff --git a/src/runtime/cgo/signal_ios_arm64.s b/src/runtime/cgo/signal_ios_arm64.s
new file mode 100644
index 0000000000..1ae00d13f3
--- /dev/null
+++ b/src/runtime/cgo/signal_ios_arm64.s
@@ -0,0 +1,56 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// xx_cgo_panicmem is the entrypoint for SIGSEGV as intercepted via a
+// mach thread port as EXC_BAD_ACCESS. As the segfault may have happened
+// in C code, we first need to load_g then call xx_cgo_panicmem.
+//
+//	R1 - LR at moment of fault
+//	R2 - PC at moment of fault
+TEXT xx_cgo_panicmem(SB),NOSPLIT|NOFRAME,$0
+	// If in external C code, we need to load the g register.
+	BL  runtime·load_g(SB)
+	CMP $0, g
+	BNE ongothread
+
+	// On a foreign thread.
+	// TODO(crawshaw): call badsignal
+	MOVD.W $0, -16(RSP)
+	MOVW $139, R1
+	MOVW R1, 8(RSP)
+	B    runtime·exit(SB)
+
+ongothread:
+	// Trigger a SIGSEGV panic.
+	//
+	// The goal is to arrange the stack so it looks like the runtime
+	// function sigpanic was called from the PC that faulted. It has
+	// to be sigpanic, as the stack unwinding code in traceback.go
+	// looks explicitly for it.
+	//
+	// To do this we call into runtime·setsigsegv, which sets the
+	// appropriate state inside the g object. We give it the faulting
+	// PC on the stack, then put it in the LR before calling sigpanic.
+
+	// Build a 32-byte stack frame for us for this call.
+	// Saved LR (none available) is at the bottom,
+	// then the PC argument for setsigsegv,
+	// then a copy of the LR for us to restore.
+	MOVD.W $0, -32(RSP)
+	MOVD R1, 8(RSP)
+	MOVD R2, 16(RSP)
+	BL runtime·setsigsegv(SB)
+	MOVD 8(RSP), R1
+	MOVD 16(RSP), R2
+
+	// Build a 16-byte stack frame for the simulated
+	// call to sigpanic, by taking 16 bytes away from the
+	// 32-byte stack frame above.
+	// The saved LR in this frame is the LR at time of fault,
+	// and the LR on entry to sigpanic is the PC at time of fault.
+	MOVD.W R1, 16(RSP)
+	MOVD R2, R30
+	B runtime·sigpanic(SB)
-- 
cgit v1.2.1


From 67edc0ed81947a55adbcd0c9d2317abb93ac9510 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Tue, 6 Oct 2020 22:07:15 -0400
Subject: runtime: restore SSE guard in asyncPreempt on 386

So we don't use SSE instructions under GO386=softfloat.

Change-Id: I8ecc92340ee567f84a22501df2543ec041d25ef2
Reviewed-on: https://go-review.googlesource.com/c/go/+/260137
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/runtime/mkpreempt.go  | 28 ++++++++++++++++++----------
 src/runtime/preempt_386.s |  6 ++++++
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 40683bb9d9..76237bc31b 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -189,26 +189,34 @@ func (l *layout) restore() {
 
 func gen386() {
 	p("PUSHFL")
-
-	// Assign stack offsets.
+	// Save general purpose registers.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
-		if reg == "SP" {
+		if reg == "SP" || strings.HasPrefix(reg, "X") {
 			continue
 		}
-		if strings.HasPrefix(reg, "X") {
-			l.add("MOVUPS", reg, 16)
-		} else {
-			l.add("MOVL", reg, 4)
-		}
+		l.add("MOVL", reg, 4)
 	}
 
-	p("ADJSP $%d", l.stack)
+	// Save SSE state only if supported.
+	lSSE := layout{stack: l.stack, sp: "SP"}
+	for i := 0; i < 8; i++ {
+		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
+	}
+
+	p("ADJSP $%d", lSSE.stack)
 	p("NOP SP")
 	l.save()
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse")
+	lSSE.save()
+	label("nosse:")
 	p("CALL ·asyncPreempt2(SB)")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse2")
+	lSSE.restore()
+	label("nosse2:")
 	l.restore()
-	p("ADJSP $%d", -l.stack)
+	p("ADJSP $%d", -lSSE.stack)
+
 	p("POPFL")
 	p("RET")
 }
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
index 5c9b8ea224..c3a5fa1f36 100644
--- a/src/runtime/preempt_386.s
+++ b/src/runtime/preempt_386.s
@@ -14,6 +14,8 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVL BP, 16(SP)
 	MOVL SI, 20(SP)
 	MOVL DI, 24(SP)
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE nosse
 	MOVUPS X0, 28(SP)
 	MOVUPS X1, 44(SP)
 	MOVUPS X2, 60(SP)
@@ -22,7 +24,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVUPS X5, 108(SP)
 	MOVUPS X6, 124(SP)
 	MOVUPS X7, 140(SP)
+nosse:
 	CALL ·asyncPreempt2(SB)
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE nosse2
 	MOVUPS 140(SP), X7
 	MOVUPS 124(SP), X6
 	MOVUPS 108(SP), X5
@@ -31,6 +36,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVUPS 60(SP), X2
 	MOVUPS 44(SP), X1
 	MOVUPS 28(SP), X0
+nosse2:
 	MOVL 24(SP), DI
 	MOVL 20(SP), SI
 	MOVL 16(SP), BP
-- 
cgit v1.2.1


From ade5161f51f2b7239705047875dc36c35139b253 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Tue, 6 Oct 2020 21:13:16 -0400
Subject: crypto/x509: use macOS/AMD64 implementation on macOS/ARM64

Updates #38485.

Change-Id: I0582a53171ce803ca1b0237cfa9bc022fc1da6f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/260340
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/sys_darwin_arm64.s | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 427cb17781..f8d6f28dc7 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -707,3 +707,23 @@ TEXT runtime·syscall6X(SB),NOSPLIT,$0
 	MOVD	R0, 72(R2)	// save err
 ok:
 	RET
+
+// syscallNoErr is like syscall6 but does not check for errors, and
+// only returns one value, for use with standard C ABI library functions.
+TEXT runtime·syscallNoErr(SB),NOSPLIT,$0
+	SUB	$16, RSP	// push structure pointer
+	MOVD	R0, (RSP)
+
+	MOVD	0(R0), R12	// fn
+	MOVD	16(R0), R1	// a2
+	MOVD	24(R0), R2	// a3
+	MOVD	32(R0), R3	// a4
+	MOVD	40(R0), R4	// a5
+	MOVD	48(R0), R5	// a6
+	MOVD	8(R0), R0	// a1
+	BL	(R12)
+
+	MOVD	(RSP), R2	// pop structure pointer
+	ADD	$16, RSP
+	MOVD	R0, 56(R2)	// save r1
+	RET
-- 
cgit v1.2.1


From f8df205e74d5122c43f41923280451641e566ee2 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Wed, 7 Oct 2020 18:29:51 -0400
Subject: all: enable more tests on macOS/ARM64

On macOS, we can do "go build", can exec, and have the source
tree available, so we can enable more tests.

Skip ones that don't work. Most of them are due to that it
requires external linking (for now) and some tests don't work
with external linking (e.g. runtime deadlock detection). For
them, helper functions CanInternalLink/MustInternalLink are
introduced. I still want to have internal linking implemented,
but it is still a good idea to identify which tests don't work
with external linking.

Updates #38485.

Change-Id: I6b14697573cf3f371daf54b9ddd792acf232f2f2
Reviewed-on: https://go-review.googlesource.com/c/go/+/260719
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
---
 src/runtime/crash_test.go | 21 +++++++++++++++++++++
 src/runtime/time_test.go  |  4 ++++
 2 files changed, 25 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index eae4f538c1..5e22b7593e 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -181,6 +181,9 @@ func TestCrashHandler(t *testing.T) {
 }
 
 func testDeadlock(t *testing.T, name string) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", name)
 	want := "fatal error: all goroutines are asleep - deadlock!\n"
 	if !strings.HasPrefix(output, want) {
@@ -205,6 +208,9 @@ func TestLockedDeadlock2(t *testing.T) {
 }
 
 func TestGoexitDeadlock(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitDeadlock")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -290,6 +296,9 @@ func TestRecursivePanic4(t *testing.T) {
 }
 
 func TestGoexitCrash(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -348,6 +357,9 @@ func TestBreakpoint(t *testing.T) {
 }
 
 func TestGoexitInPanic(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	// see issue 8774: this code used to trigger an infinite recursion
 	output := runTestProg(t, "testprog", "GoexitInPanic")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -412,6 +424,9 @@ func TestPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoveredPanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "RecoveredPanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
@@ -420,6 +435,9 @@ func TestRecoveredPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -429,6 +447,9 @@ func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
diff --git a/src/runtime/time_test.go b/src/runtime/time_test.go
index a8dab7db8e..afd9af2af4 100644
--- a/src/runtime/time_test.go
+++ b/src/runtime/time_test.go
@@ -20,6 +20,10 @@ func TestFakeTime(t *testing.T) {
 		t.Skip("faketime not supported on windows")
 	}
 
+	// Faketime is advanced in checkdead. External linking brings in cgo,
+	// causing checkdead not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 
 	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
-- 
cgit v1.2.1


From 8f26b57f9afc238bdecb9b7030bc2f4364093885 Mon Sep 17 00:00:00 2001
From: Cuong Manh Le <cuong@orijtech.com>
Date: Sat, 3 Oct 2020 01:23:47 +0700
Subject: cmd/compile: split exported/non-exported methods for interface type

Currently, mhdr/methods is emitted with the same len/cap. There's no way
to distinguish between exported and non-exported methods statically.

This CL splits mhdr/methods into two parts, use "len" for number of
exported methods, and "cap" for all methods. This fixes the bug in
issue #22075, which intends to return the number of exported methods but
currently return all methods.

Note that with this encoding, we still can access either
all/exported-only/non-exported-only methods:

	mhdr[:cap(mhdr)]          // all methods
	mhdr                      // exported methods
	mhdr[len(mhdr):cap(mhdr)] // non-exported methods

Thank to Matthew Dempsky (@mdempsky) for suggesting this encoding.

Fixes #22075

Change-Id: If662adb03ccff27407d55a5578a0ed05a15e7cdd
Reviewed-on: https://go-review.googlesource.com/c/go/+/259237
Trust: Cuong Manh Le <cuong.manhle.vn@gmail.com>
Run-TryBot: Cuong Manh Le <cuong.manhle.vn@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
---
 src/runtime/alg.go    |  2 +-
 src/runtime/iface.go  | 12 +++++++-----
 src/runtime/mfinal.go |  4 ++--
 src/runtime/type.go   | 26 ++++++++++++++++++++------
 4 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 0af48ab25c..4a98b84e4a 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -185,7 +185,7 @@ func typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
 		return strhash(p, h)
 	case kindInterface:
 		i := (*interfacetype)(unsafe.Pointer(t))
-		if len(i.mhdr) == 0 {
+		if i.isEmpty() {
 			return nilinterhash(p, h)
 		}
 		return interhash(p, h)
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 0504b89363..f8b7d429a3 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -31,16 +31,17 @@ func itabHashFunc(inter *interfacetype, typ *_type) uintptr {
 }
 
 func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
-	if len(inter.mhdr) == 0 {
+	if inter.isEmpty() {
 		throw("internal error - misuse of itab")
 	}
+	imethods := inter.methods()
 
 	// easy case
 	if typ.tflag&tflagUncommon == 0 {
 		if canfail {
 			return nil
 		}
-		name := inter.typ.nameOff(inter.mhdr[0].name)
+		name := inter.typ.nameOff(imethods[0].name)
 		panic(&TypeAssertionError{nil, typ, &inter.typ, name.name()})
 	}
 
@@ -63,7 +64,7 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 	}
 
 	// Entry doesn't exist yet. Make a new entry & add it.
-	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*sys.PtrSize, 0, &memstats.other_sys))
+	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(imethods)-1)*sys.PtrSize, 0, &memstats.other_sys))
 	m.inter = inter
 	m._type = typ
 	// The hash is used in type switches. However, compiler statically generates itab's
@@ -197,7 +198,8 @@ func (m *itab) init() string {
 	// and interface names are unique,
 	// so can iterate over both in lock step;
 	// the loop is O(ni+nt) not O(ni*nt).
-	ni := len(inter.mhdr)
+	imethods := inter.methods()
+	ni := len(imethods)
 	nt := int(x.mcount)
 	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
 	j := 0
@@ -205,7 +207,7 @@ func (m *itab) init() string {
 	var fun0 unsafe.Pointer
 imethods:
 	for k := 0; k < ni; k++ {
-		i := &inter.mhdr[k]
+		i := &imethods[k]
 		itype := inter.typ.typeOff(i.ityp)
 		name := inter.typ.nameOff(i.name)
 		iname := name.name()
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index cd6196dcab..6676ae6736 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -210,7 +210,7 @@ func runfinq() {
 					// set up with empty interface
 					(*eface)(frame)._type = &f.ot.typ
 					(*eface)(frame).data = f.arg
-					if len(ityp.mhdr) != 0 {
+					if !ityp.isEmpty() {
 						// convert to interface with methods
 						// this conversion is guaranteed to succeed - we checked in SetFinalizer
 						*(*iface)(frame) = assertE2I(ityp, *(*eface)(frame))
@@ -394,7 +394,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 		}
 	case fint.kind&kindMask == kindInterface:
 		ityp := (*interfacetype)(unsafe.Pointer(fint))
-		if len(ityp.mhdr) == 0 {
+		if ityp.isEmpty() {
 			// ok - satisfies empty interface
 			goto okarg
 		}
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 81455f3532..36492619e1 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -366,7 +366,19 @@ type imethod struct {
 type interfacetype struct {
 	typ     _type
 	pkgpath name
-	mhdr    []imethod
+	// expMethods contains all interface methods.
+	//
+	// - len(expMethods) returns number of exported methods.
+	// - cap(expMethods) returns all interface methods, including both exported/non-exported methods.
+	expMethods []imethod
+}
+
+func (it *interfacetype) methods() []imethod {
+	return it.expMethods[:cap(it.expMethods)]
+}
+
+func (it *interfacetype) isEmpty() bool {
+	return cap(it.expMethods) == 0
 }
 
 type maptype struct {
@@ -664,13 +676,15 @@ func typesEqual(t, v *_type, seen map[_typePair]struct{}) bool {
 		if it.pkgpath.name() != iv.pkgpath.name() {
 			return false
 		}
-		if len(it.mhdr) != len(iv.mhdr) {
+		itmethods := it.methods()
+		ivmethods := iv.methods()
+		if len(itmethods) != len(ivmethods) {
 			return false
 		}
-		for i := range it.mhdr {
-			tm := &it.mhdr[i]
-			vm := &iv.mhdr[i]
-			// Note the mhdr array can be relocated from
+		for i := range itmethods {
+			tm := &itmethods[i]
+			vm := &ivmethods[i]
+			// Note the expMethods array can be relocated from
 			// another module. See #17724.
 			tname := resolveNameOff(unsafe.Pointer(tm), tm.name)
 			vname := resolveNameOff(unsafe.Pointer(vm), vm.name)
-- 
cgit v1.2.1


From 39b527691495902279da7ac8405a070ded7dd4a2 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Tue, 7 Jul 2020 09:07:16 -0400
Subject: net: remove dependency on math/rand

Like we did for sync, let the runtime give net random numbers,
to avoid forcing an import of math/rand for DNS.

Change-Id: Iab3e64121d687d288a3961a8ccbcebe589047253
Reviewed-on: https://go-review.googlesource.com/c/go/+/241258
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/stubs.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index bd2514e862..6290142a41 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -130,6 +130,9 @@ func fastrandn(n uint32) uint32 {
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
 
+//go:linkname net_fastrand net.fastrand
+func net_fastrand() uint32 { return fastrand() }
+
 // in internal/bytealg/equal_*.s
 //go:noescape
 func memequal(a, b unsafe.Pointer, size uintptr) bool
-- 
cgit v1.2.1


From e08059f4fcacce2ff18d1cfc5fa48c942f8d46aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= <moehrmann@google.com>
Date: Tue, 13 Oct 2020 08:26:36 +0200
Subject: runtime: remove unused alg constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CL 191198 removed algarray from the runtime
which used these constants as indices.

Change-Id: Ia669cf410372ef5113dadccd115a39ff8d47e5c8
Reviewed-on: https://go-review.googlesource.com/c/go/+/261364
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Emmanuel Odeke <emm.odeke@gmail.com>
Trust: Martin Möhrmann <moehrmann@google.com>
---
 src/runtime/alg.go | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 4a98b84e4a..2ec3fc3658 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -15,25 +15,6 @@ const (
 	c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503)
 )
 
-// type algorithms - known to compiler
-const (
-	alg_NOEQ = iota
-	alg_MEM0
-	alg_MEM8
-	alg_MEM16
-	alg_MEM32
-	alg_MEM64
-	alg_MEM128
-	alg_STRING
-	alg_INTER
-	alg_NILINTER
-	alg_FLOAT32
-	alg_FLOAT64
-	alg_CPLX64
-	alg_CPLX128
-	alg_max
-)
-
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
-- 
cgit v1.2.1


From 7c58ef732efd9bf0d0882bb95371ce1909924a75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= <moehrmann@google.com>
Date: Mon, 14 Sep 2020 16:55:34 +0200
Subject: runtime: implement GODEBUG=inittrace=1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.

The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.

Packages with no init function work (user defined or compiler generated) are omitted.

Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.

$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...

Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.

Fixes #41378

Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/runtime/extern.go   | 13 ++++++++++
 src/runtime/malloc.go   | 56 ++++++++++++++++++++++++---------------
 src/runtime/proc.go     | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 src/runtime/runtime1.go | 13 ++++++++--
 src/runtime/symtab.go   | 16 ++++++++++++
 5 files changed, 138 insertions(+), 29 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 7316503ed2..b75507b8f8 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -78,6 +78,19 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
 	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
 	instead of MADV_FREE on Linux when returning memory to the
 	kernel. This is less efficient, but causes RSS numbers to drop
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index f7e9b7c4b4..b19d1f2671 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -909,27 +909,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if debug.sbrk != 0 {
-		align := uintptr(16)
-		if typ != nil {
-			// TODO(austin): This should be just
-			//   align = uintptr(typ.align)
-			// but that's only 4 on 32-bit platforms,
-			// even if there's a uint64 field in typ (see #599).
-			// This causes 64-bit atomic accesses to panic.
-			// Hence, we use stricter alignment that matches
-			// the normal allocator better.
-			if size&7 == 0 {
-				align = 8
-			} else if size&3 == 0 {
-				align = 4
-			} else if size&1 == 0 {
-				align = 2
-			} else {
-				align = 1
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
 			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.allocs += 1
 		}
-		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
 	// assistG is the G to charge for this allocation, or nil if
@@ -1136,8 +1143,15 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
-	if debug.allocfreetrace != 0 {
-		tracealloc(x, size, typ)
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.bytes += uint64(size)
+		}
 	}
 
 	if rate := MemProfileRate; rate > 0 {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a1e2ed0680..4872480314 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -154,11 +154,20 @@ func main() {
 		throw("runtime.main not on m0")
 	}
 
-	doInit(&runtime_inittask) // must be before defer
-	if nanotime() == 0 {
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
 		throw("nanotime returning zero")
 	}
 
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	doInit(&runtime_inittask) // Must be before defer.
+
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
 	needUnlock := true
 	defer func() {
@@ -167,9 +176,6 @@ func main() {
 		}
 	}()
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
-
 	gcenable()
 
 	main_init_done = make(chan bool)
@@ -196,6 +202,10 @@ func main() {
 
 	doInit(&main_inittask)
 
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
 	close(main_init_done)
 
 	needUnlock = false
@@ -5665,6 +5675,17 @@ type initTask struct {
 	// followed by nfns pcs, one per init function to run
 }
 
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init go routine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
+
 func doInit(t *initTask) {
 	switch t.state {
 	case 2: // fully initialized
@@ -5673,16 +5694,52 @@ func doInit(t *initTask) {
 		throw("recursive call during initialization - linker skew")
 	default: // not initialized yet
 		t.state = 1 // initialization in progress
+
 		for i := uintptr(0); i < t.ndeps; i++ {
 			p := add(unsafe.Pointer(t), (3+i)*sys.PtrSize)
 			t2 := *(**initTask)(p)
 			doInit(t2)
 		}
+
+		if t.nfns == 0 {
+			t.state = 2 // initialization done
+			return
+		}
+
+		var (
+			start  int64
+			before tracestat
+		)
+
+		if inittrace.active {
+			start = nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			before = inittrace
+		}
+
+		firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*sys.PtrSize)
 		for i := uintptr(0); i < t.nfns; i++ {
-			p := add(unsafe.Pointer(t), (3+t.ndeps+i)*sys.PtrSize)
+			p := add(firstFunc, i*sys.PtrSize)
 			f := *(*func())(unsafe.Pointer(&p))
 			f()
 		}
+
+		if inittrace.active {
+			end := nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			after := inittrace
+
+			pkg := funcpkgpath(findfunc(funcPC(firstFunc)))
+
+			var sbuf [24]byte
+			print("init ", pkg, " @")
+			print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ")
+			print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ")
+			print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ")
+			print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs")
+			print("\n")
+		}
+
 		t.state = 2 // initialization done
 	}
 }
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 7c893aa25c..0f182ac58e 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -300,7 +300,6 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace     int32
 	cgocheck           int32
 	clobberfree        int32
 	efence             int32
@@ -311,13 +310,20 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	sbrk               int32
 	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
 }
 
 var dbgvars = []dbgVar{
@@ -339,6 +345,7 @@ var dbgvars = []dbgVar{
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
 }
 
 func parsedebugvars() {
@@ -378,6 +385,8 @@ func parsedebugvars() {
 		}
 	}
 
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 }
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index a14f5c13d9..84637376bf 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -844,6 +844,22 @@ func funcname(f funcInfo) string {
 	return gostringnocopy(cfuncname(f))
 }
 
+func funcpkgpath(f funcInfo) string {
+	name := funcname(f)
+	i := len(name) - 1
+	for ; i > 0; i-- {
+		if name[i] == '/' {
+			break
+		}
+	}
+	for ; i < len(name); i++ {
+		if name[i] == '.' {
+			break
+		}
+	}
+	return name[:i]
+}
+
 func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
 	if !f.valid() {
 		return nil
-- 
cgit v1.2.1


From e4ec30965b9ca629922e83b8d335224ae4bdf062 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Mon, 12 Oct 2020 13:44:21 -0400
Subject: cmd/link: support internal linking on darwin/arm64

Add support of internal linking on darwin/arm64 (macOS).

Still incomplete. Pure Go binaries work. Cgo doesn't. TLS is not
set up when cgo is not used (as before) (so asynchronous
preemption is not enabled).

Internal linking is not enabled by default but can be requested
via -ldflags=-linkmode=internal.

Updates #38485.

Change-Id: I1e0c81b6028edcb1ac26dcdafeb9bb3f788cf732
Reviewed-on: https://go-review.googlesource.com/c/go/+/261643
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
---
 src/runtime/rt0_darwin_arm64.s | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s
index e3972f4924..0040361215 100644
--- a/src/runtime/rt0_darwin_arm64.s
+++ b/src/runtime/rt0_darwin_arm64.s
@@ -4,11 +4,14 @@
 
 #include "textflag.h"
 
-// No need for _rt0_arm64_darwin as darwin/arm64 only
-// supports external linking.
 TEXT _rt0_arm64_darwin(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$42, R0
-	BL  libc_exit(SB)
+	MOVD	$runtime·rt0_go(SB), R2
+	BL	(R2)
+exit:
+	MOVD	$0, R0
+	MOVD	$1, R16	// sys_exit
+	SVC	$0x80
+	B	exit
 
 // When linking with -buildmode=c-archive or -buildmode=c-shared,
 // this symbol is called from a global initialization function.
@@ -86,11 +89,6 @@ GLOBL _rt0_arm64_darwin_lib_argc<>(SB),NOPTR, $8
 DATA  _rt0_arm64_darwin_lib_argv<>(SB)/8, $0
 GLOBL _rt0_arm64_darwin_lib_argv<>(SB),NOPTR, $8
 
+// external linking entry point.
 TEXT main(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·rt0_go(SB), R2
-	BL	(R2)
-exit:
-	MOVD	$0, R0
-	MOVD	$1, R16	// sys_exit
-	SVC	$0x80
-	B	exit
+	JMP	_rt0_arm64_darwin(SB)
-- 
cgit v1.2.1


From 2517f4946b42b8deedb864c884f1b41311d45850 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 14 Oct 2020 17:18:27 -0400
Subject: runtime: remove debugCachedWork

debugCachedWork and all of its dependent fields and code were added to
aid in debugging issue #27993. Now that the source of the problem is
known and mitigated (via the extra work check after STW in gcMarkDone),
these extra checks are no longer required and simply make the code more
difficult to follow.

Remove it all.

Updates #27993

Change-Id: I594beedd5ca61733ba9cc9eaad8f80ea92df1a0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/262350
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/mgc.go     | 121 ++++++++++---------------------------------------
 src/runtime/mgcwork.go |  74 ------------------------------
 src/runtime/mwbbuf.go  |  32 +------------
 src/runtime/panic.go   |   9 ----
 4 files changed, 26 insertions(+), 210 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index bd87144355..0a4d5616a5 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1407,19 +1407,6 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32
 
-// debugCachedWork enables extra checks for debugging premature mark
-// termination.
-//
-// For debugging issue #27993.
-const debugCachedWork = false
-
-// gcWorkPauseGen is for debugging the mark completion algorithm.
-// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
-// Only used if debugCachedWork is true.
-//
-// For debugging issue #27993.
-var gcWorkPauseGen uint32 = 1
-
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1475,15 +1462,7 @@ top:
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
 			wbBufFlush1(_p_)
-			// For debugging, shrink the write barrier
-			// buffer so it flushes immediately.
-			// wbBuf.reset will keep it at this size as
-			// long as throwOnGCWork is set.
-			if debugCachedWork {
-				b := &_p_.wbBuf
-				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
-				b.debugGen = gcWorkPauseGen
-			}
+
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
@@ -1494,29 +1473,12 @@ top:
 			if _p_.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
 				_p_.gcw.flushedWork = false
-			} else if debugCachedWork {
-				// For debugging, freeze the gcWork
-				// until we know whether we've reached
-				// completion or not. If we think
-				// we've reached completion, but
-				// there's a paused gcWork, then
-				// that's a bug.
-				_p_.gcw.pauseGen = gcWorkPauseGen
-				// Capture the G's stack.
-				for i := range _p_.gcw.pauseStack {
-					_p_.gcw.pauseStack[i] = 0
-				}
-				callers(1, _p_.gcw.pauseStack[:])
 			}
 		})
 		casgstatus(gp, _Gwaiting, _Grunning)
 	})
 
 	if gcMarkDoneFlushed != 0 {
-		if debugCachedWork {
-			// Release paused gcWorks.
-			atomic.Xadd(&gcWorkPauseGen, 1)
-		}
 		// More grey objects were discovered since the
 		// previous termination check, so there may be more
 		// work to do. Keep going. It's possible the
@@ -1526,13 +1488,6 @@ top:
 		goto top
 	}
 
-	if debugCachedWork {
-		throwOnGCWork = true
-		// Release paused gcWorks. If there are any, they
-		// should now observe throwOnGCWork and panic.
-		atomic.Xadd(&gcWorkPauseGen, 1)
-	}
-
 	// There was no global work, no local work, and no Ps
 	// communicated work since we took markDoneSema. Therefore
 	// there are no grey objects and no more objects can be
@@ -1549,59 +1504,33 @@ top:
 	// below. The important thing is that the wb remains active until
 	// all marking is complete. This includes writes made by the GC.
 
-	if debugCachedWork {
-		// For debugging, double check that no work was added after we
-		// went around above and disable write barrier buffering.
+	// There is sometimes work left over when we enter mark termination due
+	// to write barriers performed after the completion barrier above.
+	// Detect this and resume concurrent mark. This is obviously
+	// unfortunate.
+	//
+	// See issue #27993 for details.
+	//
+	// Switch to the system stack to call wbBufFlush1, though in this case
+	// it doesn't matter because we're non-preemptible anyway.
+	restart := false
+	systemstack(func() {
 		for _, p := range allp {
-			gcw := &p.gcw
-			if !gcw.empty() {
-				printlock()
-				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
-				if gcw.wbuf1 == nil {
-					print(" wbuf1=<nil>")
-				} else {
-					print(" wbuf1.n=", gcw.wbuf1.nobj)
-				}
-				if gcw.wbuf2 == nil {
-					print(" wbuf2=<nil>")
-				} else {
-					print(" wbuf2.n=", gcw.wbuf2.nobj)
-				}
-				print("\n")
-				if gcw.pauseGen == gcw.putGen {
-					println("runtime: checkPut already failed at this generation")
-				}
-				throw("throwOnGCWork")
+			wbBufFlush1(p)
+			if !p.gcw.empty() {
+				restart = true
+				break
 			}
 		}
-	} else {
-		// For unknown reasons (see issue #27993), there is
-		// sometimes work left over when we enter mark
-		// termination. Detect this and resume concurrent
-		// mark. This is obviously unfortunate.
-		//
-		// Switch to the system stack to call wbBufFlush1,
-		// though in this case it doesn't matter because we're
-		// non-preemptible anyway.
-		restart := false
+	})
+	if restart {
+		getg().m.preemptoff = ""
 		systemstack(func() {
-			for _, p := range allp {
-				wbBufFlush1(p)
-				if !p.gcw.empty() {
-					restart = true
-					break
-				}
-			}
+			now := startTheWorldWithSema(true)
+			work.pauseNS += now - work.pauseStart
 		})
-		if restart {
-			getg().m.preemptoff = ""
-			systemstack(func() {
-				now := startTheWorldWithSema(true)
-				work.pauseNS += now - work.pauseStart
-			})
-			semrelease(&worldsema)
-			goto top
-		}
+		semrelease(&worldsema)
+		goto top
 	}
 
 	// Disable assists and background workers. We must do
@@ -2085,7 +2014,7 @@ func gcMark(start_time int64) {
 		// ensured all reachable objects were marked, all of
 		// these must be pointers to black objects. Hence we
 		// can just discard the write barrier buffer.
-		if debug.gccheckmark > 0 || throwOnGCWork {
+		if debug.gccheckmark > 0 {
 			// For debugging, flush the buffer and make
 			// sure it really was all marked.
 			wbBufFlush1(p)
@@ -2117,8 +2046,6 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	throwOnGCWork = false
-
 	cachestats()
 
 	// Update the marked heap stat.
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 46101657d5..51e0fe9219 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -22,13 +22,6 @@ const (
 	workbufAlloc = 32 << 10
 )
 
-// throwOnGCWork causes any operations that add pointers to a gcWork
-// buffer to throw.
-//
-// TODO(austin): This is a temporary debugging measure for issue
-// #27993. To be removed before release.
-var throwOnGCWork bool
-
 func init() {
 	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 		throw("bad workbufAlloc")
@@ -93,17 +86,6 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
-
-	// pauseGen causes put operations to spin while pauseGen ==
-	// gcWorkPauseGen if debugCachedWork is true.
-	pauseGen uint32
-
-	// putGen is the pauseGen of the last putGen.
-	putGen uint32
-
-	// pauseStack is the stack at which this P was paused if
-	// debugCachedWork is true.
-	pauseStack [16]uintptr
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -122,60 +104,10 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
-func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
-	if debugCachedWork {
-		alreadyFailed := w.putGen == w.pauseGen
-		w.putGen = w.pauseGen
-		if !canPreemptM(getg().m) {
-			// If we were to spin, the runtime may
-			// deadlock. Since we can't be preempted, the
-			// spin could prevent gcMarkDone from
-			// finishing the ragged barrier, which is what
-			// releases us from the spin.
-			return
-		}
-		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
-		}
-		if throwOnGCWork {
-			printlock()
-			if alreadyFailed {
-				println("runtime: checkPut already failed at this generation")
-			}
-			println("runtime: late gcWork put")
-			if ptr != 0 {
-				gcDumpObject("ptr", ptr, ^uintptr(0))
-			}
-			for _, ptr := range ptrs {
-				gcDumpObject("ptrs", ptr, ^uintptr(0))
-			}
-			println("runtime: paused at")
-			for _, pc := range w.pauseStack {
-				if pc == 0 {
-					break
-				}
-				f := findfunc(pc)
-				if f.valid() {
-					// Obviously this doesn't
-					// relate to ancestor
-					// tracebacks, but this
-					// function prints what we
-					// want.
-					printAncestorTracebackFuncInfo(f, pc)
-				} else {
-					println("\tunknown PC ", hex(pc), "\n")
-				}
-			}
-			throw("throwOnGCWork")
-		}
-	}
-}
-
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-	w.checkPut(obj, nil)
-
 	flushed := false
 	wbuf := w.wbuf1
 	// Record that this may acquire the wbufSpans or heap lock to
@@ -214,8 +146,6 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-	w.checkPut(obj, nil)
-
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
@@ -237,8 +167,6 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}
 
-	w.checkPut(0, obj)
-
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -360,12 +288,10 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		putfull(wbuf)
 		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		w.wbuf1 = handoff(wbuf)
 		w.flushedWork = true // handoff did putfull
 	} else {
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index 632769c114..6efc00007d 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -57,12 +57,6 @@ type wbBuf struct {
 	// on. This must be a multiple of wbBufEntryPointers because
 	// the write barrier only checks for overflow once per entry.
 	buf [wbBufEntryPointers * wbBufEntries]uintptr
-
-	// debugGen causes the write barrier buffer to flush after
-	// every write barrier if equal to gcWorkPauseGen. This is for
-	// debugging #27993. This is only set if debugCachedWork is
-	// set.
-	debugGen uint32
 }
 
 const (
@@ -86,7 +80,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if writeBarrier.cgo || (debugCachedWork && (throwOnGCWork || b.debugGen == atomic.Load(&gcWorkPauseGen))) {
+	if writeBarrier.cgo {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
@@ -204,32 +198,10 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Switch to the system stack so we don't have to worry about
 	// the untyped stack slots or safe points.
 	systemstack(func() {
-		if debugCachedWork {
-			// For debugging, include the old value of the
-			// slot and some other data in the traceback.
-			wbBuf := &getg().m.p.ptr().wbBuf
-			var old uintptr
-			if dst != nil {
-				// dst may be nil in direct calls to wbBufFlush.
-				old = *dst
-			}
-			wbBufFlush1Debug(old, wbBuf.buf[0], wbBuf.buf[1], &wbBuf.buf[0], wbBuf.next)
-		} else {
-			wbBufFlush1(getg().m.p.ptr())
-		}
+		wbBufFlush1(getg().m.p.ptr())
 	})
 }
 
-// wbBufFlush1Debug is a temporary function for debugging issue
-// #27993. It exists solely to add some context to the traceback.
-//
-//go:nowritebarrierrec
-//go:systemstack
-//go:noinline
-func wbBufFlush1Debug(old, buf1, buf2 uintptr, start *uintptr, next uintptr) {
-	wbBufFlush1(getg().m.p.ptr())
-}
-
 // wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
 //
 // This must not have write barriers because it is part of the write
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 6050a34d29..aed17d6fc6 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -421,15 +421,6 @@ func newdefer(siz int32) *_defer {
 			total := roundupsize(totaldefersize(uintptr(siz)))
 			d = (*_defer)(mallocgc(total, deferType, true))
 		})
-		if debugCachedWork {
-			// Duplicate the tail below so if there's a
-			// crash in checkPut we can tell if d was just
-			// allocated or came from the pool.
-			d.siz = siz
-			d.link = gp._defer
-			gp._defer = d
-			return d
-		}
 	}
 	d.siz = siz
 	d.heap = true
-- 
cgit v1.2.1


From 64fb6ae95f1c322486cbfb758552bb8439a8e6e8 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Wed, 14 Oct 2020 16:03:48 -0700
Subject: runtime: stop preemption during syscall.Exec on Darwin

On current macOS versions a program that receives a signal during an
execve can fail with a SIGILL signal. This appears to be a macOS
kernel bug. It has been reported to Apple.

This CL partially works around the problem by using execLock to not
send preemption signals during execve. Of course some other stray
signal could occur, but at least we can avoid exacerbating the problem.
We can't simply disable signals, as that would mean that the exec'ed
process would start with all signals blocked, which it likely does not
expect.

Fixes #41702

Change-Id: I91b0add967b315671ddcf73269c4d30136e579b4
Reviewed-on: https://go-review.googlesource.com/c/go/+/262438
Trust: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/signal_unix.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index a3d6f34c88..c228de47b4 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -356,6 +356,13 @@ func preemptM(mp *m) {
 		// required).
 		return
 	}
+
+	// On Darwin, don't try to preempt threads during exec.
+	// Issue #41702.
+	if GOOS == "darwin" {
+		execLock.rlock()
+	}
+
 	if atomic.Cas(&mp.signalPending, 0, 1) {
 		// If multiple threads are preempting the same M, it may send many
 		// signals to the same M such that it hardly make progress, causing
@@ -364,6 +371,10 @@ func preemptM(mp *m) {
 		// Only send a signal if there isn't already one pending.
 		signalM(mp, sigPreempt)
 	}
+
+	if GOOS == "darwin" {
+		execLock.runlock()
+	}
 }
 
 // sigFetchG fetches the value of G safely when running in a signal handler.
-- 
cgit v1.2.1


From afba990169f41d9026c923da5235584db32cab67 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 15 Oct 2020 16:11:10 -0400
Subject: runtime/internal/atomic: drop package prefixes

This drops package prefixes from the assembly code on 386 and arm. In
addition to just being nicer, this allows the assembler to
automatically pick up the argument stack map from the Go signatures of
these functions. This doesn't matter right now because these functions
never call back out to Go, but prepares us for the next CL.

Change-Id: I90fed7d4dd63ad49274529c62804211b6390e2e9
Reviewed-on: https://go-review.googlesource.com/c/go/+/262777
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/funcdata.h                |  6 +--
 src/runtime/internal/atomic/asm_386.s | 76 ++++++++++++++++-----------------
 src/runtime/internal/atomic/asm_arm.s | 80 +++++++++++++++++------------------
 3 files changed, 81 insertions(+), 81 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 0fb50ddfba..cd76c06992 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -32,9 +32,9 @@
 // defines the pointer map for the function's arguments.
 // GO_ARGS should be the first instruction in a function that uses it.
 // It can be omitted if there are no arguments at all.
-// GO_ARGS is inserted implicitly by the linker for any function
-// that also has a Go prototype and therefore is usually not necessary
-// to write explicitly.
+// GO_ARGS is inserted implicitly by the linker for any function whose
+// name starts with a middle-dot and that also has a Go prototype; it
+// is therefore usually not necessary to write explicitly.
 #define GO_ARGS	FUNCDATA $FUNCDATA_ArgsPointerMaps, go_args_stackmap(SB)
 
 // GO_RESULTS_INITIALIZED indicates that the assembly function
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 9b9dc14a60..357ca95625 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -11,7 +11,7 @@
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+TEXT ·Cas(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -20,32 +20,32 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
 	SETEQ	ret+12(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·CasRel(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduint(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Xadd(SB)
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	·Xadd(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Load64(SB)
+TEXT ·Loadint64(SB), NOSPLIT, $0-12
+	JMP	·Load64(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
-	JMP runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Xaddint64(SB), NOSPLIT, $0-20
+	JMP	·Xadd64(SB)
 
 
-// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
 //		*val = new;
@@ -53,7 +53,7 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
 //	} else {
 //		return 0;
 //	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+TEXT ·Cas64(SB), NOSPLIT, $0-21
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
@@ -74,7 +74,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+TEXT ·Casp1(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -87,7 +87,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
 // Atomically:
 //	*val += delta;
 //	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+TEXT ·Xadd(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	delta+4(FP), AX
 	MOVL	AX, CX
@@ -97,7 +97,7 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-20
+TEXT ·Xadd64(SB), NOSPLIT, $0-20
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
@@ -133,17 +133,17 @@ addloop:
 	MOVL	CX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+TEXT ·Xchg(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xchg(SB)
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	·Xchg(SB)
 
-TEXT  runtime∕internal∕atomic·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$0-20
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
@@ -171,23 +171,23 @@ swaploop:
 	MOVL	DX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+TEXT ·Store(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
+TEXT ·StoreRel(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
 // uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+TEXT ·Load64(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
@@ -197,8 +197,8 @@ TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
 	EMMS
 	RET
 
-// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+// void ·Store64(uint64 volatile* addr, uint64 v);
+TEXT ·Store64(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
@@ -214,23 +214,23 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
 	XADDL	AX, (SP)
 	RET
 
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ORB	BX, (AX)
 	RET
 
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ANDB	BX, (AX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-5
+TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), BX
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index d4ef11560e..db1267423d 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -12,13 +12,13 @@
 //	}else
 //		return 0;
 //
-// To implement runtime∕internal∕atomic·cas in sys_$GOOS_arm.s
+// To implement ·cas in sys_$GOOS_arm.s
 // using the native instructions, use:
 //
-//	TEXT runtime∕internal∕atomic·cas(SB),NOSPLIT,$0
-//		B	runtime∕internal∕atomic·armcas(SB)
+//	TEXT ·cas(SB),NOSPLIT,$0
+//		B	·armcas(SB)
 //
-TEXT runtime∕internal∕atomic·armcas(SB),NOSPLIT,$0-13
+TEXT ·armcas(SB),NOSPLIT,$0-13
 	MOVW	ptr+0(FP), R1
 	MOVW	old+4(FP), R2
 	MOVW	new+8(FP), R3
@@ -50,44 +50,44 @@ casfail:
 
 // stubs
 
-TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casp1(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·CasRel(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduint(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StoreRel(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·StoreRel(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Xadd(SB)
+TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
+	B	·Xadd(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Load64(SB)
+TEXT ·Loadint64(SB),NOSPLIT,$0-12
+	B	·Load64(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
-	B	runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Xaddint64(SB),NOSPLIT,$0-20
+	B	·Xadd64(SB)
 
 // 64-bit atomics
 // The native ARM implementations use LDREXD/STREXD, which are
@@ -95,7 +95,7 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
 // On older ARM, we use Go implementations which simulate 64-bit
 // atomics with locks.
 
-TEXT	armCas64<>(SB),NOSPLIT,$0-21
+TEXT armCas64<>(SB),NOSPLIT,$0-21
 	MOVW	addr+0(FP), R1
 	// make unaligned atomic access panic
 	AND.S	$7, R1, R2
@@ -128,7 +128,7 @@ cas64fail:
 	MOVBU	R0, swapped+20(FP)
 	RET
 
-TEXT	armXadd64<>(SB),NOSPLIT,$0-20
+TEXT armXadd64<>(SB),NOSPLIT,$0-20
 	MOVW	addr+0(FP), R1
 	// make unaligned atomic access panic
 	AND.S	$7, R1, R2
@@ -154,7 +154,7 @@ add64loop:
 	MOVW	R5, new_hi+16(FP)
 	RET
 
-TEXT	armXchg64<>(SB),NOSPLIT,$0-20
+TEXT armXchg64<>(SB),NOSPLIT,$0-20
 	MOVW	addr+0(FP), R1
 	// make unaligned atomic access panic
 	AND.S	$7, R1, R2
@@ -178,7 +178,7 @@ swap64loop:
 	MOVW	R5, old_hi+16(FP)
 	RET
 
-TEXT	armLoad64<>(SB),NOSPLIT,$0-12
+TEXT armLoad64<>(SB),NOSPLIT,$0-12
 	MOVW	addr+0(FP), R1
 	// make unaligned atomic access panic
 	AND.S	$7, R1, R2
@@ -192,7 +192,7 @@ TEXT	armLoad64<>(SB),NOSPLIT,$0-12
 	MOVW	R3, val_hi+8(FP)
 	RET
 
-TEXT	armStore64<>(SB),NOSPLIT,$0-12
+TEXT armStore64<>(SB),NOSPLIT,$0-12
 	MOVW	addr+0(FP), R1
 	// make unaligned atomic access panic
 	AND.S	$7, R1, R2
@@ -213,35 +213,35 @@ store64loop:
 	DMB	MB_ISH
 	RET
 
-TEXT	·Cas64(SB),NOSPLIT,$0-21
+TEXT ·Cas64(SB),NOSPLIT,$0-21
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armCas64<>(SB)
 	JMP	·goCas64(SB)
 
-TEXT	·Xadd64(SB),NOSPLIT,$0-20
+TEXT ·Xadd64(SB),NOSPLIT,$0-20
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXadd64<>(SB)
 	JMP	·goXadd64(SB)
 
-TEXT	·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$0-20
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXchg64<>(SB)
 	JMP	·goXchg64(SB)
 
-TEXT	·Load64(SB),NOSPLIT,$0-12
+TEXT ·Load64(SB),NOSPLIT,$0-12
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armLoad64<>(SB)
 	JMP	·goLoad64(SB)
 
-TEXT	·Store64(SB),NOSPLIT,$0-12
+TEXT ·Store64(SB),NOSPLIT,$0-12
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
-- 
cgit v1.2.1


From 83317d9e3cb0674f71d1118d8814aefb31ac1239 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 15 Oct 2020 15:52:58 -0400
Subject: runtime/internal/atomic: panic nicely on unaligned 64-bit atomics

On 386 and arm, unaligned 64-bit atomics aren't safe, so we check for
this and panic. Currently, we panic by dereferencing nil, which may be
expedient but is pretty user-hostile since it gives no hint of what
the actual problem was.

This CL replaces this with an actual panic. The only subtlety here is
now the atomic assembly implementations are calling back into Go, so
they have to play nicely with stack maps and stack scanning. On 386,
this just requires declaring NO_LOCAL_POINTERS. On arm, this is
somewhat more complicated: first, we have to move the alignment check
into the functions that have Go signatures. Then we have to support
both the tail call from these functions to the underlying
implementation (which requires that they have no frame) and the call
into Go to panic (which requires that they have a frame). We resolve
this by forcing them to have no frame and setting up the frame
manually just before the panic call.

Change-Id: I19f1e860045df64088013db37a18acea47342c69
Reviewed-on: https://go-review.googlesource.com/c/go/+/262778
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/internal/atomic/asm_386.s       | 17 ++++--
 src/runtime/internal/atomic/asm_arm.s       | 89 +++++++++++++++++++----------
 src/runtime/internal/atomic/atomic_mipsx.go |  2 +-
 src/runtime/internal/atomic/atomic_test.go  |  9 ++-
 src/runtime/internal/atomic/unaligned.go    |  9 +++
 5 files changed, 88 insertions(+), 38 deletions(-)
 create mode 100644 src/runtime/internal/atomic/unaligned.go

(limited to 'src/runtime')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 357ca95625..bcefff373f 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool Cas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -44,7 +45,6 @@ TEXT ·Loadint64(SB), NOSPLIT, $0-12
 TEXT ·Xaddint64(SB), NOSPLIT, $0-20
 	JMP	·Xadd64(SB)
 
-
 // bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
@@ -54,10 +54,11 @@ TEXT ·Xaddint64(SB), NOSPLIT, $0-20
 //		return 0;
 //	}
 TEXT ·Cas64(SB), NOSPLIT, $0-21
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, BP // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVL	old_lo+4(FP), AX
 	MOVL	old_hi+8(FP), DX
 	MOVL	new_lo+12(FP), BX
@@ -98,11 +99,12 @@ TEXT ·Xadd(SB), NOSPLIT, $0-12
 	RET
 
 TEXT ·Xadd64(SB), NOSPLIT, $0-20
+	NO_LOCAL_POINTERS
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// DI:SI = delta
 	MOVL	delta_lo+4(FP), SI
 	MOVL	delta_hi+8(FP), DI
@@ -144,11 +146,12 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
 	JMP	·Xchg(SB)
 
 TEXT ·Xchg64(SB),NOSPLIT,$0-20
+	NO_LOCAL_POINTERS
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// CX:BX = new
 	MOVL	new_lo+4(FP), BX
 	MOVL	new_hi+8(FP), CX
@@ -188,10 +191,11 @@ TEXT ·StoreRel(SB), NOSPLIT, $0-8
 
 // uint64 atomicload64(uint64 volatile* addr);
 TEXT ·Load64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVQ	(AX), M0
 	MOVQ	M0, ret+4(FP)
 	EMMS
@@ -199,10 +203,11 @@ TEXT ·Load64(SB), NOSPLIT, $0-12
 
 // void ·Store64(uint64 volatile* addr, uint64 v);
 TEXT ·Store64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	// MOVQ and EMMS were introduced on the Pentium MMX.
 	MOVQ	val+4(FP), M0
 	MOVQ	M0, (AX)
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index db1267423d..c3d1d9025d 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -96,11 +97,7 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-20
 // atomics with locks.
 
 TEXT armCas64<>(SB),NOSPLIT,$0-21
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	old_lo+4(FP), R2
 	MOVW	old_hi+8(FP), R3
 	MOVW	new_lo+12(FP), R4
@@ -129,11 +126,7 @@ cas64fail:
 	RET
 
 TEXT armXadd64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	delta_lo+4(FP), R2
 	MOVW	delta_hi+8(FP), R3
 
@@ -155,11 +148,7 @@ add64loop:
 	RET
 
 TEXT armXchg64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	new_lo+4(FP), R2
 	MOVW	new_hi+8(FP), R3
 
@@ -179,11 +168,7 @@ swap64loop:
 	RET
 
 TEXT armLoad64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 
 	LDREXD	(R1), R2	// loads R2 and R3
 	DMB	MB_ISH
@@ -193,11 +178,7 @@ TEXT armLoad64<>(SB),NOSPLIT,$0-12
 	RET
 
 TEXT armStore64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	val_lo+4(FP), R2
 	MOVW	val_hi+8(FP), R3
 
@@ -213,35 +194,83 @@ store64loop:
 	DMB	MB_ISH
 	RET
 
-TEXT ·Cas64(SB),NOSPLIT,$0-21
+// The following functions all panic if their address argument isn't
+// 8-byte aligned. Since we're calling back into Go code to do this,
+// we have to cooperate with stack unwinding. In the normal case, the
+// functions tail-call into the appropriate implementation, which
+// means they must not open a frame. Hence, when they go down the
+// panic path, at that point they push the LR to create a real frame
+// (they don't need to pop it because panic won't return).
+
+TEXT ·Cas64(SB),NOSPLIT,$-4-21
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armCas64<>(SB)
 	JMP	·goCas64(SB)
 
-TEXT ·Xadd64(SB),NOSPLIT,$0-20
+TEXT ·Xadd64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXadd64<>(SB)
 	JMP	·goXadd64(SB)
 
-TEXT ·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXchg64<>(SB)
 	JMP	·goXchg64(SB)
 
-TEXT ·Load64(SB),NOSPLIT,$0-12
+TEXT ·Load64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armLoad64<>(SB)
 	JMP	·goLoad64(SB)
 
-TEXT ·Store64(SB),NOSPLIT,$0-12
+TEXT ·Store64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 0e2d77ade1..b99bfe7dbf 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -34,7 +34,7 @@ func spinUnlock(state *uint32)
 func lockAndCheck(addr *uint64) {
 	// ensure 8-byte alignment
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
-		addr = nil
+		panicUnaligned()
 	}
 	// force dereference before taking lock
 	_ = *addr
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index b0a8fa0610..a9f95077c0 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -73,8 +73,15 @@ func TestXadduintptrOnUint64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
diff --git a/src/runtime/internal/atomic/unaligned.go b/src/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000000..a859de4144
--- /dev/null
+++ b/src/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
-- 
cgit v1.2.1


From 9cec50f50c29f5ef7264bf06ee7ac0991b4b36d6 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 16 Oct 2020 19:35:09 +0200
Subject: internal/poll, net, syscall: use accept4 on illumos

Illumos supports the accept4 syscall, use it in internal/poll.accept
like on other platforms.

Add Accept4 to package syscall despite the package being frozen. The
other option would have been to add this to internal/syscall/unix, but
adding it to syscall avoids duplicating a lot of code in internal/poll
and net/internal/socktest. Also, all other platforms supporting the
accept4 syscall already export Accept4.

Follow CL 97196, CL 40895 and CL 94295

Change-Id: I13b32f0163a683840c02b16722730d9dfdb98f56
Reviewed-on: https://go-review.googlesource.com/c/go/+/256101
Trust: Tobias Klauser <tobias.klauser@gmail.com>
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
---
 src/runtime/cgo/cgo.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index c02b837978..4d2caf6c4f 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go
@@ -21,6 +21,7 @@ package cgo
 #cgo openbsd LDFLAGS: -lpthread
 #cgo aix LDFLAGS: -Wl,-berok
 #cgo solaris LDFLAGS: -lxnet
+#cgo illumos LDFLAGS: -lsocket
 
 // Issue 35247.
 #cgo darwin CFLAGS: -Wno-nullability-completeness
-- 
cgit v1.2.1


From 05739d6f17c57f09264272621b88725a463234d0 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Thu, 15 Oct 2020 14:39:12 -0700
Subject: runtime: wait for preemption signals before syscall.Exec

Fixes #41702
Fixes #42023

Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d
Reviewed-on: https://go-review.googlesource.com/c/go/+/262817
Trust: Ian Lance Taylor <iant@golang.org>
Trust: Bryan C. Mills <bcmills@google.com>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/proc.go        | 21 +++++++++++++++++++++
 src/runtime/signal_unix.go | 11 +++++++++++
 2 files changed, 32 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 4872480314..e1de70a997 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1311,6 +1311,14 @@ found:
 	checkdead()
 	unlock(&sched.lock)
 
+	if GOOS == "darwin" {
+		// Make sure pendingPreemptSignals is correct when an M exits.
+		// For #41702.
+		if atomic.Load(&m.signalPending) != 0 {
+			atomic.Xadd(&pendingPreemptSignals, -1)
+		}
+	}
+
 	if osStack {
 		// Return from mstart and let the system thread
 		// library free the g0 stack and terminate the thread.
@@ -3510,11 +3518,24 @@ func syscall_runtime_AfterForkInChild() {
 	inForkedChild = false
 }
 
+// pendingPreemptSignals is the number of preemption signals
+// that have been sent but not received. This is only used on Darwin.
+// For #41702.
+var pendingPreemptSignals uint32
+
 // Called from syscall package before Exec.
 //go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
 func syscall_runtime_BeforeExec() {
 	// Prevent thread creation during exec.
 	execLock.lock()
+
+	// On Darwin, wait for all pending preemption signals to
+	// be received. See issue #41702.
+	if GOOS == "darwin" {
+		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
+			osyield()
+		}
+	}
 }
 
 // Called from syscall package after Exec.
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index c228de47b4..e8b6f95d8f 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -335,6 +335,10 @@ func doSigPreempt(gp *g, ctxt *sigctxt) {
 	// Acknowledge the preemption.
 	atomic.Xadd(&gp.m.preemptGen, 1)
 	atomic.Store(&gp.m.signalPending, 0)
+
+	if GOOS == "darwin" {
+		atomic.Xadd(&pendingPreemptSignals, -1)
+	}
 }
 
 const preemptMSupported = true
@@ -364,6 +368,10 @@ func preemptM(mp *m) {
 	}
 
 	if atomic.Cas(&mp.signalPending, 0, 1) {
+		if GOOS == "darwin" {
+			atomic.Xadd(&pendingPreemptSignals, 1)
+		}
+
 		// If multiple threads are preempting the same M, it may send many
 		// signals to the same M such that it hardly make progress, causing
 		// live-lock problem. Apparently this could happen on darwin. See
@@ -435,6 +443,9 @@ func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
 			// no non-Go signal handler for sigPreempt.
 			// The default behavior for sigPreempt is to ignore
 			// the signal, so badsignal will be a no-op anyway.
+			if GOOS == "darwin" {
+				atomic.Xadd(&pendingPreemptSignals, -1)
+			}
 			return
 		}
 		c.fixsigcode(sig)
-- 
cgit v1.2.1


From f1e3c8f14232cde8da8666ad68df493563287634 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Wed, 14 Oct 2020 19:26:20 -0400
Subject: runtime/cgo: build iOS-specific code only on iOS

Don't build them on macOS/ARM64.

Updates #38485.

Change-Id: I9fbea838fdce52db22742487926879761dea0d6a
Reviewed-on: https://go-review.googlesource.com/c/go/+/262559
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/cgo/gcc_darwin_arm64.c         | 21 ++++++++++++++++-----
 src/runtime/cgo/gcc_signal_darwin_nolldb.c | 12 ------------
 src/runtime/cgo/gcc_signal_ios_nolldb.c    | 12 ++++++++++++
 3 files changed, 28 insertions(+), 17 deletions(-)
 delete mode 100644 src/runtime/cgo/gcc_signal_darwin_nolldb.c
 create mode 100644 src/runtime/cgo/gcc_signal_ios_nolldb.c

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index 9ea43ae4af..dbe848b4ee 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -10,12 +10,16 @@
 #include <unistd.h>
 #include <stdlib.h>
 
-#include <CoreFoundation/CFBundle.h>
-#include <CoreFoundation/CFString.h>
-
 #include "libcgo.h"
 #include "libcgo_unix.h"
 
+#include <TargetConditionals.h>
+
+#if TARGET_OS_IPHONE
+#include <CoreFoundation/CFBundle.h>
+#include <CoreFoundation/CFString.h>
+#endif
+
 #define magic (0xc476c475c47957UL)
 
 // inittls allocates a thread-local storage slot for g.
@@ -87,14 +91,18 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
+#if TARGET_OS_IPHONE
 	darwin_arm_init_thread_exception_port();
+#endif
 
 	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
 
+#if TARGET_OS_IPHONE
+
 // init_working_dir sets the current working directory to the app root.
-// By default darwin/arm64 processes start in "/".
+// By default ios/arm64 processes start in "/".
 static void
 init_working_dir()
 {
@@ -145,6 +153,8 @@ init_working_dir()
 	}
 }
 
+#endif // TARGET_OS_IPHONE
+
 void
 x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 {
@@ -161,8 +171,9 @@ x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 	// yes, tlsbase from mrs might not be correctly aligned.
 	inittls(tlsg, (void**)((uintptr)tlsbase & ~7));
 
+#if TARGET_OS_IPHONE
 	darwin_arm_init_mach_exception_handler();
 	darwin_arm_init_thread_exception_port();
-
 	init_working_dir();
+#endif
 }
diff --git a/src/runtime/cgo/gcc_signal_darwin_nolldb.c b/src/runtime/cgo/gcc_signal_darwin_nolldb.c
deleted file mode 100644
index 26be71bd1d..0000000000
--- a/src/runtime/cgo/gcc_signal_darwin_nolldb.c
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !lldb !ios
-// +build darwin
-// +build arm64
-
-#include <stdint.h>
-
-void darwin_arm_init_thread_exception_port() {}
-void darwin_arm_init_mach_exception_handler() {}
diff --git a/src/runtime/cgo/gcc_signal_ios_nolldb.c b/src/runtime/cgo/gcc_signal_ios_nolldb.c
new file mode 100644
index 0000000000..cfa4025414
--- /dev/null
+++ b/src/runtime/cgo/gcc_signal_ios_nolldb.c
@@ -0,0 +1,12 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !lldb
+// +build ios
+// +build arm64
+
+#include <stdint.h>
+
+void darwin_arm_init_thread_exception_port() {}
+void darwin_arm_init_mach_exception_handler() {}
-- 
cgit v1.2.1


From 689a7a13780dc7a5138215aa4d369bdcf789fee8 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Fri, 16 Oct 2020 19:11:42 -0400
Subject: runtime/cgo: fix build tag placement vet warning

Change-Id: Ie6583b46213caae897fc2189d4973c88759f5f4b
Reviewed-on: https://go-review.googlesource.com/c/go/+/263258
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/cgo/gcc_libinit_windows.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/gcc_libinit_windows.c b/src/runtime/cgo/gcc_libinit_windows.c
index 9fd7d36bfb..2732248bdc 100644
--- a/src/runtime/cgo/gcc_libinit_windows.c
+++ b/src/runtime/cgo/gcc_libinit_windows.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo
+
 #define WIN64_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
-- 
cgit v1.2.1


From 515e6a9b12dfe654c86cfd070ee5d6ac144fe116 Mon Sep 17 00:00:00 2001
From: Alex Brainman <alex.brainman@gmail.com>
Date: Sun, 19 Jul 2020 16:06:48 +1000
Subject: runtime: use CreateWaitableTimerEx to implement usleep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

@jstarks suggested that recent versions of Windows provide access to high resolution timers. See

https://github.com/golang/go/issues/8687#issuecomment-656259353

for details.

I tried to run this C program on my Windows 10 computer

```
 #include <stdio.h>
 #include <Windows.h>

 #pragma comment(lib, "Winmm.lib")

// Apparently this is already defined when I use msvc cl.
//#define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION = 0x00000002;

int usleep(HANDLE timer, LONGLONG d) {
	LARGE_INTEGER liDueTime;
	DWORD ret;
	LARGE_INTEGER StartingTime, EndingTime, ElapsedMicroseconds;
	LARGE_INTEGER Frequency;

	QueryPerformanceFrequency(&Frequency);
	QueryPerformanceCounter(&StartingTime);

	liDueTime.QuadPart = d;
	liDueTime.QuadPart = liDueTime.QuadPart * 10;	// us into 100 of ns units
	liDueTime.QuadPart = -liDueTime.QuadPart;	// negative for relative dure time

	if (!SetWaitableTimer(timer, &liDueTime, 0, NULL, NULL, 0)) {
		printf("SetWaitableTimer failed: errno=%d\n", GetLastError());
		return 1;
	}

	ret = WaitForSingleObject(timer, INFINITE);
	if (ret != WAIT_OBJECT_0) {
		printf("WaitForSingleObject failed: ret=%d errno=%d\n", ret, GetLastError());
		return 1;
	}

	QueryPerformanceCounter(&EndingTime);
	ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
	ElapsedMicroseconds.QuadPart *= 1000000;
	ElapsedMicroseconds.QuadPart /= Frequency.QuadPart;

	printf("delay is %lld us - slept for %lld us\n", d, ElapsedMicroseconds.QuadPart);

	return 0;
}

int testTimer(DWORD createFlag)
{
	HANDLE timer;

	timer = CreateWaitableTimerEx(NULL, NULL, createFlag, TIMER_ALL_ACCESS);
	if (timer == NULL) {
		printf("CreateWaitableTimerEx failed: errno=%d\n", GetLastError());
		return 1;
	}

	usleep(timer, 1000LL);
	usleep(timer, 100LL);
	usleep(timer, 10LL);
	usleep(timer, 1LL);

	CloseHandle(timer);

	return 0;
}

int main()
{
	printf("\n1. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is off - timeBeginPeriod is off\n");
	testTimer(0);

	printf("\n2. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is on - timeBeginPeriod is off\n");
	testTimer(CREATE_WAITABLE_TIMER_HIGH_RESOLUTION);

	timeBeginPeriod(1);

	printf("\n3. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is off - timeBeginPeriod is on\n");
	testTimer(0);

	printf("\n4. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is on - timeBeginPeriod is on\n");
	testTimer(CREATE_WAITABLE_TIMER_HIGH_RESOLUTION);
}
```

and I see this output

```
1. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is off - timeBeginPeriod is off
delay is 1000 us - slept for 4045 us
delay is 100 us - slept for 3915 us
delay is 10 us - slept for 3291 us
delay is 1 us - slept for 2234 us

2. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is on - timeBeginPeriod is off
delay is 1000 us - slept for 1076 us
delay is 100 us - slept for 569 us
delay is 10 us - slept for 585 us
delay is 1 us - slept for 17 us

3. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is off - timeBeginPeriod is on
delay is 1000 us - slept for 742 us
delay is 100 us - slept for 893 us
delay is 10 us - slept for 414 us
delay is 1 us - slept for 920 us

4. CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is on - timeBeginPeriod is on
delay is 1000 us - slept for 1466 us
delay is 100 us - slept for 559 us
delay is 10 us - slept for 535 us
delay is 1 us - slept for 5 us
```

That shows, that indeed using CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
will provide sleeps as low as about 500 microseconds, while our
current approach provides about 1 millisecond sleep.

New approach also does not require for timeBeginPeriod to be on,
so this change solves long standing problem with go programs draining
laptop battery, because it calls timeBeginPeriod.

This change will only run on systems where
CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag is available. If not
available, the runtime will fallback to original code that uses
timeBeginPeriod.

This is how this change affects benchmark reported in issue #14790

name               old time/op  new time/op  delta
ChanToSyscallPing  1.05ms ± 2%  0.68ms ±11%  -35.43%  (p=0.000 n=10+10)

The benchmark was run with GOMAXPROCS set to 1.

Fixes #8687
Updates #14790

Change-Id: I5b97ba58289c088c17c05292e12e45285c467eae
Reviewed-on: https://go-review.googlesource.com/c/go/+/248699
Run-TryBot: Alex Brainman <alex.brainman@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Alex Brainman <alex.brainman@gmail.com>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/os_windows.go       | 73 +++++++++++++++++++++++++++++++++++++++--
 src/runtime/sys_windows_386.s   | 36 ++++++++++++++++++++
 src/runtime/sys_windows_amd64.s | 32 ++++++++++++++++++
 src/runtime/sys_windows_arm.s   |  5 +++
 4 files changed, 144 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index a62e941229..9dd140c952 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -21,6 +21,7 @@ const (
 //go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateWaitableTimerExW CreateWaitableTimerExW%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
 //go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
@@ -68,6 +69,7 @@ var (
 	_CreateIoCompletionPort,
 	_CreateThread,
 	_CreateWaitableTimerA,
+	_CreateWaitableTimerExW,
 	_DuplicateHandle,
 	_ExitProcess,
 	_FreeEnvironmentStringsW,
@@ -151,6 +153,8 @@ type mOS struct {
 	waitsema   uintptr // semaphore for parking on locks
 	resumesema uintptr // semaphore to indicate suspend/resume
 
+	highResTimer uintptr // high resolution timer handle used in usleep
+
 	// preemptExtLock synchronizes preemptM with entry/exit from
 	// external C code.
 	//
@@ -402,11 +406,21 @@ const osRelaxMinNS = 60 * 1e6
 // osRelax is called by the scheduler when transitioning to and from
 // all Ps being idle.
 //
-// On Windows, it adjusts the system-wide timer resolution. Go needs a
+// Some versions of Windows have high resolution timer. For those
+// versions osRelax is noop.
+// For Windows versions without high resolution timer, osRelax
+// adjusts the system-wide timer resolution. Go needs a
 // high resolution timer while running and there's little extra cost
 // if we're already using the CPU, but if all Ps are idle there's no
 // need to consume extra power to drive the high-res timer.
 func osRelax(relax bool) uint32 {
+	if haveHighResTimer {
+		// If the high resolution timer is available, the runtime uses the timer
+		// to sleep for short durations. This means there's no need to adjust
+		// the global clock frequency.
+		return 0
+	}
+
 	if relax {
 		return uint32(stdcall1(_timeEndPeriod, 1))
 	} else {
@@ -414,6 +428,42 @@ func osRelax(relax bool) uint32 {
 	}
 }
 
+// haveHighResTimer indicates that the CreateWaitableTimerEx
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag is available.
+var haveHighResTimer = false
+
+// createHighResTimer calls CreateWaitableTimerEx with
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag to create high
+// resolution timer. createHighResTimer returns new timer
+// handle or 0, if CreateWaitableTimerEx failed.
+func createHighResTimer() uintptr {
+	const (
+		// As per @jstarks, see
+		// https://github.com/golang/go/issues/8687#issuecomment-656259353
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION = 0x00000002
+
+		_SYNCHRONIZE        = 0x00100000
+		_TIMER_QUERY_STATE  = 0x0001
+		_TIMER_MODIFY_STATE = 0x0002
+	)
+	return stdcall4(_CreateWaitableTimerExW, 0, 0,
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+		_SYNCHRONIZE|_TIMER_QUERY_STATE|_TIMER_MODIFY_STATE)
+}
+
+func initHighResTimer() {
+	if GOARCH == "arm" {
+		// TODO: Not yet implemented.
+		return
+	}
+	h := createHighResTimer()
+	if h != 0 {
+		haveHighResTimer = true
+		usleep2Addr = unsafe.Pointer(funcPC(usleep2HighRes))
+		stdcall1(_CloseHandle, h)
+	}
+}
+
 func osinit() {
 	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
 	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
@@ -429,6 +479,7 @@ func osinit() {
 
 	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
 
+	initHighResTimer()
 	timeBeginPeriodRetValue = osRelax(false)
 
 	ncpu = getproccount()
@@ -844,9 +895,20 @@ func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
 
+	// Configure usleep timer, if possible.
+	var timer uintptr
+	if haveHighResTimer {
+		timer = createHighResTimer()
+		if timer == 0 {
+			print("runtime: CreateWaitableTimerEx failed; errno=", getlasterror(), "\n")
+			throw("CreateWaitableTimerEx when creating timer failed")
+		}
+	}
+
 	mp := getg().m
 	lock(&mp.threadLock)
 	mp.thread = thandle
+	mp.highResTimer = timer
 	unlock(&mp.threadLock)
 
 	// Query the true stack base from the OS. Currently we're
@@ -884,6 +946,10 @@ func unminit() {
 	lock(&mp.threadLock)
 	stdcall1(_CloseHandle, mp.thread)
 	mp.thread = 0
+	if mp.highResTimer != 0 {
+		stdcall1(_CloseHandle, mp.highResTimer)
+		mp.highResTimer = 0
+	}
 	unlock(&mp.threadLock)
 }
 
@@ -976,9 +1042,12 @@ func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
 	return stdcall(fn)
 }
 
-// in sys_windows_386.s and sys_windows_amd64.s
+// In sys_windows_386.s and sys_windows_amd64.s.
 func onosstack(fn unsafe.Pointer, arg uint32)
+
+// These are not callable functions. They should only be called via onosstack.
 func usleep2(usec uint32)
+func usleep2HighRes(usec uint32)
 func switchtothread()
 
 var usleep2Addr unsafe.Pointer
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 9e1f40925d..4ac1527ab1 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -428,6 +428,42 @@ TEXT runtime·usleep2(SB),NOSPLIT,$20
 	MOVL	BP, SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT,$36
+	// Want negative 100ns units.
+	NEGL	BX
+	MOVL	$-1, hi-4(SP)
+	MOVL	BX, lo-8(SP)
+
+	get_tls(CX)
+	MOVL	g(CX), CX
+	MOVL	g_m(CX), CX
+	MOVL	(m_mOS+mOS_highResTimer)(CX), CX
+	MOVL	CX, saved_timer-12(SP)
+
+	MOVL	$0, fResume-16(SP)
+	MOVL	$0, lpArgToCompletionRoutine-20(SP)
+	MOVL	$0, pfnCompletionRoutine-24(SP)
+	MOVL	$0, lPeriod-28(SP)
+	LEAL	lo-8(SP), BX
+	MOVL	BX, lpDueTime-32(SP)
+	MOVL	CX, hTimer-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	MOVL	$0, ptime-28(SP)
+	MOVL	$0, alertable-32(SP)
+	MOVL	saved_timer-12(SP), CX
+	MOVL	CX, handle-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT,$0
 	MOVL	SP, BP
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 6c8eecd4e7..847542592b 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -457,6 +457,38 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$48
 	MOVQ	40(SP), SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$72
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	MOVQ	AX, 64(SP)
+
+	get_tls(CX)
+	MOVQ	g(CX), CX
+	MOVQ	g_m(CX), CX
+	MOVQ	(m_mOS+mOS_highResTimer)(CX), CX	// hTimer
+	MOVQ	CX, 48(SP)				// save hTimer for later
+	// Want negative 100ns units.
+	NEGQ	BX
+	LEAQ	56(SP), DX				// lpDueTime
+	MOVQ	BX, (DX)
+	MOVQ	$0, R8					// lPeriod
+	MOVQ	$0, R9					// pfnCompletionRoutine
+	MOVQ	$0, AX
+	MOVQ	AX, 32(SP)				// lpArgToCompletionRoutine
+	MOVQ	AX, 40(SP)				// fResume
+	MOVQ	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+
+	MOVQ	48(SP), CX				// handle
+	MOVQ	$0, DX					// alertable
+	MOVQ	$0, R8					// ptime
+	MOVQ	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+
+	MOVQ	64(SP), SP
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	SP, AX
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 256b5ff7f0..57415e1306 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -468,6 +468,11 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$0
 	MOVW	R4, R13			// Restore SP
 	MOVM.IA.W (R13), [R4, R15]	// pop {R4, pc}
 
+// Runs on OS stack. Duration (in 100ns units) is in R0.
+// TODO: neeeds to be implemented properly.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$0
+	B	runtime·abort(SB)
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVM.DB.W [R4, R14], (R13)  	// push {R4, lr}
-- 
cgit v1.2.1


From 0040adfd0f98e1012837b0317fabf69cf1f8855b Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Fri, 16 Oct 2020 21:39:36 -0400
Subject: runtime: define ios/arm64 entry points

Updates #38485.

Change-Id: I030346c7f0c3ce89209588525b210284fdea4efd
Reviewed-on: https://go-review.googlesource.com/c/go/+/263638
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
---
 src/runtime/rt0_ios_arm64.s | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 src/runtime/rt0_ios_arm64.s

(limited to 'src/runtime')

diff --git a/src/runtime/rt0_ios_arm64.s b/src/runtime/rt0_ios_arm64.s
new file mode 100644
index 0000000000..dcc83656e2
--- /dev/null
+++ b/src/runtime/rt0_ios_arm64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/arm64 only supports external linking.
+TEXT _rt0_arm64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_arm64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_arm64_darwin_lib(SB)
-- 
cgit v1.2.1


From 2754d911641c3a4569f48d61c541fc2ac395d23b Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 19 Oct 2020 17:55:55 +0000
Subject: runtime: add lock rank partial-order edge between fin and mheap

finlock may be held across a write barrier, which could then acquire the
mheap lock. Notably, this occurs in the mp.unlockf write in gopark where
finlock is held by the finalizer goroutines and is going to sleep.

Fixes #42062.

Change-Id: Icf76637ae6fc12795436272633dca3d473780875
Reviewed-on: https://go-review.googlesource.com/c/go/+/263678
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Dan Scales <danscales@google.com>
---
 src/runtime/lockrank.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 042f10b1d3..0cbbfc4f45 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -231,7 +231,7 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDefer:        {},
 	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
 	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
 	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
-- 
cgit v1.2.1


From 7bb721b9384bdd196befeaed593b185f7f2a5589 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Tue, 7 Jul 2020 13:49:21 -0400
Subject: all: update references to symbols moved from os to io/fs

The old os references are still valid, but update our code
to reflect best practices and get used to the new locations.

Code compiled with the bootstrap toolchain
(cmd/asm, cmd/dist, cmd/compile, debug/elf)
must remain Go 1.4-compatible and is excluded.

For #41190.

Change-Id: I8f9526977867c10a221e2f392f78d7dec073f1bd
Reviewed-on: https://go-review.googlesource.com/c/go/+/243907
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
---
 src/runtime/testdata/testprogcgo/exec.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/testdata/testprogcgo/exec.go b/src/runtime/testdata/testprogcgo/exec.go
index 94da5dc526..15723c7369 100644
--- a/src/runtime/testdata/testprogcgo/exec.go
+++ b/src/runtime/testdata/testprogcgo/exec.go
@@ -31,6 +31,7 @@ import "C"
 
 import (
 	"fmt"
+	"io/fs"
 	"os"
 	"os/exec"
 	"os/signal"
@@ -98,7 +99,7 @@ func CgoExecSignalMask() {
 
 // isEAGAIN reports whether err is an EAGAIN error from a process execution.
 func isEAGAIN(err error) bool {
-	if p, ok := err.(*os.PathError); ok {
+	if p, ok := err.(*fs.PathError); ok {
 		err = p.Err
 	}
 	return err == syscall.EAGAIN
-- 
cgit v1.2.1


From de932da453f68b8fc04e9c2ab25136748173c806 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= <moehrmann@google.com>
Date: Tue, 13 Oct 2020 22:30:23 +0200
Subject: internal/cpu: consolidate arm64 feature detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move code to detect and mask arm64 CPU features from
runtime to internal/cpu.

Change-Id: Ib784e2ff056e8def125d68827b852f07a3eff0db
Reviewed-on: https://go-review.googlesource.com/c/go/+/261878
Trust: Martin Möhrmann <moehrmann@google.com>
Trust: Tobias Klauser <tobias.klauser@gmail.com>
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Tobias Klauser <tobias.klauser@gmail.com>
Reviewed-by: Benny Siegert <bsiegert@gmail.com>
---
 src/runtime/auxv_none.go         |   1 -
 src/runtime/os_freebsd_arm64.go  | 143 ---------------------------------------
 src/runtime/os_freebsd_noauxv.go |   2 +-
 src/runtime/os_linux_arm64.go    |  14 +---
 src/runtime/os_netbsd.go         |   1 -
 src/runtime/os_netbsd_386.go     |   3 -
 src/runtime/os_netbsd_amd64.go   |   3 -
 src/runtime/os_netbsd_arm.go     |   3 -
 src/runtime/os_netbsd_arm64.go   |  12 +---
 src/runtime/os_openbsd_arm64.go  |  11 ---
 src/runtime/sys_freebsd_arm64.s  |  21 ------
 11 files changed, 3 insertions(+), 211 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/auxv_none.go b/src/runtime/auxv_none.go
index 3a560a1793..3ca617b21e 100644
--- a/src/runtime/auxv_none.go
+++ b/src/runtime/auxv_none.go
@@ -7,7 +7,6 @@
 // +build !dragonfly
 // +build !freebsd
 // +build !netbsd
-// +build !openbsd !arm64
 // +build !solaris
 
 package runtime
diff --git a/src/runtime/os_freebsd_arm64.go b/src/runtime/os_freebsd_arm64.go
index 51ebf9d478..b5b25f0dc5 100644
--- a/src/runtime/os_freebsd_arm64.go
+++ b/src/runtime/os_freebsd_arm64.go
@@ -4,149 +4,6 @@
 
 package runtime
 
-import "internal/cpu"
-
-const (
-	hwcap_FP       = 1 << 0
-	hwcap_ASIMD    = 1 << 1
-	hwcap_EVTSTRM  = 1 << 2
-	hwcap_AES      = 1 << 3
-	hwcap_PMULL    = 1 << 4
-	hwcap_SHA1     = 1 << 5
-	hwcap_SHA2     = 1 << 6
-	hwcap_CRC32    = 1 << 7
-	hwcap_ATOMICS  = 1 << 8
-	hwcap_FPHP     = 1 << 9
-	hwcap_ASIMDHP  = 1 << 10
-	hwcap_CPUID    = 1 << 11
-	hwcap_ASIMDRDM = 1 << 12
-	hwcap_JSCVT    = 1 << 13
-	hwcap_FCMA     = 1 << 14
-	hwcap_LRCPC    = 1 << 15
-	hwcap_DCPOP    = 1 << 16
-	hwcap_SHA3     = 1 << 17
-	hwcap_SM3      = 1 << 18
-	hwcap_SM4      = 1 << 19
-	hwcap_ASIMDDP  = 1 << 20
-	hwcap_SHA512   = 1 << 21
-	hwcap_SVE      = 1 << 22
-	hwcap_ASIMDFHM = 1 << 23
-)
-
-func getisar0() uint64
-func getisar1() uint64
-func getpfr0() uint64
-
-// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
-// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
-func archauxv(tag, val uintptr) {
-	var isar0, isar1, pfr0 uint64
-
-	isar0 = getisar0()
-	isar1 = getisar1()
-	pfr0 = getpfr0()
-
-	// ID_AA64ISAR0_EL1
-	switch extractBits(isar0, 4, 7) {
-	case 1:
-		cpu.HWCap |= hwcap_AES
-	case 2:
-		cpu.HWCap |= hwcap_PMULL | hwcap_AES
-	}
-
-	switch extractBits(isar0, 8, 11) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA1
-	}
-
-	switch extractBits(isar0, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA2
-	case 2:
-		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
-	}
-
-	switch extractBits(isar0, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_CRC32
-	}
-
-	switch extractBits(isar0, 20, 23) {
-	case 2:
-		cpu.HWCap |= hwcap_ATOMICS
-	}
-
-	switch extractBits(isar0, 28, 31) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDRDM
-	}
-
-	switch extractBits(isar0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA3
-	}
-
-	switch extractBits(isar0, 36, 39) {
-	case 1:
-		cpu.HWCap |= hwcap_SM3
-	}
-
-	switch extractBits(isar0, 40, 43) {
-	case 1:
-		cpu.HWCap |= hwcap_SM4
-	}
-
-	switch extractBits(isar0, 44, 47) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDDP
-	}
-
-	// ID_AA64ISAR1_EL1
-	switch extractBits(isar1, 0, 3) {
-	case 1:
-		cpu.HWCap |= hwcap_DCPOP
-	}
-
-	switch extractBits(isar1, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_JSCVT
-	}
-
-	switch extractBits(isar1, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_FCMA
-	}
-
-	switch extractBits(isar1, 20, 23) {
-	case 1:
-		cpu.HWCap |= hwcap_LRCPC
-	}
-
-	// ID_AA64PFR0_EL1
-	switch extractBits(pfr0, 16, 19) {
-	case 0:
-		cpu.HWCap |= hwcap_FP
-	case 1:
-		cpu.HWCap |= hwcap_FP | hwcap_FPHP
-	}
-
-	switch extractBits(pfr0, 20, 23) {
-	case 0:
-		cpu.HWCap |= hwcap_ASIMD
-	case 1:
-		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
-	}
-
-	switch extractBits(pfr0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SVE
-	}
-}
-
-func extractBits(data uint64, start, end uint) uint {
-	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
-}
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go
index c6a49927c8..01efb9b7c9 100644
--- a/src/runtime/os_freebsd_noauxv.go
+++ b/src/runtime/os_freebsd_noauxv.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build freebsd
-// +build !arm,!arm64
+// +build !arm
 
 package runtime
 
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 19968dc164..c5fd742048 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -11,19 +11,7 @@ import "internal/cpu"
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
-		// HWCAP/HWCAP2 bits for hardware capabilities.
-		hwcap := uint(val)
-		if GOOS == "android" {
-			// The Samsung S9+ kernel reports support for atomics, but not all cores
-			// actually support them, resulting in SIGILL. See issue #28431.
-			// TODO(elias.naur): Only disable the optimization on bad chipsets.
-			const hwcap_ATOMICS = 1 << 8
-			hwcap &= ^uint(hwcap_ATOMICS)
-		}
-		cpu.HWCap = hwcap
-	case _AT_HWCAP2:
-		cpu.HWCap2 = uint(val)
+		cpu.HWCap = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index c4c3d8e2fe..f7f90cedc1 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -359,7 +359,6 @@ func sysargs(argc int32, argv **byte) {
 	// now argv+n is auxv
 	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
 	sysauxv(auxv[:])
-	archauxv(auxv[:])
 }
 
 const (
diff --git a/src/runtime/os_netbsd_386.go b/src/runtime/os_netbsd_386.go
index c203af9cef..037f7e36dc 100644
--- a/src/runtime/os_netbsd_386.go
+++ b/src/runtime/os_netbsd_386.go
@@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp
 	mc.__gregs[_REG_EDX] = uint32(uintptr(unsafe.Pointer(gp)))
 	mc.__gregs[_REG_ESI] = uint32(fn)
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_amd64.go b/src/runtime/os_netbsd_amd64.go
index ea9d125492..5118b0c4ff 100644
--- a/src/runtime/os_netbsd_amd64.go
+++ b/src/runtime/os_netbsd_amd64.go
@@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp
 	mc.__gregs[_REG_R9] = uint64(uintptr(unsafe.Pointer(gp)))
 	mc.__gregs[_REG_R12] = uint64(fn)
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go
index 646da9dc0b..b5ec23e45b 100644
--- a/src/runtime/os_netbsd_arm.go
+++ b/src/runtime/os_netbsd_arm.go
@@ -32,6 +32,3 @@ func cputicks() int64 {
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_arm64.go b/src/runtime/os_netbsd_arm64.go
index ae2638c778..8d21b0a430 100644
--- a/src/runtime/os_netbsd_arm64.go
+++ b/src/runtime/os_netbsd_arm64.go
@@ -4,10 +4,7 @@
 
 package runtime
 
-import (
-	"internal/cpu"
-	"unsafe"
-)
+import "unsafe"
 
 func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) {
 	// Machine dependent mcontext initialisation for LWP.
@@ -24,10 +21,3 @@ func cputicks() int64 {
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func archauxv(auxv []uintptr) {
-	// NetBSD does not supply AT_HWCAP, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue https://golang.org/issue/30824#issuecomment-494901591
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/src/runtime/os_openbsd_arm64.go b/src/runtime/os_openbsd_arm64.go
index d559a2a3e5..d71de7d196 100644
--- a/src/runtime/os_openbsd_arm64.go
+++ b/src/runtime/os_openbsd_arm64.go
@@ -4,20 +4,9 @@
 
 package runtime
 
-import (
-	"internal/cpu"
-)
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func sysargs(argc int32, argv **byte) {
-	// OpenBSD does not have auxv, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue #31746.
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
index 2330f2ffe2..8a4f9b7fa1 100644
--- a/src/runtime/sys_freebsd_arm64.s
+++ b/src/runtime/sys_freebsd_arm64.s
@@ -515,24 +515,3 @@ TEXT runtime·getCntxct(SB),NOSPLIT,$0
 
 	MOVW	R0, ret+8(FP)
 	RET
-
-// func getisar0() uint64
-TEXT runtime·getisar0(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 0 into R0
-	MRS	ID_AA64ISAR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getisar1() uint64
-TEXT runtime·getisar1(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 1 into R0
-	MRS	ID_AA64ISAR1_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getpfr0() uint64
-TEXT runtime·getpfr0(SB),NOSPLIT,$0
-	// get Processor Feature Register 0 into R0
-	MRS	ID_AA64PFR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-- 
cgit v1.2.1


From 05f5ae74bc95b0d77a512029bc1a6739c5d0f181 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 20 Oct 2020 12:57:14 +0000
Subject: runtime: fix scavenging tests for pallocChunkBytes huge pages and
 larger

Currently the scavenging tests implicitly assume that the system huge
page size is always strictly less than 4 MiB, or pallocChunkBytes. This
leads to failures on systems with huge pages of this size, and larger.

Filter out those tests on such platforms and add a test for the 4 MiB
case. The scavenger is already equipped to handle this case.

Huge page sizes > 4 MiB are effectively ignored, so also add a test case
to ensure that happens.

Unfortunately we can't actually run these tests in our CI because they
require the platform to provide the right huge page size, but we really
should just parameterize this value so we can test it (there's a TODO
about this already).

Fixes #42053.

Change-Id: Ia576cbf67e178a14a178a893967efbed27d6eb17
Reviewed-on: https://go-review.googlesource.com/c/go/+/263837
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/mgcscavenge_test.go | 53 +++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 18 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 7f619b1e7d..250343077f 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -235,26 +235,43 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 	if PhysHugePageSize > uintptr(PageSize) {
 		// Check hugepage preserving behavior.
 		bits := uint(PhysHugePageSize / uintptr(PageSize))
-		tests["PreserveHugePageBottom"] = test{
-			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
-			min:   1,
-			max:   3, // Make it so that max would have us try to break the huge page.
-			want:  BitRange{0, bits + 2},
-		}
-		if 3*bits < PallocChunkPages {
-			// We need at least 3 huge pages in a chunk for this test to make sense.
-			tests["PreserveHugePageMiddle"] = test{
-				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+		if bits < PallocChunkPages {
+			tests["PreserveHugePageBottom"] = test{
+				alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
 				min:   1,
-				max:   12, // Make it so that max would have us try to break the huge page.
-				want:  BitRange{bits, bits + 10},
+				max:   3, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{0, bits + 2},
+			}
+			if 3*bits < PallocChunkPages {
+				// We need at least 3 huge pages in a chunk for this test to make sense.
+				tests["PreserveHugePageMiddle"] = test{
+					alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+					min:   1,
+					max:   12, // Make it so that max would have us try to break the huge page.
+					want:  BitRange{bits, bits + 10},
+				}
+			}
+			tests["PreserveHugePageTop"] = test{
+				alloc: []BitRange{{0, PallocChunkPages - bits}},
+				min:   1,
+				max:   1, // Even one page would break a huge page in this case.
+				want:  BitRange{PallocChunkPages - bits, bits},
+			}
+		} else if bits == PallocChunkPages {
+			tests["PreserveHugePageAll"] = test{
+				min:  1,
+				max:  1, // Even one page would break a huge page in this case.
+				want: BitRange{0, PallocChunkPages},
+			}
+		} else {
+			// The huge page size is greater than pallocChunkPages, so it should
+			// be effectively disabled. There's no way we can possible scavenge
+			// a huge page out of this bitmap chunk.
+			tests["PreserveHugePageNone"] = test{
+				min:  1,
+				max:  1,
+				want: BitRange{PallocChunkPages - 1, 1},
 			}
-		}
-		tests["PreserveHugePageTop"] = test{
-			alloc: []BitRange{{0, PallocChunkPages - bits}},
-			min:   1,
-			max:   1, // Even one page would break a huge page in this case.
-			want:  BitRange{PallocChunkPages - bits, bits},
 		}
 	}
 	for name, v := range tests {
-- 
cgit v1.2.1


From 1b09d430678d4a6f73b2443463d11f75851aba8a Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Fri, 16 Oct 2020 00:49:02 -0400
Subject: all: update references to symbols moved from io/ioutil to io

The old ioutil references are still valid, but update our code
to reflect best practices and get used to the new locations.

Code compiled with the bootstrap toolchain
(cmd/asm, cmd/dist, cmd/compile, debug/elf)
must remain Go 1.4-compatible and is excluded.
Also excluded vendored code.

For #41190.

Change-Id: I6d86f2bf7bc37a9d904b6cee3fe0c7af6d94d5b1
Reviewed-on: https://go-review.googlesource.com/c/go/+/263142
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Emmanuel Odeke <emm.odeke@gmail.com>
---
 src/runtime/crash_unix_test.go            | 2 +-
 src/runtime/testdata/testprogcgo/eintr.go | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 8ef52aba48..fc87f37408 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -241,7 +241,7 @@ func TestPanicSystemstack(t *testing.T) {
 	}
 
 	// Get traceback.
-	tb, err := ioutil.ReadAll(pr)
+	tb, err := io.ReadAll(pr)
 	if err != nil {
 		t.Fatal("reading traceback from pipe: ", err)
 	}
diff --git a/src/runtime/testdata/testprogcgo/eintr.go b/src/runtime/testdata/testprogcgo/eintr.go
index 791ff1bedc..1722a75eb9 100644
--- a/src/runtime/testdata/testprogcgo/eintr.go
+++ b/src/runtime/testdata/testprogcgo/eintr.go
@@ -32,7 +32,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net"
 	"os"
@@ -242,5 +241,5 @@ func testExec(wg *sync.WaitGroup) {
 
 // Block blocks until stdin is closed.
 func Block() {
-	io.Copy(ioutil.Discard, os.Stdin)
+	io.Copy(io.Discard, os.Stdin)
 }
-- 
cgit v1.2.1


From 7f736694fe9b254efa7155a0a5da87c2c18e6078 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 20 Oct 2020 12:54:20 -0700
Subject: runtime: use GOTRACEBACK=system for TestCgoExecSignalMask

Try to get a bit more information to understand #42093.

For #42093

Change-Id: I818feb08d7561151d52eba3e88c418b55b9f9c1e
Reviewed-on: https://go-review.googlesource.com/c/go/+/264018
Trust: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Bryan C. Mills <bcmills@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/runtime/crash_cgo_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 4872189f16..b200984050 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -154,7 +154,7 @@ func TestCgoExecSignalMask(t *testing.T) {
 	case "windows", "plan9":
 		t.Skipf("skipping signal mask test on %s", runtime.GOOS)
 	}
-	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask")
+	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask", "GOTRACEBACK=system")
 	want := "OK\n"
 	if got != want {
 		t.Errorf("expected %q, got %v", want, got)
-- 
cgit v1.2.1


From 15ead857dbc638b9d83a7686acf0dc746fc45918 Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <murp@ibm.com>
Date: Wed, 9 Sep 2020 17:24:23 -0500
Subject: cmd/compiler,cmd/go,sync: add internal {LoadAcq,StoreRel}64 on ppc64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an internal atomic intrinsic for load with acquire semantics
(extending LoadAcq to 64b) and add LoadAcquintptr for internal
use within the sync package.  For other arches, this remaps to the
appropriate atomic.Load{,64} intrinsic which should not alter code
generation.

Similarly, add StoreRel{uintptr,64} for consistency, and inline.

Finally, add an exception to allow sync to directly use the
runtime/internal/atomic package which avoids more convoluted
workarounds (contributed by Lynn Boger).

In an extreme example, sync.(*Pool).pin consumes 20% of wall time
during fmt tests.  This is reduced to 5% on ppc64le/power9.

From the fmt benchmarks on ppc64le:

name                           old time/op  new time/op  delta
SprintfPadding                  468ns ± 0%   451ns ± 0%   -3.63%
SprintfEmpty                   73.3ns ± 0%  51.9ns ± 0%  -29.20%
SprintfString                   135ns ± 0%   122ns ± 0%   -9.63%
SprintfTruncateString           232ns ± 0%   214ns ± 0%   -7.76%
SprintfTruncateBytes            216ns ± 0%   202ns ± 0%   -6.48%
SprintfSlowParsingPath          162ns ± 0%   142ns ± 0%  -12.35%
SprintfQuoteString             1.00µs ± 0%  0.99µs ± 0%   -1.39%
SprintfInt                      117ns ± 0%   104ns ± 0%  -11.11%
SprintfIntInt                   190ns ± 0%   175ns ± 0%   -7.89%
SprintfPrefixedInt              232ns ± 0%   212ns ± 0%   -8.62%
SprintfFloat                    270ns ± 0%   255ns ± 0%   -5.56%
SprintfComplex                 1.01µs ± 0%  0.99µs ± 0%   -1.68%
SprintfBoolean                  127ns ± 0%   111ns ± 0%  -12.60%
SprintfHexString                220ns ± 0%   198ns ± 0%  -10.00%
SprintfHexBytes                 261ns ± 0%   252ns ± 0%   -3.45%
SprintfBytes                    600ns ± 0%   590ns ± 0%   -1.67%
SprintfStringer                 684ns ± 0%   658ns ± 0%   -3.80%
SprintfStructure               2.57µs ± 0%  2.57µs ± 0%   -0.12%
ManyArgs                        669ns ± 0%   646ns ± 0%   -3.44%
FprintInt                       140ns ± 0%   136ns ± 0%   -2.86%
FprintfBytes                    184ns ± 0%   181ns ± 0%   -1.63%
FprintIntNoAlloc                140ns ± 0%   136ns ± 0%   -2.86%
ScanInts                        929µs ± 0%   921µs ± 0%   -0.79%
ScanRecursiveInt                122ms ± 0%   121ms ± 0%   -0.11%
ScanRecursiveIntReaderWrapper   122ms ± 0%   122ms ± 0%   -0.18%

Change-Id: I4d66780261b57b06ef600229e475462e7313f0d6
Reviewed-on: https://go-review.googlesource.com/c/go/+/253748
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/runtime/internal/atomic/asm_386.s         |  3 +++
 src/runtime/internal/atomic/asm_amd64.s       |  6 ++++++
 src/runtime/internal/atomic/asm_arm.s         |  6 ++++++
 src/runtime/internal/atomic/asm_mips64x.s     |  6 ++++++
 src/runtime/internal/atomic/asm_mipsx.s       |  3 +++
 src/runtime/internal/atomic/asm_ppc64x.s      | 13 +++++++++++++
 src/runtime/internal/atomic/atomic_386.go     |  9 +++++++++
 src/runtime/internal/atomic/atomic_amd64.go   | 18 ++++++++++++++++++
 src/runtime/internal/atomic/atomic_arm.go     |  6 ++++++
 src/runtime/internal/atomic/atomic_arm64.go   | 12 ++++++++++++
 src/runtime/internal/atomic/atomic_arm64.s    | 14 ++++++++++++++
 src/runtime/internal/atomic/atomic_mips64x.go | 12 ++++++++++++
 src/runtime/internal/atomic/atomic_mips64x.s  |  8 ++++++++
 src/runtime/internal/atomic/atomic_mipsx.go   |  6 ++++++
 src/runtime/internal/atomic/atomic_ppc64x.go  | 12 ++++++++++++
 src/runtime/internal/atomic/atomic_ppc64x.s   | 20 ++++++++++++++++++++
 src/runtime/internal/atomic/atomic_riscv64.go | 12 ++++++++++++
 src/runtime/internal/atomic/atomic_riscv64.s  | 12 ++++++++++++
 src/runtime/internal/atomic/atomic_s390x.go   | 24 ++++++++++++++++++++++++
 src/runtime/internal/atomic/atomic_wasm.go    | 24 ++++++++++++++++++++++++
 20 files changed, 226 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index bcefff373f..7ebf675ac5 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -189,6 +189,9 @@ TEXT ·Store(SB), NOSPLIT, $0-8
 TEXT ·StoreRel(SB), NOSPLIT, $0-8
 	JMP	·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Store(SB)
+
 // uint64 atomicload64(uint64 volatile* addr);
 TEXT ·Load64(SB), NOSPLIT, $0-12
 	NO_LOCAL_POINTERS
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 90c56424c9..80fb31285d 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -136,6 +136,12 @@ TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
 	MOVQ	ptr+0(FP), BX
 	MOVB	val+8(FP), AX
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index c3d1d9025d..274925ed60 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -57,6 +57,9 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-8
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
 	B	·Load(SB)
 
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-8
+	B 	·Load(SB)
+
 TEXT ·Casuintptr(SB),NOSPLIT,$0-13
 	B	·Cas(SB)
 
@@ -81,6 +84,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	B	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
+
 TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
 	B	·Xadd(SB)
 
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 3290fb726a..03fb822929 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -158,6 +158,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 TEXT ·Store(SB), NOSPLIT, $0-12
 	MOVV	ptr+0(FP), R1
 	MOVW	val+8(FP), R2
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 62811a6599..63bb548825 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -122,6 +122,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	JMP	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
 // void	Or8(byte volatile*, byte);
 TEXT ·Or8(SB),NOSPLIT,$0-5
 	MOVW	ptr+0(FP), R1
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 06dc931bf4..c0237de4d0 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -83,12 +83,18 @@ TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
 TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
+TEXT runtime∕internal∕atomic·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·LoadAcq64(SB)
+
 TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
 TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
 	BR	runtime∕internal∕atomic·Store64(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·StoreRel64(SB)
+
 TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
 	BR	runtime∕internal∕atomic·Xadd64(SB)
 
@@ -191,6 +197,13 @@ TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	MOVW	R4, 0(R3)
 	RET
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	LWSYNC
+	MOVD	R4, 0(R3)
+	RET
+
 // void runtime∕internal∕atomic·Or8(byte volatile*, byte);
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 8d002ebfe3..06ce6a5356 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -30,6 +30,12 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd64(ptr *uint64, delta int64) uint64
 
@@ -83,5 +89,8 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 14b8101720..1b71a16d94 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -35,6 +35,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
@@ -85,6 +97,12 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 95713afcc1..67d529c1cb 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -81,6 +81,9 @@ func Store(addr *uint32, v uint32)
 //go:noescape
 func StoreRel(addr *uint32, v uint32)
 
+//go:noescape
+func StoreReluintptr(addr *uintptr, v uintptr)
+
 //go:nosplit
 func goCas64(addr *uint64, old, new uint64) bool {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
@@ -194,6 +197,9 @@ func Load8(addr *uint8) uint8
 //go:noescape
 func LoadAcq(addr *uint32) uint32
 
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Cas64(addr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 26ca94d54c..c9b4322fe9 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(addr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
@@ -67,3 +73,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index a2eb7568d2..36c7698b18 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -36,12 +36,26 @@ TEXT ·Loadp(SB),NOSPLIT,$0-16
 TEXT ·LoadAcq(SB),NOSPLIT,$0-12
 	B	·Load(SB)
 
+// uint64 runtime∕internal∕atomic·LoadAcquintptr(uint64 volatile* addr)
+TEXT ·LoadAcq64(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcq64(uintptr volatile* addr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
 TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	B	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	B	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 	MOVD	ptr+0(FP), R0
 	MOVW	val+8(FP), R1
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index 1d9977850b..fca2242514 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -69,3 +75,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
index 1ed90937c9..125c0c221c 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.s
+++ b/src/runtime/internal/atomic/atomic_mips64x.s
@@ -47,3 +47,11 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 // uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcquintptr(uintptr volatile* ptr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index b99bfe7dbf..be1e6a038b 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -132,6 +132,9 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -150,5 +153,8 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 //go:noescape
 func CasRel(addr *uint32, old, new uint32) bool
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index a48ecf5ee8..e759bb27a2 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -67,5 +73,11 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
index c2f696fb34..b79cdbca34 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.s
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -6,6 +6,15 @@
 
 #include "textflag.h"
 
+
+// For more details about how various memory models are
+// enforced on POWER, the following paper provides more
+// details about how they enforce C/C++ like models. This
+// gives context about why the strange looking code
+// sequences below work.
+//
+// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+
 // uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD	ptr+0(FP), R3
@@ -56,5 +65,16 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVWZ  0(R3), R3
 	CMPW   R3, R3, CR7
 	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
 	MOVW   R3, ret+8(FP)
 	RET
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD   ptr+0(FP), R3
+	MOVD   0(R3), R3
+	CMP    R3, R3, CR7
+	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
+	MOVD   R3, ret+8(FP)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index d52512369e..617bc1a3eb 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -39,6 +39,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
@@ -65,3 +71,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index d005325ca3..db139d690a 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -150,6 +150,12 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-24
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	·Load(SB)
 
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
 // func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 TEXT ·Loadp(SB),NOSPLIT,$0-16
 	JMP	·Load64(SB)
@@ -161,6 +167,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 // func Xchg(ptr *uint32, new uint32) uint32
 TEXT ·Xchg(SB), NOSPLIT, $0-20
 	MOV	ptr+0(FP), A0
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index 4d73b39baf..b649caa39f 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -41,6 +41,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
@@ -59,6 +71,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 2c0c3a8174..60a4942884 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -45,6 +45,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:nosplit
 //go:noinline
 func Load8(ptr *uint8) uint8 {
@@ -141,6 +153,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:nosplit
 //go:noinline
 func Store8(ptr *uint8, val uint8) {
-- 
cgit v1.2.1


From 6f45b39e4dbabf0b179a60ffacf434e55b2d5eab Mon Sep 17 00:00:00 2001
From: Joel Sing <joel@sing.id.au>
Date: Tue, 19 May 2020 18:55:31 +1000
Subject: cmd/compile,cmd/internal/obj/riscv: move g register on riscv64

The original riscv64 port used the thread pointer (TP aka X4) register for
the g pointer, however this register is also used when TLS support is
required, resulting in a conflict (for example, when a signal is received
we have no way of readily knowing if X4 contains a pointer to the TCB or
a pointer to a g).

In order to support cgo, free up the X4 register by moving g to X27.
This unfortunately means that the X4 register is unused in non-cgo mode,
however the alternative is to not support cgo on this platform.

Update #36641

Change-Id: Idcaf3e8ccbe42972a1b8943aeefde7149d9c960a
Reviewed-on: https://go-review.googlesource.com/c/go/+/263477
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/asm_riscv64.s | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 4084ced7f8..a136085084 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -519,12 +519,12 @@ flush:
 	MOV	T1, 16(X2)	// Also second argument to wbBufFlush
 
 	// TODO: Optimise
-	// R3 is g.
-	// R4 already saved (T0)
-	// R5 already saved (T1)
-	// R9 already saved (A0)
-	// R10 already saved (A1)
-	// R30 is tmp register.
+	// X5 already saved (T0)
+	// X6 already saved (T1)
+	// X10 already saved (A0)
+	// X11 already saved (A1)
+	// X27 is g.
+	// X31 is tmp register.
 	MOV	X0, 24(X2)
 	MOV	X1, 32(X2)
 	MOV	X2, 40(X2)
-- 
cgit v1.2.1


From 9a49f772575f8009355ac7d64d51ef41bc960c3d Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 20 Oct 2020 20:08:30 -0700
Subject: runtime/race: update race .syso files

Fixes #39186

Change-Id: I624ab73b3083f190978c09716672ce1b712a5c81
Reviewed-on: https://go-review.googlesource.com/c/go/+/264082
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Trust: Keith Randall <khr@golang.org>
---
 src/runtime/race/README                  |  14 +++++++-------
 src/runtime/race/race_darwin_amd64.syso  | Bin 449292 -> 451280 bytes
 src/runtime/race/race_freebsd_amd64.syso | Bin 579744 -> 583264 bytes
 src/runtime/race/race_linux_amd64.syso   | Bin 521752 -> 525176 bytes
 src/runtime/race/race_linux_arm64.syso   | Bin 500584 -> 505224 bytes
 src/runtime/race/race_linux_ppc64le.syso | Bin 623824 -> 624648 bytes
 src/runtime/race/race_netbsd_amd64.syso  | Bin 602664 -> 609424 bytes
 src/runtime/race/race_windows_amd64.syso | Bin 458427 -> 461185 bytes
 8 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/race/README b/src/runtime/race/README
index 34485f0fb2..b36d82ccfd 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -4,10 +4,10 @@ the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).
 
 To update the .syso files use golang.org/x/build/cmd/racebuild.
 
-race_darwin_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_freebsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_amd64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_linux_ppc64le.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_netbsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_windows_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_arm64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
+race_darwin_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_freebsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_ppc64le.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_netbsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_windows_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_arm64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
index d03a593f5a..3f95ecc8ee 100644
Binary files a/src/runtime/race/race_darwin_amd64.syso and b/src/runtime/race/race_darwin_amd64.syso differ
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
index 573591c56f..2a5b46f4ce 100644
Binary files a/src/runtime/race/race_freebsd_amd64.syso and b/src/runtime/race/race_freebsd_amd64.syso differ
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
index d31f85df56..e00398c964 100644
Binary files a/src/runtime/race/race_linux_amd64.syso and b/src/runtime/race/race_linux_amd64.syso differ
diff --git a/src/runtime/race/race_linux_arm64.syso b/src/runtime/race/race_linux_arm64.syso
index 7c74171b0f..9dae738700 100644
Binary files a/src/runtime/race/race_linux_arm64.syso and b/src/runtime/race/race_linux_arm64.syso differ
diff --git a/src/runtime/race/race_linux_ppc64le.syso b/src/runtime/race/race_linux_ppc64le.syso
index a3c72bec55..b562656d56 100644
Binary files a/src/runtime/race/race_linux_ppc64le.syso and b/src/runtime/race/race_linux_ppc64le.syso differ
diff --git a/src/runtime/race/race_netbsd_amd64.syso b/src/runtime/race/race_netbsd_amd64.syso
index 54e276bcff..11af16f046 100644
Binary files a/src/runtime/race/race_netbsd_amd64.syso and b/src/runtime/race/race_netbsd_amd64.syso differ
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
index abaf42649f..9fbf9b4391 100644
Binary files a/src/runtime/race/race_windows_amd64.syso and b/src/runtime/race/race_windows_amd64.syso differ
-- 
cgit v1.2.1


From b4a06b20897fe7ea3be715cb51040a2ccc52c15b Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 14 Jul 2020 20:27:27 +0000
Subject: runtime: define the AddrRange used for testing in terms of addrRange

Currently the AddrRange used for testing is defined separately from
addrRange in the runtime, making it difficult to test it as well as
addrRanges. Redefine AddrRange in terms of addrRange instead.

For #40191.

Change-Id: I3aa5b8df3e4c9a3c494b46ab802dd574b2488141
Reviewed-on: https://go-review.googlesource.com/c/go/+/242677
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/export_test.go     | 30 ++++++++++---
 src/runtime/mpagealloc_test.go | 98 +++++++++++++++++++++---------------------
 2 files changed, 72 insertions(+), 56 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index f2fa11dc98..25b251f4ba 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -749,10 +749,7 @@ func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 func (p *PageAlloc) InUse() []AddrRange {
 	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
 	for _, r := range p.inUse.ranges {
-		ranges = append(ranges, AddrRange{
-			Base:  r.base.addr(),
-			Limit: r.limit.addr(),
-		})
+		ranges = append(ranges, AddrRange{r})
 	}
 	return ranges
 }
@@ -763,10 +760,29 @@ func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
 	return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
 }
 
-// AddrRange represents a range over addresses.
-// Specifically, it represents the range [Base, Limit).
+// AddrRange is a wrapper around addrRange for testing.
 type AddrRange struct {
-	Base, Limit uintptr
+	addrRange
+}
+
+// MakeAddrRange creates a new address range.
+func MakeAddrRange(base, limit uintptr) AddrRange {
+	return AddrRange{makeAddrRange(base, limit)}
+}
+
+// Base returns the virtual base address of the address range.
+func (a AddrRange) Base() uintptr {
+	return a.addrRange.base.addr()
+}
+
+// Base returns the virtual address of the limit of the address range.
+func (a AddrRange) Limit() uintptr {
+	return a.addrRange.limit.addr()
+}
+
+// Equals returns true if the two address ranges are exactly equal.
+func (a AddrRange) Equals(b AddrRange) bool {
+	return a == b
 }
 
 // BitRange represents a range over a bitmap.
diff --git a/src/runtime/mpagealloc_test.go b/src/runtime/mpagealloc_test.go
index 65ba71d459..5d979fa95b 100644
--- a/src/runtime/mpagealloc_test.go
+++ b/src/runtime/mpagealloc_test.go
@@ -54,7 +54,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
 			},
 		},
 		"Contiguous2": {
@@ -63,7 +63,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 1,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
 			},
 		},
 		"Contiguous5": {
@@ -75,7 +75,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Discontiguous": {
@@ -85,9 +85,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Mixed": {
@@ -98,8 +98,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"WildlyDiscontiguous": {
@@ -110,9 +110,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x21,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
-				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
-				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)),
 			},
 		},
 		"ManyDiscontiguous": {
@@ -129,39 +129,39 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 64,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
-				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
-				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
-				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
-				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
-				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
-				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
-				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
-				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
-				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
-				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
-				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
-				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
-				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
-				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
-				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
-				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
-				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
-				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
-				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
-				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
-				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
-				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
-				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
-				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
-				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
-				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
-				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
-				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
-				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
-				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)),
 			},
 		},
 	}
@@ -172,8 +172,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)),
 			},
 		}
 	}
@@ -197,7 +197,7 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Fail()
 			} else {
 				for i := range want {
-					if want[i] != got[i] {
+					if !want[i].Equals(got[i]) {
 						t.Fail()
 						break
 					}
@@ -207,11 +207,11 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Logf("found inUse mismatch")
 				t.Logf("got:")
 				for i, r := range got {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 				t.Logf("want:")
 				for i, r := range want {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 			}
 		})
-- 
cgit v1.2.1


From f8aecbbff5b85e67fee95033b3a14f3df665ea18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= <moehrmann@google.com>
Date: Tue, 20 Oct 2020 14:21:07 +0200
Subject: runtime: move s390x HWCap CPU feature detection to internal/cpu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change-Id: I7d9e31c3b342731ddd7329962426fdfc80e9ed87
Reviewed-on: https://go-review.googlesource.com/c/go/+/263803
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Tobias Klauser <tobias.klauser@gmail.com>
---
 src/runtime/os_linux_s390x.go | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
index ee18fd1dc2..b9651f186c 100644
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -6,15 +6,10 @@ package runtime
 
 import "internal/cpu"
 
-const (
-	// bit masks taken from bits/hwcap.h
-	_HWCAP_S390_VX = 2048 // vector facility
-)
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_HWCAP: // CPU capability bit flags
-		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
 	}
 }
 
-- 
cgit v1.2.1


From 431d58da69e8c36d654876e7808f971c5667649c Mon Sep 17 00:00:00 2001
From: Elias Naur <mail@eliasnaur.com>
Date: Tue, 20 Oct 2020 11:01:46 +0200
Subject: all: add GOOS=ios GOARCH=amd64 target for the ios simulator

The Go toolchain has supported the simulator for years, but always in
buildmode=c-archive which is intrinsically externally linked and PIE.

This CL moves that support from GOOS=darwin GOARCH=amd64 -tags=ios to
just GOOS=ios GOARCH=amd64 to match the change for iOS devices.

This change also forces external linking and defaults to buildmode=pie
to support Go binaries in the default buildmode to run on the simulator.

CL 255257 added the necessary support to the exec wrapper.

Updates #38485
Fixes #42100

Change-Id: I6e6ee0e8d421be53b31e3d403880e5b9b880d031
Reviewed-on: https://go-review.googlesource.com/c/go/+/263798
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: Elias Naur <mail@eliasnaur.com>
---
 src/runtime/rt0_ios_amd64.s | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 src/runtime/rt0_ios_amd64.s

(limited to 'src/runtime')

diff --git a/src/runtime/rt0_ios_amd64.s b/src/runtime/rt0_ios_amd64.s
new file mode 100644
index 0000000000..c6990324f4
--- /dev/null
+++ b/src/runtime/rt0_ios_amd64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/amd64 only supports external linking.
+TEXT _rt0_amd64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_amd64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_amd64_darwin_lib(SB)
-- 
cgit v1.2.1


From ad61343f886cc5ce677e7bd62385144b2ba7b8f5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 8 Oct 2020 14:38:39 -0400
Subject: runtime/internal/atomic: add 32-bit And/Or

These will be used in a following CL to perform larger bit clear and bit
set than And8/Or8.

Change-Id: I60f7b1099e29b69eb64add77564faee862880a8d
Reviewed-on: https://go-review.googlesource.com/c/go/+/260977
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: Michael Pratt <mpratt@google.com>
---
 src/runtime/internal/atomic/asm_386.s         |  16 ++++
 src/runtime/internal/atomic/asm_amd64.s       |  16 ++++
 src/runtime/internal/atomic/asm_mips64x.s     |  26 ++++++
 src/runtime/internal/atomic/asm_mipsx.s       |  26 ++++++
 src/runtime/internal/atomic/asm_ppc64x.s      |  30 ++++++-
 src/runtime/internal/atomic/asm_s390x.s       |  22 ++++-
 src/runtime/internal/atomic/atomic_386.go     |   6 ++
 src/runtime/internal/atomic/atomic_amd64.go   |   6 ++
 src/runtime/internal/atomic/atomic_arm.go     |  20 +++++
 src/runtime/internal/atomic/atomic_arm64.go   |   6 ++
 src/runtime/internal/atomic/atomic_arm64.s    |  19 ++++
 src/runtime/internal/atomic/atomic_mips64x.go |   6 ++
 src/runtime/internal/atomic/atomic_mipsx.go   |   6 ++
 src/runtime/internal/atomic/atomic_ppc64x.go  |   6 ++
 src/runtime/internal/atomic/atomic_riscv64.go |   6 ++
 src/runtime/internal/atomic/atomic_riscv64.s  |  14 +++
 src/runtime/internal/atomic/atomic_s390x.go   |   6 ++
 src/runtime/internal/atomic/atomic_test.go    | 119 +++++++++++++++++++++++++-
 src/runtime/internal/atomic/atomic_wasm.go    |  12 +++
 src/runtime/internal/atomic/bench_test.go     |  40 +++++++++
 20 files changed, 400 insertions(+), 8 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 7ebf675ac5..d82faef1f0 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -243,3 +243,19 @@ TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 80fb31285d..2cf7c55870 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -169,3 +169,19 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	LOCK
 	ANDB	BX, (AX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 03fb822929..a515683ebb 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -243,3 +243,29 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	BEQ	R4, -4(PC)
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 63bb548825..2b2cfabe08 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -172,3 +172,29 @@ try_and8:
 	BEQ	R4, try_and8
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index c0237de4d0..bb009ab34d 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -222,8 +222,32 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVBZ	val+8(FP), R4
 	LWSYNC
 again:
-	LBAR	(R3),R6
-	AND	R4,R6
-	STBCCC	R6,(R3)
+	LBAR	(R3), R6
+	AND	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3), R6
+	OR	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3),R6
+	AND	R4, R6
+	STWCCC	R6, (R3)
 	BNE	again
 	RET
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 9a19bc0ece..daf1f3cc9f 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -174,8 +174,8 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 
 // func Or8(addr *uint8, v uint8)
 TEXT ·Or8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to OR with the entire word atomically.
@@ -188,8 +188,8 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to AND with the entire word atomically.
@@ -200,3 +200,17 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
 	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 06ce6a5356..1bfcb1143d 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -69,6 +69,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 1b71a16d94..e36eb83a11 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -77,6 +77,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 67d529c1cb..546b3d6120 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -182,6 +182,26 @@ func And8(addr *uint8, v uint8) {
 	}
 }
 
+//go:nosplit
+func Or(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old|v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old&v) {
+			return
+		}
+	}
+}
+
 //go:nosplit
 func armcas(ptr *uint32, old, new uint32) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index c9b4322fe9..d49bee8936 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -53,6 +53,12 @@ func Or8(ptr *uint8, val uint8)
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index 36c7698b18..0cf3c40223 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -164,3 +164,22 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	CBNZ	R3, -3(PC)
 	RET
 
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	AND	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	ORR	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index fca2242514..b0109d72b0 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index be1e6a038b..1336b50121 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -141,6 +141,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index e759bb27a2..e4b109f0ec 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index 617bc1a3eb..8f24d61625 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -51,6 +51,12 @@ func Or8(ptr *uint8, val uint8)
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index db139d690a..74c896cea6 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -242,3 +242,17 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	SLL	A2, A1
 	AMOORW	A1, (A0), ZERO
 	RET
+
+// func And(ptr *uint32, val uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOANDW	A1, (A0), ZERO
+	RET
+
+// func Or(ptr *uint32, val uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOORW	A1, (A0), ZERO
+	RET
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index b649caa39f..a058d60102 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -91,6 +91,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index a9f95077c0..c9c2eba248 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -150,6 +150,45 @@ func TestAnd8(t *testing.T) {
 	}
 }
 
+func TestAnd(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0xffffffff)
+	for i := uint32(0); i < 32; i++ {
+		atomic.And(&x, ^(1 << i))
+		if r := uint32(0xffffffff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint32, 1<<12)
+	for i := range a {
+		a[i] = 0xffffffff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := ^uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestOr8(t *testing.T) {
 	// Basic sanity check.
 	x := uint8(0)
@@ -186,7 +225,43 @@ func TestOr8(t *testing.T) {
 	}
 }
 
-func TestBitwiseContended(t *testing.T) {
+func TestOr(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0)
+	for i := uint32(0); i < 32; i++ {
+		atomic.Or(&x, 1<<i)
+		if r := (uint32(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint32(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xffffffff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint32(0xffffffff), v)
+		}
+	}
+}
+
+func TestBitwiseContended8(t *testing.T) {
 	// Start with every bit in array set to 0.
 	a := make([]uint8, 16)
 
@@ -228,6 +303,48 @@ func TestBitwiseContended(t *testing.T) {
 	}
 }
 
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or(&a[i], m)
+					if atomic.Load(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And(&a[i], ^m)
+					if atomic.Load(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 60a4942884..b05d98ed51 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -131,6 +131,18 @@ func Or8(ptr *uint8, val uint8) {
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:nosplit
+//go:noinline
+func And(ptr *uint32, val uint32) {
+	*ptr = *ptr & val
+}
+
+//go:nosplit
+//go:noinline
+func Or(ptr *uint32, val uint32) {
+	*ptr = *ptr | val
+}
+
 //go:nosplit
 //go:noinline
 func Cas64(ptr *uint64, old, new uint64) bool {
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index de71b0f2c7..434aa6d434 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -51,6 +51,14 @@ func BenchmarkAnd8(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkAnd8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -63,6 +71,18 @@ func BenchmarkAnd8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkAndParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.And(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkOr8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -71,6 +91,14 @@ func BenchmarkOr8(b *testing.B) {
 	}
 }
 
+func BenchmarkOr(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkOr8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -83,6 +111,18 @@ func BenchmarkOr8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkOrParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.Or(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
-- 
cgit v1.2.1


From 4a2cc73f8789e3df43c1c96944c90f55757a23b0 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 1 Oct 2020 15:21:37 -0400
Subject: runtime: don't attempt to steal from idle Ps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Work stealing is a scalability bottleneck in the scheduler. Since each P
has a work queue, work stealing must look at every P to determine if
there is any work. The number of Ps scales linearly with GOMAXPROCS
(i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly
with GOMAXPROCS.

Work stealing is a later attempt by a P to find work before it goes
idle. Since the P has no work of its own, extra costs here tend not to
directly affect application-level benchmarks. Where they show up is
extra CPU usage by the process as a whole. These costs get particularly
expensive for applications that transition between blocked and running
frequently.

Long term, we need a more scalable approach in general, but for now we
can make a simple observation: idle Ps ([1]) cannot possibly have
anything in their runq, so we need not bother checking at all.

We track idle Ps via a new global bitmap, updated in pidleput/pidleget.
This is already a slow path (requires sched.lock), so we don't expect
high contention there.

Using a single bitmap avoids the need to touch every P to read p.status.
Currently, the bitmap approach is not significantly better than reading
p.status. However, in a future CL I'd like to apply a similiar
optimization to timers. Once done, findrunnable would not touch most Ps
at all (in mostly idle programs), which will avoid memory latency to
pull those Ps into cache.

When reading this bitmap, we are racing with Ps going in and out of
idle, so there are a few cases to consider:

1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap.
In this case, we will try to steal (and find nothing) so there is no
harm.

2. _Pidle -> _Prunning while spinning: A P that starts running may queue
new work that we miss. This is OK: (a) that P cannot go back to sleep
without completing its work, and (b) more fundamentally, we will recheck
after we drop our P.

3. _Pidle -> _Prunning after spinning: After spinning, we really can
miss work from a newly woken P. (a) above still applies here as well,
but this is also the same delicate dance case described in findrunnable:
if nothing is spinning anymore, the other P will unpark a thread to run
the work it submits.

Benchmark results from WakeupParallel/syscall/pair/race/1ms (see
golang.org/cl/228577):

name                            old msec          new msec   delta
Perf-task-clock-8               250 ± 1%          247 ± 4%     ~     (p=0.690 n=5+5)
Perf-task-clock-16              258 ± 2%          259 ± 2%     ~     (p=0.841 n=5+5)
Perf-task-clock-32              284 ± 2%          270 ± 4%   -4.94%  (p=0.032 n=5+5)
Perf-task-clock-64              326 ± 3%          303 ± 2%   -6.92%  (p=0.008 n=5+5)
Perf-task-clock-128             407 ± 2%          363 ± 5%  -10.69%  (p=0.008 n=5+5)
Perf-task-clock-256             561 ± 1%          481 ± 1%  -14.20%  (p=0.016 n=4+5)
Perf-task-clock-512             840 ± 5%          683 ± 2%  -18.70%  (p=0.008 n=5+5)
Perf-task-clock-1024          1.38k ±14%        1.07k ± 2%  -21.85%  (p=0.008 n=5+5)

[1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other
contexts, Ps may temporarily transition through _Pidle (e.g., in
handoffp); those Ps may have work.

Updates #28808
Updates #18237

Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29
Reviewed-on: https://go-review.googlesource.com/c/go/+/259578
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
---
 src/runtime/proc.go     | 78 +++++++++++++++++++++++++++++++++++++++++++++----
 src/runtime/runtime2.go | 12 ++++++--
 2 files changed, 82 insertions(+), 8 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index e1de70a997..d088b969c8 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2295,8 +2295,12 @@ top:
 			if _p_ == p2 {
 				continue
 			}
-			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-				return gp, false
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
+					return gp, false
+				}
 			}
 
 			// Consider stealing timers from p2.
@@ -2307,8 +2311,13 @@ top:
 			// and is not marked for preemption. If p2 is running
 			// and not being preempted we assume it will handle its
 			// own timers.
+			//
 			// If we're still looking for work after checking all
 			// the P's, then go ahead and steal from an active P.
+			//
+			// TODO(prattmic): Maintain a global look-aside similar
+			// to idlepMask to avoid looking at p2 if it can't
+			// possibly have timers.
 			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
@@ -2379,6 +2388,9 @@ stop:
 	// safe-points. We don't need to snapshot the contents because
 	// everything up to cap(allp) is immutable.
 	allpSnapshot := allp
+	// Also snapshot idlepMask. Value changes are OK, but we can't allow
+	// len to change out from under us.
+	idlepMaskSnapshot := idlepMask
 
 	// return P and block
 	lock(&sched.lock)
@@ -2419,8 +2431,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for _, _p_ := range allpSnapshot {
-		if !runqempty(_p_) {
+	for id, _p_ := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -4398,6 +4410,8 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	maskWords := (nprocs+31) / 32
+
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
 		// Synchronize with retake, which could be running
@@ -4412,6 +4426,15 @@ func procresize(nprocs int32) *p {
 			copy(nallp, allp[:cap(allp)])
 			allp = nallp
 		}
+
+		if maskWords <= int32(cap(idlepMask)) {
+			idlepMask = idlepMask[:maskWords]
+		} else {
+			nidlepMask := make([]uint32, maskWords)
+			// No need to copy beyond len, old Ps are irrelevant.
+			copy(nidlepMask, idlepMask)
+			idlepMask = nidlepMask
+		}
 		unlock(&allpLock)
 	}
 
@@ -4470,6 +4493,7 @@ func procresize(nprocs int32) *p {
 	if int32(len(allp)) != nprocs {
 		lock(&allpLock)
 		allp = allp[:nprocs]
+		idlepMask = idlepMask[:maskWords]
 		unlock(&allpLock)
 	}
 
@@ -5153,8 +5177,46 @@ func globrunqget(_p_ *p, max int32) *g {
 	return gp
 }
 
-// Put p to on _Pidle list.
+// pIdleMask is a bitmap of of Ps in the _Pidle list, one bit per P.
+type pIdleMask []uint32
+
+// read returns true if P id is in the _Pidle list, and thus cannot have work.
+func (p pIdleMask) read(id uint32) bool {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	return (atomic.Load(&p[word]) & mask) != 0
+}
+
+// set sets P id as idle in mask.
+//
+// Must be called only for a P owned by the caller. In order to maintain
+// consistency, a P going idle must the idle mask simultaneously with updates
+// to the idle P list under the sched.lock, otherwise a racing pidleget may
+// clear the mask before pidleput sets the mask, corrupting the bitmap.
+//
+// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
+func (p pIdleMask) set(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.Or(&p[word], mask)
+}
+
+// clear sets P id as non-idle in mask.
+//
+// See comment on set.
+func (p pIdleMask) clear(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.And(&p[word], ^mask)
+}
+
+// pidleput puts p to on the _Pidle list.
+//
+// This releases ownership of p. Once sched.lock is released it is no longer
+// safe to use p.
+//
 // sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleput(_p_ *p) {
@@ -5163,13 +5225,16 @@ func pidleput(_p_ *p) {
 	if !runqempty(_p_) {
 		throw("pidleput: P has non-empty run queue")
 	}
+	idlepMask.set(_p_.id)
 	_p_.link = sched.pidle
 	sched.pidle.set(_p_)
 	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
 }
 
-// Try get a p from _Pidle list.
+// pidleget tries to get a p from the _Pidle list, acquiring ownership.
+//
 // sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleget() *p {
@@ -5177,6 +5242,7 @@ func pidleget() *p {
 
 	_p_ := sched.pidle.ptr()
 	if _p_ != nil {
+		idlepMask.clear(_p_.id)
 		sched.pidle = _p_.link
 		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
 	}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 519872b8e2..0758a35e01 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -1035,14 +1035,22 @@ func (w waitReason) String() string {
 var (
 	allglen    uintptr
 	allm       *m
-	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
-	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
 	sched      schedt
 	newprocs   int32
 
+	// allpLock protects P-less reads and size changes of allp and
+	// idlepMask, and all writes to allp.
+	allpLock mutex
+	// len(allp) == gomaxprocs; may change at safe points, otherwise
+	// immutable.
+	allp []*p
+	// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
+	// be atomic. Length may change at safe points.
+	idlepMask pIdleMask
+
 	// Information about what cpu features are available.
 	// Packages outside the runtime should not use these
 	// as they are not an external api.
-- 
cgit v1.2.1


From e313fd7448ed0dabf98dc725bee2361e905f208b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 9 Jul 2020 16:20:48 -0400
Subject: runtime: drop unused work.ndone field

This field is unused since golang.org/cl/134785 and thus can be
trivially removed.

Change-Id: I1a87f8e78ffdf662440409404f0251c40bc56a4f
Reviewed-on: https://go-review.googlesource.com/c/go/+/241741
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/mgc.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 0a4d5616a5..65ac654b14 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -983,7 +983,6 @@ var work struct {
 	nproc  uint32
 	tstart int64
 	nwait  uint32
-	ndone  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
 	nFlushCacheRoots                               int
-- 
cgit v1.2.1


From ad642727247383079c8546ca365172859641a800 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 25 Aug 2020 12:34:02 -0400
Subject: runtime: rename pageAlloc receiver

The history of pageAlloc using 's' as a receiver are lost to the depths
of time (perhaps it used to be called summary?), but it doesn't make
much sense anymore. Rename it to 'p'.

Generated with:

$ cd src/runtime
$ grep -R -b "func (s \*pageAlloc" . | awk -F : '{ print $1 ":#" $2+6 }' | xargs -n 1 -I {} env GOROOT=$(pwd)/../../ gorename -offset {} -to p -v
$ grep -R -b "func (s \*pageAlloc" . | awk -F : '{ print $1 ":#" $2+6 }' | xargs -n 1 -I {} env GOROOT=$(pwd)/../../ GOARCH=386 gorename -offset {} -to p -v
$ GOROOT=$(pwd)/../../ gorename -offset mpagecache.go:#2397 -to p -v

($2+6 to advance past "func (".)

Plus manual comment fixups.

Change-Id: I2d521a1cbf6ebe2ef6aae92e654bfc33c63d1aa9
Reviewed-on: https://go-review.googlesource.com/c/go/+/250517
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/mgcscavenge.go      | 102 ++++++++++----------
 src/runtime/mpagealloc.go       | 200 ++++++++++++++++++++--------------------
 src/runtime/mpagealloc_32bit.go |  14 +--
 src/runtime/mpagealloc_64bit.go |  30 +++---
 src/runtime/mpagecache.go       |  42 ++++-----
 5 files changed, 194 insertions(+), 194 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 9d6f551768..34646828e5 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -390,13 +390,13 @@ func bgscavenge(c chan int) {
 //
 // Returns the amount of memory scavenged in bytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	var (
 		addrs addrRange
 		gen   uint32
@@ -404,17 +404,17 @@ func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	released := uintptr(0)
 	for released < nbytes {
 		if addrs.size() == 0 {
-			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
 				break
 			}
 		}
-		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
+		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
 		released += r
 		addrs = a
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
-	s.scavengeUnreserve(addrs, gen)
+	p.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -440,46 +440,46 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 // scavengeStartGen starts a new scavenge generation, resetting
 // the scavenger's search space to the full in-use address space.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeStartGen() {
+func (p *pageAlloc) scavengeStartGen() {
 	if debug.scavtrace > 0 {
-		printScavTrace(s.scav.gen, s.scav.released, false)
+		printScavTrace(p.scav.gen, p.scav.released, false)
 	}
-	s.inUse.cloneInto(&s.scav.inUse)
+	p.inUse.cloneInto(&p.scav.inUse)
 
 	// Pick the new starting address for the scavenger cycle.
 	var startAddr offAddr
-	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
 		// The "free" high watermark exceeds the "scavenged" low watermark,
 		// so there are free scavengable pages in parts of the address space
 		// that the scavenger already searched, the high watermark being the
 		// highest one. Pick that as our new starting point to ensure we
 		// see those pages.
-		startAddr = s.scav.freeHWM
+		startAddr = p.scav.freeHWM
 	} else {
 		// The "free" high watermark does not exceed the "scavenged" low
 		// watermark. This means the allocator didn't free any memory in
 		// the range we scavenged last cycle, so we might as well continue
 		// scavenging from where we were.
-		startAddr = s.scav.scavLWM
+		startAddr = p.scav.scavLWM
 	}
-	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+	p.scav.inUse.removeGreaterEqual(startAddr.addr())
 
-	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
 	// scavengeReservationShards is large. This case is fine as the scavenger
 	// will simply be turned off, but it does mean that scavengeReservationShards,
 	// in concert with pallocChunkBytes, dictates the minimum heap size at which
 	// the scavenger triggers. In practice this minimum is generally less than an
 	// arena in size, so virtually every heap has the scavenger on.
-	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
-	s.scav.gen++
-	s.scav.released = 0
-	s.scav.freeHWM = minOffAddr
-	s.scav.scavLWM = maxOffAddr
+	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	p.scav.gen++
+	p.scav.released = 0
+	p.scav.freeHWM = minOffAddr
+	p.scav.scavLWM = maxOffAddr
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -489,19 +489,19 @@ func (s *pageAlloc) scavengeStartGen() {
 //
 // Returns the reserved range and the scavenge generation number for it.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	// Start by reserving the minimum.
-	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
 
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, s.scav.gen
+		return r, p.scav.gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -511,27 +511,27 @@ func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	newBase := alignDown(r.base.addr(), pallocChunkBytes)
 
 	// Remove from inUse however much extra we just pulled out.
-	s.scav.inUse.removeGreaterEqual(newBase)
+	p.scav.inUse.removeGreaterEqual(newBase)
 	r.base = offAddr{newBase}
-	return r, s.scav.gen
+	return r, p.scav.gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	if r.size() == 0 || gen != s.scav.gen {
+func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	if r.size() == 0 || gen != p.scav.gen {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	s.scav.inUse.add(r)
+	p.scav.inUse.add(r)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -545,13 +545,13 @@ func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
 	// Defensively check if we've recieved an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -586,12 +586,12 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// Helpers for locking and unlocking only if mayUnlock == true.
 	lockHeap := func() {
 		if mayUnlock {
-			lock(s.mheapLock)
+			lock(p.mheapLock)
 		}
 	}
 	unlockHeap := func() {
 		if mayUnlock {
-			unlock(s.mheapLock)
+			unlock(p.mheapLock)
 		}
 	}
 
@@ -602,14 +602,14 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// by subtracting 1.
 	maxAddr := work.limit.addr() - 1
 	maxChunk := chunkIndex(maxAddr)
-	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
 			return uintptr(npages) * pageSize, work
 		}
 	}
@@ -631,7 +631,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// that's fine. We're being optimistic anyway.
 
 			// Check quickly if there are enough free pages at all.
-			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
 				continue
 			}
 
@@ -641,7 +641,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// avoid races with heap growth. It may or may not be possible to also
 			// see a nil pointer in this case if we do race with heap growth, but
 			// just defensively ignore the nils. This operation is optimistic anyway.
-			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
 			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
 				return i, true
 			}
@@ -670,10 +670,10 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		}
 
 		// Find, verify, and scavenge if we can.
-		chunk := s.chunkOf(candidateChunkIdx)
+		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
 			return uintptr(npages) * pageSize, work
 		}
 
@@ -690,21 +690,21 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
-	s.chunkOf(ci).scavenged.setRange(base, npages)
+// p.mheapLock must be held.
+func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
 	// Update the scavenge low watermark.
-	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
-		s.scav.scavLWM = oAddr
+	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
+		p.scav.scavLWM = oAddr
 	}
 
 	// Only perform the actual scavenging if we're not in a test.
 	// It's dangerous to do so otherwise.
-	if s.test {
+	if p.test {
 		return addr
 	}
 	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index c90a6378bd..560babed03 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -299,7 +299,7 @@ type pageAlloc struct {
 	test bool
 }
 
-func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
@@ -308,29 +308,29 @@ func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 		print("runtime: summary max pages = ", maxPackedValue, "\n")
 		throw("root level max pages doesn't fit in summary")
 	}
-	s.sysStat = sysStat
+	p.sysStat = sysStat
 
-	// Initialize s.inUse.
-	s.inUse.init(sysStat)
+	// Initialize p.inUse.
+	p.inUse.init(sysStat)
 
 	// System-dependent initialization.
-	s.sysInit()
+	p.sysInit()
 
 	// Start with the searchAddr in a state indicating there's no free memory.
-	s.searchAddr = maxSearchAddr
+	p.searchAddr = maxSearchAddr
 
 	// Set the mheapLock.
-	s.mheapLock = mheapLock
+	p.mheapLock = mheapLock
 
 	// Initialize scavenge tracking state.
-	s.scav.scavLWM = maxSearchAddr
+	p.scav.scavLWM = maxSearchAddr
 }
 
 // tryChunkOf returns the bitmap data for the given chunk.
 //
 // Returns nil if the chunk data has not been mapped.
-func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
-	l2 := s.chunks[ci.l1()]
+func (p *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
+	l2 := p.chunks[ci.l1()]
 	if l2 == nil {
 		return nil
 	}
@@ -340,15 +340,15 @@ func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
 // chunkOf returns the chunk at the given chunk index.
 //
 // The chunk index must be valid or this method may throw.
-func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
-	return &s.chunks[ci.l1()][ci.l2()]
+func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &p.chunks[ci.l1()][ci.l2()]
 }
 
 // grow sets up the metadata for the address range [base, base+size).
-// It may allocate metadata, in which case *s.sysStat will be updated.
+// It may allocate metadata, in which case *p.sysStat will be updated.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) grow(base, size uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) grow(base, size uintptr) {
 	// Round up to chunks, since we can't deal with increments smaller
 	// than chunks. Also, sysGrow expects aligned values.
 	limit := alignUp(base+size, pallocChunkBytes)
@@ -356,29 +356,29 @@ func (s *pageAlloc) grow(base, size uintptr) {
 
 	// Grow the summary levels in a system-dependent manner.
 	// We just update a bunch of additional metadata here.
-	s.sysGrow(base, limit)
+	p.sysGrow(base, limit)
 
-	// Update s.start and s.end.
+	// Update p.start and p.end.
 	// If no growth happened yet, start == 0. This is generally
 	// safe since the zero page is unmapped.
-	firstGrowth := s.start == 0
+	firstGrowth := p.start == 0
 	start, end := chunkIndex(base), chunkIndex(limit)
-	if firstGrowth || start < s.start {
-		s.start = start
+	if firstGrowth || start < p.start {
+		p.start = start
 	}
-	if end > s.end {
-		s.end = end
+	if end > p.end {
+		p.end = end
 	}
 	// Note that [base, limit) will never overlap with any existing
 	// range inUse because grow only ever adds never-used memory
 	// regions to the page allocator.
-	s.inUse.add(makeAddrRange(base, limit))
+	p.inUse.add(makeAddrRange(base, limit))
 
 	// A grow operation is a lot like a free operation, so if our
-	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// chunk ends up below p.searchAddr, update p.searchAddr to the
 	// new address, just like in free.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 
 	// Add entries into chunks, which is sparse, if needed. Then,
@@ -387,21 +387,21 @@ func (s *pageAlloc) grow(base, size uintptr) {
 	// Newly-grown memory is always considered scavenged.
 	// Set all the bits in the scavenged bitmaps high.
 	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
-		if s.chunks[c.l1()] == nil {
+		if p.chunks[c.l1()] == nil {
 			// Create the necessary l2 entry.
 			//
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
-			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
-			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
-		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
 	}
 
 	// Update summaries accordingly. The grow acts like a free, so
 	// we need to ensure this newly-free memory is visible in the
 	// summaries.
-	s.update(base, size/pageSize, true, false)
+	p.update(base, size/pageSize, true, false)
 }
 
 // update updates heap metadata. It must be called each time the bitmap
@@ -411,8 +411,8 @@ func (s *pageAlloc) grow(base, size uintptr) {
 // a contiguous allocation or free between addr and addr+npages. alloc indicates
 // whether the operation performed was an allocation or a free.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+// p.mheapLock must be held.
+func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	// base, limit, start, and end are inclusive.
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -421,23 +421,23 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	if sc == ec {
 		// Fast path: the allocation doesn't span more than one chunk,
 		// so update this one and if the summary didn't change, return.
-		x := s.summary[len(s.summary)-1][sc]
-		y := s.chunkOf(sc).summarize()
+		x := p.summary[len(p.summary)-1][sc]
+		y := p.chunkOf(sc).summarize()
 		if x == y {
 			return
 		}
-		s.summary[len(s.summary)-1][sc] = y
+		p.summary[len(p.summary)-1][sc] = y
 	} else if contig {
 		// Slow contiguous path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 
 		// Update the summary for chunk sc.
-		summary[sc] = s.chunkOf(sc).summarize()
+		summary[sc] = p.chunkOf(sc).summarize()
 
 		// Update the summaries for chunks in between, which are
 		// either totally allocated or freed.
-		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		whole := p.summary[len(p.summary)-1][sc+1 : ec]
 		if alloc {
 			// Should optimize into a memclr.
 			for i := range whole {
@@ -450,22 +450,22 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 		}
 
 		// Update the summary for chunk ec.
-		summary[ec] = s.chunkOf(ec).summarize()
+		summary[ec] = p.chunkOf(ec).summarize()
 	} else {
 		// Slow general path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
 		//
 		// We can't assume a contiguous allocation happened, so walk over
 		// every chunk in the range and manually recompute the summary.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 		for c := sc; c <= ec; c++ {
-			summary[c] = s.chunkOf(c).summarize()
+			summary[c] = p.chunkOf(c).summarize()
 		}
 	}
 
 	// Walk up the radix tree and update the summaries appropriately.
 	changed := true
-	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+	for l := len(p.summary) - 2; l >= 0 && changed; l-- {
 		// Update summaries at level l from summaries at level l+1.
 		changed = false
 
@@ -479,12 +479,12 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 
 		// Iterate over each block, updating the corresponding summary in the less-granular level.
 		for i := lo; i < hi; i++ {
-			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			children := p.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
 			sum := mergeSummaries(children, logMaxPages)
-			old := s.summary[l][i]
+			old := p.summary[l][i]
 			if old != sum {
 				changed = true
-				s.summary[l][i] = sum
+				p.summary[l][i] = sum
 			}
 		}
 	}
@@ -497,8 +497,8 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 // Returns the amount of scavenged memory in bytes present in the
 // allocated range.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
 	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
@@ -506,24 +506,24 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	scav := uint(0)
 	if sc == ec {
 		// The range doesn't cross any chunk boundaries.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, ei+1-si)
 		chunk.allocRange(si, ei+1-si)
 	} else {
 		// The range crosses at least one chunk boundary.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
 		chunk.allocRange(si, pallocChunkPages-si)
 		for c := sc + 1; c < ec; c++ {
-			chunk := s.chunkOf(c)
+			chunk := p.chunkOf(c)
 			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
 			chunk.allocAll()
 		}
-		chunk = s.chunkOf(ec)
+		chunk = p.chunkOf(ec)
 		scav += chunk.scavenged.popcntRange(0, ei+1)
 		chunk.allocRange(0, ei+1)
 	}
-	s.update(base, npages, true, true)
+	p.update(base, npages, true, true)
 	return uintptr(scav) * pageSize
 }
 
@@ -532,13 +532,13 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 // returned. If addr is higher than any mapped region, then
 // it returns maxOffAddr.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+// p.mheapLock must be held.
+func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 	// If we're not in a test, validate first by checking mheap_.arenas.
 	// This is a fast path which is only safe to use outside of testing.
 	ai := arenaIndex(addr.addr())
-	if s.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
-		vAddr, ok := s.inUse.findAddrGreaterEqual(addr.addr())
+	if p.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := p.inUse.findAddrGreaterEqual(addr.addr())
 		if ok {
 			return offAddr{vAddr}
 		} else {
@@ -554,20 +554,20 @@ func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 // find searches for the first (address-ordered) contiguous free region of
 // npages in size and returns a base address for that region.
 //
-// It uses s.searchAddr to prune its search and assumes that no palloc chunks
-// below chunkIndex(s.searchAddr) contain any free memory at all.
+// It uses p.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(p.searchAddr) contain any free memory at all.
 //
-// find also computes and returns a candidate s.searchAddr, which may or
-// may not prune more of the address space than s.searchAddr already does.
-// This candidate is always a valid s.searchAddr.
+// find also computes and returns a candidate p.searchAddr, which may or
+// may not prune more of the address space than p.searchAddr already does.
+// This candidate is always a valid p.searchAddr.
 //
 // find represents the slow path and the full radix tree search.
 //
 // Returns a base address of 0 on failure, in which case the candidate
 // searchAddr returned is invalid and must be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -647,7 +647,7 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	lastSumIdx := -1
 
 nextLevel:
-	for l := 0; l < len(s.summary); l++ {
+	for l := 0; l < len(p.summary); l++ {
 		// For the root level, entriesPerBlock is the whole level.
 		entriesPerBlock := 1 << levelBits[l]
 		logMaxPages := levelLogPages[l]
@@ -657,14 +657,14 @@ nextLevel:
 		i <<= levelBits[l]
 
 		// Slice out the block of entries we care about.
-		entries := s.summary[l][i : i+entriesPerBlock]
+		entries := p.summary[l][i : i+entriesPerBlock]
 
 		// Determine j0, the first index we should start iterating from.
 		// The searchAddr may help us eliminate iterations if we followed the
 		// searchAddr on the previous level or we're on the root leve, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
-		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
 			j0 = searchIdx & (entriesPerBlock - 1)
 		}
 
@@ -729,7 +729,7 @@ nextLevel:
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
 			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
-			return addr, s.findMappedAddr(firstFree.base)
+			return addr, p.findMappedAddr(firstFree.base)
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -741,7 +741,7 @@ nextLevel:
 		// lied to us. In either case, dump some useful state and throw.
 		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
 		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
-		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: p.searchAddr = ", hex(p.searchAddr.addr()), ", i = ", i, "\n")
 		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
 		for j := 0; j < len(entries); j++ {
 			sum := entries[j]
@@ -758,12 +758,12 @@ nextLevel:
 	// After iterating over all levels, i must contain a chunk index which
 	// is what the final level represents.
 	ci := chunkIdx(i)
-	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	j, searchIdx := p.chunkOf(ci).find(npages, 0)
 	if j == ^uint(0) {
 		// We couldn't find any space in this chunk despite the summaries telling
 		// us it should be there. There's likely a bug, so dump some state and throw.
-		sum := s.summary[len(s.summary)-1][i]
-		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		sum := p.summary[len(p.summary)-1][i]
+		print("runtime: summary[", len(p.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
 		print("runtime: npages = ", npages, "\n")
 		throw("bad summary data")
 	}
@@ -775,7 +775,7 @@ nextLevel:
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
 	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
-	return addr, s.findMappedAddr(firstFree.base)
+	return addr, p.findMappedAddr(firstFree.base)
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
@@ -785,25 +785,25 @@ nextLevel:
 // Returns a 0 base address on failure, in which case other returned values
 // should be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return 0, 0
 	}
 
 	// If npages has a chance of fitting in the chunk where the searchAddr is,
 	// search it directly.
 	searchAddr := minOffAddr
-	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
+	if pallocChunkPages-chunkPageIndex(p.searchAddr.addr()) >= uint(npages) {
 		// npages is guaranteed to be no greater than pallocChunkPages here.
-		i := chunkIndex(s.searchAddr.addr())
-		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
-			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+		i := chunkIndex(p.searchAddr.addr())
+		if max := p.summary[len(p.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := p.chunkOf(i).find(npages, chunkPageIndex(p.searchAddr.addr()))
 			if j == ^uint(0) {
 				print("runtime: max = ", max, ", npages = ", npages, "\n")
-				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(p.searchAddr.addr()), ", p.searchAddr = ", hex(p.searchAddr.addr()), "\n")
 				throw("bad summary data")
 			}
 			addr = chunkBase(i) + uintptr(j)*pageSize
@@ -813,7 +813,7 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	}
 	// We failed to use a searchAddr for one reason or another, so try
 	// the slow path.
-	addr, searchAddr = s.find(npages)
+	addr, searchAddr = p.find(npages)
 	if addr == 0 {
 		if npages == 1 {
 			// We failed to find a single free page, the smallest unit
@@ -821,41 +821,41 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 			// exhausted. Otherwise, the heap still might have free
 			// space in it, just not enough contiguous space to
 			// accommodate npages.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 		}
 		return 0, 0
 	}
 Found:
 	// Go ahead and actually mark the bits now that we have an address.
-	scav = s.allocRange(addr, npages)
+	scav = p.allocRange(addr, npages)
 
 	// If we found a higher searchAddr, we know that all the
 	// heap memory before that searchAddr in an offset address space is
-	// allocated, so bump s.searchAddr up to the new one.
-	if s.searchAddr.lessThan(searchAddr) {
-		s.searchAddr = searchAddr
+	// allocated, so bump p.searchAddr up to the new one.
+	if p.searchAddr.lessThan(searchAddr) {
+		p.searchAddr = searchAddr
 	}
 	return addr, scav
 }
 
 // free returns npages worth of memory starting at base back to the page heap.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) free(base, npages uintptr) {
-	// If we're freeing pages below the s.searchAddr, update searchAddr.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+// p.mheapLock must be held.
+func (p *pageAlloc) free(base, npages uintptr) {
+	// If we're freeing pages below the p.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
-		s.scav.freeHWM = offLimit
+	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+		p.scav.freeHWM = offLimit
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
 		// where it is, so mark it directly.
 		i := chunkIndex(base)
-		s.chunkOf(i).free1(chunkPageIndex(base))
+		p.chunkOf(i).free1(chunkPageIndex(base))
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
 		sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -863,17 +863,17 @@ func (s *pageAlloc) free(base, npages uintptr) {
 
 		if sc == ec {
 			// The range doesn't cross any chunk boundaries.
-			s.chunkOf(sc).free(si, ei+1-si)
+			p.chunkOf(sc).free(si, ei+1-si)
 		} else {
 			// The range crosses at least one chunk boundary.
-			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			p.chunkOf(sc).free(si, pallocChunkPages-si)
 			for c := sc + 1; c < ec; c++ {
-				s.chunkOf(c).freeAll()
+				p.chunkOf(c).freeAll()
 			}
-			s.chunkOf(ec).free(0, ei+1)
+			p.chunkOf(ec).free(0, ei+1)
 		}
 	}
-	s.update(base, npages, true, false)
+	p.update(base, npages, true, false)
 }
 
 const (
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 90f1e54d6c..331dadade9 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -60,7 +60,7 @@ var levelLogPages = [summaryLevels]uint{
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Calculate how much memory all our entries will take up.
 	//
 	// This should be around 12 KiB or less.
@@ -76,7 +76,7 @@ func (s *pageAlloc) sysInit() {
 		throw("failed to reserve page summary memory")
 	}
 	// There isn't much. Just map it and mark it as used immediately.
-	sysMap(reservation, totalSize, s.sysStat)
+	sysMap(reservation, totalSize, p.sysStat)
 	sysUsed(reservation, totalSize)
 
 	// Iterate over the reservation and cut it up into slices.
@@ -88,29 +88,29 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
 	}
 
 	// Walk up the tree and update the summary slices.
-	for l := len(s.summary) - 1; l >= 0; l-- {
+	for l := len(p.summary) - 1; l >= 0; l-- {
 		// Figure out what part of the summary array this new address space needs.
 		// Note that we need to align the ranges to the block width (1<<levelBits[l])
 		// at this level because the full block is needed to compute the summary for
 		// the next level.
 		lo, hi := addrsToSummaryRange(l, base, limit)
 		_, hi = blockAlignSummaryRange(l, lo, hi)
-		if hi > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:hi]
+		if hi > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:hi]
 		}
 	}
 }
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index a1691ba802..ffacb46c18 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -67,7 +67,7 @@ var levelLogPages = [summaryLevels]uint{
 // sysInit performs architecture-dependent initialization of fields
 // in pageAlloc. pageAlloc should be uninitialized except for sysStat
 // if any runtime statistic should be updated.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Reserve memory for each level. This will get mapped in
 	// as R/W by setArenas.
 	for l, shift := range levelShift {
@@ -82,21 +82,21 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 	}
 }
 
 // sysGrow performs architecture-dependent operations on heap
 // growth for the page allocator, such as mapping in new memory
 // for summaries. It also updates the length of the slices in
-// s.summary.
+// [.summary.
 //
 // base is the base of the newly-added heap memory and limit is
 // the first address past the end of the newly-added heap memory.
 // Both must be aligned to pallocChunkBytes.
 //
-// The caller must update s.start and s.end after calling sysGrow.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+// The caller must update p.start and p.end after calling sysGrow.
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
@@ -111,12 +111,12 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	}
 
 	// summaryRangeToSumAddrRange converts a range of indices in any
-	// level of s.summary into page-aligned addresses which cover that
+	// level of p.summary into page-aligned addresses which cover that
 	// range of indices.
 	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
 		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
 		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
-		base := unsafe.Pointer(&s.summary[level][0])
+		base := unsafe.Pointer(&p.summary[level][0])
 		return addrRange{
 			offAddr{uintptr(add(base, baseOffset))},
 			offAddr{uintptr(add(base, limitOffset))},
@@ -140,10 +140,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	//
 	// This will be used to look at what memory in the summary array is already
 	// mapped before and after this new range.
-	inUseIndex := s.inUse.findSucc(base)
+	inUseIndex := p.inUse.findSucc(base)
 
 	// Walk up the radix tree and map summaries in as needed.
-	for l := range s.summary {
+	for l := range p.summary {
 		// Figure out what part of the summary array this new address space needs.
 		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
 
@@ -151,8 +151,8 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// we get tight bounds checks on at least the top bound.
 		//
 		// We must do this regardless of whether we map new memory.
-		if needIdxLimit > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:needIdxLimit]
+		if needIdxLimit > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:needIdxLimit]
 		}
 
 		// Compute the needed address range in the summary array for level l.
@@ -163,10 +163,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// for mapping. prune's invariants are guaranteed by the fact that this
 		// function will never be asked to remap the same memory twice.
 		if inUseIndex > 0 {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
 		}
-		if inUseIndex < len(s.inUse.ranges) {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		if inUseIndex < len(p.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex]))
 		}
 		// It's possible that after our pruning above, there's nothing new to map.
 		if need.size() == 0 {
@@ -174,7 +174,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		}
 
 		// Map and commit need.
-		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), p.sysStat)
 		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
 	}
 }
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 683a997136..5f76501a1c 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -71,8 +71,8 @@ func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
 // into s. Then, it clears the cache, such that empty returns
 // true.
 //
-// s.mheapLock must be held or the world must be stopped.
-func (c *pageCache) flush(s *pageAlloc) {
+// p.mheapLock must be held or the world must be stopped.
+func (c *pageCache) flush(p *pageAlloc) {
 	if c.empty() {
 		return
 	}
@@ -83,18 +83,18 @@ func (c *pageCache) flush(s *pageAlloc) {
 	// slower, safer thing by iterating over each bit individually.
 	for i := uint(0); i < 64; i++ {
 		if c.cache&(1<<i) != 0 {
-			s.chunkOf(ci).free1(pi + i)
+			p.chunkOf(ci).free1(pi + i)
 		}
 		if c.scav&(1<<i) != 0 {
-			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
 		}
 	}
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
-	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
-	s.update(c.base, pageCachePages, false, false)
+	p.update(c.base, pageCachePages, false, false)
 	*c = pageCache{}
 }
 
@@ -102,19 +102,19 @@ func (c *pageCache) flush(s *pageAlloc) {
 // may not be contiguous, and returns a pageCache structure which owns the
 // chunk.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocToCache() pageCache {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocToCache() pageCache {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return pageCache{}
 	}
 	c := pageCache{}
-	ci := chunkIndex(s.searchAddr.addr()) // chunk index
-	if s.summary[len(s.summary)-1][ci] != 0 {
+	ci := chunkIndex(p.searchAddr.addr()) // chunk index
+	if p.summary[len(p.summary)-1][ci] != 0 {
 		// Fast path: there's free pages at or near the searchAddr address.
-		chunk := s.chunkOf(ci)
-		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		chunk := p.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(p.searchAddr.addr()))
 		if j == ^uint(0) {
 			throw("bad summary data")
 		}
@@ -126,15 +126,15 @@ func (s *pageAlloc) allocToCache() pageCache {
 	} else {
 		// Slow path: the searchAddr address had nothing there, so go find
 		// the first free page the slow way.
-		addr, _ := s.find(1)
+		addr, _ := p.find(1)
 		if addr == 0 {
 			// We failed to find adequate free space, so mark the searchAddr as OoM
 			// and return an empty pageCache.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 			return pageCache{}
 		}
 		ci := chunkIndex(addr)
-		chunk := s.chunkOf(ci)
+		chunk := p.chunkOf(ci)
 		c = pageCache{
 			base:  alignDown(addr, 64*pageSize),
 			cache: ^chunk.pages64(chunkPageIndex(addr)),
@@ -143,19 +143,19 @@ func (s *pageAlloc) allocToCache() pageCache {
 	}
 
 	// Set the bits as allocated and clear the scavenged bits.
-	s.allocRange(c.base, pageCachePages)
+	p.allocRange(c.base, pageCachePages)
 
 	// Update as an allocation, but note that it's not contiguous.
-	s.update(c.base, pageCachePages, false, true)
+	p.update(c.base, pageCachePages, false, true)
 
 	// Set the search address to the last page represented by the cache.
 	// Since all of the pages in this block are going to the cache, and we
 	// searched for the first free page, we can confidently start at the
 	// next page.
 	//
-	// However, s.searchAddr is not allowed to point into unmapped heap memory
+	// However, p.searchAddr is not allowed to point into unmapped heap memory
 	// unless it is maxSearchAddr, so make it the last page as opposed to
 	// the page after.
-	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	p.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
 	return c
 }
-- 
cgit v1.2.1


From d1b1145cace8b968307f9311ff611e4bb810710c Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <agm@google.com>
Date: Mon, 9 Dec 2019 21:50:16 -0800
Subject: syscall: support POSIX semantics for Linux syscalls

This change adds two new methods for invoking system calls
under Linux: syscall.AllThreadsSyscall() and
syscall.AllThreadsSyscall6().

These system call wrappers ensure that all OSThreads mirror
a common system call. The wrappers serialize execution of the
runtime to ensure no race conditions where any Go code observes
a non-atomic OS state change. As such, the syscalls have
higher runtime overhead than regular system calls, and only
need to be used where such thread (or 'm' in the parlance
of the runtime sources) consistency is required.

The new support is used to enable these functions under Linux:

  syscall.Setegid(), syscall.Seteuid(), syscall.Setgroups(),
  syscall.Setgid(), syscall.Setregid(), syscall.Setreuid(),
  syscall.Setresgid(), syscall.Setresuid() and syscall.Setuid().

They work identically to their glibc counterparts.

Extensive discussion of the background issue addressed in this
patch can be found here:

   https://github.com/golang/go/issues/1435

In the case where cgo is used, the C runtime can launch pthreads that
are not managed by the Go runtime. As such, the added
syscall.AllThreadsSyscall*() return ENOTSUP when cgo is enabled.
However, for the 9 syscall.Set*() functions listed above, when cgo is
active, these functions redirect to invoke their C.set*() equivalents
in glibc, which wraps the raw system calls with a nptl:setxid fixup
mechanism. This achieves POSIX semantics for these functions in the
combined Go and C runtime.

As a side note, the glibc/nptl:setxid support (2019-11-30) does not
extend to all security related system calls under Linux so using
native Go (CGO_ENABLED=0) and these AllThreadsSyscall*()s, where
needed, will yield more well defined/consistent behavior over all
threads of a Go program. That is, using the
syscall.AllThreadsSyscall*() wrappers for things like setting state
through SYS_PRCTL and SYS_CAPSET etc.

Fixes #1435

Change-Id: Ib1a3e16b9180f64223196a32fc0f9dce14d9105c
Reviewed-on: https://go-review.googlesource.com/c/go/+/210639
Trust: Emmanuel Odeke <emm.odeke@gmail.com>
Trust: Ian Lance Taylor <iant@golang.org>
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Emmanuel Odeke <emm.odeke@gmail.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/cgo/linux.go        |  74 ++++++++++++++
 src/runtime/cgo/linux_syscall.c |  85 ++++++++++++++++
 src/runtime/cgocall.go          |  16 +++
 src/runtime/proc.go             | 210 +++++++++++++++++++++++++++++++++++++++-
 src/runtime/runtime2.go         |  12 +++
 5 files changed, 392 insertions(+), 5 deletions(-)
 create mode 100644 src/runtime/cgo/linux.go
 create mode 100644 src/runtime/cgo/linux_syscall.c

(limited to 'src/runtime')

diff --git a/src/runtime/cgo/linux.go b/src/runtime/cgo/linux.go
new file mode 100644
index 0000000000..76c0192c20
--- /dev/null
+++ b/src/runtime/cgo/linux.go
@@ -0,0 +1,74 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linux system call wrappers that provide POSIX semantics through the
+// corresponding cgo->libc (nptl) wrappers for various system calls.
+
+// +build linux
+
+package cgo
+
+import "unsafe"
+
+// Each of the following entries is needed to ensure that the
+// syscall.syscall_linux code can conditionally call these
+// function pointers:
+//
+//  1. find the C-defined function start
+//  2. force the local byte alias to be mapped to that location
+//  3. map the Go pointer to the function to the syscall package
+
+//go:cgo_import_static _cgo_libc_setegid
+//go:linkname _cgo_libc_setegid _cgo_libc_setegid
+//go:linkname cgo_libc_setegid syscall.cgo_libc_setegid
+var _cgo_libc_setegid byte
+var cgo_libc_setegid = unsafe.Pointer(&_cgo_libc_setegid)
+
+//go:cgo_import_static _cgo_libc_seteuid
+//go:linkname _cgo_libc_seteuid _cgo_libc_seteuid
+//go:linkname cgo_libc_seteuid syscall.cgo_libc_seteuid
+var _cgo_libc_seteuid byte
+var cgo_libc_seteuid = unsafe.Pointer(&_cgo_libc_seteuid)
+
+//go:cgo_import_static _cgo_libc_setregid
+//go:linkname _cgo_libc_setregid _cgo_libc_setregid
+//go:linkname cgo_libc_setregid syscall.cgo_libc_setregid
+var _cgo_libc_setregid byte
+var cgo_libc_setregid = unsafe.Pointer(&_cgo_libc_setregid)
+
+//go:cgo_import_static _cgo_libc_setresgid
+//go:linkname _cgo_libc_setresgid _cgo_libc_setresgid
+//go:linkname cgo_libc_setresgid syscall.cgo_libc_setresgid
+var _cgo_libc_setresgid byte
+var cgo_libc_setresgid = unsafe.Pointer(&_cgo_libc_setresgid)
+
+//go:cgo_import_static _cgo_libc_setresuid
+//go:linkname _cgo_libc_setresuid _cgo_libc_setresuid
+//go:linkname cgo_libc_setresuid syscall.cgo_libc_setresuid
+var _cgo_libc_setresuid byte
+var cgo_libc_setresuid = unsafe.Pointer(&_cgo_libc_setresuid)
+
+//go:cgo_import_static _cgo_libc_setreuid
+//go:linkname _cgo_libc_setreuid _cgo_libc_setreuid
+//go:linkname cgo_libc_setreuid syscall.cgo_libc_setreuid
+var _cgo_libc_setreuid byte
+var cgo_libc_setreuid = unsafe.Pointer(&_cgo_libc_setreuid)
+
+//go:cgo_import_static _cgo_libc_setgroups
+//go:linkname _cgo_libc_setgroups _cgo_libc_setgroups
+//go:linkname cgo_libc_setgroups syscall.cgo_libc_setgroups
+var _cgo_libc_setgroups byte
+var cgo_libc_setgroups = unsafe.Pointer(&_cgo_libc_setgroups)
+
+//go:cgo_import_static _cgo_libc_setgid
+//go:linkname _cgo_libc_setgid _cgo_libc_setgid
+//go:linkname cgo_libc_setgid syscall.cgo_libc_setgid
+var _cgo_libc_setgid byte
+var cgo_libc_setgid = unsafe.Pointer(&_cgo_libc_setgid)
+
+//go:cgo_import_static _cgo_libc_setuid
+//go:linkname _cgo_libc_setuid _cgo_libc_setuid
+//go:linkname cgo_libc_setuid syscall.cgo_libc_setuid
+var _cgo_libc_setuid byte
+var cgo_libc_setuid = unsafe.Pointer(&_cgo_libc_setuid)
diff --git a/src/runtime/cgo/linux_syscall.c b/src/runtime/cgo/linux_syscall.c
new file mode 100644
index 0000000000..c8e91918a1
--- /dev/null
+++ b/src/runtime/cgo/linux_syscall.c
@@ -0,0 +1,85 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+#ifndef _GNU_SOURCE // setres[ug]id() API.
+#define _GNU_SOURCE
+#endif
+
+#include <grp.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+#include <errno.h>
+#include "libcgo.h"
+
+/*
+ * Assumed POSIX compliant libc system call wrappers. For linux, the
+ * glibc/nptl/setxid mechanism ensures that POSIX semantics are
+ * honored for all pthreads (by default), and this in turn with cgo
+ * ensures that all Go threads launched with cgo are kept in sync for
+ * these function calls.
+ */
+
+// argset_t matches runtime/cgocall.go:argset.
+typedef struct {
+	uintptr_t* args;
+	uintptr_t retval;
+} argset_t;
+
+// libc backed posix-compliant syscalls.
+
+#define SET_RETVAL(fn) \
+  uintptr_t ret = (uintptr_t) fn ; \
+  if (ret == -1) {                 \
+    x->retval = (uintptr_t) errno; \
+  } else                           \
+    x->retval = ret
+
+void
+_cgo_libc_setegid(argset_t* x) {
+	SET_RETVAL(setegid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_seteuid(argset_t* x) {
+	SET_RETVAL(seteuid((uid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgid(argset_t* x) {
+	SET_RETVAL(setgid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgroups(argset_t* x) {
+	SET_RETVAL(setgroups((size_t) x->args[0], (const gid_t *) x->args[1]));
+}
+
+void
+_cgo_libc_setregid(argset_t* x) {
+	SET_RETVAL(setregid((gid_t) x->args[0], (gid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setresgid(argset_t* x) {
+	SET_RETVAL(setresgid((gid_t) x->args[0], (gid_t) x->args[1],
+			     (gid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setresuid(argset_t* x) {
+	SET_RETVAL(setresuid((uid_t) x->args[0], (uid_t) x->args[1],
+			     (uid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setreuid(argset_t* x) {
+	SET_RETVAL(setreuid((uid_t) x->args[0], (uid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setuid(argset_t* x) {
+	SET_RETVAL(setuid((uid_t) x->args[0]));
+}
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 0b69ff3233..7ab42a0ed0 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -93,6 +93,22 @@ import (
 // Length must match arg.Max in x_cgo_callers in runtime/cgo/gcc_traceback.c.
 type cgoCallers [32]uintptr
 
+// argset matches runtime/cgo/linux_syscall.c:argset_t
+type argset struct {
+	args   unsafe.Pointer
+	retval uintptr
+}
+
+// wrapper for syscall package to call cgocall for libc (cgo) calls.
+//go:linkname syscall_cgocaller syscall.cgocaller
+//go:nosplit
+//go:uintptrescapes
+func syscall_cgocaller(fn unsafe.Pointer, args ...uintptr) uintptr {
+	as := argset{args: unsafe.Pointer(&args[0])}
+	cgocall(fn, unsafe.Pointer(&as))
+	return as.retval
+}
+
 // Call from Go to C.
 //
 // This must be nosplit because it's used for syscalls on some
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index d088b969c8..aeacb23391 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -137,6 +137,10 @@ func main() {
 	mainStarted = true
 
 	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
+		// For runtime_syscall_doAllThreadsSyscall, we
+		// register sysmon is not ready for the world to be
+		// stopped.
+		atomic.Store(&sched.sysmonStarting, 1)
 		systemstack(func() {
 			newm(sysmon, nil, -1)
 		})
@@ -153,6 +157,7 @@ func main() {
 	if g.m != &m0 {
 		throw("runtime.main not on m0")
 	}
+	m0.doesPark = true
 
 	// Record when the world started.
 	// Must be before doInit for tracing init.
@@ -1226,6 +1231,21 @@ func mstartm0() {
 	initsig(false)
 }
 
+// mPark causes a thread to park itself - temporarily waking for
+// fixups but otherwise waiting to be fully woken. This is the
+// only way that m's should park themselves.
+//go:nosplit
+func mPark() {
+	g := getg()
+	for {
+		notesleep(&g.m.park)
+		noteclear(&g.m.park)
+		if !mDoFixup() {
+			return
+		}
+	}
+}
+
 // mexit tears down and exits the current thread.
 //
 // Don't call this directly to exit the thread, since it must run at
@@ -1257,7 +1277,7 @@ func mexit(osStack bool) {
 		sched.nmfreed++
 		checkdead()
 		unlock(&sched.lock)
-		notesleep(&m.park)
+		mPark()
 		throw("locked m0 woke up")
 	}
 
@@ -1424,6 +1444,127 @@ func forEachP(fn func(*p)) {
 	releasem(mp)
 }
 
+// syscall_runtime_doAllThreadsSyscall serializes Go execution and
+// executes a specified fn() call on all m's.
+//
+// The boolean argument to fn() indicates whether the function's
+// return value will be consulted or not. That is, fn(true) should
+// return true if fn() succeeds, and fn(true) should return false if
+// it failed. When fn(false) is called, its return status will be
+// ignored.
+//
+// syscall_runtime_doAllThreadsSyscall first invokes fn(true) on a
+// single, coordinating, m, and only if it returns true does it go on
+// to invoke fn(false) on all of the other m's known to the process.
+//
+//go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall
+func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
+	if iscgo {
+		panic("doAllThreadsSyscall not supported with cgo enabled")
+	}
+	if fn == nil {
+		return
+	}
+	for atomic.Load(&sched.sysmonStarting) != 0 {
+		osyield()
+	}
+	stopTheWorldGC("doAllThreadsSyscall")
+	if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
+		// Ensure that there are no in-flight thread
+		// creations: don't want to race with allm.
+		lock(&newmHandoff.lock)
+		for !newmHandoff.waiting {
+			unlock(&newmHandoff.lock)
+			osyield()
+			lock(&newmHandoff.lock)
+		}
+		unlock(&newmHandoff.lock)
+	}
+	if netpollinited() {
+		netpollBreak()
+	}
+	_g_ := getg()
+	if raceenabled {
+		// For m's running without racectx, we loan out the
+		// racectx of this call.
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = _g_.racectx
+		unlock(&mFixupRace.lock)
+	}
+	if ok := fn(true); ok {
+		tid := _g_.m.procid
+		for mp := allm; mp != nil; mp = mp.alllink {
+			if mp.procid == tid {
+				// This m has already completed fn()
+				// call.
+				continue
+			}
+			// Be wary of mp's without procid values if
+			// they are known not to park. If they are
+			// marked as parking with a zero procid, then
+			// they will be racing with this code to be
+			// allocated a procid and we will annotate
+			// them with the need to execute the fn when
+			// they acquire a procid to run it.
+			if mp.procid == 0 && !mp.doesPark {
+				// Reaching here, we are either
+				// running Windows, or cgo linked
+				// code. Neither of which are
+				// currently supported by this API.
+				throw("unsupported runtime environment")
+			}
+			// stopTheWorldGC() doesn't guarantee stopping
+			// all the threads, so we lock here to avoid
+			// the possibility of racing with mp.
+			lock(&mp.mFixup.lock)
+			mp.mFixup.fn = fn
+			if mp.doesPark {
+				// For non-service threads this will
+				// cause the wakeup to be short lived
+				// (once the mutex is unlocked). The
+				// next real wakeup will occur after
+				// startTheWorldGC() is called.
+				notewakeup(&mp.park)
+			}
+			unlock(&mp.mFixup.lock)
+		}
+		for {
+			done := true
+			for mp := allm; done && mp != nil; mp = mp.alllink {
+				if mp.procid == tid {
+					continue
+				}
+				lock(&mp.mFixup.lock)
+				done = done && (mp.mFixup.fn == nil)
+				unlock(&mp.mFixup.lock)
+			}
+			if done {
+				break
+			}
+			// if needed force sysmon and/or newmHandoff to wakeup.
+			lock(&sched.lock)
+			if atomic.Load(&sched.sysmonwait) != 0 {
+				atomic.Store(&sched.sysmonwait, 0)
+				notewakeup(&sched.sysmonnote)
+			}
+			unlock(&sched.lock)
+			lock(&newmHandoff.lock)
+			if newmHandoff.waiting {
+				newmHandoff.waiting = false
+				notewakeup(&newmHandoff.wake)
+			}
+			unlock(&newmHandoff.lock)
+			osyield()
+		}
+	}
+	if raceenabled {
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = 0
+		unlock(&mFixupRace.lock)
+	}
+	startTheWorldGC()
+}
+
 // runSafePointFn runs the safe point function, if any, for this P.
 // This should be called like
 //
@@ -1816,6 +1957,7 @@ var newmHandoff struct {
 //go:nowritebarrierrec
 func newm(fn func(), _p_ *p, id int64) {
 	mp := allocm(_p_, fn, id)
+	mp.doesPark = (_p_ != nil)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
@@ -1888,6 +2030,57 @@ func startTemplateThread() {
 	releasem(mp)
 }
 
+// mFixupRace is used to temporarily borrow the race context from the
+// coordinating m during a syscall_runtime_doAllThreadsSyscall and
+// loan it out to each of the m's of the runtime so they can execute a
+// mFixup.fn in that context.
+var mFixupRace struct {
+	lock mutex
+	ctx  uintptr
+}
+
+// mDoFixup runs any outstanding fixup function for the running m.
+// Returns true if a fixup was outstanding and actually executed.
+//
+//go:nosplit
+func mDoFixup() bool {
+	_g_ := getg()
+	lock(&_g_.m.mFixup.lock)
+	fn := _g_.m.mFixup.fn
+	if fn != nil {
+		if gcphase != _GCoff {
+			// We can't have a write barrier in this
+			// context since we may not have a P, but we
+			// clear fn to signal that we've executed the
+			// fixup. As long as fn is kept alive
+			// elsewhere, technically we should have no
+			// issues with the GC, but fn is likely
+			// generated in a different package altogether
+			// that may change independently. Just assert
+			// the GC is off so this lack of write barrier
+			// is more obviously safe.
+			throw("GC must be disabled to protect validity of fn value")
+		}
+		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+		if _g_.racectx != 0 || !raceenabled {
+			fn(false)
+		} else {
+			// temporarily acquire the context of the
+			// originator of the
+			// syscall_runtime_doAllThreadsSyscall and
+			// block others from using it for the duration
+			// of the fixup call.
+			lock(&mFixupRace.lock)
+			_g_.racectx = mFixupRace.ctx
+			fn(false)
+			_g_.racectx = 0
+			unlock(&mFixupRace.lock)
+		}
+	}
+	unlock(&_g_.m.mFixup.lock)
+	return fn != nil
+}
+
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
 // may not be in a good state.
@@ -1924,6 +2117,7 @@ func templateThread() {
 		noteclear(&newmHandoff.wake)
 		unlock(&newmHandoff.lock)
 		notesleep(&newmHandoff.wake)
+		mDoFixup()
 	}
 }
 
@@ -1945,8 +2139,7 @@ func stopm() {
 	lock(&sched.lock)
 	mput(_g_.m)
 	unlock(&sched.lock)
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	acquirep(_g_.m.nextp.ptr())
 	_g_.m.nextp = 0
 }
@@ -2106,8 +2299,7 @@ func stoplockedm() {
 	}
 	incidlelocked(1)
 	// Wait until another thread schedules lockedg again.
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
 		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
@@ -4715,9 +4907,14 @@ func sysmon() {
 	checkdead()
 	unlock(&sched.lock)
 
+	// For syscall_runtime_doAllThreadsSyscall, sysmon is
+	// sufficiently up to participate in fixups.
+	atomic.Store(&sched.sysmonStarting, 0)
+
 	lasttrace := int64(0)
 	idle := 0 // how many cycles in succession we had not wokeup somebody
 	delay := uint32(0)
+
 	for {
 		if idle == 0 { // start with 20us sleep...
 			delay = 20
@@ -4728,6 +4925,7 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		mDoFixup()
 		now := nanotime()
 		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
@@ -4747,6 +4945,7 @@ func sysmon() {
 						osRelax(true)
 					}
 					notetsleep(&sched.sysmonnote, sleep)
+					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
@@ -4795,6 +4994,7 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
+		mDoFixup()
 		if next < now {
 			// There are timers that should have already run,
 			// perhaps because there is an unpreemptible P.
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 0758a35e01..21dd7b3949 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -528,6 +528,7 @@ type m struct {
 	ncgo          int32       // number of cgo calls currently in progress
 	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
 	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	doesPark      bool        // non-P running threads: sysmon and newmHandoff never use .park
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
@@ -544,6 +545,13 @@ type m struct {
 	syscalltick   uint32
 	freelink      *m // on sched.freem
 
+	// mFixup is used to synchronize OS related m state (credentials etc)
+	// use mutex to access.
+	mFixup struct {
+		lock mutex
+		fn   func(bool) bool
+	}
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -768,6 +776,10 @@ type schedt struct {
 	sysmonwait uint32
 	sysmonnote note
 
+	// While true, sysmon not ready for mFixup calls.
+	// Accessed atomically.
+	sysmonStarting uint32
+
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
 	safePointFn   func(*p)
-- 
cgit v1.2.1


From db185e543fe471c522790b7d93291e786dc54a84 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 7 Jul 2020 17:55:40 -0400
Subject: runtime: drop redundant gcBlackenEnabled reset

This reset of gcBlackenEnabled is a no-op because it was already reset
almost immediately before in gcMarkDone, which is the only caller of
gcMarkTermination.

Adjust the comment to clarify setGCPhase a bit more. We are coming from
_GCmark, so write barriers are already enabled.

Change-Id: Ieac2dadf33c3c5a44e8a25a499dea8cfe03b8d73
Reviewed-on: https://go-review.googlesource.com/c/go/+/241357
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/mgc.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 65ac654b14..c42c7fbd29 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1558,10 +1558,10 @@ top:
 	gcMarkTermination(nextTriggerRatio)
 }
 
+// World must be stopped and mark assists and background workers must be
+// disabled.
 func gcMarkTermination(nextTriggerRatio float64) {
-	// World is stopped.
-	// Start marktermination which includes enabling the write barrier.
-	atomic.Store(&gcBlackenEnabled, 0)
+	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
 	work.heap1 = memstats.heap_live
-- 
cgit v1.2.1


From e01a1c01f830e2398b773b803dce3238b1107ce9 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 14 Jul 2020 21:39:52 +0000
Subject: runtime: add tests for addrRanges.findSucc

This change adds a test suite for addrRanges.findSucc so we can change
the implementation more safely.

For #40191.

Change-Id: I14a834b6d54836cbc676eb0edb292ba6176705cc
Reviewed-on: https://go-review.googlesource.com/c/go/+/242678
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go  |  24 +++++++
 src/runtime/mranges_test.go | 172 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 src/runtime/mranges_test.go

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 25b251f4ba..605bcb2294 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -785,6 +785,30 @@ func (a AddrRange) Equals(b AddrRange) bool {
 	return a == b
 }
 
+// AddrRanges is a wrapper around addrRanges for testing.
+type AddrRanges struct {
+	addrRanges
+}
+
+// MakeAddrRanges creates a new addrRanges populated with
+// the ranges in a.
+func MakeAddrRanges(a ...AddrRange) AddrRanges {
+	// Methods that manipulate the backing store of addrRanges.ranges should
+	// not be used on the result from this function (e.g. add) since they may
+	// trigger reallocation.
+	ranges := make([]addrRange, 0, len(a))
+	for _, r := range a {
+		ranges = append(ranges, r.addrRange)
+	}
+	return AddrRanges{addrRanges{ranges: ranges, sysStat: new(uint64)}}
+}
+
+// FindSucc returns the successor to base. See addrRanges.findSucc
+// for more details.
+func (a *AddrRanges) FindSucc(base uintptr) int {
+	return a.findSucc(base)
+}
+
 // BitRange represents a range over a bitmap.
 type BitRange struct {
 	I, N uint // bit index and length in bits
diff --git a/src/runtime/mranges_test.go b/src/runtime/mranges_test.go
new file mode 100644
index 0000000000..3a9023adfa
--- /dev/null
+++ b/src/runtime/mranges_test.go
@@ -0,0 +1,172 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func TestAddrRangesFindSucc(t *testing.T) {
+	var large []AddrRange
+	for i := 0; i < 100; i++ {
+		large = append(large, MakeAddrRange(5+uintptr(i)*5, 5+uintptr(i)*5+3))
+	}
+
+	type testt struct {
+		name   string
+		base   uintptr
+		expect int
+		ranges []AddrRange
+	}
+	tests := []testt{
+		{
+			name:   "Empty",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{},
+		},
+		{
+			name:   "OneBefore",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneWithin",
+			base:   14,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfterLimit",
+			base:   16,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfter",
+			base:   17,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "ThreeBefore",
+			base:   3,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeAfter",
+			base:   24,
+			expect: 3,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeBetween",
+			base:   11,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeWithin",
+			base:   9,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "Zero",
+			base:   0,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(0, 10),
+			},
+		},
+		{
+			name:   "Max",
+			base:   ^uintptr(0),
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(^uintptr(0)-5, ^uintptr(0)),
+			},
+		},
+		{
+			name:   "LargeBefore",
+			base:   2,
+			expect: 0,
+			ranges: large,
+		},
+		{
+			name:   "LargeAfter",
+			base:   5 + uintptr(len(large))*5 + 30,
+			expect: len(large),
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenLow",
+			base:   14,
+			expect: 2,
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenHigh",
+			base:   249,
+			expect: 49,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinLow",
+			base:   25,
+			expect: 5,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinHigh",
+			base:   396,
+			expect: 79,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinMiddle",
+			base:   250,
+			expect: 50,
+			ranges: large,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			a := MakeAddrRanges(test.ranges...)
+			i := a.FindSucc(test.base)
+			if i != test.expect {
+				t.Fatalf("expected %d, got %d", test.expect, i)
+			}
+		})
+	}
+}
-- 
cgit v1.2.1


From fe70866d1dc8c44ab19180ecab2b5c5b8628265a Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 15 Jul 2020 18:56:39 +0000
Subject: runtime: throw on zero-sized range passed to addrRanges.add

addrRanges represents a set of addresses. Currently, passing in a
zero-sized range will cause that range to be added to the list, even
though it doesn't represent any address (addrRanges.contains will still
always return false, and findSucc will give surprising results).

We could ignore this input, but it's almost always a bug for the calling
code to pass in a zero-sized range, so just throw.

Change-Id: I8ed09e15b79a3a33e2d0cf5ed55f9e497388e7a5
Reviewed-on: https://go-review.googlesource.com/c/go/+/242817
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/mranges.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 2c0eb2c2dd..1109f506a6 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -218,7 +218,7 @@ func (a *addrRanges) contains(addr uintptr) bool {
 
 // add inserts a new address range to a.
 //
-// r must not overlap with any address range in a.
+// r must not overlap with any address range in a and r.size() must be > 0.
 func (a *addrRanges) add(r addrRange) {
 	// The copies in this function are potentially expensive, but this data
 	// structure is meant to represent the Go heap. At worst, copying this
@@ -229,6 +229,12 @@ func (a *addrRanges) add(r addrRange) {
 	// of 16) and Go heaps are usually mostly contiguous, so the chance that
 	// an addrRanges even grows to that size is extremely low.
 
+	// An empty range has no effect on the set of addresses represented
+	// by a, but passing a zero-sized range is almost always a bug.
+	if r.size() == 0 {
+		print("runtime: range = {", hex(r.base.addr()), ", ", hex(r.limit.addr()), "}\n")
+		throw("attempted to add zero-sized address range")
+	}
 	// Because we assume r is not currently represented in a,
 	// findSucc gives us our insertion index.
 	i := a.findSucc(r.base.addr())
-- 
cgit v1.2.1


From 64dc25b2db5d5be55f3f2fde3daac6c8a2873235 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 14 Jul 2020 21:41:12 +0000
Subject: runtime: add tests for addrRanges.add

Change-Id: I249deb482df74068b0538e9d773b9a87bc5a6df3
Reviewed-on: https://go-review.googlesource.com/c/go/+/242681
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go  |  62 +++++++++++++++++++++++++-
 src/runtime/mranges_test.go | 103 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 605bcb2294..e65b7b8ea7 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -785,22 +785,64 @@ func (a AddrRange) Equals(b AddrRange) bool {
 	return a == b
 }
 
+// Size returns the size in bytes of the address range.
+func (a AddrRange) Size() uintptr {
+	return a.addrRange.size()
+}
+
 // AddrRanges is a wrapper around addrRanges for testing.
 type AddrRanges struct {
 	addrRanges
+	mutable bool
+}
+
+// NewAddrRanges creates a new empty addrRanges.
+//
+// Note that this initializes addrRanges just like in the
+// runtime, so its memory is persistentalloc'd. Call this
+// function sparingly since the memory it allocates is
+// leaked.
+//
+// This AddrRanges is mutable, so we can test methods like
+// Add.
+func NewAddrRanges() AddrRanges {
+	r := addrRanges{}
+	r.init(new(uint64))
+	return AddrRanges{r, true}
 }
 
 // MakeAddrRanges creates a new addrRanges populated with
 // the ranges in a.
+//
+// The returned AddrRanges is immutable, so methods like
+// Add will fail.
 func MakeAddrRanges(a ...AddrRange) AddrRanges {
 	// Methods that manipulate the backing store of addrRanges.ranges should
 	// not be used on the result from this function (e.g. add) since they may
-	// trigger reallocation.
+	// trigger reallocation. That would normally be fine, except the new
+	// backing store won't come from the heap, but from persistentalloc, so
+	// we'll leak some memory implicitly.
 	ranges := make([]addrRange, 0, len(a))
+	total := uintptr(0)
 	for _, r := range a {
 		ranges = append(ranges, r.addrRange)
+		total += r.Size()
+	}
+	return AddrRanges{addrRanges{
+		ranges:     ranges,
+		totalBytes: total,
+		sysStat:    new(uint64),
+	}, false}
+}
+
+// Ranges returns a copy of the ranges described by the
+// addrRanges.
+func (a *AddrRanges) Ranges() []AddrRange {
+	result := make([]AddrRange, 0, len(a.addrRanges.ranges))
+	for _, r := range a.addrRanges.ranges {
+		result = append(result, AddrRange{r})
 	}
-	return AddrRanges{addrRanges{ranges: ranges, sysStat: new(uint64)}}
+	return result
 }
 
 // FindSucc returns the successor to base. See addrRanges.findSucc
@@ -809,6 +851,22 @@ func (a *AddrRanges) FindSucc(base uintptr) int {
 	return a.findSucc(base)
 }
 
+// Add adds a new AddrRange to the AddrRanges.
+//
+// The AddrRange must be mutable (i.e. created by NewAddrRanges),
+// otherwise this method will throw.
+func (a *AddrRanges) Add(r AddrRange) {
+	if !a.mutable {
+		throw("attempt to mutate immutable AddrRanges")
+	}
+	a.add(r.addrRange)
+}
+
+// TotalBytes returns the totalBytes field of the addrRanges.
+func (a *AddrRanges) TotalBytes() uintptr {
+	return a.addrRanges.totalBytes
+}
+
 // BitRange represents a range over a bitmap.
 type BitRange struct {
 	I, N uint // bit index and length in bits
diff --git a/src/runtime/mranges_test.go b/src/runtime/mranges_test.go
index 3a9023adfa..ed439c56c2 100644
--- a/src/runtime/mranges_test.go
+++ b/src/runtime/mranges_test.go
@@ -9,6 +9,109 @@ import (
 	"testing"
 )
 
+func validateAddrRanges(t *testing.T, a *AddrRanges, want ...AddrRange) {
+	ranges := a.Ranges()
+	if len(ranges) != len(want) {
+		t.Errorf("want %v, got %v", want, ranges)
+		t.Fatal("different lengths")
+	}
+	gotTotalBytes := uintptr(0)
+	wantTotalBytes := uintptr(0)
+	for i := range ranges {
+		gotTotalBytes += ranges[i].Size()
+		wantTotalBytes += want[i].Size()
+		if ranges[i].Base() >= ranges[i].Limit() {
+			t.Error("empty range found")
+		}
+		// Ensure this is equivalent to what we want.
+		if !ranges[i].Equals(want[i]) {
+			t.Errorf("range %d: got [0x%x, 0x%x), want [0x%x, 0x%x)", i,
+				ranges[i].Base(), ranges[i].Limit(),
+				want[i].Base(), want[i].Limit(),
+			)
+		}
+		if i != 0 {
+			// Ensure the ranges are sorted.
+			if ranges[i-1].Base() >= ranges[i].Base() {
+				t.Errorf("ranges %d and %d are out of sorted order", i-1, i)
+			}
+			// Check for a failure to coalesce.
+			if ranges[i-1].Limit() == ranges[i].Base() {
+				t.Errorf("ranges %d and %d should have coalesced", i-1, i)
+			}
+			// Check if any ranges overlap. Because the ranges are sorted
+			// by base, it's sufficient to just check neighbors.
+			if ranges[i-1].Limit() > ranges[i].Base() {
+				t.Errorf("ranges %d and %d overlap", i-1, i)
+			}
+		}
+	}
+	if wantTotalBytes != gotTotalBytes {
+		t.Errorf("expected %d total bytes, got %d", wantTotalBytes, gotTotalBytes)
+	}
+	if b := a.TotalBytes(); b != gotTotalBytes {
+		t.Errorf("inconsistent total bytes: want %d, got %d", gotTotalBytes, b)
+	}
+	if t.Failed() {
+		t.Errorf("addrRanges: %v", ranges)
+		t.Fatal("detected bad addrRanges")
+	}
+}
+
+func TestAddrRangesAdd(t *testing.T) {
+	a := NewAddrRanges()
+
+	// First range.
+	a.Add(MakeAddrRange(512, 1024))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 1024),
+	)
+
+	// Coalesce up.
+	a.Add(MakeAddrRange(1024, 2048))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+	)
+
+	// Add new independent range.
+	a.Add(MakeAddrRange(4096, 8192))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(4096, 8192),
+	)
+
+	// Coalesce down.
+	a.Add(MakeAddrRange(3776, 4096))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(3776, 8192),
+	)
+
+	// Coalesce up and down.
+	a.Add(MakeAddrRange(2048, 3776))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 8192),
+	)
+
+	// Push a bunch of independent ranges to the end to try and force growth.
+	expectedRanges := []AddrRange{MakeAddrRange(512, 8192)}
+	for i := uintptr(0); i < 64; i++ {
+		dRange := MakeAddrRange(8192+(i+1)*2048, 8192+(i+1)*2048+10)
+		a.Add(dRange)
+		expectedRanges = append(expectedRanges, dRange)
+		validateAddrRanges(t, &a, expectedRanges...)
+	}
+
+	// Push a bunch of independent ranges to the beginning to try and force growth.
+	var bottomRanges []AddrRange
+	for i := uintptr(0); i < 63; i++ {
+		dRange := MakeAddrRange(8+i*8, 8+i*8+4)
+		a.Add(dRange)
+		bottomRanges = append(bottomRanges, dRange)
+		validateAddrRanges(t, &a, append(bottomRanges, expectedRanges...)...)
+	}
+}
+
 func TestAddrRangesFindSucc(t *testing.T) {
 	var large []AddrRange
 	for i := 0; i < 100; i++ {
-- 
cgit v1.2.1


From bc0b198bd75a8eef45d0965531ba6fa127d0e8ec Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Thu, 15 Oct 2020 01:43:51 +0000
Subject: runtime: dump the status of lockedg on error

The dumpgstatus() will dump current g's status anyway. When lockedg's
status is bad, it's more helpful to dump lockedg's status as well than
dumping current g's status twice.

Change-Id: If5248cb94b9cdcbf4ceea07562237e1d6ee28489
GitHub-Last-Rev: da814c51ff42f56fb28582f088f4d72b500061fe
GitHub-Pull-Request: golang/go#40248
Reviewed-on: https://go-review.googlesource.com/c/go/+/243097
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Emmanuel Odeke <emm.odeke@gmail.com>
Run-TryBot: Emmanuel Odeke <emm.odeke@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/runtime/proc.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index aeacb23391..83d2a524e0 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2302,8 +2302,8 @@ func stoplockedm() {
 	mPark()
 	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
-		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_)
+		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_.m.lockedg.ptr())
 		throw("stoplockedm: not runnable")
 	}
 	acquirep(_g_.m.nextp.ptr())
-- 
cgit v1.2.1


From 30c18878730434027dbefd343aad74963a1fdc48 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 1 Oct 2020 17:22:38 -0400
Subject: runtime,cmd/cgo: simplify C -> Go call path

This redesigns the way calls work from C to exported Go functions. It
removes several steps from the call path, makes cmd/cgo no longer
sensitive to the Go calling convention, and eliminates the use of
reflectcall from cgo.

In order to avoid generating a large amount of FFI glue between the C
and Go ABIs, the cgo tool has long depended on generating a C function
that marshals the arguments into a struct, and then the actual ABI
switch happens in functions with fixed signatures that simply take a
pointer to this struct. In a way, this CL simply pushes this idea
further.

Currently, the cgo tool generates this argument struct in the exact
layout of the Go stack frame and depends on reflectcall to unpack it
into the appropriate Go call (even though it's actually
reflectcall'ing a function generated by cgo).

In this CL, we decouple this struct from the Go stack layout. Instead,
cgo generates a Go function that takes the struct, unpacks it, and
calls the exported function. Since this generated function has a
generic signature (like the rest of the call path), we don't need
reflectcall and can instead depend on the Go compiler itself to
implement the call to the exported Go function.

One complication is that syscall.NewCallback on Windows, which
converts a Go function into a C function pointer, depends on
cgocallback's current dynamic calling approach since the signatures of
the callbacks aren't known statically. For this specific case, we
continue to depend on reflectcall. Really, the current approach makes
some overly simplistic assumptions about translating the C ABI to the
Go ABI. Now we're at least in a much better position to do a proper
ABI translation.

For comparison, the current cgo call path looks like:

    GoF (generated C function) ->
    crosscall2 (in cgo/asm_*.s) ->
    _cgoexp_GoF (generated Go function) ->
    cgocallback (in asm_*.s) ->
    cgocallback_gofunc (in asm_*.s) ->
    cgocallbackg (in cgocall.go) ->
    cgocallbackg1 (in cgocall.go) ->
    reflectcall (in asm_*.s) ->
    _cgoexpwrap_GoF (generated Go function) ->
    p.GoF

Now the call path looks like:

    GoF (generated C function) ->
    crosscall2 (in cgo/asm_*.s) ->
    cgocallback (in asm_*.s) ->
    cgocallbackg (in cgocall.go) ->
    cgocallbackg1 (in cgocall.go) ->
    _cgoexp_GoF (generated Go function) ->
    p.GoF

Notably:

1. We combine _cgoexp_GoF and _cgoexpwrap_GoF and move the combined
operation to the end of the sequence. This combined function also
handles reflectcall's previous role.

2. We combined cgocallback and cgocallback_gofunc since the only
purpose of having both was to convert a raw PC into a Go function
value. We instead construct the Go function value in cgocallbackg1.

3. cgocallbackg1 no longer reaches backwards through the stack to get
the arguments to cgocallback_gofunc. Instead, we just pass the
arguments down.

4. Currently, we need an explicit msanwrite to mark the results struct
as written because reflectcall doesn't do this. Now, the results are
written by regular Go assignments, so the Go compiler generates the
necessary MSAN annotations. This also means we no longer need to track
the size of the arguments frame.

Updates #40724, since now we don't need to teach cgo about the
register ABI or change how it uses reflectcall.

Change-Id: I7840489a2597962aeb670e0c1798a16a7359c94f
Reviewed-on: https://go-review.googlesource.com/c/go/+/258938
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/asm_386.s           |  56 ++++++----------
 src/runtime/asm_amd64.s         |  55 ++++++----------
 src/runtime/asm_arm.s           |  58 ++++++-----------
 src/runtime/asm_arm64.s         |  48 +++++---------
 src/runtime/asm_mips64x.s       |  54 ++++++----------
 src/runtime/asm_mipsx.s         |  54 ++++++----------
 src/runtime/asm_ppc64x.s        |  55 ++++++----------
 src/runtime/asm_riscv64.s       |   5 +-
 src/runtime/asm_s390x.s         |  54 ++++++----------
 src/runtime/asm_wasm.s          |   5 +-
 src/runtime/cgo/asm_386.s       |  12 ++--
 src/runtime/cgo/asm_amd64.s     |  20 +++---
 src/runtime/cgo/asm_arm.s       |  61 ++++++++---------
 src/runtime/cgo/asm_arm64.s     |  19 +++---
 src/runtime/cgo/asm_mips64x.s   |  19 +++---
 src/runtime/cgo/asm_mipsx.s     |  19 +++---
 src/runtime/cgo/asm_ppc64x.s    |  20 +++---
 src/runtime/cgo/asm_s390x.s     |  14 ++--
 src/runtime/cgo/callbacks.go    |  29 ++++-----
 src/runtime/cgocall.go          | 140 +++++++++++-----------------------------
 src/runtime/proc.go             |   2 +-
 src/runtime/race/output_test.go |   2 +-
 src/runtime/stubs.go            |  25 ++++---
 src/runtime/symtab.go           |   2 +-
 src/runtime/sys_windows_386.s   |  67 ++++++++-----------
 src/runtime/sys_windows_amd64.s |  53 +++++++--------
 src/runtime/sys_windows_arm.s   |   3 +
 src/runtime/syscall_windows.go  |  42 ++++++++++--
 src/runtime/traceback.go        |   2 +-
 29 files changed, 412 insertions(+), 583 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 11863fba39..a54b68e03d 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -702,25 +702,9 @@ nosave:
 	MOVL	AX, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	LEAL	fn+0(FP), AX
-	MOVL	AX, 0(SP)
-	MOVL	frame+4(FP), AX
-	MOVL	AX, 4(SP)
-	MOVL	framesize+8(FP), AX
-	MOVL	AX, 8(SP)
-	MOVL	ctxt+12(FP), AX
-	MOVL	AX, 12(SP)
-	MOVL	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
+TEXT ·cgocallback(SB),NOSPLIT,$16-12  // Frame size must match commented places below
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -780,34 +764,36 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	//
-	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
-	// 8(SP) is unused.
+	// In the new goroutine, 12(SP) holds the saved oldm (DX) register.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
 	MOVL	(g_sched+gobuf_pc)(SI), BP
-	MOVL	BP, -4(DI)
-	MOVL	ctxt+12(FP), CX
-	LEAL	-(4+12)(DI), SP
-	MOVL	DX, 4(SP)
-	MOVL	CX, 0(SP)
+	MOVL	BP, -4(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVL	fn+0(FP), AX
+	MOVL	frame+4(FP), BX
+	MOVL	ctxt+8(FP), CX
+	LEAL	-(4+16)(DI), SP  // Must match declared frame size
+	MOVL	DX, 12(SP)
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	MOVL	CX, 8(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVL	4(SP), DX
+	MOVL	12(SP), DX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	12(SP), BP
+	MOVL	16(SP), BP  // Must match declared frame size
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(12+4)(SP), DI
+	LEAL	(16+4)(SP), DI  // Must match declared frame size
 	MOVL	DI, (g_sched+gobuf_sp)(SI)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 256f4112cd..3d5d9c4d58 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -691,25 +691,9 @@ nosave:
 	MOVL	AX, ret+16(FP)
 	RET
 
-// func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	LEAQ	fn+0(FP), AX
-	MOVQ	AX, 0(SP)
-	MOVQ	frame+8(FP), AX
-	MOVQ	AX, 8(SP)
-	MOVQ	framesize+16(FP), AX
-	MOVQ	AX, 16(SP)
-	MOVQ	ctxt+24(FP), AX
-	MOVQ	AX, 24(SP)
-	MOVQ	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// func cgocallback_gofunc(fn, frame, framesize, ctxt uintptr)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$32-24
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -769,37 +753,40 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	//
-	// In the new goroutine, 8(SP) holds the saved R8.
+	// In the new goroutine, 24(SP) holds the saved R8.
 	MOVQ	m_curg(BX), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
 	MOVQ	(g_sched+gobuf_pc)(SI), BX
-	MOVQ	BX, -8(DI)
+	MOVQ	BX, -8(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVQ	fn+0(FP), BX
+	MOVQ	frame+8(FP), CX
+	MOVQ	ctxt+16(FP), DX
 	// Compute the size of the frame, including return PC and, if
 	// GOEXPERIMENT=framepointer, the saved base pointer
-	MOVQ	ctxt+24(FP), BX
-	LEAQ	fv+0(FP), AX
-	SUBQ	SP, AX
-	SUBQ	AX, DI
+	LEAQ	fn+0(FP), AX
+	SUBQ	SP, AX   // AX is our actual frame size
+	SUBQ	AX, DI   // Allocate the same frame size on the g stack
 	MOVQ	DI, SP
 
-	MOVQ	R8, 8(SP)
+	MOVQ	R8, 24(SP)
 	MOVQ	BX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	DX, 16(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVQ	8(SP), R8
+	MOVQ	24(SP), R8
 
 	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
 	// but only their difference matters.
-	LEAQ	fv+0(FP), AX
+	LEAQ	fn+0(FP), AX
 	SUBQ	SP, AX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 51a50c604c..c54b4eb006 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -643,25 +643,9 @@ nosave:
 	MOVW	R0, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R0
-	MOVW	R0, 4(R13)
-	MOVW	frame+4(FP), R0
-	MOVW	R0, 8(R13)
-	MOVW	framesize+8(FP), R0
-	MOVW	R0, 12(R13)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, 16(R13)
-	MOVW	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT	·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -686,7 +670,7 @@ needm:
 	MOVW	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -706,10 +690,10 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
-	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-12(SP).
 	MOVW	m_g0(R8), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
-	MOVW	R4, savedsp-8(SP)
+	MOVW	R4, savedsp-12(SP)	// must match frame size
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -718,30 +702,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVW	(g_sched+gobuf_pc)(g), R5
-	MOVW	R5, -12(R4)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, -8(R4)
-	MOVW	$-12(R4), R13
+	MOVW	R5, -(12+4)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R1
+	MOVW	frame+4(FP), R2
+	MOVW	ctxt+8(FP), R3
+	MOVW	$-(12+4)(R4), R13	// switch stack; must match frame size
+	MOVW	R1, 4(R13)
+	MOVW	R2, 8(R13)
+	MOVW	R3, 12(R13)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R13), R5
 	MOVW	R5, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R13), R4
+	MOVW	$(12+4)(R13), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -751,7 +735,7 @@ havem:
 	MOVW	m_g0(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R13
-	MOVW	savedsp-8(SP), R4
+	MOVW	savedsp-12(SP), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 1f46d1962c..a45e342478 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -958,25 +958,9 @@ nosave:
 	MOVD	R0, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$40-32
-	MOVD	$fn+0(FP), R0
-	MOVD	R0, 8(RSP)
-	MOVD	frame+8(FP), R0
-	MOVD	R0, 16(RSP)
-	MOVD	framesize+16(FP), R0
-	MOVD	R0, 24(RSP)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, 32(RSP)
-	MOVD	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load g from thread-local storage.
@@ -1001,7 +985,7 @@ needm:
 	MOVD	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -1037,16 +1021,11 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
@@ -1054,10 +1033,15 @@ havem:
 	MOVD	R5, -48(R4)
 	MOVD	(g_sched+gobuf_bp)(g), R5
 	MOVD	R5, -56(R4)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, -40(R4)
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
 	MOVD	$-48(R4), R0 // maintain 16-byte SP alignment
-	MOVD	R0, RSP
+	MOVD	R0, RSP	// switch stack
+	MOVD	R1, 8(RSP)
+	MOVD	R2, 16(RSP)
+	MOVD	R3, 24(RSP)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 0ff1b24225..19781f7885 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -471,25 +471,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVV	$fn+0(FP), R1
-	MOVV	R1, 8(R29)
-	MOVV	frame+8(FP), R1
-	MOVV	R1, 16(R29)
-	MOVV	framesize+16(FP), R1
-	MOVV	R1, 24(R29)
-	MOVV	ctxt+24(FP), R1
-	MOVV	R1, 32(R29)
-	MOVV	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -537,7 +521,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R29) aka savedsp-16(SP).
 	MOVV	m_g0(R3), R1
 	MOVV	(g_sched+gobuf_sp)(R1), R2
-	MOVV	R2, savedsp-16(SP)
+	MOVV	R2, savedsp-24(SP)	// must match frame size
 	MOVV	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -546,30 +530,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVV	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVV	(g_sched+gobuf_pc)(g), R4
-	MOVV	R4, -24(R2)
-	MOVV    ctxt+24(FP), R1
-	MOVV    R1, -16(R2)
-	MOVV	$-24(R2), R29
+	MOVV	R4, -(24+8)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVV	fn+0(FP), R5
+	MOVV	frame+8(FP), R6
+	MOVV	ctxt+16(FP), R7
+	MOVV	$-(24+8)(R2), R29	// switch stack; must match frame size
+	MOVV	R5, 8(R29)
+	MOVV	R6, 16(R29)
+	MOVV	R7, 24(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVV	0(R29), R4
 	MOVV	R4, (g_sched+gobuf_pc)(g)
-	MOVV	$24(R29), R2
+	MOVV	$(24+8)(R29), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -579,7 +563,7 @@ havem:
 	MOVV	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R29
-	MOVV	savedsp-16(SP), R2
+	MOVV	savedsp-24(SP), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index aca0510b69..ee87d81436 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -472,25 +472,9 @@ g0:
 	MOVW	R2, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R1
-	MOVW	R1, 4(R29)
-	MOVW	frame+4(FP), R1
-	MOVW	R1, 8(R29)
-	MOVW	framesize+8(FP), R1
-	MOVW	R1, 12(R29)
-	MOVW	ctxt+12(FP), R1
-	MOVW	R1, 16(R29)
-	MOVW	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT ·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -538,7 +522,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R29) aka savedsp-8(SP).
 	MOVW	m_g0(R3), R1
 	MOVW	(g_sched+gobuf_sp)(R1), R2
-	MOVW	R2, savedsp-8(SP)
+	MOVW	R2, savedsp-12(SP)	// must match frame size
 	MOVW	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -547,30 +531,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVW	(g_sched+gobuf_pc)(g), R4
-	MOVW	R4, -12(R2)
-	MOVW    ctxt+12(FP), R1
-	MOVW    R1, -8(R2)
-	MOVW	$-12(R2), R29
+	MOVW	R4, -(12+4)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R5
+	MOVW	frame+4(FP), R6
+	MOVW	ctxt+8(FP), R7
+	MOVW	$-(12+4)(R2), R29	// switch stack; must match frame size
+	MOVW	R5, 4(R29)
+	MOVW	R6, 8(R29)
+	MOVW	R7, 12(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R29), R4
 	MOVW	R4, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R29), R2
+	MOVW	$(12+4)(R29), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -580,7 +564,7 @@ havem:
 	MOVW	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R29
-	MOVW	savedsp-8(SP), R2
+	MOVW	savedsp-12(SP), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 603058a61b..dc34c0e4c8 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -651,26 +651,9 @@ g0:
 	MOVW	R3, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, FIXED_FRAME+8(R1)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, FIXED_FRAME+16(R1)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, FIXED_FRAME+24(R1)
-	MOVD	$runtime·cgocallback_gofunc(SB), R12
-	MOVD	R12, CTR
-	BL	(CTR)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -721,7 +704,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)      // must match frame size
 	MOVD	R1, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -730,30 +713,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -(FIXED_FRAME+16)(R4)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, -16(R4)
-	MOVD	$-(FIXED_FRAME+16)(R4), R1
+	MOVD	R5, -(24+FIXED_FRAME)(R4)       // "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R5
+	MOVD	frame+8(FP), R6
+	MOVD	ctxt+16(FP), R7
+	MOVD	$-(24+FIXED_FRAME)(R4), R1      // switch stack; must match frame size
+	MOVD    R5, FIXED_FRAME+0(R1)
+	MOVD    R6, FIXED_FRAME+8(R1)
+	MOVD    R7, FIXED_FRAME+16(R1)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R1), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$(FIXED_FRAME+16)(R1), R4
+	MOVD	$(24+FIXED_FRAME)(R1), R4       // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -763,7 +746,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R1
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4      // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index a136085084..fd01fd6f07 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -453,8 +453,9 @@ TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	MOV	ZERO, ZERO	// NOP
 
-// func cgocallback_gofunc(fv uintptr, frame uintptr, framesize, ctxt uintptr)
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
+// See cgocall.go for more details.
+TEXT ·cgocallback(SB),NOSPLIT,$0-24
 	// TODO(jsing): Add support for cgo - issue #36641.
 	WORD $0		// crash
 
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 46a434119b..7baef37324 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -575,25 +575,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, 8(R15)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, 16(R15)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, 24(R15)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, 32(R15)
-	MOVD	$runtime·cgocallback_gofunc(SB), R3
-	BL	(R3)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -641,7 +625,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)	// must match frame size
 	MOVD	R15, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -650,30 +634,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -24(R4)
-	MOVD	ctxt+24(FP), R5
-	MOVD	R5, -16(R4)
-	MOVD	$-24(R4), R15
+	MOVD	R5, -(24+8)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
+	MOVD	$-(24+8)(R4), R15	// switch stack; must match frame size
+	MOVD	R1, 8(R15)
+	MOVD	R2, 16(R15)
+	MOVD	R3, 24(R15)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R15), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$24(R15), R4
+	MOVD	$(24+8)(R15), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -683,7 +667,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R15
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index 1275af136b..67e81adf0b 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -288,9 +288,6 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 TEXT ·asmcgocall(SB), NOSPLIT, $0-0
 	UNDEF
 
-TEXT ·cgocallback_gofunc(SB), NOSPLIT, $16-32
-	UNDEF
-
 #define DISPATCH(NAME, MAXSIZE) \
 	Get R0; \
 	I64Const $MAXSIZE; \
@@ -432,7 +429,7 @@ TEXT runtime·goexit(SB), NOSPLIT, $0-0
 	CALL runtime·goexit1(SB) // does not return
 	UNDEF
 
-TEXT runtime·cgocallback(SB), NOSPLIT, $32-32
+TEXT runtime·cgocallback(SB), NOSPLIT, $0-24
 	UNDEF
 
 // gcWriteBarrier performs a heap pointer write and informs the GC.
diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
index 7293c20bf8..2e7e9512e2 100644
--- a/src/runtime/cgo/asm_386.s
+++ b/src/runtime/cgo/asm_386.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT,$28-16
 	MOVL BP, 24(SP)
 	MOVL BX, 20(SP)
@@ -15,12 +16,11 @@ TEXT crosscall2(SB),NOSPLIT,$28-16
 
 	MOVL	ctxt+12(FP), AX
 	MOVL	AX, 8(SP)
-	MOVL	n+8(FP), AX
-	MOVL	AX, 4(SP)
 	MOVL	a+4(FP), AX
-	MOVL	AX, 0(SP)
+	MOVL	AX, 4(SP)
 	MOVL	fn+0(FP), AX
-	CALL	AX
+	MOVL	AX, 0(SP)
+	CALL	runtime·cgocallback(SB)
 
 	MOVL 12(SP), DI
 	MOVL 16(SP), SI
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 06c538b9bc..5dc8e2d235 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s
@@ -5,8 +5,10 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
+// This signature is known to SWIG, so we can't change it.
 #ifndef GOOS_windows
 TEXT crosscall2(SB),NOSPLIT,$0x50-0 /* keeps stack pointer 32-byte aligned */
 #else
@@ -33,11 +35,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	X14, 0xe0(SP)
 	MOVUPS	X15, 0xf0(SP)
 
-	MOVQ	DX, 0x0(SP)	/* arg */
-	MOVQ	R8, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	CX, 0x0(SP)	/* fn */
+	MOVQ	DX, 0x8(SP)	/* arg */
+	// Skip n in R8.
 	MOVQ	R9, 0x10(SP)	/* ctxt */
 
-	CALL	CX	/* fn */
+	CALL	runtime·cgocallback(SB)
 
 	MOVQ	0x48(SP), DI
 	MOVQ	0x50(SP), SI
@@ -52,11 +55,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	0xe0(SP), X14
 	MOVUPS	0xf0(SP), X15
 #else
-	MOVQ	SI, 0x0(SP)	/* arg */
-	MOVQ	DX, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	DI, 0x0(SP)	/* fn */
+	MOVQ	SI, 0x8(SP)	/* arg */
+	// Skip n in DX.
 	MOVQ	CX, 0x10(SP)	/* ctxt */
 
-	CALL	DI	/* fn */
+	CALL	runtime·cgocallback(SB)
 #endif
 
 	MOVQ	0x18(SP), BX
diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
index 60132c14a8..ea55e173c1 100644
--- a/src/runtime/cgo/asm_arm.s
+++ b/src/runtime/cgo/asm_arm.s
@@ -5,51 +5,52 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
-	/*
-	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
-	 * Also note that at procedure entry in gc world, 4(R13) will be the
-	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
-	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
-	 *  nevertheless.
-	 */
 	SUB	$(8*9), R13 // Reserve space for the floating point registers.
-	MOVM.WP	[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	// The C arguments arrive in R0, R1, R2, and R3. We want to
+	// pass R0, R1, and R3 to Go, so we push those on the stack.
+	// Also, save C callee-save registers R4-R12.
+	MOVM.WP	[R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12], (R13)
+	// Finally, save the link register R14. This also puts the
+	// arguments we pushed for cgocallback where they need to be,
+	// starting at 4(R13).
+	MOVW.W	R14, -4(R13)
 
 	// Skip floating point registers on GOARM < 6.
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfpsave
-	MOVD	F8, (14*4+8*1)(R13)
-	MOVD	F9, (14*4+8*2)(R13)
-	MOVD	F10, (14*4+8*3)(R13)
-	MOVD	F11, (14*4+8*4)(R13)
-	MOVD	F12, (14*4+8*5)(R13)
-	MOVD	F13, (14*4+8*6)(R13)
-	MOVD	F14, (14*4+8*7)(R13)
-	MOVD	F15, (14*4+8*8)(R13)
+	MOVD	F8, (13*4+8*1)(R13)
+	MOVD	F9, (13*4+8*2)(R13)
+	MOVD	F10, (13*4+8*3)(R13)
+	MOVD	F11, (13*4+8*4)(R13)
+	MOVD	F12, (13*4+8*5)(R13)
+	MOVD	F13, (13*4+8*6)(R13)
+	MOVD	F14, (13*4+8*7)(R13)
+	MOVD	F15, (13*4+8*8)(R13)
 
 skipfpsave:
 	BL	runtime·load_g(SB)
-	MOVW	R15, R14 // R15 is PC.
-	MOVW	0(R13), R15
+	// We set up the arguments to cgocallback when saving registers above.
+	BL	runtime·cgocallback(SB)
 
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfprest
-	MOVD	(14*4+8*1)(R13), F8
-	MOVD	(14*4+8*2)(R13), F9
-	MOVD	(14*4+8*3)(R13), F10
-	MOVD	(14*4+8*4)(R13), F11
-	MOVD	(14*4+8*5)(R13), F12
-	MOVD	(14*4+8*6)(R13), F13
-	MOVD	(14*4+8*7)(R13), F14
-	MOVD	(14*4+8*8)(R13), F15
+	MOVD	(13*4+8*1)(R13), F8
+	MOVD	(13*4+8*2)(R13), F9
+	MOVD	(13*4+8*3)(R13), F10
+	MOVD	(13*4+8*4)(R13), F11
+	MOVD	(13*4+8*5)(R13), F12
+	MOVD	(13*4+8*6)(R13), F13
+	MOVD	(13*4+8*7)(R13), F14
+	MOVD	(13*4+8*8)(R13), F15
 
 skipfprest:
-	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14]
+	MOVW.P	4(R13), R14
+	MOVM.IAW	(R13), [R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12]
 	ADD	$(8*9), R13
 	MOVW	R14, R15
diff --git a/src/runtime/cgo/asm_arm64.s b/src/runtime/cgo/asm_arm64.s
index ce56f9b1c7..1cb25cf89e 100644
--- a/src/runtime/cgo/asm_arm64.s
+++ b/src/runtime/cgo/asm_arm64.s
@@ -5,19 +5,20 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
+	 *  push 3 args for fn (R0, R1, R3), skipping R2.
 	 * Also note that at procedure entry in gc world, 8(RSP) will be the
 	 *  first arg.
 	 * TODO(minux): use LDP/STP here if it matters.
 	 */
 	SUB	$(8*24), RSP
-	MOVD	R1, (8*1)(RSP)
-	MOVD	R2, (8*2)(RSP)
+	MOVD	R0, (8*1)(RSP)
+	MOVD	R1, (8*2)(RSP)
 	MOVD	R3, (8*3)(RSP)
 	MOVD	R19, (8*4)(RSP)
 	MOVD	R20, (8*5)(RSP)
@@ -40,15 +41,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	FMOVD	F14, (8*22)(RSP)
 	FMOVD	F15, (8*23)(RSP)
 
-	MOVD	R0, R19
-
 	// Initialize Go ABI environment
 	BL	runtime·load_g(SB)
-	BL	(R19)
 
-	MOVD	(8*1)(RSP), R1
-	MOVD	(8*2)(RSP), R2
-	MOVD	(8*3)(RSP), R3
+	BL	runtime·cgocallback(SB)
+
 	MOVD	(8*4)(RSP), R19
 	MOVD	(8*5)(RSP), R20
 	MOVD	(8*6)(RSP), R21
diff --git a/src/runtime/cgo/asm_mips64x.s b/src/runtime/cgo/asm_mips64x.s
index 1235852dbe..e51cdf3d12 100644
--- a/src/runtime/cgo/asm_mips64x.s
+++ b/src/runtime/cgo/asm_mips64x.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 8(R29) will be the
 	 *  first arg.
 	 */
@@ -22,9 +22,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	ADDV	$(-8*15), R29
 #endif
-	MOVV	R5, (8*1)(R29) // void*
-	MOVW	R6, (8*2)(R29) // int32
-	MOVV	R7, (8*3)(R29) // uintptr
+	MOVV	R4, (8*1)(R29) // fn unsafe.Pointer
+	MOVV	R5, (8*2)(R29) // a unsafe.Pointer
+	MOVV	R7, (8*3)(R29) // ctxt uintptr
 	MOVV	R16, (8*4)(R29)
 	MOVV	R17, (8*5)(R29)
 	MOVV	R18, (8*6)(R29)
@@ -52,7 +52,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	SRLV	$32, R31, RSB
 	SLLV	$32, RSB
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVV	(8*4)(R29), R16
 	MOVV	(8*5)(R29), R17
diff --git a/src/runtime/cgo/asm_mipsx.s b/src/runtime/cgo/asm_mipsx.s
index e3090da223..1127c8beb4 100644
--- a/src/runtime/cgo/asm_mipsx.s
+++ b/src/runtime/cgo/asm_mipsx.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 4(R29) will be the
 	 *  first arg.
 	 */
@@ -25,9 +25,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	SUBU	$(4*14-16), R29	// For soft-float, no FPR.
 #endif
-	MOVW	R5, (4*1)(R29)
-	MOVW	R6, (4*2)(R29)
-	MOVW	R7, (4*3)(R29)
+	MOVW	R4, (4*1)(R29)	// fn unsafe.Pointer
+	MOVW	R5, (4*2)(R29)	// a unsafe.Pointer
+	MOVW	R7, (4*3)(R29)	// ctxt uintptr
 	MOVW	R16, (4*4)(R29)
 	MOVW	R17, (4*5)(R29)
 	MOVW	R18, (4*6)(R29)
@@ -47,7 +47,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	MOVD	F30, (4*14+8*5)(R29)
 #endif
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVW	(4*4)(R29), R16
 	MOVW	(4*5)(R29), R17
diff --git a/src/runtime/cgo/asm_ppc64x.s b/src/runtime/cgo/asm_ppc64x.s
index 3876f9389c..f4efc1e67d 100644
--- a/src/runtime/cgo/asm_ppc64x.s
+++ b/src/runtime/cgo/asm_ppc64x.s
@@ -8,8 +8,9 @@
 #include "asm_ppc64x.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage
 	MOVD	LR, R0
@@ -26,19 +27,18 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	BL	runtime·reginit(SB)
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, R12
 #ifdef GOARCH_ppc64
 	// ppc64 use elf ABI v1. we must get the real entry address from
 	// first slot of the function descriptor before call.
 	// Same for AIX.
-	MOVD	8(R12), R2
-	MOVD	(R12), R12
+	MOVD	8(R3), R2
+	MOVD	(R3), R3
 #endif
-	MOVD	R12, CTR
-	MOVD	R4, FIXED_FRAME+0(R1)
-	MOVW	R5, FIXED_FRAME+8(R1)
-	MOVD	R6, FIXED_FRAME+16(R1)
-	BL	(CTR)
+	MOVD	R3, FIXED_FRAME+0(R1)	// fn unsafe.Pointer
+	MOVD	R4, FIXED_FRAME+8(R1)	// a unsafe.Pointer
+	// Skip R5 = n uint32
+	MOVD	R6, FIXED_FRAME+16(R1)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	ADD	$(288+3*8+FIXED_FRAME), R1
 
diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
index 7eab8f652a..8bf16e75e2 100644
--- a/src/runtime/cgo/asm_s390x.s
+++ b/src/runtime/cgo/asm_s390x.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage.
 
@@ -29,10 +30,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Initialize Go ABI environment.
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, 8(R15)  // arg1
-	MOVW	R4, 16(R15) // arg2
-	MOVD	R5, 24(R15) // arg3
-	BL	(R2)        // fn(arg1, arg2, arg3)
+	MOVD	R2, 8(R15)	// fn unsafe.Pointer
+	MOVD	R3, 16(R15)	// a unsafe.Pointer
+	// Skip R4 = n uint32
+	MOVD	R5, 24(R15)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	FMOVD	32(R15), F8
 	FMOVD	40(R15), F9
diff --git a/src/runtime/cgo/callbacks.go b/src/runtime/cgo/callbacks.go
index 14a218ec92..cd8b795387 100644
--- a/src/runtime/cgo/callbacks.go
+++ b/src/runtime/cgo/callbacks.go
@@ -9,20 +9,18 @@ import "unsafe"
 // These utility functions are available to be called from code
 // compiled with gcc via crosscall2.
 
-// cgocallback is defined in runtime
-//go:linkname _runtime_cgocallback runtime.cgocallback
-func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
-
 // The declaration of crosscall2 is:
-//   void crosscall2(void (*fn)(void *, int), void *, int);
+//   void crosscall2(void (*fn)(void *), void *, int);
 //
 // We need to export the symbol crosscall2 in order to support
 // callbacks from shared libraries. This applies regardless of
 // linking mode.
 //
-// Compatibility note: crosscall2 actually takes four arguments, but
-// it works to call it with three arguments when calling _cgo_panic.
-// That is supported for backward compatibility.
+// Compatibility note: SWIG uses crosscall2 in exactly one situation:
+// to call _cgo_panic using the pattern shown below. We need to keep
+// that pattern working. In particular, crosscall2 actually takes four
+// arguments, but it works to call it with three arguments when
+// calling _cgo_panic.
 //go:cgo_export_static crosscall2
 //go:cgo_export_dynamic crosscall2
 
@@ -34,21 +32,18 @@ func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
 //   crosscall2(_cgo_panic, &a, sizeof a);
 //   /* The function call will not return.  */
 
+// TODO: We should export a regular C function to panic, change SWIG
+// to use that instead of the above pattern, and then we can drop
+// backwards-compatibility from crosscall2 and stop exporting it.
+
 //go:linkname _runtime_cgo_panic_internal runtime._cgo_panic_internal
 func _runtime_cgo_panic_internal(p *byte)
 
 //go:linkname _cgo_panic _cgo_panic
 //go:cgo_export_static _cgo_panic
 //go:cgo_export_dynamic _cgo_panic
-//go:nosplit
-//go:norace
-func _cgo_panic(a unsafe.Pointer, n int32) {
-	f := _runtime_cgo_panic_internal
-	type funcval struct {
-		pc unsafe.Pointer
-	}
-	fv := *(**funcval)(unsafe.Pointer(&f))
-	_runtime_cgocallback(fv.pc, a, uintptr(n), 0)
+func _cgo_panic(a *struct{ cstr *byte }) {
+	_runtime_cgo_panic_internal(a.cstr)
 }
 
 //go:cgo_import_static x_cgo_init
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 7ab42a0ed0..9bca279318 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -35,31 +35,25 @@
 // cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
 // know about packages).  The gcc-compiled C function f calls GoF.
 //
-// GoF calls crosscall2(_cgoexp_GoF, frame, framesize, ctxt).
-// Crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
-// the gcc function call ABI to the gc function call ABI.
-// It is called from gcc to call gc functions. In this case it calls
-// _cgoexp_GoF(frame, framesize), still running on m.g0's stack
-// and outside the $GOMAXPROCS limit. Thus, this code cannot yet
-// call arbitrary Go code directly and must be careful not to allocate
-// memory or use up m.g0's stack.
+// GoF initializes "frame", a structure containing all of its
+// arguments and slots for p.GoF's results. It calls
+// crosscall2(_cgoexp_GoF, frame, framesize, ctxt) using the gcc ABI.
 //
-// _cgoexp_GoF (generated by cmd/cgo) calls
-// runtime.cgocallback(funcPC(p.GoF), frame, framesize, ctxt).
-// (The reason for having _cgoexp_GoF instead of writing a crosscall3
-// to make this call directly is that _cgoexp_GoF, because it is compiled
-// with gc instead of gcc, can refer to dotted names like
-// runtime.cgocallback and p.GoF.)
+// crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
+// the gcc function call ABI to the gc function call ABI. At this
+// point we're in the Go runtime, but we're still running on m.g0's
+// stack and outside the $GOMAXPROCS limit. crosscall2 calls
+// runtime.cgocallback(_cgoexp_GoF, frame, ctxt) using the gc ABI.
+// (crosscall2's framesize argument is no longer used, but there's one
+// case where SWIG calls crosscall2 directly and expects to pass this
+// argument. See _cgo_panic.)
 //
-// runtime.cgocallback (in asm_$GOARCH.s) turns the raw PC of p.GoF
-// into a Go function value and calls runtime.cgocallback_gofunc.
-//
-// runtime.cgocallback_gofunc (in asm_$GOARCH.s) switches from m.g0's
-// stack to the original g (m.curg)'s stack, on which it calls
-// runtime.cgocallbackg(p.GoF, frame, framesize).
-// As part of the stack switch, runtime.cgocallback saves the current
-// SP as m.g0.sched.sp, so that any use of m.g0's stack during the
-// execution of the callback will be done below the existing stack frames.
+// runtime.cgocallback (in asm_$GOARCH.s) switches from m.g0's stack
+// to the original g (m.curg)'s stack, on which it calls
+// runtime.cgocallbackg(_cgoexp_GoF, frame, ctxt). As part of the
+// stack switch, runtime.cgocallback saves the current SP as
+// m.g0.sched.sp, so that any use of m.g0's stack during the execution
+// of the callback will be done below the existing stack frames.
 // Before overwriting m.g0.sched.sp, it pushes the old value on the
 // m.g0 stack, so that it can be restored later.
 //
@@ -67,19 +61,26 @@
 // stack (not an m.g0 stack).  First it calls runtime.exitsyscall, which will
 // block until the $GOMAXPROCS limit allows running this goroutine.
 // Once exitsyscall has returned, it is safe to do things like call the memory
-// allocator or invoke the Go callback function p.GoF.  runtime.cgocallbackg
+// allocator or invoke the Go callback function.  runtime.cgocallbackg
 // first defers a function to unwind m.g0.sched.sp, so that if p.GoF
 // panics, m.g0.sched.sp will be restored to its old value: the m.g0 stack
 // and the m.curg stack will be unwound in lock step.
-// Then it calls p.GoF.  Finally it pops but does not execute the deferred
-// function, calls runtime.entersyscall, and returns to runtime.cgocallback.
+// Then it calls _cgoexp_GoF(frame).
+//
+// _cgoexp_GoF, which was generated by cmd/cgo, unpacks the arguments
+// from frame, calls p.GoF, writes the results back to frame, and
+// returns. Now we start unwinding this whole process.
+//
+// runtime.cgocallbackg pops but does not execute the deferred
+// function to unwind m.g0.sched.sp, calls runtime.entersyscall, and
+// returns to runtime.cgocallback.
 //
 // After it regains control, runtime.cgocallback switches back to
 // m.g0's stack (the pointer is still in m.g0.sched.sp), restores the old
-// m.g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+// m.g0.sched.sp value from the stack, and returns to crosscall2.
 //
-// _cgoexp_GoF immediately returns to crosscall2, which restores the
-// callee-save registers for gcc and returns to GoF, which returns to f.
+// crosscall2 restores the callee-save registers for gcc and returns
+// to GoF, which unpacks any result values and returns to f.
 
 package runtime
 
@@ -196,7 +197,7 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 
 // Call from C back to Go.
 //go:nosplit
-func cgocallbackg(ctxt uintptr) {
+func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp != gp.m.curg {
 		println("runtime: bad g in cgocallback")
@@ -224,7 +225,7 @@ func cgocallbackg(ctxt uintptr) {
 
 	osPreemptExtExit(gp.m)
 
-	cgocallbackg1(ctxt)
+	cgocallbackg1(fn, frame, ctxt)
 
 	// At this point unlockOSThread has been called.
 	// The following code must not change to a different m.
@@ -239,7 +240,7 @@ func cgocallbackg(ctxt uintptr) {
 	gp.m.syscall = syscall
 }
 
-func cgocallbackg1(ctxt uintptr) {
+func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp.m.needextram || atomic.Load(&extraMWaiters) > 0 {
 		gp.m.needextram = false
@@ -283,79 +284,16 @@ func cgocallbackg1(ctxt uintptr) {
 		raceacquire(unsafe.Pointer(&racecgosync))
 	}
 
-	type args struct {
-		fn      *funcval
-		arg     unsafe.Pointer
-		argsize uintptr
-	}
-	var cb *args
-
-	// Location of callback arguments depends on stack frame layout
-	// and size of stack frame of cgocallback_gofunc.
-	sp := gp.m.g0.sched.sp
-	switch GOARCH {
-	default:
-		throw("cgocallbackg is unimplemented on arch")
-	case "arm":
-		// On arm, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "arm64":
-		// On arm64, stack frame is four words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		// Additional two words (16-byte alignment) are for saving FP.
-		cb = (*args)(unsafe.Pointer(sp + 7*sys.PtrSize))
-	case "amd64":
-		// On amd64, stack frame is two words, plus caller PC and BP.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "386":
-		// On 386, stack frame is three words, plus caller PC.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "ppc64", "ppc64le", "s390x":
-		// On ppc64 and s390x, the callback arguments are in the arguments area of
-		// cgocallback's stack frame. The stack looks like this:
-		// +--------------------+------------------------------+
-		// |                    | ...                          |
-		// | cgoexp_$fn         +------------------------------+
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+
-		// |                    | arguments area               |
-		// | cgocallback        +------------------------------+ <- sp + 2*minFrameSize + 2*ptrSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp + minFrameSize + 2*ptrSize
-		// |                    | local variables (2 pointers) |
-		// | cgocallback_gofunc +------------------------------+ <- sp + minFrameSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp
-		cb = (*args)(unsafe.Pointer(sp + 2*sys.MinFrameSize + 2*sys.PtrSize))
-	case "mips64", "mips64le":
-		// On mips64x, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "mips", "mipsle":
-		// On mipsx, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	}
-
-	// Invoke callback.
-	// NOTE(rsc): passing nil for argtype means that the copying of the
-	// results back into cb.arg happens without any corresponding write barriers.
-	// For cgo, cb.arg points into a C stack frame and therefore doesn't
-	// hold any pointers that the GC can find anyway - the write barrier
-	// would be a no-op.
-	reflectcall(nil, unsafe.Pointer(cb.fn), cb.arg, uint32(cb.argsize), 0)
+	// Invoke callback. This function is generated by cmd/cgo and
+	// will unpack the argument frame and call the Go function.
+	var cb func(frame unsafe.Pointer)
+	cbFV := funcval{uintptr(fn)}
+	*(*unsafe.Pointer)(unsafe.Pointer(&cb)) = noescape(unsafe.Pointer(&cbFV))
+	cb(frame)
 
 	if raceenabled {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
 	}
-	if msanenabled {
-		// Tell msan that we wrote to the entire argument block.
-		// This tells msan that we set the results.
-		// Since we have already called the function it doesn't
-		// matter that we are writing to the non-result parameters.
-		msanwrite(cb.arg, cb.argsize)
-	}
 
 	// Do not unwind m->g0->sched.sp.
 	// Our caller, cgocallback, will do that.
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 83d2a524e0..c629fd45f0 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4243,7 +4243,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// First, it may be that the g switch has no PC update, because the SP
 	// either corresponds to a user g throughout (as in asmcgocall)
 	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback_gofunc). In this case, since the entire
+	// (as in cgocallback). In this case, since the entire
 	// transition is a g+SP update, a partial transition updating just one of
 	// those will be detected by the stack bounds check.
 	//
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index d3e7762175..b4b8936c7c 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -309,7 +309,7 @@ Read at 0x[0-9,a-f]+ by main goroutine:
 Previous write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.goCallback\(\)
       .*/main\.go:27 \+0x[0-9,a-f]+
-  main._cgoexpwrap_[0-9a-z]+_goCallback\(\)
+  _cgoexp_[0-9a-z]+_goCallback\(\)
       .*_cgo_gotypes\.go:[0-9]+ \+0x[0-9,a-f]+
 
 Goroutine [0-9] \(running\) created at:
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 6290142a41..d77cb4d460 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -148,7 +148,13 @@ func noescape(p unsafe.Pointer) unsafe.Pointer {
 	return unsafe.Pointer(x ^ 0)
 }
 
-func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
+// Not all cgocallback frames are actually cgocallback,
+// so not all have these arguments. Mark them uintptr so that the GC
+// does not misinterpret memory when the arguments are not present.
+// cgocallback is not called from Go, only from crosscall2.
+// This in turn calls cgocallbackg, which is where we'll find
+// pointer-declared arguments.
+func cgocallback(fn, frame, ctxt uintptr)
 func gogo(buf *gobuf)
 func gosave(buf *gobuf)
 
@@ -163,10 +169,11 @@ func breakpoint()
 // back into arg+retoffset before returning. If copying result bytes back,
 // the caller should pass the argument frame type as argtype, so that
 // call can execute appropriate write barriers during the copy.
-// Package reflect passes a frame type. In package runtime, there is only
-// one call that copies results back, in cgocallbackg1, and it does NOT pass a
-// frame type, meaning there are no write barriers invoked. See that call
-// site for justification.
+//
+// Package reflect always passes a frame type. In package runtime,
+// Windows callbacks are the only use of this that copies results
+// back, and those cannot have pointers in their results, so runtime
+// passes nil for the frame type.
 //
 // Package reflect accesses this symbol through a linkname.
 func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
@@ -187,14 +194,6 @@ type neverCallThisFunction struct{}
 // prematurely and if there is leftover state it may panic.
 func goexit(neverCallThisFunction)
 
-// Not all cgocallback_gofunc frames are actually cgocallback_gofunc,
-// so not all have these arguments. Mark them uintptr so that the GC
-// does not misinterpret memory when the arguments are not present.
-// cgocallback_gofunc is not called from go, only from cgocallback,
-// so the arguments will be found via cgocallback's pointer-declared arguments.
-// See the assembly implementations for more details.
-func cgocallback_gofunc(fv, frame, framesize, ctxt uintptr)
-
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
 // between initializing an object and making that object accessible to
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 84637376bf..932fba3de0 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -326,7 +326,7 @@ const (
 	funcID_gcBgMarkWorker
 	funcID_systemstack_switch
 	funcID_systemstack
-	funcID_cgocallback_gofunc
+	funcID_cgocallback
 	funcID_gogo
 	funcID_externalthreadhandler
 	funcID_debugCallV1
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 4ac1527ab1..2e5e82879c 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -239,7 +239,7 @@ GLOBL runtime·cbctxts(SB), NOPTR, $4
 TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVL	0(SP), AX	// will use to find our callback context
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
 	ADDL	$4, SP
 
 	// address to callback parameters into CX
@@ -251,50 +251,35 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	PUSHL	BP
 	PUSHL	BX
 
-	// determine index into runtime·cbctxts table
+	// Go ABI requires DF flag to be cleared.
+	CLD
+
+	// determine index into runtime·cbs table
 	SUBL	$runtime·callbackasm(SB), AX
 	MOVL	$0, DX
 	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	BX
-
-	// find correspondent runtime·cbctxts table entry
-	MOVL	runtime·cbctxts(SB), BX
-	MOVL	-4(BX)(AX*4), BX
-
-	// extract callback context
-	MOVL	wincallbackcontext_gobody(BX), AX
-	MOVL	wincallbackcontext_argsize(BX), DX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	PUSHL	0(CX)(DX*1)
-
-	// extend argsize by size of return value
-	ADDL	$4, DX
-
-	// remember how to restore stack on return
-	MOVL	wincallbackcontext_restorestack(BX), BX
-	PUSHL	BX
-
-	// call target Go function
-	PUSHL	DX			// argsize (including return value)
-	PUSHL	CX			// callback parameters
-	PUSHL	AX			// address of target Go function
-	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	POPL	AX
-	POPL	CX
-	POPL	DX
-
-	// how to restore stack on return
-	POPL	BX
-
-	// return value into AX (as per Windows spec)
-	// and restore previously preserved value
-	MOVL	-4(CX)(DX*1), AX
-	POPL	-4(CX)(DX*1)
-
-	MOVL	BX, CX			// cannot use BX anymore
+	SUBL	$1, AX	// subtract 1 because return PC is to the next slot
+
+	// Create a struct callbackArgs on our stack.
+	SUBL	$(12+callbackArgs__size), SP
+	MOVL	AX, (12+callbackArgs_index)(SP)		// callback index
+	MOVL	CX, (12+callbackArgs_args)(SP)		// address of args vector
+	MOVL	$0, (12+callbackArgs_result)(SP)	// result
+	LEAL	12(SP), AX	// AX = &callbackArgs{...}
+
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVL	$0, 8(SP)	// context
+	MOVL	AX, 4(SP)	// frame (address of callbackArgs)
+	LEAL	·callbackWrap(SB), AX
+	MOVL	AX, 0(SP)	// PC of function to call
+	CALL	runtime·cgocallback(SB)
+
+	// Get callback result.
+	MOVL	(12+callbackArgs_result)(SP), AX
+	// Get popRet.
+	MOVL	(12+callbackArgs_retPop)(SP), CX	// Can't use a callee-save register
+	ADDL	$(12+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	POPL	BX
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 847542592b..e9ec99a51d 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -291,31 +291,20 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVQ	DX, (16+8)(SP)
   	MOVQ	R8, (16+16)(SP)
   	MOVQ	R9, (16+24)(SP)
+	// R8 = address of args vector
+	LEAQ	(16+0)(SP), R8
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
   	MOVQ	0(SP), AX
 	ADDQ	$8, SP
 
-	// determine index into runtime·cbctxts table
+	// determine index into runtime·cbs table
 	MOVQ	$runtime·callbackasm(SB), DX
 	SUBQ	DX, AX
 	MOVQ	$0, DX
 	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	CX
-
-	// find correspondent runtime·cbctxts table entry
-	MOVQ	runtime·cbctxts(SB), CX
-	MOVQ	-8(CX)(AX*8), AX
-
-	// extract callback context
-	MOVQ	wincallbackcontext_argsize(AX), DX
-	MOVQ	wincallbackcontext_gobody(AX), AX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	LEAQ	8(SP), CX       // args vector, skip return address
-	PUSHQ	0(CX)(DX*1)     // store 8 bytes from just after the args array
-	ADDQ	$8, DX          // extend argsize by size of return value
+	SUBQ	$1, AX	// subtract 1 because return PC is to the next slot
 
 	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
 	// as required by windows callback convention.
@@ -330,18 +319,25 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	MOVQ	R14, 8(SP)
 	MOVQ	R15, 0(SP)
 
-	// prepare call stack.  use SUBQ to hide from stack frame checks
-	// cgocallback(Go func, void *frame, uintptr framesize)
-	SUBQ	$24, SP
-	MOVQ	DX, 16(SP)	// argsize (including return value)
-	MOVQ	CX, 8(SP)	// callback parameters
-	MOVQ	AX, 0(SP)	// address of target Go function
+	// Go ABI requires DF flag to be cleared.
 	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	MOVQ	0(SP), AX
-	MOVQ	8(SP), CX
-	MOVQ	16(SP), DX
-	ADDQ	$24, SP
+
+	// Create a struct callbackArgs on our stack to be passed as
+	// the "frame" to cgocallback and on to callbackWrap.
+	SUBQ	$(24+callbackArgs__size), SP
+	MOVQ	AX, (24+callbackArgs_index)(SP) 	// callback index
+	MOVQ	R8, (24+callbackArgs_args)(SP)  	// address of args vector
+	MOVQ	$0, (24+callbackArgs_result)(SP)	// result
+	LEAQ	24(SP), AX
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVQ	$0, 16(SP)	// context
+	MOVQ	AX, 8(SP)	// frame (address of callbackArgs)
+	LEAQ	·callbackWrap(SB), BX
+	MOVQ	BX, 0(SP)	// PC of function value to call (callbackWrap)
+	CALL	·cgocallback(SB)
+	// Get callback result.
+	MOVQ	(24+callbackArgs_result)(SP), AX
+	ADDQ	$(24+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	MOVQ	0(SP), R15
@@ -355,8 +351,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	ADDQ	$64, SP
 	POPFQ
 
-	MOVQ	-8(CX)(DX*1), AX  // return value
-	POPQ	-8(CX)(DX*1)      // restore bytes just after the args
+	// The return value was placed in AX above.
 	RET
 
 // uint32 tstart_stdcall(M *newm);
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 57415e1306..3fc6d27cb0 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -314,6 +314,9 @@ TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME,$0
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
 TEXT runtime·callbackasm1(SB),NOSPLIT|NOFRAME,$0
+	// TODO(austin): This needs to be converted to match changes
+	// in cgocallback, but I have no way to test. See CL 258938,
+	// and callbackasm1 on amd64 and 386.
 	MOVM.DB.W [R4-R11, R14], (R13)	// push {r4-r11, lr}
 	SUB	$36, R13		// space for locals
 
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 0e2fcfb02d..ff43e7cbed 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -22,10 +23,7 @@ func (c *wincallbackcontext) setCleanstack(cleanstack bool) {
 	c.cleanstack = cleanstack
 }
 
-var (
-	cbs     callbacks
-	cbctxts **wincallbackcontext = &cbs.ctxt[0] // to simplify access to cbs.ctxt in sys_windows_*.s
-)
+var cbs callbacks
 
 func callbackasm()
 
@@ -53,6 +51,8 @@ func callbackasmAddr(i int) uintptr {
 	return funcPC(callbackasm) + uintptr(i*entrySize)
 }
 
+const callbackMaxArgs = 64
+
 //go:linkname compileCallback syscall.compileCallback
 func compileCallback(fn eface, cleanstack bool) (code uintptr) {
 	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
@@ -66,6 +66,9 @@ func compileCallback(fn eface, cleanstack bool) (code uintptr) {
 	if ft.out()[0].size != uintptrSize {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
+	if len(ft.in()) > callbackMaxArgs {
+		panic("compileCallback: too many function arguments")
+	}
 	argsize := uintptr(0)
 	for _, t := range ft.in() {
 		if t.size > uintptrSize {
@@ -106,6 +109,37 @@ func compileCallback(fn eface, cleanstack bool) (code uintptr) {
 	return r
 }
 
+type callbackArgs struct {
+	index uintptr
+	args  *uintptr // Arguments in stdcall/cdecl convention, with registers spilled
+	// Below are out-args from callbackWrap
+	result uintptr
+	retPop uintptr // For 386 cdecl, how many bytes to pop on return
+}
+
+// callbackWrap is called by callbackasm to invoke a registered C callback.
+func callbackWrap(a *callbackArgs) {
+	c := cbs.ctxt[a.index]
+	a.retPop = c.restorestack
+
+	// Convert from stdcall to Go ABI. We assume the stack layout
+	// is the same, and we just need to make room for the result.
+	//
+	// TODO: This isn't a good assumption. For example, a function
+	// that takes two uint16 arguments will be laid out
+	// differently by the stdcall and Go ABIs. We should implement
+	// proper ABI conversion.
+	var frame [callbackMaxArgs + 1]uintptr
+	memmove(unsafe.Pointer(&frame), unsafe.Pointer(a.args), c.argsize)
+
+	// Even though this is copying back results, we can pass a nil
+	// type because those results must not require write barriers.
+	reflectcall(nil, c.gobody, noescape(unsafe.Pointer(&frame)), sys.PtrSize+uint32(c.argsize), uint32(c.argsize))
+
+	// Extract the result.
+	a.result = frame[c.argsize/sys.PtrSize]
+}
+
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
 
 // When available, this function will use LoadLibraryEx with the filename
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 94f4a44976..f3df152535 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -450,7 +450,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		}
 		n++
 
-		if f.funcID == funcID_cgocallback_gofunc && len(cgoCtxt) > 0 {
+		if f.funcID == funcID_cgocallback && len(cgoCtxt) > 0 {
 			ctxt := cgoCtxt[len(cgoCtxt)-1]
 			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
 
-- 
cgit v1.2.1


From c91dffbc9aeaacd087eb0c0c3f718739bc5f8c4a Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Wed, 7 Oct 2020 22:53:52 -0400
Subject: runtime: tidy cgocallback

On amd64 and 386, we have a very roundabout way of remembering that we
need to dropm on return that currently involves saving a zero to
needm's argument slot and later bringing it back. Just store the zero.

This also makes amd64 and 386 more consistent with cgocallback on all
other platforms: rather than saving the old M to the G stack, they now
save it to a named slot on the G0 stack.

The needm function no longer needs a dummy argument to get the SP, so
we drop that.

Change-Id: I7e84bb4a5ff9552de70dcf41d8accf02310535e7
Reviewed-on: https://go-review.googlesource.com/c/go/+/263268
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/asm_386.s      | 18 +++++++-----------
 src/runtime/asm_amd64.s    | 16 ++++++----------
 src/runtime/proc.go        |  6 +++---
 src/runtime/signal_unix.go |  6 +++---
 4 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index a54b68e03d..fa3b1be339 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -704,7 +704,7 @@ nosave:
 
 // cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback(SB),NOSPLIT,$16-12  // Frame size must match commented places below
+TEXT ·cgocallback(SB),NOSPLIT,$12-12  // Frame size must match commented places below
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -722,13 +722,12 @@ TEXT ·cgocallback(SB),NOSPLIT,$16-12  // Frame size must match commented places
 	CMPL	BP, $0
 	JEQ	needm
 	MOVL	g_m(BP), BP
-	MOVL	BP, DX // saved copy of oldm
+	MOVL	BP, savedm-4(SP) // saved copy of oldm
 	JMP	havem
 needm:
-	MOVL	$0, 0(SP)
 	MOVL	$runtime·needm(SB), AX
 	CALL	AX
-	MOVL	0(SP), DX
+	MOVL	$0, savedm-4(SP) // dropm on return
 	get_tls(CX)
 	MOVL	g(CX), BP
 	MOVL	g_m(BP), BP
@@ -769,8 +768,6 @@ havem:
 	// Once we switch to the curg stack, the pushed PC will appear
 	// to be the return PC of cgocallback, so that the traceback
 	// will seamlessly trace back into the earlier calls.
-	//
-	// In the new goroutine, 12(SP) holds the saved oldm (DX) register.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
@@ -780,20 +777,18 @@ havem:
 	MOVL	fn+0(FP), AX
 	MOVL	frame+4(FP), BX
 	MOVL	ctxt+8(FP), CX
-	LEAL	-(4+16)(DI), SP  // Must match declared frame size
-	MOVL	DX, 12(SP)
+	LEAL	-(4+12)(DI), SP  // Must match declared frame size
 	MOVL	AX, 0(SP)
 	MOVL	BX, 4(SP)
 	MOVL	CX, 8(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVL	12(SP), DX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	16(SP), BP  // Must match declared frame size
+	MOVL	12(SP), BP  // Must match declared frame size
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(16+4)(SP), DI  // Must match declared frame size
+	LEAL	(12+4)(SP), DI  // Must match declared frame size
 	MOVL	DI, (g_sched+gobuf_sp)(SI)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -809,6 +804,7 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVL	savedm-4(SP), DX
 	CMPL	DX, $0
 	JNE 3(PC)
 	MOVL	$runtime·dropm(SB), AX
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 3d5d9c4d58..19a3bb2d7d 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -693,7 +693,7 @@ nosave:
 
 // func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback(SB),NOSPLIT,$32-24
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -711,13 +711,12 @@ TEXT ·cgocallback(SB),NOSPLIT,$32-24
 	CMPQ	BX, $0
 	JEQ	needm
 	MOVQ	g_m(BX), BX
-	MOVQ	BX, R8 // holds oldm until end of function
+	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
 	JMP	havem
 needm:
-	MOVQ	$0, 0(SP)
-	MOVQ	$runtime·needm(SB), AX
+	MOVQ    $runtime·needm(SB), AX
 	CALL	AX
-	MOVQ	0(SP), R8
+	MOVQ	$0, savedm-8(SP) // dropm on return
 	get_tls(CX)
 	MOVQ	g(CX), BX
 	MOVQ	g_m(BX), BX
@@ -758,8 +757,6 @@ havem:
 	// Once we switch to the curg stack, the pushed PC will appear
 	// to be the return PC of cgocallback, so that the traceback
 	// will seamlessly trace back into the earlier calls.
-	//
-	// In the new goroutine, 24(SP) holds the saved R8.
 	MOVQ	m_curg(BX), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
@@ -776,12 +773,10 @@ havem:
 	SUBQ	AX, DI   // Allocate the same frame size on the g stack
 	MOVQ	DI, SP
 
-	MOVQ	R8, 24(SP)
 	MOVQ	BX, 0(SP)
 	MOVQ	CX, 8(SP)
 	MOVQ	DX, 16(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVQ	24(SP), R8
 
 	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
@@ -811,7 +806,8 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	CMPQ	R8, $0
+	MOVQ	savedm-8(SP), BX
+	CMPQ	BX, $0
 	JNE 3(PC)
 	MOVQ	$runtime·dropm(SB), AX
 	CALL	AX
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index c629fd45f0..ec4e6d8751 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1695,7 +1695,7 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 // When the callback is done with the m, it calls dropm to
 // put the m back on the list.
 //go:nosplit
-func needm(x byte) {
+func needm() {
 	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
 		// Can happen if C/C++ code calls Go from a global ctor.
 		// Can also happen on Windows if a global ctor uses a
@@ -1740,8 +1740,8 @@ func needm(x byte) {
 	// which is more than enough for us.
 	setg(mp.g0)
 	_g_ := getg()
-	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
-	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+	_g_.stack.hi = getcallersp() + 1024
+	_g_.stack.lo = getcallersp() - 32*1024
 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
 
 	// Initialize this thread to use the m.
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index e8b6f95d8f..9318a9b8bc 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -504,14 +504,14 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
 	sigaltstack(nil, &st)
 	if st.ss_flags&_SS_DISABLE != 0 {
 		setg(nil)
-		needm(0)
+		needm()
 		noSignalStack(sig)
 		dropm()
 	}
 	stsp := uintptr(unsafe.Pointer(st.ss_sp))
 	if sp < stsp || sp >= stsp+st.ss_size {
 		setg(nil)
-		needm(0)
+		needm()
 		sigNotOnStack(sig)
 		dropm()
 	}
@@ -951,7 +951,7 @@ func badsignal(sig uintptr, c *sigctxt) {
 		exit(2)
 		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
 	}
-	needm(0)
+	needm()
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
 		// Go code does not want to handle it.
-- 
cgit v1.2.1


From bda37a0b8a4e89318901a68492b79cf6531fa2ff Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Sat, 3 Oct 2020 19:52:08 -0400
Subject: runtime: tidy compileCallback

This makes a few minor cleanups and simplifications to compileCallback.

Change-Id: Ibebf4b5ed66fb68bba7c84129c127cd4d8a691fe
Reviewed-on: https://go-review.googlesource.com/c/go/+/263269
Trust: Austin Clements <austin@google.com>
Trust: Alex Brainman <alex.brainman@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Alex Brainman <alex.brainman@gmail.com>
---
 src/runtime/runtime2.go        |  8 ----
 src/runtime/syscall_windows.go | 86 ++++++++++++++++++++++++++----------------
 2 files changed, 53 insertions(+), 41 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 21dd7b3949..7bac5fd38d 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -387,14 +387,6 @@ type libcall struct {
 	err  uintptr // error number
 }
 
-// describes how to handle callback
-type wincallbackcontext struct {
-	gobody       unsafe.Pointer // go function to call
-	argsize      uintptr        // callback arguments size (in bytes)
-	restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
-	cleanstack   bool
-}
-
 // Stack describes a Go execution stack.
 // The bounds of the stack are exactly [lo, hi),
 // with no implicit data structures on either side.
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index ff43e7cbed..3a34d9ddba 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -9,22 +9,26 @@ import (
 	"unsafe"
 )
 
-type callbacks struct {
-	lock mutex
-	ctxt [cb_max]*wincallbackcontext
-	n    int
+// cbs stores all registered Go callbacks.
+var cbs struct {
+	lock  mutex
+	ctxt  [cb_max]winCallback
+	index map[winCallbackKey]int
+	n     int
 }
 
-func (c *wincallbackcontext) isCleanstack() bool {
-	return c.cleanstack
+// winCallback records information about a registered Go callback.
+type winCallback struct {
+	fn      *funcval // Go function
+	argsize uintptr  // Callback arguments size (in bytes)
+	cdecl   bool     // C function uses cdecl calling convention
 }
 
-func (c *wincallbackcontext) setCleanstack(cleanstack bool) {
-	c.cleanstack = cleanstack
+type winCallbackKey struct {
+	fn    *funcval
+	cdecl bool
 }
 
-var cbs callbacks
-
 func callbackasm()
 
 // callbackasmAddr returns address of runtime.callbackasm
@@ -53,8 +57,20 @@ func callbackasmAddr(i int) uintptr {
 
 const callbackMaxArgs = 64
 
+// compileCallback converts a Go function fn into a C function pointer
+// that can be passed to Windows APIs.
+//
+// On 386, if cdecl is true, the returned C function will use the
+// cdecl calling convention; otherwise, it will use stdcall. On amd64,
+// it always uses fastcall. On arm, it always uses the ARM convention.
+//
 //go:linkname compileCallback syscall.compileCallback
-func compileCallback(fn eface, cleanstack bool) (code uintptr) {
+func compileCallback(fn eface, cdecl bool) (code uintptr) {
+	if GOARCH != "386" {
+		// cdecl is only meaningful on 386.
+		cdecl = false
+	}
+
 	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
@@ -77,36 +93,32 @@ func compileCallback(fn eface, cleanstack bool) (code uintptr) {
 		argsize += uintptrSize
 	}
 
+	key := winCallbackKey{(*funcval)(fn.data), cdecl}
+
 	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
 
-	n := cbs.n
-	for i := 0; i < n; i++ {
-		if cbs.ctxt[i].gobody == fn.data && cbs.ctxt[i].isCleanstack() == cleanstack {
-			r := callbackasmAddr(i)
-			unlock(&cbs.lock)
-			return r
-		}
-	}
-	if n >= cb_max {
+	// Check if this callback is already registered.
+	if n, ok := cbs.index[key]; ok {
 		unlock(&cbs.lock)
-		throw("too many callback functions")
+		return callbackasmAddr(n)
 	}
 
-	c := new(wincallbackcontext)
-	c.gobody = fn.data
-	c.argsize = argsize
-	c.setCleanstack(cleanstack)
-	if cleanstack && argsize != 0 {
-		c.restorestack = argsize
-	} else {
-		c.restorestack = 0
+	// Register the callback.
+	if cbs.index == nil {
+		cbs.index = make(map[winCallbackKey]int)
 	}
+	n := cbs.n
+	if n >= len(cbs.ctxt) {
+		unlock(&cbs.lock)
+		throw("too many callback functions")
+	}
+	c := winCallback{key.fn, argsize, cdecl}
 	cbs.ctxt[n] = c
+	cbs.index[key] = n
 	cbs.n++
 
-	r := callbackasmAddr(n)
 	unlock(&cbs.lock)
-	return r
+	return callbackasmAddr(n)
 }
 
 type callbackArgs struct {
@@ -120,7 +132,15 @@ type callbackArgs struct {
 // callbackWrap is called by callbackasm to invoke a registered C callback.
 func callbackWrap(a *callbackArgs) {
 	c := cbs.ctxt[a.index]
-	a.retPop = c.restorestack
+	if GOARCH == "386" {
+		if c.cdecl {
+			// In cdecl, the callee is responsible for
+			// popping its arguments.
+			a.retPop = c.argsize
+		} else {
+			a.retPop = 0
+		}
+	}
 
 	// Convert from stdcall to Go ABI. We assume the stack layout
 	// is the same, and we just need to make room for the result.
@@ -134,7 +154,7 @@ func callbackWrap(a *callbackArgs) {
 
 	// Even though this is copying back results, we can pass a nil
 	// type because those results must not require write barriers.
-	reflectcall(nil, c.gobody, noescape(unsafe.Pointer(&frame)), sys.PtrSize+uint32(c.argsize), uint32(c.argsize))
+	reflectcall(nil, unsafe.Pointer(c.fn), noescape(unsafe.Pointer(&frame)), sys.PtrSize+uint32(c.argsize), uint32(c.argsize))
 
 	// Extract the result.
 	a.result = frame[c.argsize/sys.PtrSize]
-- 
cgit v1.2.1


From 614a8b7c8ad42ff8a9bc363f813af2aae046fd0c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Fri, 16 Oct 2020 22:22:20 -0400
Subject: runtime: tidy Windows callback test

This simplifies the systematic test of Windows callbacks with
different signatures and prepares it for expanded coverage of function
signatures.

It now returns a result from the Go function and threads it back
through C. This simplifies things, but also previously the code could
have succeeded by simply not calling the callbacks at all (though
other tests would have caught that).

It bundles together the C function description and the Go function
it's intended to call. Now the test source generation and the test
running both loop over a single slice of test functions.

Since the C function and Go function are now bundled, it generates the
C function by reflectively inspecting the signature of the Go
function. For the moment, we keep the same test suite, which is
entirely functions with "uintptr" arguments, but we'll expand this
shortly.

It now use sub-tests. This way tests automatically get useful
diagnostic labels in failures and the tests don't have to catch panics
on their own.

It eliminates the DLL function argument. I honestly couldn't figure
out what the point of this was, and it added what appeared to be an
unnecessary loop level to the tests.

Change-Id: I120dfd4785057cc2c392bd2c821302f276bd128e
Reviewed-on: https://go-review.googlesource.com/c/go/+/263270
Trust: Austin Clements <austin@google.com>
Trust: Alex Brainman <alex.brainman@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Alex Brainman <alex.brainman@gmail.com>
---
 src/runtime/syscall_windows_test.go | 224 +++++++++++++++---------------------
 1 file changed, 90 insertions(+), 134 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 2e74546e38..cb942beb3e 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -9,11 +9,13 @@ import (
 	"fmt"
 	"internal/syscall/windows/sysdll"
 	"internal/testenv"
+	"io"
 	"io/ioutil"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
@@ -285,99 +287,85 @@ func TestCallbackInAnotherThread(t *testing.T) {
 	}
 }
 
-type cbDLLFunc int // int determines number of callback parameters
-
-func (f cbDLLFunc) stdcallName() string {
-	return fmt.Sprintf("stdcall%d", f)
+type cbFunc struct {
+	goFunc interface{}
 }
 
-func (f cbDLLFunc) cdeclName() string {
-	return fmt.Sprintf("cdecl%d", f)
+func (f cbFunc) cName(cdecl bool) string {
+	name := "stdcall"
+	if cdecl {
+		name = "cdecl"
+	}
+	t := reflect.TypeOf(f.goFunc)
+	for i := 0; i < t.NumIn(); i++ {
+		name += "_" + t.In(i).Name()
+	}
+	return name
 }
 
-func (f cbDLLFunc) buildOne(stdcall bool) string {
-	var funcname, attr string
-	if stdcall {
-		funcname = f.stdcallName()
-		attr = "__stdcall"
-	} else {
-		funcname = f.cdeclName()
+func (f cbFunc) cSrc(w io.Writer, cdecl bool) {
+	// Construct a C function that takes a callback with
+	// f.goFunc's signature, and calls it with integers 1..N.
+	funcname := f.cName(cdecl)
+	attr := "__stdcall"
+	if cdecl {
 		attr = "__cdecl"
 	}
 	typename := "t" + funcname
-	p := make([]string, f)
-	for i := range p {
-		p[i] = "uintptr_t"
-	}
-	params := strings.Join(p, ",")
-	for i := range p {
-		p[i] = fmt.Sprintf("%d", i+1)
-	}
-	args := strings.Join(p, ",")
-	return fmt.Sprintf(`
-typedef void %s (*%s)(%s);
-void %s(%s f, uintptr_t n) {
-	uintptr_t i;
-	for(i=0;i<n;i++){
-		f(%s);
-	}
+	t := reflect.TypeOf(f.goFunc)
+	cTypes := make([]string, t.NumIn())
+	cArgs := make([]string, t.NumIn())
+	for i := range cTypes {
+		// We included stdint.h, so this works for all sized
+		// integer types.
+		cTypes[i] = t.In(i).Name() + "_t"
+		cArgs[i] = fmt.Sprintf("%d", i+1)
+	}
+	fmt.Fprintf(w, `
+typedef uintptr_t %s (*%s)(%s);
+uintptr_t %s(%s f) {
+	return f(%s);
 }
-	`, attr, typename, params, funcname, typename, args)
+	`, attr, typename, strings.Join(cTypes, ","), funcname, typename, strings.Join(cArgs, ","))
 }
 
-func (f cbDLLFunc) build() string {
-	return "#include <stdint.h>\n\n" + f.buildOne(false) + f.buildOne(true)
+func (f cbFunc) testOne(t *testing.T, dll *syscall.DLL, cdecl bool, cb uintptr) {
+	r1, _, _ := dll.MustFindProc(f.cName(cdecl)).Call(cb)
+
+	want := 0
+	for i := 0; i < reflect.TypeOf(f.goFunc).NumIn(); i++ {
+		want += i + 1
+	}
+	if int(r1) != want {
+		t.Errorf("wanted result %d; got %d", want, r1)
+	}
 }
 
-var cbFuncs = [...]interface{}{
-	2: func(i1, i2 uintptr) uintptr {
-		if i1+i2 != 3 {
-			panic("bad input")
-		}
-		return 0
-	},
-	3: func(i1, i2, i3 uintptr) uintptr {
-		if i1+i2+i3 != 6 {
-			panic("bad input")
-		}
-		return 0
-	},
-	4: func(i1, i2, i3, i4 uintptr) uintptr {
-		if i1+i2+i3+i4 != 10 {
-			panic("bad input")
-		}
-		return 0
-	},
-	5: func(i1, i2, i3, i4, i5 uintptr) uintptr {
-		if i1+i2+i3+i4+i5 != 15 {
-			panic("bad input")
-		}
-		return 0
-	},
-	6: func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6 != 21 {
-			panic("bad input")
-		}
-		return 0
-	},
-	7: func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7 != 28 {
-			panic("bad input")
-		}
-		return 0
-	},
-	8: func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8 != 36 {
-			panic("bad input")
-		}
-		return 0
-	},
-	9: func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8+i9 != 45 {
-			panic("bad input")
-		}
-		return 0
-	},
+var cbFuncs = []cbFunc{
+	{func(i1, i2 uintptr) uintptr {
+		return i1 + i2
+	}},
+	{func(i1, i2, i3 uintptr) uintptr {
+		return i1 + i2 + i3
+	}},
+	{func(i1, i2, i3, i4 uintptr) uintptr {
+		return i1 + i2 + i3 + i4
+	}},
+	{func(i1, i2, i3, i4, i5 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9
+	}},
 }
 
 type cbDLL struct {
@@ -385,21 +373,23 @@ type cbDLL struct {
 	buildArgs func(out, src string) []string
 }
 
-func (d *cbDLL) buildSrc(t *testing.T, path string) {
+func (d *cbDLL) makeSrc(t *testing.T, path string) {
 	f, err := os.Create(path)
 	if err != nil {
 		t.Fatalf("failed to create source file: %v", err)
 	}
 	defer f.Close()
 
-	for i := 2; i < 10; i++ {
-		fmt.Fprint(f, cbDLLFunc(i).build())
+	fmt.Fprintf(f, "#include <stdint.h>\n\n")
+	for _, cbf := range cbFuncs {
+		cbf.cSrc(f, false)
+		cbf.cSrc(f, true)
 	}
 }
 
 func (d *cbDLL) build(t *testing.T, dir string) string {
 	srcname := d.name + ".c"
-	d.buildSrc(t, filepath.Join(dir, srcname))
+	d.makeSrc(t, filepath.Join(dir, srcname))
 	outname := d.name + ".dll"
 	args := d.buildArgs(outname, srcname)
 	cmd := exec.Command(args[0], args[1:]...)
@@ -426,51 +416,6 @@ var cbDLLs = []cbDLL{
 	},
 }
 
-type cbTest struct {
-	n     int     // number of callback parameters
-	param uintptr // dll function parameter
-}
-
-func (test *cbTest) run(t *testing.T, dllpath string) {
-	dll := syscall.MustLoadDLL(dllpath)
-	defer dll.Release()
-	cb := cbFuncs[test.n]
-	stdcall := syscall.NewCallback(cb)
-	f := cbDLLFunc(test.n)
-	test.runOne(t, dll, f.stdcallName(), stdcall)
-	cdecl := syscall.NewCallbackCDecl(cb)
-	test.runOne(t, dll, f.cdeclName(), cdecl)
-}
-
-func (test *cbTest) runOne(t *testing.T, dll *syscall.DLL, proc string, cb uintptr) {
-	defer func() {
-		if r := recover(); r != nil {
-			t.Errorf("dll call %v(..., %d) failed: %v", proc, test.param, r)
-		}
-	}()
-	dll.MustFindProc(proc).Call(cb, test.param)
-}
-
-var cbTests = []cbTest{
-	{2, 1},
-	{2, 10000},
-	{3, 3},
-	{4, 5},
-	{4, 6},
-	{5, 2},
-	{6, 7},
-	{6, 8},
-	{7, 6},
-	{8, 1},
-	{9, 8},
-	{9, 10000},
-	{3, 4},
-	{5, 3},
-	{7, 7},
-	{8, 2},
-	{9, 9},
-}
-
 func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	if _, err := exec.LookPath("gcc"); err != nil {
 		t.Skip("skipping test: gcc is missing")
@@ -482,10 +427,21 @@ func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	defer os.RemoveAll(tmp)
 
 	for _, dll := range cbDLLs {
-		dllPath := dll.build(t, tmp)
-		for _, test := range cbTests {
-			test.run(t, dllPath)
-		}
+		t.Run(dll.name, func(t *testing.T) {
+			dllPath := dll.build(t, tmp)
+			dll := syscall.MustLoadDLL(dllPath)
+			defer dll.Release()
+			for _, cbf := range cbFuncs {
+				t.Run(cbf.cName(false), func(t *testing.T) {
+					stdcall := syscall.NewCallback(cbf.goFunc)
+					cbf.testOne(t, dll, false, stdcall)
+				})
+				t.Run(cbf.cName(true), func(t *testing.T) {
+					cdecl := syscall.NewCallbackCDecl(cbf.goFunc)
+					cbf.testOne(t, dll, true, cdecl)
+				})
+			}
+		})
 	}
 }
 
-- 
cgit v1.2.1


From 532c199ee56cdbc2cfd12da1c1cfb3359b122c7c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Sat, 17 Oct 2020 18:42:03 -0400
Subject: runtime: fix sub-uintptr-sized Windows callback arguments

The Windows callback support accepts Go functions with arguments that
are uintptr-sized or smaller. However, it doesn't implement smaller
arguments correctly. It assumes the Windows arguments layout is
equivalent to the Go argument layout. This is often true, but because
Windows C ABIs pad arguments to word size, while Go packs arguments,
the layout is different if there are multiple sub-word-size arguments
in a row. For example, a function with two uint16 arguments will have
a two-word C argument frame, but only a 4 byte Go argument frame.
There are also subtleties surrounding floating-point register
arguments that it doesn't handle correctly.

To fix this, when constructing a callback, we examine the Go
function's signature to construct a mapping between the C argument
frame and the Go argument frame. When the callback is invoked, we use
this mapping to build the Go argument frame and copy the result back.

This adds several test cases to TestStdcallAndCDeclCallbacks that
exercise more complex function signatures. These all fail with the
current code, but work with this CL.

In addition to fixing these callback types, this is also a step toward
the Go register ABI (#40724), which is going to make the ABI
translation more complex.

Change-Id: I19fb1681b659d9fd528ffd5e88912bebb95da052
Reviewed-on: https://go-review.googlesource.com/c/go/+/263271
Trust: Austin Clements <austin@google.com>
Trust: Alex Brainman <alex.brainman@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Alex Brainman <alex.brainman@gmail.com>
---
 src/runtime/syscall_windows.go      | 143 +++++++++++++++++++++++++++---------
 src/runtime/syscall_windows_test.go |  32 +++++++-
 2 files changed, 136 insertions(+), 39 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 3a34d9ddba..21f2452b5a 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -19,9 +19,32 @@ var cbs struct {
 
 // winCallback records information about a registered Go callback.
 type winCallback struct {
-	fn      *funcval // Go function
-	argsize uintptr  // Callback arguments size (in bytes)
-	cdecl   bool     // C function uses cdecl calling convention
+	fn     *funcval // Go function
+	retPop uintptr  // For 386 cdecl, how many bytes to pop on return
+
+	// abiMap specifies how to translate from a C frame to a Go
+	// frame. This does not specify how to translate back because
+	// the result is always a uintptr. If the C ABI is fastcall,
+	// this assumes the four fastcall registers were first spilled
+	// to the shadow space.
+	abiMap []abiPart
+	// retOffset is the offset of the uintptr-sized result in the Go
+	// frame.
+	retOffset uintptr
+}
+
+// abiPart encodes a step in translating between calling ABIs.
+type abiPart struct {
+	src, dst uintptr
+	len      uintptr
+}
+
+func (a *abiPart) tryMerge(b abiPart) bool {
+	if a.src+a.len == b.src && a.dst+a.len == b.dst {
+		a.len += b.len
+		return true
+	}
+	return false
 }
 
 type winCallbackKey struct {
@@ -55,7 +78,7 @@ func callbackasmAddr(i int) uintptr {
 	return funcPC(callbackasm) + uintptr(i*entrySize)
 }
 
-const callbackMaxArgs = 64
+const callbackMaxFrame = 64 * sys.PtrSize
 
 // compileCallback converts a Go function fn into a C function pointer
 // that can be passed to Windows APIs.
@@ -75,22 +98,81 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	ft := (*functype)(unsafe.Pointer(fn._type))
+
+	// Check arguments and construct ABI translation.
+	var abiMap []abiPart
+	var src, dst uintptr
+	for _, t := range ft.in() {
+		if t.size > sys.PtrSize {
+			// We don't support this right now. In
+			// stdcall/cdecl, 64-bit ints and doubles are
+			// passed as two words (little endian); and
+			// structs are pushed on the stack. In
+			// fastcall, arguments larger than the word
+			// size are passed by reference.
+			panic("compileCallback: argument size is larger than uintptr")
+		}
+		if k := t.kind & kindMask; GOARCH == "amd64" && (k == kindFloat32 || k == kindFloat64) {
+			// In fastcall, floating-point arguments in
+			// the first four positions are passed in
+			// floating-point registers, which we don't
+			// currently spill.
+			panic("compileCallback: float arguments not supported")
+		}
+
+		// The Go ABI aligns arguments.
+		dst = alignUp(dst, uintptr(t.align))
+		// In the C ABI, we're already on a word boundary.
+		// Also, sub-word-sized fastcall register arguments
+		// are stored to the least-significant bytes of the
+		// argument word and all supported Windows
+		// architectures are little endian, so src is already
+		// pointing to the right place for smaller arguments.
+
+		// Copy just the size of the argument. Note that this
+		// could be a small by-value struct, but C and Go
+		// struct layouts are compatible, so we can copy these
+		// directly, too.
+		part := abiPart{src, dst, t.size}
+		// Add this step to the adapter.
+		if len(abiMap) == 0 || !abiMap[len(abiMap)-1].tryMerge(part) {
+			abiMap = append(abiMap, part)
+		}
+
+		// cdecl, stdcall, and fastcall pad arguments to word size.
+		src += sys.PtrSize
+		// The Go ABI packs arguments.
+		dst += t.size
+	}
+	// The Go ABI aligns the result to the word size. src is
+	// already aligned.
+	dst = alignUp(dst, sys.PtrSize)
+	retOffset := dst
+
 	if len(ft.out()) != 1 {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	uintptrSize := unsafe.Sizeof(uintptr(0))
-	if ft.out()[0].size != uintptrSize {
+	if ft.out()[0].size != sys.PtrSize {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	if len(ft.in()) > callbackMaxArgs {
-		panic("compileCallback: too many function arguments")
+	if k := ft.out()[0].kind & kindMask; k == kindFloat32 || k == kindFloat64 {
+		// In cdecl and stdcall, float results are returned in
+		// ST(0). In fastcall, they're returned in XMM0.
+		// Either way, it's not AX.
+		panic("compileCallback: float results not supported")
 	}
-	argsize := uintptr(0)
-	for _, t := range ft.in() {
-		if t.size > uintptrSize {
-			panic("compileCallback: argument size is larger than uintptr")
-		}
-		argsize += uintptrSize
+	// Make room for the uintptr-sized result.
+	dst += sys.PtrSize
+
+	if dst > callbackMaxFrame {
+		panic("compileCallback: function argument frame too large")
+	}
+
+	// For cdecl, the callee is responsible for popping its
+	// arguments from the C stack.
+	var retPop uintptr
+	if cdecl {
+		retPop = src
 	}
 
 	key := winCallbackKey{(*funcval)(fn.data), cdecl}
@@ -112,7 +194,7 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 		unlock(&cbs.lock)
 		throw("too many callback functions")
 	}
-	c := winCallback{key.fn, argsize, cdecl}
+	c := winCallback{key.fn, retPop, abiMap, retOffset}
 	cbs.ctxt[n] = c
 	cbs.index[key] = n
 	cbs.n++
@@ -123,7 +205,7 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 
 type callbackArgs struct {
 	index uintptr
-	args  *uintptr // Arguments in stdcall/cdecl convention, with registers spilled
+	args  unsafe.Pointer // Arguments in stdcall/cdecl convention, with registers spilled
 	// Below are out-args from callbackWrap
 	result uintptr
 	retPop uintptr // For 386 cdecl, how many bytes to pop on return
@@ -132,32 +214,21 @@ type callbackArgs struct {
 // callbackWrap is called by callbackasm to invoke a registered C callback.
 func callbackWrap(a *callbackArgs) {
 	c := cbs.ctxt[a.index]
-	if GOARCH == "386" {
-		if c.cdecl {
-			// In cdecl, the callee is responsible for
-			// popping its arguments.
-			a.retPop = c.argsize
-		} else {
-			a.retPop = 0
-		}
-	}
+	a.retPop = c.retPop
 
-	// Convert from stdcall to Go ABI. We assume the stack layout
-	// is the same, and we just need to make room for the result.
-	//
-	// TODO: This isn't a good assumption. For example, a function
-	// that takes two uint16 arguments will be laid out
-	// differently by the stdcall and Go ABIs. We should implement
-	// proper ABI conversion.
-	var frame [callbackMaxArgs + 1]uintptr
-	memmove(unsafe.Pointer(&frame), unsafe.Pointer(a.args), c.argsize)
+	// Convert from stdcall to Go ABI.
+	var frame [callbackMaxFrame]byte
+	goArgs := unsafe.Pointer(&frame)
+	for _, part := range c.abiMap {
+		memmove(add(goArgs, part.dst), add(a.args, part.src), part.len)
+	}
 
 	// Even though this is copying back results, we can pass a nil
 	// type because those results must not require write barriers.
-	reflectcall(nil, unsafe.Pointer(c.fn), noescape(unsafe.Pointer(&frame)), sys.PtrSize+uint32(c.argsize), uint32(c.argsize))
+	reflectcall(nil, unsafe.Pointer(c.fn), noescape(goArgs), uint32(c.retOffset)+sys.PtrSize, uint32(c.retOffset))
 
 	// Extract the result.
-	a.result = frame[c.argsize/sys.PtrSize]
+	a.result = *(*uintptr)(unsafe.Pointer(&frame[c.retOffset]))
 }
 
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index cb942beb3e..7705d2a017 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -317,9 +317,13 @@ func (f cbFunc) cSrc(w io.Writer, cdecl bool) {
 	cArgs := make([]string, t.NumIn())
 	for i := range cTypes {
 		// We included stdint.h, so this works for all sized
-		// integer types.
+		// integer types, and uint8Pair_t.
 		cTypes[i] = t.In(i).Name() + "_t"
-		cArgs[i] = fmt.Sprintf("%d", i+1)
+		if t.In(i).Name() == "uint8Pair" {
+			cArgs[i] = fmt.Sprintf("(uint8Pair_t){%d,1}", i)
+		} else {
+			cArgs[i] = fmt.Sprintf("%d", i+1)
+		}
 	}
 	fmt.Fprintf(w, `
 typedef uintptr_t %s (*%s)(%s);
@@ -341,6 +345,8 @@ func (f cbFunc) testOne(t *testing.T, dll *syscall.DLL, cdecl bool, cb uintptr)
 	}
 }
 
+type uint8Pair struct{ x, y uint8 }
+
 var cbFuncs = []cbFunc{
 	{func(i1, i2 uintptr) uintptr {
 		return i1 + i2
@@ -366,6 +372,23 @@ var cbFuncs = []cbFunc{
 	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
 		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9
 	}},
+
+	// Non-uintptr parameters.
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint16) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 int8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1 int8, i2 int16, i3 int32, i4, i5 uintptr) uintptr {
+		return uintptr(i1) + uintptr(i2) + uintptr(i3) + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5 uint8Pair) uintptr {
+		return uintptr(i1.x + i1.y + i2.x + i2.y + i3.x + i3.y + i4.x + i4.y + i5.x + i5.y)
+	}},
 }
 
 type cbDLL struct {
@@ -380,7 +403,10 @@ func (d *cbDLL) makeSrc(t *testing.T, path string) {
 	}
 	defer f.Close()
 
-	fmt.Fprintf(f, "#include <stdint.h>\n\n")
+	fmt.Fprint(f, `
+#include <stdint.h>
+typedef struct { uint8_t x, y; } uint8Pair_t;
+`)
 	for _, cbf := range cbFuncs {
 		cbf.cSrc(f, false)
 		cbf.cSrc(f, true)
-- 
cgit v1.2.1


From 8cc280aa727bc7159adfdd083861472aa3066a35 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 20:13:49 +0000
Subject: runtime: define and enforce synchronization on heap_scan

Currently heap_scan is mostly protected by the heap lock, but
gcControllerState.revise sometimes accesses it without a lock. In an
effort to make gcControllerState.revise callable from more contexts (and
have its synchronization guarantees actually respected), make heap_scan
atomically read from and written to, unless the world is stopped.

Note that we don't update gcControllerState.revise's erroneous doc
comment here because this change isn't about revise's guarantees, just
about heap_scan. The comment is updated in a later change.

Change-Id: Iddbbeb954767c704c2bd1d221f36e6c4fc9948a6
Reviewed-on: https://go-review.googlesource.com/c/go/+/246960
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgc.go    | 5 +++--
 src/runtime/mheap.go  | 4 ++--
 src/runtime/mstats.go | 4 +++-
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index c42c7fbd29..94539dd770 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -494,6 +494,7 @@ func (c *gcControllerState) revise() {
 		gcpercent = 100000
 	}
 	live := atomic.Load64(&memstats.heap_live)
+	scan := atomic.Load64(&memstats.heap_scan)
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
@@ -508,7 +509,7 @@ func (c *gcControllerState) revise() {
 	//
 	// (This is a float calculation to avoid overflowing on
 	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
 	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
@@ -518,7 +519,7 @@ func (c *gcControllerState) revise() {
 		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
 
 		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(memstats.heap_scan)
+		scanWorkExpected = int64(scan)
 	}
 
 	// Compute the remaining scan work estimate.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1a57bcd66e..124bbacd1d 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1168,7 +1168,7 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 				throw("mheap.allocSpan called with no P")
 			}
 		}
-		memstats.heap_scan += uint64(c.local_scan)
+		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
 		memstats.tinyallocs += uint64(c.local_tinyallocs)
 		c.local_tinyallocs = 0
@@ -1375,7 +1375,7 @@ func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
 		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(c.local_scan)
+		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
 		memstats.tinyallocs += uint64(c.local_tinyallocs)
 		c.local_tinyallocs = 0
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index b95b332134..2c217ecf84 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -139,6 +139,8 @@ type mstats struct {
 	// no-scan objects and no-scan tails of objects.
 	//
 	// Whenever this is updated, call gcController.revise().
+	//
+	// Read and written atomically or with the world stopped.
 	heap_scan uint64
 
 	// heap_marked is the number of bytes marked by the previous
@@ -635,7 +637,7 @@ func flushallmcaches() {
 func purgecachedstats(c *mcache) {
 	// Protected by either heap or GC lock.
 	h := &mheap_
-	memstats.heap_scan += uint64(c.local_scan)
+	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 	c.local_scan = 0
 	memstats.tinyallocs += uint64(c.local_tinyallocs)
 	c.local_tinyallocs = 0
-- 
cgit v1.2.1


From 93d7d1685ee9e9f296e20f6c712796e54602e891 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 20:17:40 +0000
Subject: runtime: load gcControllerState.scanWork atomically in revise

gcControllerState.scanWork's docs state that it must be accessed
atomically during a GC cycle, but gcControllerState.revise does not do
this (even when called with the heap lock held).

This change makes it so that gcControllerState.revise accesses scanWork
atomically and explicitly.

Note that we don't update gcControllerState.revise's erroneous doc
comment here because this change isn't about revise's guarantees, just
about heap_scan. The comment is updated in a later change.

Change-Id: Iafc3ad214e517190bfd8a219896d23da19f7659d
Reviewed-on: https://go-review.googlesource.com/c/go/+/246961
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgc.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 94539dd770..4b9a6da3b3 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -495,6 +495,7 @@ func (c *gcControllerState) revise() {
 	}
 	live := atomic.Load64(&memstats.heap_live)
 	scan := atomic.Load64(&memstats.heap_scan)
+	work := atomic.Loadint64(&c.scanWork)
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
@@ -511,7 +512,7 @@ func (c *gcControllerState) revise() {
 	// 100*heap_scan.)
 	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
-	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+	if live > memstats.next_gc || work > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
 		// work than we expected. Pace GC so that in the worst case it
 		// will complete by the hard goal.
@@ -529,7 +530,7 @@ func (c *gcControllerState) revise() {
 	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
-	scanWorkRemaining := scanWorkExpected - c.scanWork
+	scanWorkRemaining := scanWorkExpected - work
 	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
-- 
cgit v1.2.1


From f5c6875f3228951afa1fcf2ec01c614e0fb7e2dd Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 20:24:56 +0000
Subject: runtime: make next_gc atomically accessed

next_gc is mostly updated only during a STW, but may occasionally be
updated by calls to e.g. debug.SetGCPercent. In this case the update is
supposed to be protected by the heap lock, but in reality it's accessed
by gcController.revise which may be called without the heap lock held
(despite its documentation, which will be updated in a later change).

Change the synchronization policy on next_gc so that it's atomically
accessed when the world is not stopped to aid in making revise safe for
concurrent use.

Change-Id: I79657a72f91563f3241aaeda66e8a7757d399529
Reviewed-on: https://go-review.googlesource.com/c/go/+/246962
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgc.go         | 13 +++++++------
 src/runtime/mgcscavenge.go |  2 +-
 src/runtime/mstats.go      | 10 ++++++++--
 src/runtime/trace.go       |  5 +++--
 4 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 4b9a6da3b3..5c565a5853 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -409,7 +409,8 @@ type gcControllerState struct {
 }
 
 // startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema.
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
 func (c *gcControllerState) startCycle() {
 	c.scanWork = 0
 	c.bgScanCredit = 0
@@ -499,7 +500,7 @@ func (c *gcControllerState) revise() {
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(memstats.next_gc)
+	heapGoal := int64(atomic.Load64(&memstats.next_gc))
 
 	// Compute the expected scan work remaining.
 	//
@@ -512,12 +513,12 @@ func (c *gcControllerState) revise() {
 	// 100*heap_scan.)
 	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
-	if live > memstats.next_gc || work > scanWorkExpected {
+	if int64(live) > heapGoal || work > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
 		// work than we expected. Pace GC so that in the worst case it
 		// will complete by the hard goal.
 		const maxOvershoot = 1.1
-		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
 
 		// Compute the upper bound on the scan work remaining.
 		scanWorkExpected = int64(scan)
@@ -846,7 +847,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 
 	// Commit to the trigger and goal.
 	memstats.gc_trigger = trigger
-	memstats.next_gc = goal
+	atomic.Store64(&memstats.next_gc, goal)
 	if trace.enabled {
 		traceNextGC()
 	}
@@ -903,7 +904,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcEffectiveGrowthRatio() float64 {
-	egogc := float64(memstats.next_gc-memstats.heap_marked) / float64(memstats.heap_marked)
+	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
 	if egogc < 0 {
 		// Shouldn't happen, but just in case.
 		egogc = 0
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 34646828e5..6328b295ca 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -123,7 +123,7 @@ func gcPaceScavenger() {
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 2c217ecf84..8cc20552fb 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -57,9 +57,15 @@ type mstats struct {
 	gc_sys       uint64 // updated atomically or during STW
 	other_sys    uint64 // updated atomically or during STW
 
-	// Statistics about garbage collector.
+	// Statistics about the garbage collector.
+
+	// next_gc is the goal heap_live for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	next_gc uint64
+
 	// Protected by mheap or stopping the world during GC.
-	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 169b650eb4..d3ecd148be 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -13,6 +13,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -1146,11 +1147,11 @@ func traceHeapAlloc() {
 }
 
 func traceNextGC() {
-	if memstats.next_gc == ^uint64(0) {
+	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
 		// Heap-based triggering is disabled.
 		traceEvent(traceEvNextGC, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, memstats.next_gc)
+		traceEvent(traceEvNextGC, -1, nextGC)
 	}
 }
 
-- 
cgit v1.2.1


From ce46f197b6c75281b77ee93338e2559671e28b01 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 20:48:06 +0000
Subject: runtime: access the assist ratio atomically

This change makes it so that the GC assist ratio (the pair of
gcControllerState fields assistBytesPerWork and assistWorkPerByte) is
updated atomically. Note that the pair of fields are not updated
together atomically, but that's OK. The code here was already racy for
some time and in practice the assist ratio moves very slowly.

The purpose of this change is so that we can document
gcController.revise to be safe for concurrent use, which will be useful
in further changes.

Change-Id: Ie25d630207c88e4f85f2b8953f6a0051ebf1b4ea
Reviewed-on: https://go-review.googlesource.com/c/go/+/246963
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgc.go     | 51 ++++++++++++++++++++++++++++++++++++++++++--------
 src/runtime/mgcmark.go | 17 +++++++++++------
 src/runtime/proc.go    |  3 ++-
 3 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 5c565a5853..c54f893689 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -388,10 +388,24 @@ type gcControllerState struct {
 	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
 	// time heap_scan is updated.
-	assistWorkPerByte float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
 
 	// assistBytesPerWork is 1/assistWorkPerByte.
-	assistBytesPerWork float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
 	// time that should be spent in the fractional mark worker on
@@ -470,7 +484,8 @@ func (c *gcControllerState) startCycle() {
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
-		print("pacer: assist ratio=", c.assistWorkPerByte,
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
 			" (scan ", memstats.heap_scan>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
@@ -480,9 +495,22 @@ func (c *gcControllerState) startCycle() {
 }
 
 // revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan, memstats.heap_live, or
-// memstats.next_gc is updated (with mheap_.lock held).
+// improved estimates. This should be called whenever memstats.heap_scan,
+// memstats.heap_live, or memstats.next_gc is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
@@ -555,8 +583,15 @@ func (c *gcControllerState) revise() {
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
-	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
 }
 
 // endCycle computes the trigger ratio for the next cycle.
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 79df59d6d6..c71c0e58d3 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -400,11 +400,13 @@ retry:
 	// balance positive. When the required amount of work is low,
 	// we over-assist to build up credit for future allocations
 	// and amortize the cost of assisting.
+	assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
 	debtBytes := -gp.gcAssistBytes
-	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	scanWork := int64(assistWorkPerByte * float64(debtBytes))
 	if scanWork < gcOverAssistWork {
 		scanWork = gcOverAssistWork
-		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+		debtBytes = int64(assistBytesPerWork * float64(scanWork))
 	}
 
 	// Steal as much credit as we can from the background GC's
@@ -418,7 +420,7 @@ retry:
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
 			stolen = bgScanCredit
-			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+			gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(stolen))
 		} else {
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
@@ -543,7 +545,8 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// this scan work counts for. The "1+" is a poor man's
 	// round-up, to ensure this adds credit even if
 	// assistBytesPerWork is very low.
-	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
 
 	// If this is the last worker and we ran out of work,
 	// signal a completion point.
@@ -637,7 +640,8 @@ func gcFlushBgCredit(scanWork int64) {
 		return
 	}
 
-	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	scanBytes := int64(float64(scanWork) * assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
 	for !work.assistQueue.q.empty() && scanBytes > 0 {
@@ -670,7 +674,8 @@ func gcFlushBgCredit(scanWork int64) {
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
-		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
 		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 	}
 	unlock(&work.assistQueue.lock)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index ec4e6d8751..ebecc92745 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -3208,7 +3208,8 @@ func goexit0(gp *g) {
 		// Flush assist credit to the global pool. This gives
 		// better information to pacing if the application is
 		// rapidly creating an exiting goroutines.
-		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
 		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
 		gp.gcAssistBytes = 0
 	}
-- 
cgit v1.2.1


From 42019613df2d9b6ad39e8ccf80861e75666025a0 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 21:02:05 +0000
Subject: runtime: make distributed/local malloc stats the source-of-truth

This change makes it so that various local malloc stats (excluding
heap_scan and local_tinyallocs) are no longer written first to mheap
fields but are instead accessed directly from each mcache.

This change is part of a move toward having stats be distributed, and
cleaning up some old code related to the stats.

Note that because there's no central source-of-truth, when an mcache
dies, it must donate its stats to another mcache. It's always safe to
donate to the mcache for the 0th P, so do that.

Change-Id: I2556093dbc27357cb9621c9b97671f3c00aa1173
Reviewed-on: https://go-review.googlesource.com/c/go/+/246964
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 26 ++++++++++++++++++--------
 src/runtime/mcache.go      | 31 +++++++++++++++++++++++++++++--
 src/runtime/mheap.go       |  7 ++-----
 src/runtime/mstats.go      | 41 +++++++++++++++++++----------------------
 src/runtime/proc.go        |  2 +-
 5 files changed, 69 insertions(+), 38 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index e65b7b8ea7..d5a90ca65b 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -339,18 +339,28 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 
 		// Add in frees. readmemstats_m flushed the cached stats, so
 		// these are up-to-date.
-		var smallFree uint64
-		slow.Frees = mheap_.nlargefree
-		for i := range mheap_.nsmallfree {
-			slow.Frees += mheap_.nsmallfree[i]
-			bySize[i].Frees = mheap_.nsmallfree[i]
-			bySize[i].Mallocs += mheap_.nsmallfree[i]
-			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		var largeFree, smallFree uint64
+		for _, p := range allp {
+			c := p.mcache
+			if c == nil {
+				continue
+			}
+			// Collect large allocation stats.
+			largeFree += uint64(c.local_largefree)
+			slow.Frees += uint64(c.local_nlargefree)
+
+			// Collect per-sizeclass stats.
+			for i := 0; i < _NumSizeClasses; i++ {
+				slow.Frees += uint64(c.local_nsmallfree[i])
+				bySize[i].Frees += uint64(c.local_nsmallfree[i])
+				bySize[i].Mallocs += uint64(c.local_nsmallfree[i])
+				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+			}
 		}
 		slow.Frees += memstats.tinyallocs
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 7a7d33ccae..5baa7b3da8 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -41,7 +41,13 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Local allocator stats, flushed during GC.
+	// Allocator stats (source-of-truth).
+	// Only the P that owns this mcache may write to these
+	// variables, so it's safe for that P to read non-atomically.
+	//
+	// When read with stats from other mcaches and with the world
+	// stopped, the result will accurately reflect the state of the
+	// application.
 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
@@ -97,7 +103,13 @@ func allocmcache() *mcache {
 	return c
 }
 
-func freemcache(c *mcache) {
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
+func freemcache(c *mcache, recipient *mcache) {
 	systemstack(func() {
 		c.releaseAll()
 		stackcache_clear(c)
@@ -109,11 +121,26 @@ func freemcache(c *mcache) {
 
 		lock(&mheap_.lock)
 		purgecachedstats(c)
+		// Donate anything else that's left.
+		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
+// donate flushes data and resources which have no global
+// pool to another mcache.
+func (c *mcache) donate(d *mcache) {
+	d.local_largefree += c.local_largefree
+	c.local_largefree = 0
+	d.local_nlargefree += c.local_nlargefree
+	c.local_nlargefree = 0
+	for i := range c.local_nsmallfree {
+		d.local_nsmallfree[i] += c.local_nsmallfree[i]
+		c.local_nsmallfree[i] = 0
+	}
+}
+
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 124bbacd1d..1b41b204ab 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -129,11 +129,8 @@ type mheap struct {
 	reclaimCredit uintptr
 
 	// Malloc stats.
-	largealloc  uint64                  // bytes allocated for large objects
-	nlargealloc uint64                  // number of large object allocations
-	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+	largealloc  uint64 // bytes allocated for large objects
+	nlargealloc uint64 // number of large object allocations
 
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 8cc20552fb..d81d2ebe81 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -571,21 +571,27 @@ func updatememstats() {
 		memstats.by_size[i].nmalloc += c.nmalloc
 		totalAlloc += c.nmalloc * uint64(class_to_size[i])
 	}
-	// Collect per-sizeclass stats.
-	for i := 0; i < _NumSizeClasses; i++ {
-		if i == 0 {
-			memstats.nmalloc += mheap_.nlargealloc
-			totalAlloc += mheap_.largealloc
-			totalFree += mheap_.largefree
-			memstats.nfree += mheap_.nlargefree
+
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
 			continue
 		}
-
-		// The mcache stats have been flushed to mheap_.
-		memstats.nfree += mheap_.nsmallfree[i]
-		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		// Collect large allocation stats.
+		totalFree += uint64(c.local_largefree)
+		memstats.nfree += uint64(c.local_nlargefree)
+
+		// Collect per-sizeclass stats.
+		for i := 0; i < _NumSizeClasses; i++ {
+			memstats.nfree += uint64(c.local_nsmallfree[i])
+			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
+			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+		}
 	}
+	// Collect remaining large allocation stats.
+	memstats.nmalloc += mheap_.nlargealloc
+	totalAlloc += mheap_.largealloc
+
 	totalFree += smallFree
 
 	memstats.nfree += memstats.tinyallocs
@@ -641,20 +647,11 @@ func flushallmcaches() {
 
 //go:nosplit
 func purgecachedstats(c *mcache) {
-	// Protected by either heap or GC lock.
-	h := &mheap_
+	// Protected by heap lock.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 	c.local_scan = 0
 	memstats.tinyallocs += uint64(c.local_tinyallocs)
 	c.local_tinyallocs = 0
-	h.largefree += uint64(c.local_largefree)
-	c.local_largefree = 0
-	h.nlargefree += uint64(c.local_nlargefree)
-	c.local_nlargefree = 0
-	for i := 0; i < len(c.local_nsmallfree); i++ {
-		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
-		c.local_nsmallfree[i] = 0
-	}
 }
 
 // Atomically increases a given *system* memory stat. We are counting on this
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index ebecc92745..4f4cff38aa 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4550,7 +4550,7 @@ func (pp *p) destroy() {
 		pp.mspancache.len = 0
 		pp.pcache.flush(&mheap_.pages)
 	})
-	freemcache(pp.mcache)
+	freemcache(pp.mcache, allp[0].mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-- 
cgit v1.2.1


From e63716bc76d3264f669843434bc365a78f2141d2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 21:10:29 +0000
Subject: runtime: make nlargealloc and largealloc mcache fields

This change makes nlargealloc and largealloc into mcache fields just
like nlargefree and largefree. These local fields become the new
source-of-truth. This change also moves the accounting for these fields
out of allocSpan (which is an inappropriate place for it -- this
accounting generally happens much closer to the point of allocation) and
into largeAlloc. This move is partially possible now that we can call
gcController.revise at that point.

Furthermore, this change moves largeAlloc into mcache.go and makes it a
method of mcache. While there's a little bit of a mismatch here because
largeAlloc barely interacts with the mcache, it helps solidify the
mcache as the first allocation layer and provides a clear place to
aggregate and manage statistics.

Change-Id: I37b5e648710733bb4c04430b71e96700e438587a
Reviewed-on: https://go-review.googlesource.com/c/go/+/246965
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 33 +------------------------------
 src/runtime/mcache.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++---
 src/runtime/mheap.go  | 18 +----------------
 src/runtime/mstats.go |  4 ++--
 4 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index b19d1f2671..ec601ccb39 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1082,9 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.largeAlloc(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1179,35 +1177,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	// Put the large span in the mcentral swept list so that it's
-	// visible to the background sweeper.
-	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5baa7b3da8..3657c0b86a 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -48,9 +49,11 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	local_largealloc  uintptr                  // bytes allocated for large objects
+	local_nlargealloc uintptr                  // number of large object allocations
+	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
+	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
+	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -131,6 +134,10 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
+	d.local_largealloc += c.local_largealloc
+	c.local_largealloc = 0
+	d.local_nlargealloc += c.local_nlargealloc
+	c.local_nlargealloc = 0
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -178,6 +185,47 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
+// largeAlloc allocates a span for a large object.
+func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	c.local_largealloc += npages * pageSize
+	c.local_nlargealloc++
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
 	for i := range c.alloc {
 		s := c.alloc[i]
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1b41b204ab..5635dc6784 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -128,10 +128,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64 // bytes allocated for large objects
-	nlargealloc uint64 // number of large object allocations
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -1170,14 +1166,7 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		memstats.tinyallocs += uint64(c.local_tinyallocs)
 		c.local_tinyallocs = 0
 
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
-		}
-
-		// Either heap_live or heap_scan could have been updated.
+		// heap_scan was been updated.
 		if gcBlackenEnabled != 0 {
 			gcController.revise()
 		}
@@ -1277,11 +1266,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d81d2ebe81..d9acb361d5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -578,6 +578,8 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
+		memstats.nmalloc += uint64(c.local_nlargealloc)
+		totalAlloc += uint64(c.local_largealloc)
 		totalFree += uint64(c.local_largefree)
 		memstats.nfree += uint64(c.local_nlargefree)
 
@@ -589,8 +591,6 @@ func updatememstats() {
 		}
 	}
 	// Collect remaining large allocation stats.
-	memstats.nmalloc += mheap_.nlargealloc
-	totalAlloc += mheap_.largealloc
 
 	totalFree += smallFree
 
-- 
cgit v1.2.1


From a5088e76f108f6470d2a9b3ac56a58ddb9376e4f Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:07:44 +0000
Subject: runtime: remove mcentral.nmalloc and add mcache.local_nsmallalloc

This change removes mcentral.nmalloc and adds mcache.local_nsmallalloc
which fulfills the same role but may be accessed non-atomically. It also
moves responsibility for updating heap_live and local_nsmallalloc into
mcache functions.

As a result of this change, mcache is now the sole source-of-truth for
malloc stats. It is also solely responsible for updating heap_live and
performing the various operations required as a result of updating
heap_live. The overall improvement here is in code organization:
previously malloc stats were fairly scattered, and now they have one
single home, and nearly all the required manipulations exist in a single
file.

Change-Id: I7e93fa297c1debf17e3f2a0d68aeed28a9c6af00
Reviewed-on: https://go-review.googlesource.com/c/go/+/246966
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go   | 34 ++++++++++++++++++++++++++++++++++
 src/runtime/mcentral.go | 41 +----------------------------------------
 src/runtime/mstats.go   | 18 ++++++------------
 3 files changed, 41 insertions(+), 52 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 3657c0b86a..4d2ba6dff0 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -51,6 +51,7 @@ type mcache struct {
 	// application.
 	local_largealloc  uintptr                  // bytes allocated for large objects
 	local_nlargealloc uintptr                  // number of large object allocations
+	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
 	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
@@ -138,6 +139,10 @@ func (c *mcache) donate(d *mcache) {
 	c.local_largealloc = 0
 	d.local_nlargealloc += c.local_nlargealloc
 	c.local_nlargealloc = 0
+	for i := range c.local_nsmallalloc {
+		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
+		c.local_nsmallalloc[i] = 0
+	}
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -182,6 +187,20 @@ func (c *mcache) refill(spc spanClass) {
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
@@ -227,9 +246,24 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49e01677..97fe92c2ab 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -44,11 +44,6 @@ type mcentral struct {
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
@@ -178,19 +173,6 @@ havespan:
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -228,27 +210,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -256,7 +217,7 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d9acb361d5..44cf17c85b 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -561,17 +561,6 @@ func updatememstats() {
 	// Collect allocation stats. This is safe and consistent
 	// because the world is stopped.
 	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
-
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
@@ -585,12 +574,17 @@ func updatememstats() {
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
+			// Malloc stats.
+			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
+			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
+			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+
+			// Free stats.
 			memstats.nfree += uint64(c.local_nsmallfree[i])
 			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
 			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
 		}
 	}
-	// Collect remaining large allocation stats.
 
 	totalFree += smallFree
 
-- 
cgit v1.2.1


From cca3d1e5533cb40beb9ef55bbc332b733adcc6ba Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:16:46 +0000
Subject: runtime: don't flush local_tinyallocs

This change makes local_tinyallocs work like the rest of the malloc
stats and doesn't flush local_tinyallocs, instead making that the
source-of-truth.

Change-Id: I3e6cb5f1b3d086e432ce7d456895511a48e3617a
Reviewed-on: https://go-review.googlesource.com/c/go/+/246967
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 7 +++++--
 src/runtime/mcache.go      | 8 +++++---
 src/runtime/mheap.go       | 4 ----
 src/runtime/mstats.go      | 6 ++++--
 4 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d5a90ca65b..d71b180f76 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -339,7 +339,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 
 		// Add in frees. readmemstats_m flushed the cached stats, so
 		// these are up-to-date.
-		var largeFree, smallFree uint64
+		var tinyAllocs, largeFree, smallFree uint64
 		for _, p := range allp {
 			c := p.mcache
 			if c == nil {
@@ -349,6 +349,9 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			largeFree += uint64(c.local_largefree)
 			slow.Frees += uint64(c.local_nlargefree)
 
+			// Collect tiny allocation stats.
+			tinyAllocs += uint64(c.local_tinyallocs)
+
 			// Collect per-sizeclass stats.
 			for i := 0; i < _NumSizeClasses; i++ {
 				slow.Frees += uint64(c.local_nsmallfree[i])
@@ -357,7 +360,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
 			}
 		}
-		slow.Frees += memstats.tinyallocs
+		slow.Frees += tinyAllocs
 		slow.Mallocs += slow.Frees
 
 		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 4d2ba6dff0..fe603116a2 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -32,9 +32,8 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny             uintptr
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+	tiny       uintptr
+	tinyoffset uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -49,6 +48,7 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
+	local_tinyallocs  uintptr                  // number of tiny allocs not counted in other stats
 	local_largealloc  uintptr                  // bytes allocated for large objects
 	local_nlargealloc uintptr                  // number of large object allocations
 	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
@@ -151,6 +151,8 @@ func (c *mcache) donate(d *mcache) {
 		d.local_nsmallfree[i] += c.local_nsmallfree[i]
 		c.local_nsmallfree[i] = 0
 	}
+	d.local_tinyallocs += c.local_tinyallocs
+	c.local_tinyallocs = 0
 }
 
 // refill acquires a new span of span class spc for c. This span will
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 5635dc6784..47f86ee38c 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1163,8 +1163,6 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		}
 		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 
 		// heap_scan was been updated.
 		if gcBlackenEnabled != 0 {
@@ -1358,8 +1356,6 @@ func (h *mheap) freeSpan(s *mspan) {
 		lock(&h.lock)
 		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 44cf17c85b..341906fced 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -550,6 +550,7 @@ func updatememstats() {
 	memstats.total_alloc = 0
 	memstats.nmalloc = 0
 	memstats.nfree = 0
+	memstats.tinyallocs = 0
 	for i := 0; i < len(memstats.by_size); i++ {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
@@ -572,6 +573,9 @@ func updatememstats() {
 		totalFree += uint64(c.local_largefree)
 		memstats.nfree += uint64(c.local_nlargefree)
 
+		// Collect tiny allocation stats.
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
 			// Malloc stats.
@@ -644,8 +648,6 @@ func purgecachedstats(c *mcache) {
 	// Protected by heap lock.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 	c.local_scan = 0
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
 }
 
 // Atomically increases a given *system* memory stat. We are counting on this
-- 
cgit v1.2.1


From d677899e903c4741920846f1af2c14c56f6e710e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:36:58 +0000
Subject: runtime: flush local_scan directly and more often

Now that local_scan is the last mcache-based statistic that is flushed
by purgecachedstats, and heap_scan and gcController.revise may be
interacted with concurrently, we don't need to flush heap_scan at
arbitrary locations where the heap is locked, and we don't need
purgecachedstats and cachestats anymore. Instead, we can flush
local_scan at the same time we update heap_live in refill, so the two
updates may share the same revise call.

Clean up unused functions, remove code that would cause the heap to get
locked in the allocSpan when it didn't need to (other than to flush
local_scan), and flush local_scan explicitly in a few important places.
Notably we need to flush local_scan whenever we flush the other stats,
but it doesn't need to be donated anywhere, so have releaseAll do the
flushing. Also, we need to flush local_scan before we set heap_scan at
the end of a GC, which was previously handled by cachestats. Just do so
explicitly -- it's not much code and it becomes a lot more clear why we
need to do so.

Change-Id: I35ac081784df7744d515479896a41d530653692d
Reviewed-on: https://go-review.googlesource.com/c/go/+/246968
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go | 22 ++++++++++++++++++++--
 src/runtime/mgc.go    | 14 ++++++++++++--
 src/runtime/mheap.go  | 49 +++----------------------------------------------
 src/runtime/mstats.go | 25 -------------------------
 4 files changed, 35 insertions(+), 75 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index fe603116a2..b8e388cc4f 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -124,7 +124,6 @@ func freemcache(c *mcache, recipient *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		purgecachedstats(c)
 		// Donate anything else that's left.
 		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
@@ -135,6 +134,8 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
+	// local_scan is handled separately because it's not
+	// like these stats -- it's used for GC pacing.
 	d.local_largealloc += c.local_largealloc
 	c.local_largealloc = 0
 	d.local_nlargealloc += c.local_nlargealloc
@@ -192,14 +193,22 @@ func (c *mcache) refill(spc spanClass) {
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
 	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+
+	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+
+	// While we're here, flush local_scan, since we have to call
+	// revise anyway.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
+	c.local_scan = 0
+
 	if trace.enabled {
 		// heap_live changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
-		// heap_live changed.
+		// heap_live and heap_scan changed.
 		gcController.revise()
 	}
 
@@ -248,6 +257,10 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
+	// Take this opportunity to flush local_scan.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
+	c.local_scan = 0
+
 	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
@@ -273,6 +286,11 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+
+	// Updated heap_scan and possible heap_live.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index c54f893689..55554c117c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2083,11 +2083,21 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	cachestats()
-
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
+	// Flush local_scan from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then local_scan
+	// might have incorrect information.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+	}
+
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 47f86ee38c..40fd58b0ef 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1102,23 +1102,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		base, scav = c.alloc(npages)
 		if base != 0 {
 			s = h.tryAllocMSpan()
-
-			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+			if s != nil {
 				goto HaveSpan
 			}
-			// We're either running duing GC, failed to acquire a mspan,
-			// or the allocation is for a large object. This means we
-			// have to lock the heap and do a bunch of extra work,
-			// so go down the HaveBaseLocked path.
-			//
-			// We must do this during GC to avoid skew with heap_scan
-			// since we flush mcache stats whenever we lock.
-			//
-			// TODO(mknyszek): It would be nice to not have to
-			// lock the heap if it's a large allocation, but
-			// it's fine for now. The critical section here is
-			// short and large object allocations are relatively
-			// infrequent.
+			// We have a base but no mspan, so we need
+			// to lock the heap.
 		}
 	}
 
@@ -1145,30 +1133,6 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// one now that we have the heap lock.
 		s = h.allocMSpanLocked()
 	}
-	if !manual {
-		// This is a heap span, so we should do some additional accounting
-		// which may only be done with the heap locked.
-
-		// Transfer stats from mcache to global.
-		var c *mcache
-		if gp.m.p != 0 {
-			c = gp.m.p.ptr().mcache
-		} else {
-			// This case occurs while bootstrapping.
-			// See the similar code in mallocgc.
-			c = mcache0
-			if c == nil {
-				throw("mheap.allocSpan called with no P")
-			}
-		}
-		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-		c.local_scan = 0
-
-		// heap_scan was been updated.
-		if gcBlackenEnabled != 0 {
-			gcController.revise()
-		}
-	}
 	unlock(&h.lock)
 
 HaveSpan:
@@ -1352,20 +1316,13 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-		c.local_scan = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if gcBlackenEnabled != 0 {
-			// heap_scan changed.
-			gcController.revise()
-		}
 		h.freeSpanLocked(s, true, true)
 		unlock(&h.lock)
 	})
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 341906fced..5eeb173640 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -556,9 +556,6 @@ func updatememstats() {
 		memstats.by_size[i].nfree = 0
 	}
 
-	// Aggregate local stats.
-	cachestats()
-
 	// Collect allocation stats. This is safe and consistent
 	// because the world is stopped.
 	var smallFree, totalAlloc, totalFree uint64
@@ -602,21 +599,6 @@ func updatememstats() {
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
-// cachestats flushes all mcache stats.
-//
-// The world must be stopped.
-//
-//go:nowritebarrier
-func cachestats() {
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		purgecachedstats(c)
-	}
-}
-
 // flushmcache flushes the mcache of allp[i].
 //
 // The world must be stopped.
@@ -643,13 +625,6 @@ func flushallmcaches() {
 	}
 }
 
-//go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by heap lock.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
-}
-
 // Atomically increases a given *system* memory stat. We are counting on this
 // stat never overflowing a uintptr, so this function must only be used for
 // system memory stats.
-- 
cgit v1.2.1


From c8638498008f9874dc5a48734418e0fbea08cee9 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 24 Jul 2020 19:58:31 +0000
Subject: runtime: rename mcache fields to match Go style

This change renames a bunch of malloc statistics stored in the mcache
that are all named with the "local_" prefix. It also renames largeAlloc
to allocLarge to prevent a naming conflict, and next_sample because it
would be the last mcache field with the old C naming style.

Change-Id: I29695cb83b397a435ede7e9ad5c3c9be72767ea3
Reviewed-on: https://go-review.googlesource.com/c/go/+/246969
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go      | 14 ++++----
 src/runtime/malloc.go           | 12 +++----
 src/runtime/mcache.go           | 78 ++++++++++++++++++++---------------------
 src/runtime/mgc.go              |  8 ++---
 src/runtime/mgcsweep.go         |  6 ++--
 src/runtime/mstats.go           | 22 ++++++------
 src/runtime/pprof/mprof_test.go |  2 +-
 7 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d71b180f76..47cbc286f6 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -346,18 +346,18 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 				continue
 			}
 			// Collect large allocation stats.
-			largeFree += uint64(c.local_largefree)
-			slow.Frees += uint64(c.local_nlargefree)
+			largeFree += uint64(c.largeFree)
+			slow.Frees += uint64(c.largeFreeCount)
 
 			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.local_tinyallocs)
+			tinyAllocs += uint64(c.tinyAllocCount)
 
 			// Collect per-sizeclass stats.
 			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Mallocs += uint64(c.local_nsmallfree[i])
-				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+				slow.Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
+				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 			}
 		}
 		slow.Frees += tinyAllocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ec601ccb39..0f48d7f68e 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1040,7 +1040,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocCount++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
@@ -1082,7 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		span = c.largeAlloc(size, needzero, noscan)
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1111,7 +1111,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1153,8 +1153,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1221,7 +1221,7 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 			throw("profilealloc called with no P")
 		}
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index b8e388cc4f..c3e0e5e1f7 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -20,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -48,13 +48,13 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_tinyallocs  uintptr                  // number of tiny allocs not counted in other stats
-	local_largealloc  uintptr                  // bytes allocated for large objects
-	local_nlargealloc uintptr                  // number of large object allocations
-	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
-	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -103,7 +103,7 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
@@ -134,26 +134,26 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-	// local_scan is handled separately because it's not
+	// scanAlloc is handled separately because it's not
 	// like these stats -- it's used for GC pacing.
-	d.local_largealloc += c.local_largealloc
-	c.local_largealloc = 0
-	d.local_nlargealloc += c.local_nlargealloc
-	c.local_nlargealloc = 0
-	for i := range c.local_nsmallalloc {
-		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
-		c.local_nsmallalloc[i] = 0
+	d.largeAlloc += c.largeAlloc
+	c.largeAlloc = 0
+	d.largeAllocCount += c.largeAllocCount
+	c.largeAllocCount = 0
+	for i := range c.smallAllocCount {
+		d.smallAllocCount[i] += c.smallAllocCount[i]
+		c.smallAllocCount[i] = 0
 	}
-	d.local_largefree += c.local_largefree
-	c.local_largefree = 0
-	d.local_nlargefree += c.local_nlargefree
-	c.local_nlargefree = 0
-	for i := range c.local_nsmallfree {
-		d.local_nsmallfree[i] += c.local_nsmallfree[i]
-		c.local_nsmallfree[i] = 0
+	d.largeFree += c.largeFree
+	c.largeFree = 0
+	d.largeFreeCount += c.largeFreeCount
+	c.largeFreeCount = 0
+	for i := range c.smallFreeCount {
+		d.smallFreeCount[i] += c.smallFreeCount[i]
+		c.smallFreeCount[i] = 0
 	}
-	d.local_tinyallocs += c.local_tinyallocs
-	c.local_tinyallocs = 0
+	d.tinyAllocCount += c.tinyAllocCount
+	c.tinyAllocCount = 0
 }
 
 // refill acquires a new span of span class spc for c. This span will
@@ -192,16 +192,16 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
-	// While we're here, flush local_scan, since we have to call
+	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	if trace.enabled {
 		// heap_live changed.
@@ -215,8 +215,8 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
-// largeAlloc allocates a span for a large object.
-func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if size+_PageSize < size {
 		throw("out of memory")
 	}
@@ -235,8 +235,8 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.local_largealloc += npages * pageSize
-	c.local_nlargealloc++
+	c.largeAlloc += npages * pageSize
+	c.largeAllocCount++
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -257,9 +257,9 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
-	// Take this opportunity to flush local_scan.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	sg := mheap_.sweepgen
 	for i := range c.alloc {
@@ -267,7 +267,7 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			c.smallAllocCount[spanClass(i).sizeclass()] -= n
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 55554c117c..540c376f1c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2086,16 +2086,16 @@ func gcMark(start_time int64) {
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
-	// Flush local_scan from each mcache since we're about to modify
-	// heap_scan directly. If we were to flush this later, then local_scan
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
 	// might have incorrect information.
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
 		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
 	}
 
 	// Update other GC heap size stats. This must happen after
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 6b8c56ce35..7103b08455 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +548,8 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			c.largeFreeCount++
+			c.largeFree += size
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 5eeb173640..64687c24e5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -565,25 +565,25 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.local_nlargealloc)
-		totalAlloc += uint64(c.local_largealloc)
-		totalFree += uint64(c.local_largefree)
-		memstats.nfree += uint64(c.local_nlargefree)
+		memstats.nmalloc += uint64(c.largeAllocCount)
+		totalAlloc += uint64(c.largeAlloc)
+		totalFree += uint64(c.largeFree)
+		memstats.nfree += uint64(c.largeFreeCount)
 
 		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		memstats.tinyallocs += uint64(c.tinyAllocCount)
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
 			// Malloc stats.
-			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
-			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
-			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+			memstats.nmalloc += uint64(c.smallAllocCount[i])
+			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
+			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
 
 			// Free stats.
-			memstats.nfree += uint64(c.local_nsmallfree[i])
-			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
-			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+			memstats.nfree += uint64(c.smallFreeCount[i])
+			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
+			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
 	}
 
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index f253f07def..c11a45fd69 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
-- 
cgit v1.2.1


From dc02578ac8bb27359c7d0451ca249e47bdef2a9e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 29 Jul 2020 19:00:37 +0000
Subject: runtime: make the span allocation purpose more explicit

This change modifies mheap's span allocation API to have each caller
declare a purpose, defined as a new enum called spanAllocType.

The purpose behind this change is two-fold:
1. Tight control over who gets to allocate heap memory is, generally
   speaking, a good thing. Every codepath that allocates heap memory
   places additional implicit restrictions on the allocator. A notable
   example of a restriction is work bufs coming from heap memory: write
   barriers are not allowed in allocation paths because then we could
   have a situation where the allocator calls into the allocator.
2. Memory statistic updating is explicit. Instead of passing an opaque
   pointer for statistic updating, which places restrictions on how that
   statistic may be updated, we use the spanAllocType to determine which
   statistic to update and how.

We also take this opportunity to group all the statistic updating code
together, which should make the accounting code a little easier to
follow.

Change-Id: Ic0b0898959ba2a776f67122f0e36c9d7d60e3085
Reviewed-on: https://go-review.googlesource.com/c/go/+/246970
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mbitmap.go |  4 +--
 src/runtime/mgcwork.go |  4 +--
 src/runtime/mheap.go   | 78 +++++++++++++++++++++++++++++++++++++-------------
 src/runtime/stack.go   | 12 ++++----
 4 files changed, 68 insertions(+), 30 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 51c3625c3d..fbfaae0f93 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -1868,12 +1868,12 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
 	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
 	// Compute the number of pages needed for bitmapBytes.
 	pages := divRoundUp(bitmapBytes, pageSize)
-	s := mheap_.allocManual(pages, &memstats.gc_sys)
+	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
 	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
 	return s
 }
 func dematerializeGCProg(s *mspan) {
-	mheap_.freeManual(s, &memstats.gc_sys)
+	mheap_.freeManual(s, spanAllocPtrScalarBits)
 }
 
 func dumpGCProg(p *byte) {
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 51e0fe9219..b3a068661e 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -371,7 +371,7 @@ func getempty() *workbuf {
 		}
 		if s == nil {
 			systemstack(func() {
-				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+				s = mheap_.allocManual(workbufAlloc/pageSize, spanAllocWorkBuf)
 			})
 			if s == nil {
 				throw("out of memory")
@@ -473,7 +473,7 @@ func freeSomeWbufs(preemptible bool) bool {
 				break
 			}
 			work.wbufSpans.free.remove(span)
-			mheap_.freeManual(span, &memstats.gc_sys)
+			mheap_.freeManual(span, spanAllocWorkBuf)
 		}
 	})
 	more := !work.wbufSpans.free.isEmpty()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 40fd58b0ef..df659e222b 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -861,6 +861,22 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	return nFreed
 }
 
+// spanAllocType represents the type of allocation to make, or
+// the type of allocation to be freed.
+type spanAllocType uint8
+
+const (
+	spanAllocHeap          spanAllocType = iota // heap span
+	spanAllocStack                              // stack span
+	spanAllocPtrScalarBits                      // unrolled GC prog bitmap span
+	spanAllocWorkBuf                            // work buf span
+)
+
+// manual returns true if the span allocation is manually managed.
+func (s spanAllocType) manual() bool {
+	return s != spanAllocHeap
+}
+
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
 // spanclass indicates the span's size class and scannability.
@@ -877,7 +893,7 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 		if h.sweepdone == 0 {
 			h.reclaim(npages)
 		}
-		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
+		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
 	if s != nil {
@@ -902,9 +918,15 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 // allocManual must be called on the system stack because it may
 // acquire the heap lock via allocSpan. See mheap for details.
 //
+// If new code is written to call allocManual, do NOT use an
+// existing spanAllocType value and instead declare a new one.
+//
 //go:systemstack
-func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
-	return h.allocSpan(npages, true, 0, stat)
+func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
+	if !typ.manual() {
+		throw("manual span allocation called with non-manually-managed type")
+	}
+	return h.allocSpan(npages, typ, 0)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -1066,7 +1088,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 
 // allocSpan allocates an mspan which owns npages worth of memory.
 //
-// If manual == false, allocSpan allocates a heap span of class spanclass
+// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
 // and updates heap accounting. If manual == true, allocSpan allocates a
 // manually-managed span (spanclass is ignored), and the caller is
 // responsible for any accounting related to its use of the span. Either
@@ -1081,7 +1103,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 // the heap lock and because it must block GC transitions.
 //
 //go:systemstack
-func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
@@ -1143,12 +1165,10 @@ HaveSpan:
 		s.needzero = 1
 	}
 	nbytes := npages * pageSize
-	if manual {
+	if typ.manual() {
 		s.manualFreeList = 0
 		s.nelems = 0
 		s.limit = s.base() + s.npages*pageSize
-		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
 		s.state.set(mSpanManual)
 	} else {
 		// We must set span properties before the span is published anywhere
@@ -1205,7 +1225,18 @@ HaveSpan:
 		mSysStatDec(&memstats.heap_released, scav)
 	}
 	// Update stats.
-	mSysStatInc(sysStat, nbytes)
+	switch typ {
+	case spanAllocHeap:
+		mSysStatInc(&memstats.heap_inuse, nbytes)
+	case spanAllocStack:
+		mSysStatInc(&memstats.stacks_inuse, nbytes)
+	case spanAllocPtrScalarBits, spanAllocWorkBuf:
+		mSysStatInc(&memstats.gc_sys, nbytes)
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys.
+		mSysStatDec(&memstats.heap_sys, nbytes)
+	}
 	mSysStatDec(&memstats.heap_idle, nbytes)
 
 	// Publish the span in various locations.
@@ -1217,7 +1248,7 @@ HaveSpan:
 	// before that happens) or pageInUse is updated.
 	h.setSpans(s.base(), npages, s)
 
-	if !manual {
+	if !typ.manual() {
 		// Mark in-use span in arena page bitmap.
 		//
 		// This publishes the span to the page sweeper, so
@@ -1323,13 +1354,13 @@ func (h *mheap) freeSpan(s *mspan) {
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		h.freeSpanLocked(s, true, true)
+		h.freeSpanLocked(s, spanAllocHeap)
 		unlock(&h.lock)
 	})
 }
 
 // freeManual frees a manually-managed span returned by allocManual.
-// stat must be the same as the stat passed to the allocManual that
+// typ must be the same as the spanAllocType passed to the allocManual that
 // allocated s.
 //
 // This must only be called when gcphase == _GCoff. See mSpanState for
@@ -1339,16 +1370,14 @@ func (h *mheap) freeSpan(s *mspan) {
 // the heap lock. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) freeManual(s *mspan, stat *uint64) {
+func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
 	s.needzero = 1
 	lock(&h.lock)
-	mSysStatDec(stat, s.npages*pageSize)
-	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
-	h.freeSpanLocked(s, false, true)
+	h.freeSpanLocked(s, typ)
 	unlock(&h.lock)
 }
 
-func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
+func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
@@ -1368,12 +1397,21 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
-	if acctinuse {
+	// Update stats.
+	//
+	// Mirrors the code in allocSpan.
+	switch typ {
+	case spanAllocHeap:
 		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
+	case spanAllocStack:
+		mSysStatDec(&memstats.stacks_inuse, s.npages*pageSize)
+	case spanAllocPtrScalarBits, spanAllocWorkBuf:
+		mSysStatDec(&memstats.gc_sys, s.npages*pageSize)
 	}
-	if acctidle {
-		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
+	if typ.manual() {
+		mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
 	}
+	mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 2afc2635aa..7b9dce5393 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -187,7 +187,7 @@ func stackpoolalloc(order uint8) gclinkptr {
 	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
-		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
+		s = mheap_.allocManual(_StackCacheSize>>_PageShift, spanAllocStack)
 		if s == nil {
 			throw("out of memory")
 		}
@@ -251,7 +251,7 @@ func stackpoolfree(x gclinkptr, order uint8) {
 		stackpool[order].item.span.remove(s)
 		s.manualFreeList = 0
 		osStackFree(s)
-		mheap_.freeManual(s, &memstats.stacks_inuse)
+		mheap_.freeManual(s, spanAllocStack)
 	}
 }
 
@@ -396,7 +396,7 @@ func stackalloc(n uint32) stack {
 
 		if s == nil {
 			// Allocate a new stack from the heap.
-			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
+			s = mheap_.allocManual(npage, spanAllocStack)
 			if s == nil {
 				throw("out of memory")
 			}
@@ -480,7 +480,7 @@ func stackfree(stk stack) {
 			// Free the stack immediately if we're
 			// sweeping.
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 		} else {
 			// If the GC is running, we can't return a
 			// stack span to the heap because it could be
@@ -1193,7 +1193,7 @@ func freeStackSpans() {
 				list.remove(s)
 				s.manualFreeList = 0
 				osStackFree(s)
-				mheap_.freeManual(s, &memstats.stacks_inuse)
+				mheap_.freeManual(s, spanAllocStack)
 			}
 			s = next
 		}
@@ -1207,7 +1207,7 @@ func freeStackSpans() {
 			next := s.next
 			stackLarge.free[i].remove(s)
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 			s = next
 		}
 	}
-- 
cgit v1.2.1


From 8ebc58452af3a586a3da1f68725bc83c78d4b073 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 29 Jul 2020 20:25:05 +0000
Subject: runtime: delineate which memstats are system stats with a type

This change modifies the type of several mstats fields to be a new type:
sysMemStat. This type has the same structure as the fields used to have.

The purpose of this change is to make it very clear which stats may be
used in various functions for accounting (usually the platform-specific
sys* functions, but there are others). Currently there's an implicit
understanding that the *uint64 value passed to these functions is some
kind of statistic whose value is atomically managed. This understanding
isn't inherently problematic, but we're about to change how some stats
(which currently use mSysStatInc and mSysStatDec) work, so we want to
make it very clear what the various requirements are around "sysStat".

This change also removes mSysStatInc and mSysStatDec in favor of a
method on sysMemStat. Note that those two functions were originally
written the way they were because atomic 64-bit adds required a valid G
on ARM, but this hasn't been the case for a very long time (since
golang.org/cl/14204, but even before then it wasn't clear if mutexes
required a valid G anymore). Today we implement 64-bit adds on ARM with
a spinlock table.

Change-Id: I4e9b37cf14afc2ae20cf736e874eb0064af086d7
Reviewed-on: https://go-review.googlesource.com/c/go/+/246971
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go |  4 +--
 src/runtime/heapdump.go    | 14 ++++----
 src/runtime/malloc.go      | 10 +++---
 src/runtime/mem_aix.go     | 12 +++----
 src/runtime/mem_bsd.go     | 12 +++----
 src/runtime/mem_darwin.go  | 12 +++----
 src/runtime/mem_js.go      | 10 +++---
 src/runtime/mem_linux.go   | 12 +++----
 src/runtime/mem_plan9.go   | 12 +++----
 src/runtime/mem_windows.go | 12 +++----
 src/runtime/mfixalloc.go   |  4 +--
 src/runtime/mgcscavenge.go |  4 +--
 src/runtime/mheap.go       | 28 ++++++++--------
 src/runtime/mpagealloc.go  |  4 +--
 src/runtime/mranges.go     |  4 +--
 src/runtime/mstats.go      | 82 +++++++++++++++++-----------------------------
 src/runtime/os_darwin.go   |  3 +-
 17 files changed, 109 insertions(+), 130 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 47cbc286f6..cb753ee819 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -820,7 +820,7 @@ type AddrRanges struct {
 // Add.
 func NewAddrRanges() AddrRanges {
 	r := addrRanges{}
-	r.init(new(uint64))
+	r.init(new(sysMemStat))
 	return AddrRanges{r, true}
 }
 
@@ -844,7 +844,7 @@ func MakeAddrRanges(a ...AddrRange) AddrRanges {
 	return AddrRanges{addrRanges{
 		ranges:     ranges,
 		totalBytes: total,
-		sysStat:    new(uint64),
+		sysStat:    new(sysMemStat),
 	}, false}
 }
 
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 4c35309211..495ecc5164 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -548,20 +548,20 @@ func dumpmemstats() {
 	dumpint(memstats.nmalloc)
 	dumpint(memstats.nfree)
 	dumpint(memstats.heap_alloc)
-	dumpint(memstats.heap_sys)
+	dumpint(memstats.heap_sys.load())
 	dumpint(memstats.heap_idle)
 	dumpint(memstats.heap_inuse)
 	dumpint(memstats.heap_released)
 	dumpint(memstats.heap_objects)
 	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys)
+	dumpint(memstats.stacks_sys.load())
 	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys)
+	dumpint(memstats.mspan_sys.load())
 	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys)
-	dumpint(memstats.buckhash_sys)
-	dumpint(memstats.gc_sys)
-	dumpint(memstats.other_sys)
+	dumpint(memstats.mcache_sys.load())
+	dumpint(memstats.buckhash_sys.load())
+	dumpint(memstats.gc_sys.load())
+	dumpint(memstats.other_sys.load())
 	dumpint(memstats.next_gc)
 	dumpint(memstats.last_gc_unix)
 	dumpint(memstats.pause_total_ns)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 0f48d7f68e..27d678d917 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1313,7 +1313,7 @@ var persistentChunks *notInHeap
 // The returned memory will be zeroed.
 //
 // Consider marking persistentalloc'd types go:notinheap.
-func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
@@ -1324,7 +1324,7 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 	const (
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
@@ -1383,8 +1383,8 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	}
 
 	if sysStat != &memstats.other_sys {
-		mSysStatInc(sysStat, size)
-		mSysStatDec(&memstats.other_sys, size)
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
 	}
 	return p
 }
@@ -1425,7 +1425,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	l.end = base + size
 }
 
-func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go
index 7e145b072a..957aa4dcc2 100644
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -11,7 +11,7 @@ import (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -24,7 +24,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -41,8 +41,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 
 }
@@ -59,8 +59,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	// AIX does not allow mapping a range that is already mapped.
 	// So, call mprotect to change permissions.
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 4d860e7bd3..bc672019fb 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -13,12 +13,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -35,8 +35,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -65,8 +65,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 const _sunosEAGAIN = 11
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM || ((GOOS == "solaris" || GOOS == "illumos") && err == _sunosEAGAIN) {
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3b5d565b0f..7fccd2bb8e 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -11,12 +11,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -39,8 +39,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -58,8 +58,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index 092b3d4fa2..957ed36ffa 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -13,7 +13,7 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := sysReserve(nil, n)
 	sysMap(p, n, sysStat)
 	return p
@@ -31,8 +31,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
@@ -80,6 +80,6 @@ func growMemory(pages int32) int32
 // This allows the front-end to replace the old DataView object with a new one.
 func resetMemoryDataView()
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 59b0bca970..3436851091 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -17,7 +17,7 @@ const (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -30,7 +30,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -144,8 +144,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -161,8 +161,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 4fea851cdd..53d8e6dffa 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -140,19 +140,19 @@ func sbrk(n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(bl)
 }
 
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	lock(&memlock)
 	p := memAlloc(n)
 	memCheck()
 	unlock(&memlock)
 	if p != nil {
-		mSysStatInc(sysStat, n)
+		sysStat.add(int64(n))
 	}
 	return p
 }
 
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	lock(&memlock)
 	if uintptr(v)+n == bloc {
 		// Address range being freed is at the end of memory,
@@ -176,10 +176,10 @@ func sysUsed(v unsafe.Pointer, n uintptr) {
 func sysHugePage(v unsafe.Pointer, n uintptr) {
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
 	// sysReserve has already allocated all heap memory,
 	// but has not adjusted stats.
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 165062ec27..3a805b9767 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go
@@ -24,8 +24,8 @@ const (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	mSysStatInc(sysStat, n)
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	sysStat.add(int64(n))
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_COMMIT|_MEM_RESERVE, _PAGE_READWRITE))
 }
 
@@ -97,8 +97,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	r := stdcall3(_VirtualFree, uintptr(v), 0, _MEM_RELEASE)
 	if r == 0 {
 		print("runtime: VirtualFree of ", n, " bytes failed with errno=", getlasterror(), "\n")
@@ -124,6 +124,6 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_RESERVE, _PAGE_READWRITE))
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index f9dd6ca474..293c16b38b 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -32,7 +32,7 @@ type fixalloc struct {
 	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
-	stat   *uint64
+	stat   *sysMemStat
 	zero   bool // zero allocations
 }
 
@@ -49,7 +49,7 @@ type mlink struct {
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
 	f.size = size
 	f.first = first
 	f.arg = arg
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 6328b295ca..8b1a0be353 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -100,7 +100,7 @@ const (
 
 // heapRetained returns an estimate of the current heap RSS.
 func heapRetained() uint64 {
-	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
@@ -711,7 +711,7 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	atomic.Xadd64(&memstats.heap_released, int64(npages)*pageSize)
 	return addr
 }
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index df659e222b..27c1bfbcf1 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1222,22 +1222,22 @@ HaveSpan:
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
 		sysUsed(unsafe.Pointer(base), nbytes)
-		mSysStatDec(&memstats.heap_released, scav)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
 	switch typ {
 	case spanAllocHeap:
-		mSysStatInc(&memstats.heap_inuse, nbytes)
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
 	case spanAllocStack:
-		mSysStatInc(&memstats.stacks_inuse, nbytes)
+		atomic.Xadd64(&memstats.stacks_inuse, int64(nbytes))
 	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		mSysStatInc(&memstats.gc_sys, nbytes)
+		memstats.gc_sys.add(int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, nbytes)
+		memstats.heap_sys.add(-int64(nbytes))
 	}
-	mSysStatDec(&memstats.heap_idle, nbytes)
+	atomic.Xadd64(&memstats.heap_idle, -int64(nbytes))
 
 	// Publish the span in various locations.
 
@@ -1314,8 +1314,8 @@ func (h *mheap) grow(npage uintptr) bool {
 		// The allocation is always aligned to the heap arena
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
-		mSysStatInc(&memstats.heap_released, asize)
-		mSysStatInc(&memstats.heap_idle, asize)
+		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		atomic.Xadd64(&memstats.heap_idle, int64(asize))
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1400,18 +1400,20 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	// Update stats.
 	//
 	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
 	switch typ {
 	case spanAllocHeap:
-		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
 	case spanAllocStack:
-		mSysStatDec(&memstats.stacks_inuse, s.npages*pageSize)
+		atomic.Xadd64(&memstats.stacks_inuse, -int64(nbytes))
 	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		mSysStatDec(&memstats.gc_sys, s.npages*pageSize)
+		memstats.gc_sys.add(-int64(nbytes))
 	}
 	if typ.manual() {
-		mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
 	}
-	mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
+	atomic.Xadd64(&memstats.heap_idle, int64(nbytes))
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 560babed03..2af1c97e0b 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -293,13 +293,13 @@ type pageAlloc struct {
 
 	// sysStat is the runtime memstat to update when new system
 	// memory is committed by the pageAlloc for allocation metadata.
-	sysStat *uint64
+	sysStat *sysMemStat
 
 	// Whether or not this struct is being used in tests.
 	test bool
 }
 
-func (p *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 1109f506a6..16acadcff1 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -160,10 +160,10 @@ type addrRanges struct {
 	totalBytes uintptr
 
 	// sysStat is the stat to track allocations by this type
-	sysStat *uint64
+	sysStat *sysMemStat
 }
 
-func (a *addrRanges) init(sysStat *uint64) {
+func (a *addrRanges) init(sysStat *sysMemStat) {
 	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
 	ranges.len = 0
 	ranges.cap = 16
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 64687c24e5..571a9c9ce3 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,7 +8,6 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -35,11 +34,11 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in mSpanInUse spans
-	heap_released uint64 // bytes released to the os
+	heap_alloc    uint64     // bytes allocated and not yet freed (same as alloc above)
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_idle     uint64     // bytes in idle spans
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
 
 	// heap_objects is not used by the runtime directly and instead
 	// computed on the fly by updatememstats.
@@ -47,15 +46,15 @@ type mstats struct {
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
-	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
+	stacks_inuse uint64     // bytes in manually-managed stack spans; updated atomically or during STW
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+	mspan_inuse  uint64     // mspan structures
+	mspan_sys    sysMemStat
 	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64 // updated atomically or during STW
-	other_sys    uint64 // updated atomically or during STW
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+	gc_sys       sysMemStat // updated atomically or during STW
+	other_sys    sysMemStat // updated atomically or during STW
 
 	// Statistics about the garbage collector.
 
@@ -533,8 +532,9 @@ func updatememstats() {
 
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gc_sys.load() +
+		memstats.other_sys.load()
 
 	// We also count stacks_inuse as sys memory.
 	memstats.sys += memstats.stacks_inuse
@@ -625,46 +625,24 @@ func flushallmcaches() {
 	}
 }
 
-// Atomically increases a given *system* memory stat. We are counting on this
-// stat never overflowing a uintptr, so this function must only be used for
-// system memory stats.
+// sysMemStat represents a global system statistic that is managed atomically.
 //
-// The current implementation for little endian architectures is based on
-// xadduintptr(), which is less than ideal: xadd64() should really be used.
-// Using xadduintptr() is a stop-gap solution until arm supports xadd64() that
-// doesn't use locks.  (Locks are a problem as they require a valid G, which
-// restricts their useability.)
-//
-// A side-effect of using xadduintptr() is that we need to check for
-// overflow errors.
-//go:nosplit
-func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
-	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, int64(n))
-		return
-	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
-		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
-		exit(2)
-	}
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
 }
 
-// Atomically decreases a given *system* memory stat. Same comments as
-// mSysStatInc apply.
-//go:nosplit
-func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
-	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, -int64(n))
+// add atomically adds the sysMemStat by n.
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
 		return
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
-		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
-		exit(2)
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
 	}
 }
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 394bd6fb0f..3f5bb7cf96 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -198,7 +198,6 @@ func newosproc(mp *m) {
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
-	//mSysStatInc(&memstats.stacks_sys, stacksize) //TODO: do this?
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -247,7 +246,7 @@ func newosproc0(stacksize uintptr, fn uintptr) {
 		exit(1)
 	}
 	g0.stack.hi = stacksize // for mstart
-	mSysStatInc(&memstats.stacks_sys, stacksize)
+	memstats.stacks_sys.add(int64(stacksize))
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-- 
cgit v1.2.1


From 39e335ac0618044bbd8ed2fca5e5b3583d8c444e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 31 Jul 2020 21:32:26 +0000
Subject: runtime: copy in MemStats fields explicitly

Currently MemStats is populated via an unsafe memmove from memstats, but
this places unnecessary structural restrictions on memstats, is annoying
to reason about, and tightly couples the two. Instead, just populate the
fields of MemStats explicitly.

Change-Id: I96f6a64326b1a91d4084e7b30169a4bbe6a331f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/246972
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mstats.go | 70 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 23 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 571a9c9ce3..466f33836c 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -12,8 +12,6 @@ import (
 )
 
 // Statistics.
-// If you edit this structure, also edit type MemStats below.
-// Their layouts must match exactly.
 //
 // For detailed descriptions see the documentation for MemStats.
 // Fields that differ from MemStats are further documented here.
@@ -87,8 +85,6 @@ type mstats struct {
 	// to 64 bits for atomic operations on 32 bit platforms.
 	_ [1 - _NumSizeClasses%2]uint32
 
-	// Statistics below here are not exported to MemStats directly.
-
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
 	last_next_gc     uint64 // next_gc for the previous GC
@@ -430,20 +426,7 @@ type MemStats struct {
 	}
 }
 
-// Size of the trailing by_size array differs between mstats and MemStats,
-// and all data after by_size is local to runtime, not exported.
-// NumSizeClasses was changed, but we cannot change MemStats because of backward compatibility.
-// sizeof_C_MStats is the size of the prefix of mstats that
-// corresponds to MemStats. It should match Sizeof(MemStats{}).
-var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
-
 func init() {
-	var memStats MemStats
-	if sizeof_C_MStats != unsafe.Sizeof(memStats) {
-		println(sizeof_C_MStats, unsafe.Sizeof(memStats))
-		throw("MStats vs MemStatsType size mismatch")
-	}
-
 	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
 		println(unsafe.Offsetof(memstats.heap_live))
 		throw("memstats.heap_live not aligned to 8 bytes")
@@ -469,14 +452,55 @@ func ReadMemStats(m *MemStats) {
 func readmemstats_m(stats *MemStats) {
 	updatememstats()
 
-	// The size of the trailing by_size array differs between
-	// mstats and MemStats. NumSizeClasses was changed, but we
-	// cannot change MemStats because of backward compatibility.
-	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
-
+	stats.Alloc = memstats.alloc
+	stats.TotalAlloc = memstats.total_alloc
+	stats.Sys = memstats.sys
+	stats.Mallocs = memstats.nmalloc
+	stats.Frees = memstats.nfree
+	stats.HeapAlloc = memstats.heap_alloc
+	stats.HeapSys = memstats.heap_sys.load()
+	stats.HeapIdle = memstats.heap_idle
+	stats.HeapInuse = memstats.heap_inuse
+	stats.HeapReleased = memstats.heap_released
+	stats.HeapObjects = memstats.heap_objects
+	stats.StackInuse = memstats.stacks_inuse
 	// memstats.stacks_sys is only memory mapped directly for OS stacks.
 	// Add in heap-allocated stack memory for user consumption.
-	stats.StackSys += stats.StackInuse
+	stats.StackSys = memstats.stacks_inuse + memstats.stacks_sys.load()
+	stats.MSpanInuse = memstats.mspan_inuse
+	stats.MSpanSys = memstats.mspan_sys.load()
+	stats.MCacheInuse = memstats.mcache_inuse
+	stats.MCacheSys = memstats.mcache_sys.load()
+	stats.BuckHashSys = memstats.buckhash_sys.load()
+	stats.GCSys = memstats.gc_sys.load()
+	stats.OtherSys = memstats.other_sys.load()
+	stats.NextGC = memstats.next_gc
+	stats.LastGC = memstats.last_gc_unix
+	stats.PauseTotalNs = memstats.pause_total_ns
+	stats.PauseNs = memstats.pause_ns
+	stats.PauseEnd = memstats.pause_end
+	stats.NumGC = memstats.numgc
+	stats.NumForcedGC = memstats.numforcedgc
+	stats.GCCPUFraction = memstats.gc_cpu_fraction
+	stats.EnableGC = true
+
+	// Handle BySize. Copy N values, where N is
+	// the minimum of the lengths of the two arrays.
+	// Unfortunately copy() won't work here because
+	// the arrays have different structs.
+	//
+	// TODO(mknyszek): Consider renaming the fields
+	// of by_size's elements to align so we can use
+	// the copy built-in.
+	bySizeLen := len(stats.BySize)
+	if l := len(memstats.by_size); l < bySizeLen {
+		bySizeLen = l
+	}
+	for i := 0; i < bySizeLen; i++ {
+		stats.BySize[i].Size = memstats.by_size[i].size
+		stats.BySize[i].Mallocs = memstats.by_size[i].nmalloc
+		stats.BySize[i].Frees = memstats.by_size[i].nfree
+	}
 }
 
 //go:linkname readGCStats runtime/debug.readGCStats
-- 
cgit v1.2.1


From ad863ba32a2ede207d708fa15897e9de1d14dd87 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 19:23:30 +0000
Subject: runtime: break down memstats.gc_sys

This change breaks apart gc_sys into three distinct pieces. Two of those
pieces are pieces which come from heap_sys since they're allocated from
the page heap. The rest comes from memory mapped from e.g.
persistentalloc which better fits the purpose of a sysMemStat. Also,
rename gc_sys to gcMiscSys.

Change-Id: I098789170052511e7b31edbcdc9a53e5c24573f7
Reviewed-on: https://go-review.googlesource.com/c/go/+/246973
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/heapdump.go   |  5 ++++-
 src/runtime/malloc.go     |  6 +++---
 src/runtime/mcheckmark.go |  2 +-
 src/runtime/mfinal.go     |  2 +-
 src/runtime/mheap.go      | 16 ++++++++++------
 src/runtime/mspanset.go   |  4 ++--
 src/runtime/mstats.go     | 31 ++++++++++++++++++-------------
 7 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 495ecc5164..eed47930f0 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -540,6 +540,9 @@ func dumpms() {
 }
 
 func dumpmemstats() {
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
 	dumpint(tagMemStats)
 	dumpint(memstats.alloc)
 	dumpint(memstats.total_alloc)
@@ -560,7 +563,7 @@ func dumpmemstats() {
 	dumpint(memstats.mcache_inuse)
 	dumpint(memstats.mcache_sys.load())
 	dumpint(memstats.buckhash_sys.load())
-	dumpint(memstats.gc_sys.load())
+	dumpint(memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse)
 	dumpint(memstats.other_sys.load())
 	dumpint(memstats.next_gc)
 	dumpint(memstats.last_gc_unix)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 27d678d917..ee22bad58c 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -743,9 +743,9 @@ mapped:
 			throw("arena already initialized")
 		}
 		var r *heapArena
-		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 		if r == nil {
-			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 			if r == nil {
 				throw("out of memory allocating heap arena metadata")
 			}
@@ -757,7 +757,7 @@ mapped:
 			if size == 0 {
 				size = physPageSize
 			}
-			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gcMiscSys))
 			if newArray == nil {
 				throw("out of memory allocating allArenas")
 			}
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
index 1fd8e4e78f..c0b028d715 100644
--- a/src/runtime/mcheckmark.go
+++ b/src/runtime/mcheckmark.go
@@ -41,7 +41,7 @@ func startCheckmarks() {
 
 		if bitmap == nil {
 			// Allocate bitmap on first use.
-			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gc_sys))
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
 			if bitmap == nil {
 				throw("out of memory allocating checkmarks bitmap")
 			}
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 6676ae6736..6ec5133be0 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -88,7 +88,7 @@ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
-			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
 			finc.alllink = allfin
 			allfin = finc
 			if finptrmask[0] == 0 {
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 27c1bfbcf1..1624a04b9d 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -713,7 +713,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gc_sys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -1230,8 +1230,10 @@ HaveSpan:
 		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
 	case spanAllocStack:
 		atomic.Xadd64(&memstats.stacks_inuse, int64(nbytes))
-	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		memstats.gc_sys.add(int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xadd64(&memstats.gcWorkBufInUse, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys.
@@ -1406,8 +1408,10 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
 	case spanAllocStack:
 		atomic.Xadd64(&memstats.stacks_inuse, -int64(nbytes))
-	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		memstats.gc_sys.add(-int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xadd64(&memstats.gcWorkBufInUse, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, -int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
@@ -1956,7 +1960,7 @@ func newArenaMayUnlock() *gcBitsArena {
 	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
 		unlock(&gcBitsArenas.lock)
-		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
index 490eed4549..10d2596c38 100644
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -102,7 +102,7 @@ retry:
 			if newCap == 0 {
 				newCap = spanSetInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -283,7 +283,7 @@ func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
 	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
 		return s
 	}
-	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
 }
 
 // free returns a spanSetBlock back to the pool.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 466f33836c..967fe6e2be 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -44,15 +44,17 @@ type mstats struct {
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64     // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
-	mspan_inuse  uint64     // mspan structures
-	mspan_sys    sysMemStat
-	mcache_inuse uint64 // mcache structures
-	mcache_sys   sysMemStat
-	buckhash_sys sysMemStat // profiling bucket hash table
-	gc_sys       sysMemStat // updated atomically or during STW
-	other_sys    sysMemStat // updated atomically or during STW
+	stacks_inuse             uint64     // bytes in manually-managed stack spans; updated atomically or during STW
+	stacks_sys               sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+	mspan_inuse              uint64     // mspan structures
+	mspan_sys                sysMemStat
+	mcache_inuse             uint64 // mcache structures
+	mcache_sys               sysMemStat
+	buckhash_sys             sysMemStat // profiling bucket hash table
+	gcWorkBufInUse           uint64     // updated atomically or during STW
+	gcProgPtrScalarBitsInUse uint64     // updated atomically or during STW
+	gcMiscSys                sysMemStat // updated atomically or during STW
+	other_sys                sysMemStat // updated atomically or during STW
 
 	// Statistics about the garbage collector.
 
@@ -472,7 +474,10 @@ func readmemstats_m(stats *MemStats) {
 	stats.MCacheInuse = memstats.mcache_inuse
 	stats.MCacheSys = memstats.mcache_sys.load()
 	stats.BuckHashSys = memstats.buckhash_sys.load()
-	stats.GCSys = memstats.gc_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 	stats.OtherSys = memstats.other_sys.load()
 	stats.NextGC = memstats.next_gc
 	stats.LastGC = memstats.last_gc_unix
@@ -557,11 +562,11 @@ func updatememstats() {
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
-		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gc_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
 		memstats.other_sys.load()
 
-	// We also count stacks_inuse as sys memory.
-	memstats.sys += memstats.stacks_inuse
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
-- 
cgit v1.2.1


From c5dea8f38726572ddc161e5d169a453639edb7b1 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 19:27:59 +0000
Subject: runtime: remove memstats.heap_idle

This statistic is updated in many places but for MemStats may be
computed from existing statistics. Specifically by definition
heap_idle = heap_sys - heap_inuse since heap_sys is all memory allocated
from the OS for use in the heap minus memory used for non-heap purposes.
heap_idle is almost the same (since it explicitly includes memory that
*could* be used for non-heap purposes) but also doesn't include memory
that's actually used to hold heap objects.

Although it has some utility as a sanity check, it complicates
accounting and we want fewer, orthogonal statistics for upcoming metrics
changes, so just drop it.

Change-Id: I40af54a38e335f43249f6e218f35088bfd4380d1
Reviewed-on: https://go-review.googlesource.com/c/go/+/246974
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/heapdump.go |  2 +-
 src/runtime/mheap.go    |  3 ---
 src/runtime/mstats.go   | 19 +++++++++++++++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index eed47930f0..f96475e848 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -552,7 +552,7 @@ func dumpmemstats() {
 	dumpint(memstats.nfree)
 	dumpint(memstats.heap_alloc)
 	dumpint(memstats.heap_sys.load())
-	dumpint(memstats.heap_idle)
+	dumpint(memstats.heap_sys.load() - memstats.heap_inuse)
 	dumpint(memstats.heap_inuse)
 	dumpint(memstats.heap_released)
 	dumpint(memstats.heap_objects)
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1624a04b9d..87d2fd495b 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1239,7 +1239,6 @@ HaveSpan:
 		// Manually managed memory doesn't count toward heap_sys.
 		memstats.heap_sys.add(-int64(nbytes))
 	}
-	atomic.Xadd64(&memstats.heap_idle, -int64(nbytes))
 
 	// Publish the span in various locations.
 
@@ -1317,7 +1316,6 @@ func (h *mheap) grow(npage uintptr) bool {
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
-		atomic.Xadd64(&memstats.heap_idle, int64(asize))
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1417,7 +1415,6 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
 		memstats.heap_sys.add(int64(nbytes))
 	}
-	atomic.Xadd64(&memstats.heap_idle, int64(nbytes))
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 967fe6e2be..43f74273f7 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -34,7 +34,6 @@ type mstats struct {
 	// in manually-managed spans.
 	heap_alloc    uint64     // bytes allocated and not yet freed (same as alloc above)
 	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64     // bytes in idle spans
 	heap_inuse    uint64     // bytes in mSpanInUse spans
 	heap_released uint64     // bytes released to the os
 
@@ -461,7 +460,23 @@ func readmemstats_m(stats *MemStats) {
 	stats.Frees = memstats.nfree
 	stats.HeapAlloc = memstats.heap_alloc
 	stats.HeapSys = memstats.heap_sys.load()
-	stats.HeapIdle = memstats.heap_idle
+	// By definition, HeapIdle is memory that was mapped
+	// for the heap but is not currently used to hold heap
+	// objects. It also specifically is memory that can be
+	// used for other purposes, like stacks, but this memory
+	// is subtracted out of HeapSys before it makes that
+	// transition. Put another way:
+	//
+	// heap_sys = bytes allocated from the OS for the heap - bytes ultimately used for non-heap purposes
+	// heap_idle = bytes allocated from the OS for the heap - bytes ultimately used for any purpose
+	//
+	// or
+	//
+	// heap_sys = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse
+	// heap_idle = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse - heap_inuse
+	//
+	// => heap_idle = heap_sys - heap_inuse
+	stats.HeapIdle = memstats.heap_sys.load() - memstats.heap_inuse
 	stats.HeapInuse = memstats.heap_inuse
 	stats.HeapReleased = memstats.heap_released
 	stats.HeapObjects = memstats.heap_objects
-- 
cgit v1.2.1


From ae585ee52c2437bfd0e955ad6fc8911bf292f51d Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 19:31:23 +0000
Subject: runtime: remove memstats.heap_alloc

memstats.heap_alloc is 100% a duplicate and unnecessary copy of
memstats.alloc which exists because MemStats used to be populated from
memstats via a memmove.

Change-Id: I995489f61be39786e573b8494a8ab6d4ea8bed9c
Reviewed-on: https://go-review.googlesource.com/c/go/+/246975
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/heapdump.go |  2 +-
 src/runtime/mstats.go   | 13 +++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index f96475e848..6fcd9746af 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -550,7 +550,7 @@ func dumpmemstats() {
 	dumpint(memstats.nlookup)
 	dumpint(memstats.nmalloc)
 	dumpint(memstats.nfree)
-	dumpint(memstats.heap_alloc)
+	dumpint(memstats.alloc)
 	dumpint(memstats.heap_sys.load())
 	dumpint(memstats.heap_sys.load() - memstats.heap_inuse)
 	dumpint(memstats.heap_inuse)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 43f74273f7..a6e38d1c1b 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -32,7 +32,6 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64     // bytes allocated and not yet freed (same as alloc above)
 	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
 	heap_inuse    uint64     // bytes in mSpanInUse spans
 	heap_released uint64     // bytes released to the os
@@ -112,11 +111,10 @@ type mstats struct {
 
 	// heap_live is the number of bytes considered live by the GC.
 	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= heap_alloc, since heap_alloc
-	// includes unmarked objects that have not yet been swept (and
-	// hence goes up as we allocate and down as we sweep) while
-	// heap_live excludes these objects (and hence only goes up
-	// between GCs).
+	// since then. heap_live <= alloc, since alloc includes unmarked
+	// objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heap_live excludes these
+	// objects (and hence only goes up between GCs).
 	//
 	// This is updated atomically without locking. To reduce
 	// contention, this is updated only when obtaining a span from
@@ -458,7 +456,7 @@ func readmemstats_m(stats *MemStats) {
 	stats.Sys = memstats.sys
 	stats.Mallocs = memstats.nmalloc
 	stats.Frees = memstats.nfree
-	stats.HeapAlloc = memstats.heap_alloc
+	stats.HeapAlloc = memstats.alloc
 	stats.HeapSys = memstats.heap_sys.load()
 	// By definition, HeapIdle is memory that was mapped
 	// for the heap but is not currently used to hold heap
@@ -639,7 +637,6 @@ func updatememstats() {
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
 	memstats.alloc = totalAlloc - totalFree
-	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
-- 
cgit v1.2.1


From c02134abb01e019683daf051029d66b15dd11213 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:08:25 +0000
Subject: runtime: add helper for getting an mcache in allocation contexts

This change adds a function getMCache which returns the current P's
mcache if it's available, and otherwise tries to get mcache0 if we're
bootstrapping. This function will come in handy as we need to replicate
this behavior in multiple places in future changes.

Change-Id: I536073d6f6dc6c6390269e613ead9f8bcb6e7f98
Reviewed-on: https://go-review.googlesource.com/c/go/+/246976
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 25 ++-----------------------
 src/runtime/mcache.go | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ee22bad58c..6383c34817 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -972,19 +972,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
-	}
+	c := getMCache()
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1212,16 +1200,7 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
-	}
-	c.nextSample = nextSample()
+	getMCache().nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c3e0e5e1f7..5564e4a47d 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -131,6 +131,29 @@ func freemcache(c *mcache, recipient *mcache) {
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Must be running with a P when called (so the caller must be in a
+// non-preemptible state) or must be called during bootstrapping.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("getMCache called with no P or outside bootstrapping")
+		}
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-- 
cgit v1.2.1


From 2159c26ceb32bbfa86036431750c0752fca84ef6 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 14 Apr 2020 21:06:26 +0000
Subject: runtime/metrics: add package interface

This change creates the runtime/metrics package and adds the initial
interface as laid out in the design document.

For #37112.

Change-Id: I202dcee08ab008dd63bf96f7a4162f5b5f813637
Reviewed-on: https://go-review.googlesource.com/c/go/+/247040
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics/description.go | 52 ++++++++++++++++++++++++++++
 src/runtime/metrics/doc.go         | 49 +++++++++++++++++++++++++++
 src/runtime/metrics/histogram.go   | 30 +++++++++++++++++
 src/runtime/metrics/sample.go      | 29 ++++++++++++++++
 src/runtime/metrics/value.go       | 69 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 229 insertions(+)
 create mode 100644 src/runtime/metrics/description.go
 create mode 100644 src/runtime/metrics/doc.go
 create mode 100644 src/runtime/metrics/histogram.go
 create mode 100644 src/runtime/metrics/sample.go
 create mode 100644 src/runtime/metrics/value.go

(limited to 'src/runtime')

diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
new file mode 100644
index 0000000000..32bb950a72
--- /dev/null
+++ b/src/runtime/metrics/description.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Description describes a runtime metric.
+type Description struct {
+	// Name is the full name of the metric which includes the unit.
+	//
+	// The format of the metric may be described by the following regular expression.
+	//
+	// 	^(?P<name>/[^:]+):(?P<unit>[^:*\/]+(?:[*\/][^:*\/]+)*)$
+	//
+	// The format splits the name into two components, separated by a colon: a path which always
+	// starts with a /, and a machine-parseable unit. The name may contain any valid Unicode
+	// codepoint in between / characters, but by convention will try to stick to lowercase
+	// characters and hyphens. An example of such a path might be "/memory/heap/free".
+	//
+	// The unit is by convention a series of lowercase English unit names (singular or plural)
+	// without prefixes delimited by '*' or '/'. The unit names may contain any valid Unicode
+	// codepoint that is not a delimiter.
+	// Examples of units might be "seconds", "bytes", "bytes/second", "cpu-seconds",
+	// "byte*cpu-seconds", and "bytes/second/second".
+	//
+	// A complete name might look like "/memory/heap/free:bytes".
+	Name string
+
+	// Kind is the kind of value for this metric.
+	//
+	// The purpose of this field is to allow users to filter out metrics whose values are
+	// types which their application may not understand.
+	Kind ValueKind
+
+	// Cumulative is whether or not the metric is cumulative. If a cumulative metric is just
+	// a single number, then it increases monotonically. If the metric is a distribution,
+	// then each bucket count increases monotonically.
+	//
+	// This flag thus indicates whether or not it's useful to compute a rate from this value.
+	Cumulative bool
+
+	// StopTheWorld is whether or not the metric requires a stop-the-world
+	// event in order to collect it.
+	StopTheWorld bool
+}
+
+var allDesc = []Description{}
+
+// All returns a slice of containing metric descriptions for all supported metrics.
+func All() []Description {
+	return allDesc
+}
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
new file mode 100644
index 0000000000..b48c22ba30
--- /dev/null
+++ b/src/runtime/metrics/doc.go
@@ -0,0 +1,49 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package metrics provides a stable interface to access implementation-defined
+metrics exported by the Go runtime. This package is similar to existing functions
+like runtime.ReadMemStats and debug.ReadGCStats, but significantly more general.
+
+The set of metrics defined by this package may evolve as the runtime itself
+evolves, and also enables variation across Go implementations, whose relevant
+metric sets may not intersect.
+
+Interface
+
+Metrics are designated by a string key, rather than, for example, a field name in
+a struct. The full list of supported metrics is always available in the slice of
+Descriptions returned by All. Each Description also includes useful information
+about the metric, such as how to display it (e.g. gauge vs. counter) and how difficult
+or disruptive it is to obtain it (e.g. do you need to stop the world?).
+
+Thus, users of this API are encouraged to sample supported metrics defined by the
+slice returned by All to remain compatible across Go versions. Of course, situations
+arise where reading specific metrics is critical. For these cases, users are
+encouranged to use build tags, and although metrics may be deprecated and removed,
+users should consider this to be an exceptional and rare event, coinciding with a
+very large change in a particular Go implementation.
+
+Each metric key also has a "kind" that describes the format of the metric's value.
+In the interest of not breaking users of this package, the "kind" for a given metric
+is guaranteed not to change. If it must change, then a new metric will be introduced
+with a new key and a new "kind."
+
+Metric key format
+
+As mentioned earlier, metric keys are strings. Their format is simple and well-defined,
+designed to be both human and machine readable. It is split into two components,
+separated by a colon: a rooted path and a unit. The choice to include the unit in
+the key is motivated by compatibility: if a metric's unit changes, its semantics likely
+did also, and a new key should be introduced.
+
+For more details on the precise definition of the metric key's path and unit formats, see
+the documentation of the Name field of the Description struct.
+
+Supported metrics
+
+TODO(mknyszek): List them here as they're added.
+*/
+package metrics
diff --git a/src/runtime/metrics/histogram.go b/src/runtime/metrics/histogram.go
new file mode 100644
index 0000000000..e1364e1e26
--- /dev/null
+++ b/src/runtime/metrics/histogram.go
@@ -0,0 +1,30 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Float64Histogram represents a distribution of float64 values.
+type Float64Histogram struct {
+	// Counts contains the weights for each histogram bucket. The length of
+	// Counts is equal to the length of Buckets (in the metric description)
+	// plus one to account for the implicit minimum bucket.
+	//
+	// Given N buckets, the following is the mathematical relationship between
+	// Counts and Buckets.
+	// count[0] is the weight of the range (-inf, bucket[0])
+	// count[n] is the weight of the range [bucket[n], bucket[n+1]), for 0 < n < N-1
+	// count[N-1] is the weight of the range [bucket[N-1], inf)
+	Counts []uint64
+
+	// Buckets contains the boundaries between histogram buckets, in increasing order.
+	//
+	// Because this slice contains boundaries, there are len(Buckets)+1 counts:
+	// a count for all values less than the first boundary, a count covering each
+	// [slice[i], slice[i+1]) interval, and a count for all values greater than or
+	// equal to the last boundary.
+	//
+	// For a given metric name, the value of Buckets is guaranteed not to change
+	// between calls until program exit.
+	Buckets []float64
+}
diff --git a/src/runtime/metrics/sample.go b/src/runtime/metrics/sample.go
new file mode 100644
index 0000000000..c7a3fc424a
--- /dev/null
+++ b/src/runtime/metrics/sample.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Sample captures a single metric sample.
+type Sample struct {
+	// Name is the name of the metric sampled.
+	//
+	// It must correspond to a name in one of the metric descriptions
+	// returned by Descriptions.
+	Name string
+
+	// Value is the value of the metric sample.
+	Value Value
+}
+
+// Read populates each Value field in the given slice of metric samples.
+//
+// Desired metrics should be present in the slice with the appropriate name.
+// The user of this API is encouraged to re-use the same slice between calls.
+//
+// Metric values with names not appearing in the value returned by Descriptions
+// will have the value populated as KindBad to indicate that the name is
+// unknown.
+func Read(m []Sample) {
+	panic("unimplemented")
+}
diff --git a/src/runtime/metrics/value.go b/src/runtime/metrics/value.go
new file mode 100644
index 0000000000..0b056b4ea8
--- /dev/null
+++ b/src/runtime/metrics/value.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	"math"
+	"unsafe"
+)
+
+// ValueKind is a tag for a metric Value which indicates its type.
+type ValueKind int
+
+const (
+	// KindBad indicates that the Value has no type and should not be used.
+	KindBad ValueKind = iota
+
+	// KindUint64 indicates that the type of the Value is a uint64.
+	KindUint64
+
+	// KindFloat64 indicates that the type of the Value is a float64.
+	KindFloat64
+
+	// KindFloat64Histogram indicates that the type of the Value is a *Float64Histogram.
+	KindFloat64Histogram
+)
+
+// Value represents a metric value returned by the runtime.
+type Value struct {
+	kind    ValueKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// Kind returns the a tag representing the kind of value this is.
+func (v Value) Kind() ValueKind {
+	return v.kind
+}
+
+// Uint64 returns the internal uint64 value for the metric.
+//
+// If v.Kind() != KindUint64, this method panics.
+func (v Value) Uint64() uint64 {
+	if v.kind != KindUint64 {
+		panic("called Uint64 on non-uint64 metric value")
+	}
+	return v.scalar
+}
+
+// Float64 returns the internal float64 value for the metric.
+//
+// If v.Kind() != KindFloat64, this method panics.
+func (v Value) Float64() float64 {
+	if v.kind != KindFloat64 {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return math.Float64frombits(v.scalar)
+}
+
+// Float64Histogram returns the internal *Float64Histogram value for the metric.
+//
+// If v.Kind() != KindFloat64Histogram, this method panics.
+func (v Value) Float64Histogram() *Float64Histogram {
+	if v.kind != KindFloat64Histogram {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return (*Float64Histogram)(v.pointer)
+}
-- 
cgit v1.2.1


From fe7ff71185cf30f9bdee3e8d8897e8b6069ad02e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:11:04 +0000
Subject: runtime: add consistent heap statistics

This change adds a global set of heap statistics which are similar
to existing memory statistics. The purpose of these new statistics
is to be able to read them and get a consistent result without stopping
the world. The goal is to eventually replace as many of the existing
memstats statistics with the sharded ones as possible.

The consistent memory statistics use a tailor-made synchronization
mechanism to allow writers (allocators) to proceed with minimal
synchronization by using a sequence counter and a global generation
counter to determine which set of statistics to update. Readers
increment the global generation counter to effectively grab a snapshot
of the statistics, and then iterate over all Ps using the sequence
counter to ensure that they may safely read the snapshotted statistics.
To keep statistics fresh, the reader also has a responsibility to merge
sets of statistics.

These consistent statistics are computed, but otherwise unused for now.
Upcoming changes will integrate them with the rest of the codebase and
will begin to phase out existing statistics.

Change-Id: I637a11f2439e2049d7dccb8650c5d82500733ca5
Reviewed-on: https://go-review.googlesource.com/c/go/+/247037
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go      |   4 +
 src/runtime/mgcscavenge.go |  11 ++-
 src/runtime/mheap.go       |  34 +++++++++
 src/runtime/mstats.go      | 184 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 230 insertions(+), 3 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5564e4a47d..e27a1c9ec0 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -61,6 +61,10 @@ type mcache struct {
 	// in this mcache are stale and need to the flushed so they
 	// can be swept. This is done in acquirep.
 	flushGen uint32
+
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 8b1a0be353..5843ada981 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -711,7 +711,16 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	atomic.Xadd64(&memstats.heap_released, int64(npages)*pageSize)
+	nbytes := int64(npages) * pageSize
+	atomic.Xadd64(&memstats.heap_released, nbytes)
+
+	// Update consistent accounting too.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, -nbytes)
+	atomic.Xaddint64(&stats.released, nbytes)
+	memstats.heapStats.release(c)
+
 	return addr
 }
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 87d2fd495b..d17b6fa284 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1239,6 +1239,22 @@ HaveSpan:
 		// Manually managed memory doesn't count toward heap_sys.
 		memstats.heap_sys.add(-int64(nbytes))
 	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Publish the span in various locations.
 
@@ -1316,6 +1332,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		c := getMCache()
+		stats := memstats.heapStats.acquire(c)
+		atomic.Xaddint64(&stats.released, int64(asize))
+		memstats.heapStats.release(c)
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1415,6 +1435,20 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
 		memstats.heap_sys.add(int64(nbytes))
 	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index a6e38d1c1b..76546c0f0c 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -148,6 +148,9 @@ type mstats struct {
 	// unlike heap_live, heap_marked does not change until the
 	// next mark termination.
 	heap_marked uint64
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
 }
 
 var memstats mstats
@@ -426,10 +429,20 @@ type MemStats struct {
 }
 
 func init() {
-	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
-		println(unsafe.Offsetof(memstats.heap_live))
+	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
+		println(offset)
 		throw("memstats.heap_live not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
 }
 
 // ReadMemStats populates m with memory allocator statistics.
@@ -687,3 +700,170 @@ func (s *sysMemStat) add(n int64) {
 		throw("sysMemStat overflow")
 	}
 }
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+}
+
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each mcache's
+	// statsSeq field. If the reader observes a P (to which the
+	// mcache is bound) not writing, it can be sure that it will
+	// pick up the new gen value the next time it writes.
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+}
+
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated. c must be
+// a valid mcache not being used by any other thread.
+//
+// The returned heapStatsDelta must be updated atomically.
+//
+// Note however, that this is unsafe to call concurrently
+// with other writers and there must be only one writer
+// at a time.
+func (m *consistentHeapStats) acquire(c *mcache) *heapStatsDelta {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 == 0 {
+		// Should have been incremented to odd.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
+	}
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The mcache passed here must be the same as the one
+// passed to acquire.
+func (m *consistentHeapStats) release(c *mcache) {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 != 0 {
+		// Should have been incremented to even.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
+	}
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// only safe time to call this is if the world is stopped or
+// we're freezing the world or going down anyway (and we just
+// want _some_ estimate).
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	for i := range m.stats {
+		out.merge(&m.stats[i])
+	}
+}
+
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
+	}
+}
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	currGen := atomic.Load(&m.gen)
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
+	}
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		// Spin until there are no more writers.
+		for atomic.Load(&c.statsSeq)%2 != 0 {
+		}
+	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+	releasem(mp)
+}
-- 
cgit v1.2.1


From f77a9025f1e4bf4bb3e2b582d13cce5f19c1ca51 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:35:40 +0000
Subject: runtime: replace some memstats with consistent stats

This change replaces stacks_inuse, gcWorkBufInUse and
gcProgPtrScalarBitsInUse with their corresponding consistent stats. It
also adds checks to make sure the rest of the sharded stats line up with
existing stats in updatememstats.

Change-Id: I17d0bd181aedb5c55e09c8dff18cef5b2a3a14e3
Reviewed-on: https://go-review.googlesource.com/c/go/+/247038
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mheap.go  | 18 ++-----------
 src/runtime/mstats.go | 73 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 29 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index d17b6fa284..14a73c0491 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1225,15 +1225,8 @@ HaveSpan:
 		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
-	switch typ {
-	case spanAllocHeap:
+	if typ == spanAllocHeap {
 		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
-	case spanAllocStack:
-		atomic.Xadd64(&memstats.stacks_inuse, int64(nbytes))
-	case spanAllocWorkBuf:
-		atomic.Xadd64(&memstats.gcWorkBufInUse, int64(nbytes))
-	case spanAllocPtrScalarBits:
-		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys.
@@ -1421,15 +1414,8 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	//
 	// Mirrors the code in allocSpan.
 	nbytes := s.npages * pageSize
-	switch typ {
-	case spanAllocHeap:
+	if typ == spanAllocHeap {
 		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
-	case spanAllocStack:
-		atomic.Xadd64(&memstats.stacks_inuse, -int64(nbytes))
-	case spanAllocWorkBuf:
-		atomic.Xadd64(&memstats.gcWorkBufInUse, -int64(nbytes))
-	case spanAllocPtrScalarBits:
-		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, -int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 76546c0f0c..4363eff1e0 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -40,19 +40,25 @@ type mstats struct {
 	// computed on the fly by updatememstats.
 	heap_objects uint64 // total number of allocated objects
 
+	// Statistics about stacks.
+	stacks_inuse uint64     // bytes in manually-managed stack spans; computed by updatememstats
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse             uint64     // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys               sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
-	mspan_inuse              uint64     // mspan structures
-	mspan_sys                sysMemStat
-	mcache_inuse             uint64 // mcache structures
-	mcache_sys               sysMemStat
-	buckhash_sys             sysMemStat // profiling bucket hash table
-	gcWorkBufInUse           uint64     // updated atomically or during STW
-	gcProgPtrScalarBitsInUse uint64     // updated atomically or during STW
+	mspan_inuse  uint64 // mspan structures
+	mspan_sys    sysMemStat
+	mcache_inuse uint64 // mcache structures
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+
+	// Statistics about GC overhead.
+	gcWorkBufInUse           uint64     // computed by updatememstats
+	gcProgPtrScalarBitsInUse uint64     // computed by updatememstats
 	gcMiscSys                sysMemStat // updated atomically or during STW
-	other_sys                sysMemStat // updated atomically or during STW
+
+	// Miscellaneous statistics.
+	other_sys sysMemStat // updated atomically or during STW
 
 	// Statistics about the garbage collector.
 
@@ -577,6 +583,10 @@ func readGCStats_m(pauses *[]uint64) {
 	*pauses = p[:n+n+3]
 }
 
+// Updates the memstats structure.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func updatememstats() {
 	// Flush mcaches to mcentral before doing anything else.
@@ -591,9 +601,6 @@ func updatememstats() {
 		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
 		memstats.other_sys.load()
 
-	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
-	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
-
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
 	// Current number of alive objects in the heap and amount of alive heap memory
@@ -641,6 +648,9 @@ func updatememstats() {
 			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
 	}
+	// Collect consistent stats, which are the source-of-truth in the some cases.
+	var consStats heapStatsDelta
+	memstats.heapStats.unsafeRead(&consStats)
 
 	totalFree += smallFree
 
@@ -651,6 +661,43 @@ func updatememstats() {
 	memstats.total_alloc = totalAlloc
 	memstats.alloc = totalAlloc - totalFree
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
+
+	memstats.stacks_inuse = uint64(consStats.inStacks)
+	memstats.gcWorkBufInUse = uint64(consStats.inWorkBufs)
+	memstats.gcProgPtrScalarBitsInUse = uint64(consStats.inPtrScalarBits)
+
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+
+	// The world is stopped, so the consistent stats (after aggregation)
+	// should be identical to some combination of memstats. In particular:
+	//
+	// * heap_inuse == inHeap
+	// * heap_released == released
+	// * heap_sys - heap_released == committed - inStacks - inWorkBufs - inPtrScalarBits
+	//
+	// Check if that's actually true.
+	//
+	// TODO(mknyszek): Maybe don't throw here. It would be bad if a
+	// bug in otherwise benign accounting caused the whole application
+	// to crash.
+	if memstats.heap_inuse != uint64(consStats.inHeap) {
+		print("runtime: heap_inuse=", memstats.heap_inuse, "\n")
+		print("runtime: consistent value=", consStats.inHeap, "\n")
+		throw("heap_inuse and consistent stats are not equal")
+	}
+	if memstats.heap_released != uint64(consStats.released) {
+		print("runtime: heap_released=", memstats.heap_released, "\n")
+		print("runtime: consistent value=", consStats.released, "\n")
+		throw("heap_released and consistent stats are not equal")
+	}
+	globalRetained := memstats.heap_sys.load() - memstats.heap_released
+	consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+	if globalRetained != consRetained {
+		print("runtime: global value=", globalRetained, "\n")
+		print("runtime: consistent value=", consRetained, "\n")
+		throw("measures of the retained heap are not equal")
+	}
 }
 
 // flushmcache flushes the mcache of allp[i].
-- 
cgit v1.2.1


From 79781e8dd382ac34e502ed6a088dff6860a08c05 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 4 Aug 2020 17:29:03 +0000
Subject: runtime: move malloc stats into consistentHeapStats

This change moves the mcache-local malloc stats into the
consistentHeapStats structure so the malloc stats can be managed
consistently with the memory stats. The one exception here is
tinyAllocs for which moving that into the global stats would incur
several atomic writes on the fast path. Microbenchmarks for just one CPU
core have shown a 50% loss in throughput. Since tiny allocation counnt
isn't exposed anyway and is always blindly added to both allocs and
frees, let that stay inconsistent and flush the tiny allocation count
every so often.

Change-Id: I2a4b75f209c0e659b9c0db081a3287bf227c10ca
Reviewed-on: https://go-review.googlesource.com/c/go/+/247039
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 37 ++++++++--------------
 src/runtime/malloc.go      |  2 +-
 src/runtime/mcache.go      | 70 ++++++++++++++---------------------------
 src/runtime/mgcsweep.go    | 10 ++++--
 src/runtime/mstats.go      | 78 ++++++++++++++++++++++++++--------------------
 src/runtime/proc.go        |  2 +-
 6 files changed, 90 insertions(+), 109 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index cb753ee819..ff901fd7be 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -337,33 +337,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
-		var tinyAllocs, largeFree, smallFree uint64
-		for _, p := range allp {
-			c := p.mcache
-			if c == nil {
-				continue
-			}
-			// Collect large allocation stats.
-			largeFree += uint64(c.largeFree)
-			slow.Frees += uint64(c.largeFreeCount)
-
-			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.tinyAllocCount)
-
-			// Collect per-sizeclass stats.
-			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
-				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-			}
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
+		var smallFree uint64
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += tinyAllocs
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 6383c34817..d0b8c668c3 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1028,7 +1028,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.tinyAllocCount++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index e27a1c9ec0..c9342a41c9 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -32,8 +32,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
 	tiny       uintptr
 	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -41,21 +45,6 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Allocator stats (source-of-truth).
-	// Only the P that owns this mcache may write to these
-	// variables, so it's safe for that P to read non-atomically.
-	//
-	// When read with stats from other mcaches and with the world
-	// stopped, the result will accurately reflect the state of the
-	// application.
-	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
-	largeAlloc      uintptr                  // bytes allocated for large objects
-	largeAllocCount uintptr                  // number of large object allocations
-	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
-	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
-	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
-	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
@@ -117,7 +106,7 @@ func allocmcache() *mcache {
 // In some cases there is no way to simply release
 // resources, such as statistics, so donate them to
 // a different mcache (the recipient).
-func freemcache(c *mcache, recipient *mcache) {
+func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
 		stackcache_clear(c)
@@ -128,8 +117,6 @@ func freemcache(c *mcache, recipient *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		// Donate anything else that's left.
-		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
@@ -158,31 +145,6 @@ func getMCache() *mcache {
 	return c
 }
 
-// donate flushes data and resources which have no global
-// pool to another mcache.
-func (c *mcache) donate(d *mcache) {
-	// scanAlloc is handled separately because it's not
-	// like these stats -- it's used for GC pacing.
-	d.largeAlloc += c.largeAlloc
-	c.largeAlloc = 0
-	d.largeAllocCount += c.largeAllocCount
-	c.largeAllocCount = 0
-	for i := range c.smallAllocCount {
-		d.smallAllocCount[i] += c.smallAllocCount[i]
-		c.smallAllocCount[i] = 0
-	}
-	d.largeFree += c.largeFree
-	c.largeFree = 0
-	d.largeFreeCount += c.largeFreeCount
-	c.largeFreeCount = 0
-	for i := range c.smallFreeCount {
-		d.smallFreeCount[i] += c.smallFreeCount[i]
-		c.smallFreeCount[i] = 0
-	}
-	d.tinyAllocCount += c.tinyAllocCount
-	c.tinyAllocCount = 0
-}
-
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -219,12 +181,20 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release(c)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
 	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
@@ -262,8 +232,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.largeAlloc += npages * pageSize
-	c.largeAllocCount++
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release(c)
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -294,7 +266,9 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.smallAllocCount[spanClass(i).sizeclass()] -= n
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release(c)
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
@@ -313,6 +287,8 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
 
 	// Updated heap_scan and possible heap_live.
 	if gcBlackenEnabled != 0 {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 7103b08455..9b77ce635c 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release(c)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.largeFreeCount++
-			c.largeFree += size
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release(c)
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 4363eff1e0..a8eca85fe6 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -612,48 +612,36 @@ func updatememstats() {
 	memstats.total_alloc = 0
 	memstats.nmalloc = 0
 	memstats.nfree = 0
-	memstats.tinyallocs = 0
 	for i := 0; i < len(memstats.by_size); i++ {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
-
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.largeAllocCount)
-		totalAlloc += uint64(c.largeAlloc)
-		totalFree += uint64(c.largeFree)
-		memstats.nfree += uint64(c.largeFreeCount)
-
-		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.tinyAllocCount)
-
-		// Collect per-sizeclass stats.
-		for i := 0; i < _NumSizeClasses; i++ {
-			// Malloc stats.
-			memstats.nmalloc += uint64(c.smallAllocCount[i])
-			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
-			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
-
-			// Free stats.
-			memstats.nfree += uint64(c.smallFreeCount[i])
-			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
-			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-		}
-	}
 	// Collect consistent stats, which are the source-of-truth in the some cases.
 	var consStats heapStatsDelta
 	memstats.heapStats.unsafeRead(&consStats)
 
-	totalFree += smallFree
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
+
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
+	}
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
@@ -752,12 +740,25 @@ func (s *sysMemStat) add(n int64) {
 // that need to be updated together in order for them to be kept
 // consistent with one another.
 type heapStatsDelta struct {
+	// Memory stats.
 	committed       int64 // byte delta of memory committed
 	released        int64 // byte delta of released memory generated
 	inHeap          int64 // byte delta of memory placed in the heap
 	inStacks        int64 // byte delta of memory reserved for stacks
 	inWorkBufs      int64 // byte delta of memory reserved for work bufs
 	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
 }
 
 // merge adds in the deltas from b into a.
@@ -768,6 +769,17 @@ func (a *heapStatsDelta) merge(b *heapStatsDelta) {
 	a.inStacks += b.inStacks
 	a.inWorkBufs += b.inWorkBufs
 	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
+	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
+	}
 }
 
 // consistentHeapStats represents a set of various memory statistics
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 4f4cff38aa..ebecc92745 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4550,7 +4550,7 @@ func (pp *p) destroy() {
 		pp.mspancache.len = 0
 		pp.pcache.flush(&mheap_.pages)
 	})
-	freemcache(pp.mcache, allp[0].mcache)
+	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-- 
cgit v1.2.1


From b08dfbaa439e4e396b979e02ea2e7d36972e8b7a Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 1 Jul 2020 16:02:42 +0000
Subject: runtime,runtime/metrics: add memory metrics

This change adds support for a variety of runtime memory metrics and
contains the base implementation of Read for the runtime/metrics
package, which lives in the runtime.

It also adds testing infrastructure for the metrics package, and a bunch
of format and documentation tests.

For #37112.

Change-Id: I16a2c4781eeeb2de0abcb045c15105f1210e2d8a
Reviewed-on: https://go-review.googlesource.com/c/go/+/247041
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/export_test.go              |  26 +++
 src/runtime/metrics.go                  | 367 ++++++++++++++++++++++++++++++++
 src/runtime/metrics/description.go      |  80 ++++++-
 src/runtime/metrics/description_test.go | 125 +++++++++++
 src/runtime/metrics/doc.go              |  56 ++++-
 src/runtime/metrics/sample.go           |  10 +-
 src/runtime/metrics_test.go             | 114 ++++++++++
 src/runtime/mstats.go                   |   3 +-
 8 files changed, 776 insertions(+), 5 deletions(-)
 create mode 100644 src/runtime/metrics.go
 create mode 100644 src/runtime/metrics/description_test.go
 create mode 100644 src/runtime/metrics_test.go

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index ff901fd7be..d043fe3ee5 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -298,6 +298,32 @@ func (p *ProfBuf) Close() {
 	(*profBuf)(p).close()
 }
 
+func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
+	stopTheWorld("ReadMetricsSlow")
+
+	// Initialize the metrics beforehand because this could
+	// allocate and skew the stats.
+	semacquire(&metricsSema)
+	initMetrics()
+	semrelease(&metricsSema)
+
+	systemstack(func() {
+		// Read memstats first. It's going to flush
+		// the mcaches which readMetrics does not do, so
+		// going the other way around may result in
+		// inconsistent statistics.
+		readmemstats_m(memStats)
+	})
+
+	// Read metrics off the system stack.
+	//
+	// The only part of readMetrics that could allocate
+	// and skew the stats is initMetrics.
+	readMetrics(samplesp, len, cap)
+
+	startTheWorld()
+}
+
 // ReadMemStatsSlow returns both the runtime-computed MemStats and
 // MemStats accumulated by scanning the heap.
 func ReadMemStatsSlow() (base, slow MemStats) {
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
new file mode 100644
index 0000000000..44b5a29751
--- /dev/null
+++ b/src/runtime/metrics.go
@@ -0,0 +1,367 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Metrics implementation exported to runtime/metrics.
+
+import (
+	"unsafe"
+)
+
+var (
+	// metrics is a map of runtime/metrics keys to
+	// data used by the runtime to sample each metric's
+	// value.
+	metricsSema uint32 = 1
+	metricsInit bool
+	metrics     map[string]metricData
+)
+
+type metricData struct {
+	// deps is the set of runtime statistics that this metric
+	// depends on. Before compute is called, the statAggregate
+	// which will be passed must ensure() these dependencies.
+	deps statDepSet
+
+	// compute is a function that populates a metricValue
+	// given a populated statAggregate structure.
+	compute func(in *statAggregate, out *metricValue)
+}
+
+// initMetrics initializes the metrics map if it hasn't been yet.
+//
+// metricsSema must be held.
+func initMetrics() {
+	if metricsInit {
+		return
+	}
+	metrics = map[string]metricData{
+		"/memory/classes/heap/free:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed - in.heapStats.inHeap -
+					in.heapStats.inStacks - in.heapStats.inWorkBufs -
+					in.heapStats.inPtrScalarBits)
+			},
+		},
+		"/memory/classes/heap/objects:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/heap/released:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.released)
+			},
+		},
+		"/memory/classes/heap/stacks:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inStacks)
+			},
+		},
+		"/memory/classes/heap/unused:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inHeap) - in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/metadata/mcache/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheSys - in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mcache/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanSys - in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/other:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inWorkBufs+in.heapStats.inPtrScalarBits) + in.sysStats.gcMiscSys
+			},
+		},
+		"/memory/classes/os-stacks:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.stacksSys
+			},
+		},
+		"/memory/classes/other:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.otherSys
+			},
+		},
+		"/memory/classes/profiling/buckets:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.buckHashSys
+			},
+		},
+		"/memory/classes/total:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed+in.heapStats.released) +
+					in.sysStats.stacksSys + in.sysStats.mSpanSys +
+					in.sysStats.mCacheSys + in.sysStats.buckHashSys +
+					in.sysStats.gcMiscSys + in.sysStats.otherSys
+			},
+		},
+	}
+	metricsInit = true
+}
+
+// statDep is a dependency on a group of statistics
+// that a metric might have.
+type statDep uint
+
+const (
+	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
+	sysStatsDep                 // corresponds to sysStatsAggregate
+	numStatsDeps
+)
+
+// statDepSet represents a set of statDeps.
+//
+// Under the hood, it's a bitmap.
+type statDepSet [1]uint64
+
+// makeStatDepSet creates a new statDepSet from a list of statDeps.
+func makeStatDepSet(deps ...statDep) statDepSet {
+	var s statDepSet
+	for _, d := range deps {
+		s[d/64] |= 1 << (d % 64)
+	}
+	return s
+}
+
+// differennce returns set difference of s from b as a new set.
+func (s statDepSet) difference(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] &^ b[i]
+	}
+	return c
+}
+
+// union returns the union of the two sets as a new set.
+func (s statDepSet) union(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] | b[i]
+	}
+	return c
+}
+
+// empty returns true if there are no dependencies in the set.
+func (s *statDepSet) empty() bool {
+	for _, c := range s {
+		if c != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// has returns true if the set contains a given statDep.
+func (s *statDepSet) has(d statDep) bool {
+	return s[d/64]&(1<<(d%64)) != 0
+}
+
+// heapStatsAggregate represents memory stats obtained from the
+// runtime. This set of stats is grouped together because they
+// depend on each other in some way to make sense of the runtime's
+// current heap memory use. They're also sharded across Ps, so it
+// makes sense to grab them all at once.
+type heapStatsAggregate struct {
+	heapStatsDelta
+
+	// inObjects is the bytes of memory occupied by objects,
+	// derived from other values in heapStats.
+	inObjects uint64
+}
+
+// compute populates the heapStatsAggregate with values from the runtime.
+func (a *heapStatsAggregate) compute() {
+	memstats.heapStats.read(&a.heapStatsDelta)
+
+	// Calculate derived stats.
+	a.inObjects = uint64(a.largeAlloc - a.largeFree)
+	for i := range a.smallAllocCount {
+		a.inObjects += uint64(a.smallAllocCount[i]-a.smallFreeCount[i]) * uint64(class_to_size[i])
+	}
+}
+
+// sysStatsAggregate represents system memory stats obtained
+// from the runtime. This set of stats is grouped together because
+// they're all relatively cheap to acquire and generally independent
+// of one another and other runtime memory stats. The fact that they
+// may be acquired at different times, especially with respect to
+// heapStatsAggregate, means there could be some skew, but because of
+// these stats are independent, there's no real consistency issue here.
+type sysStatsAggregate struct {
+	stacksSys   uint64
+	mSpanSys    uint64
+	mSpanInUse  uint64
+	mCacheSys   uint64
+	mCacheInUse uint64
+	buckHashSys uint64
+	gcMiscSys   uint64
+	otherSys    uint64
+}
+
+// compute populates the sysStatsAggregate with values from the runtime.
+func (a *sysStatsAggregate) compute() {
+	a.stacksSys = memstats.stacks_sys.load()
+	a.buckHashSys = memstats.buckhash_sys.load()
+	a.gcMiscSys = memstats.gcMiscSys.load()
+	a.otherSys = memstats.other_sys.load()
+
+	systemstack(func() {
+		lock(&mheap_.lock)
+		a.mSpanSys = memstats.mspan_sys.load()
+		a.mSpanInUse = uint64(mheap_.spanalloc.inuse)
+		a.mCacheSys = memstats.mcache_sys.load()
+		a.mCacheInUse = uint64(mheap_.cachealloc.inuse)
+		unlock(&mheap_.lock)
+	})
+}
+
+// statAggregate is the main driver of the metrics implementation.
+//
+// It contains multiple aggregates of runtime statistics, as well
+// as a set of these aggregates that it has populated. The aggergates
+// are populated lazily by its ensure method.
+type statAggregate struct {
+	ensured   statDepSet
+	heapStats heapStatsAggregate
+	sysStats  sysStatsAggregate
+}
+
+// ensure populates statistics aggregates determined by deps if they
+// haven't yet been populated.
+func (a *statAggregate) ensure(deps *statDepSet) {
+	missing := deps.difference(a.ensured)
+	if missing.empty() {
+		return
+	}
+	for i := statDep(0); i < numStatsDeps; i++ {
+		if !missing.has(i) {
+			continue
+		}
+		switch i {
+		case heapStatsDep:
+			a.heapStats.compute()
+		case sysStatsDep:
+			a.sysStats.compute()
+		}
+	}
+	a.ensured = a.ensured.union(missing)
+}
+
+// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// must be kept structurally identical to that type.
+type metricKind int
+
+const (
+	// These values must be kept identical to their corresponding Kind* values
+	// in the runtime/metrics package.
+	metricKindBad metricKind = iota
+	metricKindUint64
+	metricKindFloat64
+	metricKindFloat64Histogram
+)
+
+// metricSample is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricSample struct {
+	name  string
+	value metricValue
+}
+
+// metricValue is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricValue struct {
+	kind    metricKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// agg is used by readMetrics, and is protected by metricsSema.
+//
+// Managed as a global variable because its pointer will be
+// an argument to a dynamically-defined function, and we'd
+// like to avoid it escaping to the heap.
+var agg statAggregate
+
+// readMetrics is the implementation of runtime/metrics.Read.
+//
+//go:linkname readMetrics runtime/metrics.runtime_readMetrics
+func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
+	// Construct a slice from the args.
+	sl := slice{samplesp, len, cap}
+	samples := *(*[]metricSample)(unsafe.Pointer(&sl))
+
+	// Acquire the metricsSema but with handoff. This operation
+	// is expensive enough that queueing up goroutines and handing
+	// off between them will be noticably better-behaved.
+	semacquire1(&metricsSema, true, 0, 0)
+
+	// Ensure the map is initialized.
+	initMetrics()
+
+	// Clear agg defensively.
+	agg = statAggregate{}
+
+	// Sample.
+	for i := range samples {
+		sample := &samples[i]
+		data, ok := metrics[sample.name]
+		if !ok {
+			sample.value.kind = metricKindBad
+			continue
+		}
+		// Ensure we have all the stats we need.
+		// agg is populated lazily.
+		agg.ensure(&data.deps)
+
+		// Compute the value based on the stats we have.
+		data.compute(&agg, &sample.value)
+	}
+
+	semrelease(&metricsSema)
+}
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 32bb950a72..2e7df7e09f 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -10,7 +10,7 @@ type Description struct {
 	//
 	// The format of the metric may be described by the following regular expression.
 	//
-	// 	^(?P<name>/[^:]+):(?P<unit>[^:*\/]+(?:[*\/][^:*\/]+)*)$
+	// 	^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$
 	//
 	// The format splits the name into two components, separated by a colon: a path which always
 	// starts with a /, and a machine-parseable unit. The name may contain any valid Unicode
@@ -26,6 +26,9 @@ type Description struct {
 	// A complete name might look like "/memory/heap/free:bytes".
 	Name string
 
+	// Description is an English language sentence describing the metric.
+	Description string
+
 	// Kind is the kind of value for this metric.
 	//
 	// The purpose of this field is to allow users to filter out metrics whose values are
@@ -44,7 +47,80 @@ type Description struct {
 	StopTheWorld bool
 }
 
-var allDesc = []Description{}
+// The English language descriptions below must be kept in sync with the
+// descriptions of each metric in doc.go.
+var allDesc = []Description{
+	{
+		Name:        "/memory/classes/heap/free:bytes",
+		Description: "Memory that is available for allocation, and may be returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/objects:bytes",
+		Description: "Memory occupied by live objects and dead objects that have not yet been collected.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/released:bytes",
+		Description: "Memory that has been returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/stacks:bytes",
+		Description: "Memory allocated from the heap that is occupied by stacks.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/unused:bytes",
+		Description: "Memory that is unavailable for allocation, but cannot be returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/free:bytes",
+		Description: "Memory that is reserved for runtime mcache structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/inuse:bytes",
+		Description: "Memory that is occupied by runtime mcache structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/free:bytes",
+		Description: "Memory that is reserved for runtime mspan structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/inuse:bytes",
+		Description: "Memory that is occupied by runtime mspan structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/other:bytes",
+		Description: "Memory that is reserved for or used to hold runtime metadata.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/os-stacks:bytes",
+		Description: "Stack memory allocated by the underlying operating system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/other:bytes",
+		Description: "Memory used by execution trace buffers, structures for debugging the runtime, finalizer and profiler specials, and more.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/profiling/buckets:bytes",
+		Description: "Memory that is used by the stack trace hash map used for profiling.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/total:bytes",
+		Description: "All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.",
+		Kind:        KindUint64,
+	},
+}
 
 // All returns a slice of containing metric descriptions for all supported metrics.
 func All() []Description {
diff --git a/src/runtime/metrics/description_test.go b/src/runtime/metrics/description_test.go
new file mode 100644
index 0000000000..e966a281a1
--- /dev/null
+++ b/src/runtime/metrics/description_test.go
@@ -0,0 +1,125 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics_test
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"regexp"
+	"runtime"
+	"runtime/metrics"
+	"strings"
+	"testing"
+)
+
+func TestDescriptionNameFormat(t *testing.T) {
+	r := regexp.MustCompile("^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$")
+	descriptions := metrics.All()
+	for _, desc := range descriptions {
+		if !r.MatchString(desc.Name) {
+			t.Errorf("metrics %q does not match regexp %s", desc.Name, r)
+		}
+	}
+}
+
+func extractMetricDocs(t *testing.T) map[string]string {
+	if runtime.GOOS == "android" {
+		t.Skip("no access to Go source on android")
+	}
+
+	// Get doc.go.
+	_, filename, _, _ := runtime.Caller(0)
+	filename = filepath.Join(filepath.Dir(filename), "doc.go")
+
+	f, err := os.Open(filename)
+	if err != nil {
+		t.Fatal(err)
+	}
+	const (
+		stateSearch          = iota // look for list of metrics
+		stateNextMetric             // look for next metric
+		stateNextDescription        // build description
+	)
+	state := stateSearch
+	s := bufio.NewScanner(f)
+	result := make(map[string]string)
+	var metric string
+	var prevMetric string
+	var desc strings.Builder
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		switch state {
+		case stateSearch:
+			if line == "Supported metrics" {
+				state = stateNextMetric
+			}
+		case stateNextMetric:
+			// Ignore empty lines until we find a non-empty
+			// one. This will be our metric name.
+			if len(line) != 0 {
+				prevMetric = metric
+				metric = line
+				if prevMetric > metric {
+					t.Errorf("metrics %s and %s are out of lexicographical order", prevMetric, metric)
+				}
+				state = stateNextDescription
+			}
+		case stateNextDescription:
+			if len(line) == 0 || line == `*/` {
+				// An empty line means we're done.
+				// Write down the description and look
+				// for a new metric.
+				result[metric] = desc.String()
+				desc.Reset()
+				state = stateNextMetric
+			} else {
+				// As long as we're seeing data, assume that's
+				// part of the description and append it.
+				if desc.Len() != 0 {
+					// Turn previous newlines into spaces.
+					desc.WriteString(" ")
+				}
+				desc.WriteString(line)
+			}
+		}
+		if line == `*/` {
+			break
+		}
+	}
+	if state == stateSearch {
+		t.Fatalf("failed to find supported metrics docs in %s", filename)
+	}
+	return result
+}
+
+func TestDescriptionDocs(t *testing.T) {
+	docs := extractMetricDocs(t)
+	descriptions := metrics.All()
+	for _, d := range descriptions {
+		want := d.Description
+		got, ok := docs[d.Name]
+		if !ok {
+			t.Errorf("no docs found for metric %s", d.Name)
+			continue
+		}
+		if got != want {
+			t.Errorf("mismatched description and docs for metric %s", d.Name)
+			t.Errorf("want: %q, got %q", want, got)
+			continue
+		}
+	}
+	if len(docs) > len(descriptions) {
+	docsLoop:
+		for name, _ := range docs {
+			for _, d := range descriptions {
+				if name == d.Name {
+					continue docsLoop
+				}
+			}
+			t.Errorf("stale documentation for non-existent metric: %s", name)
+		}
+	}
+}
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index b48c22ba30..fb4e23a2b5 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -44,6 +44,60 @@ the documentation of the Name field of the Description struct.
 
 Supported metrics
 
-TODO(mknyszek): List them here as they're added.
+	/memory/classes/heap/free:bytes
+		Memory that is available for allocation, and may be returned
+		to the underlying system.
+
+	/memory/classes/heap/objects:bytes
+		Memory occupied by live objects and dead objects that have
+		not yet been collected.
+
+	/memory/classes/heap/released:bytes
+		Memory that has been returned to the underlying system.
+
+	/memory/classes/heap/stacks:bytes
+		Memory allocated from the heap that is occupied by stacks.
+
+	/memory/classes/heap/unused:bytes
+		Memory that is unavailable for allocation, but cannot be
+		returned to the underlying system.
+
+	/memory/classes/metadata/mcache/free:bytes
+		Memory that is reserved for runtime mcache structures, but
+		not in-use.
+
+	/memory/classes/metadata/mcache/inuse:bytes
+		Memory that is occupied by runtime mcache structures that
+		are currently being used.
+
+	/memory/classes/metadata/mspan/free:bytes
+		Memory that is reserved for runtime mspan structures, but
+		not in-use.
+
+	/memory/classes/metadata/mspan/inuse:bytes
+		Memory that is occupied by runtime mspan structures that are
+		currently being used.
+
+	/memory/classes/metadata/other:bytes
+		Memory that is reserved for or used to hold runtime
+		metadata.
+
+	/memory/classes/os-stacks:bytes
+		Stack memory allocated by the underlying operating system.
+
+	/memory/classes/other:bytes
+		Memory used by execution trace buffers, structures for
+		debugging the runtime, finalizer and profiler specials, and
+		more.
+
+	/memory/classes/profiling/buckets:bytes
+		Memory that is used by the stack trace hash map used for
+		profiling.
+
+	/memory/classes/total:bytes
+		All memory mapped by the Go runtime into the current process
+		as read-write. Note that this does not include memory mapped
+		by code called via cgo or via the syscall package.
+		Sum of all metrics in /memory/classes.
 */
 package metrics
diff --git a/src/runtime/metrics/sample.go b/src/runtime/metrics/sample.go
index c7a3fc424a..b4b0979aa6 100644
--- a/src/runtime/metrics/sample.go
+++ b/src/runtime/metrics/sample.go
@@ -4,6 +4,11 @@
 
 package metrics
 
+import (
+	_ "runtime" // depends on the runtime via a linkname'd function
+	"unsafe"
+)
+
 // Sample captures a single metric sample.
 type Sample struct {
 	// Name is the name of the metric sampled.
@@ -16,6 +21,9 @@ type Sample struct {
 	Value Value
 }
 
+// Implemented in the runtime.
+func runtime_readMetrics(unsafe.Pointer, int, int)
+
 // Read populates each Value field in the given slice of metric samples.
 //
 // Desired metrics should be present in the slice with the appropriate name.
@@ -25,5 +33,5 @@ type Sample struct {
 // will have the value populated as KindBad to indicate that the name is
 // unknown.
 func Read(m []Sample) {
-	panic("unimplemented")
+	runtime_readMetrics(unsafe.Pointer(&m[0]), len(m), cap(m))
 }
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
new file mode 100644
index 0000000000..f00aad07c4
--- /dev/null
+++ b/src/runtime/metrics_test.go
@@ -0,0 +1,114 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"runtime/metrics"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
+	all := metrics.All()
+	samples := make([]metrics.Sample, len(all))
+	descs := make(map[string]metrics.Description)
+	for i := range all {
+		samples[i].Name = all[i].Name
+		descs[all[i].Name] = all[i]
+	}
+	return descs, samples
+}
+
+func TestReadMetrics(t *testing.T) {
+	// Tests whether readMetrics produces values aligning
+	// with ReadMemStats while the world is stopped.
+	var mstats runtime.MemStats
+	_, samples := prepareAllMetricsSamples()
+	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
+
+	checkUint64 := func(t *testing.T, m string, got, want uint64) {
+		t.Helper()
+		if got != want {
+			t.Errorf("metric %q: got %d, want %d", m, got, want)
+		}
+	}
+
+	// Check to make sure the values we read line up with other values we read.
+	for i := range samples {
+		switch name := samples[i].Name; name {
+		case "/memory/classes/heap/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
+		case "/memory/classes/heap/released:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
+		case "/memory/classes/heap/objects:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
+		case "/memory/classes/heap/unused:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
+		case "/memory/classes/heap/stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
+		case "/memory/classes/metadata/mcache/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
+		case "/memory/classes/metadata/mcache/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
+		case "/memory/classes/metadata/mspan/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
+		case "/memory/classes/metadata/mspan/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
+		case "/memory/classes/metadata/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
+		case "/memory/classes/os-stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
+		case "/memory/classes/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
+		case "/memory/classes/profiling/buckets:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
+		case "/memory/classes/total:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
+		}
+	}
+}
+
+func TestReadMetricsConsistency(t *testing.T) {
+	// Tests whether readMetrics produces consistent, sensible values.
+	// The values are read concurrently with the runtime doing other
+	// things (e.g. allocating) so what we read can't reasonably compared
+	// to runtime values.
+
+	// Read all the supported metrics through the metrics package.
+	descs, samples := prepareAllMetricsSamples()
+	metrics.Read(samples)
+
+	// Check to make sure the values we read make sense.
+	var totalVirtual struct {
+		got, want uint64
+	}
+	for i := range samples {
+		kind := samples[i].Value.Kind()
+		if want := descs[samples[i].Name].Kind; kind != want {
+			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
+			continue
+		}
+		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
+			v := samples[i].Value.Uint64()
+			totalVirtual.want += v
+
+			// None of these stats should ever get this big.
+			// If they do, there's probably overflow involved,
+			// usually due to bad accounting.
+			if int64(v) < 0 {
+				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
+			}
+		}
+		switch samples[i].Name {
+		case "/memory/classes/total:bytes":
+			totalVirtual.got = samples[i].Value.Uint64()
+		}
+	}
+	if totalVirtual.got != totalVirtual.want {
+		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
+	}
+}
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index a8eca85fe6..512a06cffa 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -882,7 +882,8 @@ func (m *consistentHeapStats) unsafeClear() {
 // heapStatsDelta, the resulting values should be complete and
 // valid statistic values.
 //
-// Not safe to call concurrently.
+// Not safe to call concurrently. The world must be stopped
+// or metricsSema must be held.
 func (m *consistentHeapStats) read(out *heapStatsDelta) {
 	// Getting preempted after this point is not safe because
 	// we read allp. We need to make sure a STW can't happen
-- 
cgit v1.2.1


From 74e566ed1dc52f7ef58093aff936a0931537a1ad Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 5 Aug 2020 23:10:46 +0000
Subject: runtime: add readMetrics latency benchmark

This change adds a new benchmark to the runtime tests for measuring the
latency of the new metrics implementation, based on the
ReadMemStats latency benchmark. readMetrics will have more metrics added
to it in the future, and this benchmark will serve as a way to measure
the cost of adding additional metrics.

Change-Id: Ib05e3ed4afa49a70863fc0c418eab35b72263e24
Reviewed-on: https://go-review.googlesource.com/c/go/+/247042
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Emmanuel Odeke <emmanuel@orijtech.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/gc_test.go      | 17 ++++++++++++-----
 src/runtime/metrics_test.go | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 9edebdada6..7870f31ae9 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -518,7 +518,7 @@ func BenchmarkReadMemStats(b *testing.B) {
 	hugeSink = nil
 }
 
-func BenchmarkReadMemStatsLatency(b *testing.B) {
+func applyGCLoad(b *testing.B) func() {
 	// We’ll apply load to the runtime with maxProcs-1 goroutines
 	// and use one more to actually benchmark. It doesn't make sense
 	// to try to run this test with only 1 P (that's what
@@ -563,6 +563,14 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 			runtime.KeepAlive(hold)
 		}()
 	}
+	return func() {
+		close(done)
+		wg.Wait()
+	}
+}
+
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
 
 	// Spend this much time measuring latencies.
 	latencies := make([]time.Duration, 0, 1024)
@@ -579,12 +587,11 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 		runtime.ReadMemStats(&ms)
 		latencies = append(latencies, time.Now().Sub(start))
 	}
-	close(done)
-	// Make sure to stop the timer before we wait! The goroutines above
-	// are very heavy-weight and not easy to stop, so we could end up
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
 	// confusing the benchmarking framework for small b.N.
 	b.StopTimer()
-	wg.Wait()
+	stop()
 
 	// Disable the default */op metrics.
 	// ns/op doesn't mean anything because it's an average, but we
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index f00aad07c4..d925b057b0 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -7,8 +7,10 @@ package runtime_test
 import (
 	"runtime"
 	"runtime/metrics"
+	"sort"
 	"strings"
 	"testing"
+	"time"
 	"unsafe"
 )
 
@@ -112,3 +114,39 @@ func TestReadMetricsConsistency(t *testing.T) {
 		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
 	}
 }
+
+func BenchmarkReadMetricsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+	_, samples := prepareAllMetricsSamples()
+
+	// Hit metrics.Read continuously and measure.
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := time.Now()
+		metrics.Read(samples)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	stop()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
-- 
cgit v1.2.1


From 07c3f65d53df7bb9f84bdbd2ab64c0ae12337e3e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 6 Aug 2020 15:44:27 +0000
Subject: runtime,runtime/metrics: add heap object count metric

For #37112.

Change-Id: Idd3dd5c84215ddd1ab05c2e76e848aa0a4d40fb0
Reviewed-on: https://go-review.googlesource.com/c/go/+/247043
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics.go             | 18 ++++++++++++++++--
 src/runtime/metrics/description.go |  5 +++++
 src/runtime/metrics/doc.go         |  3 +++
 src/runtime/metrics_test.go        |  2 ++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 44b5a29751..cf619cca4b 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -38,6 +38,13 @@ func initMetrics() {
 		return
 	}
 	metrics = map[string]metricData{
+		"/gc/heap/objects:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.numObjects
+			},
+		},
 		"/memory/classes/heap/free:bytes": {
 			deps: makeStatDepSet(heapStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -210,9 +217,13 @@ func (s *statDepSet) has(d statDep) bool {
 type heapStatsAggregate struct {
 	heapStatsDelta
 
+	// Derived from values in heapStatsDelta.
+
 	// inObjects is the bytes of memory occupied by objects,
-	// derived from other values in heapStats.
 	inObjects uint64
+
+	// numObjects is the number of live objects in the heap.
+	numObjects uint64
 }
 
 // compute populates the heapStatsAggregate with values from the runtime.
@@ -221,8 +232,11 @@ func (a *heapStatsAggregate) compute() {
 
 	// Calculate derived stats.
 	a.inObjects = uint64(a.largeAlloc - a.largeFree)
+	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
 	for i := range a.smallAllocCount {
-		a.inObjects += uint64(a.smallAllocCount[i]-a.smallFreeCount[i]) * uint64(class_to_size[i])
+		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
+		a.inObjects += n * uint64(class_to_size[i])
+		a.numObjects += n
 	}
 }
 
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 2e7df7e09f..47013e1451 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -50,6 +50,11 @@ type Description struct {
 // The English language descriptions below must be kept in sync with the
 // descriptions of each metric in doc.go.
 var allDesc = []Description{
+	{
+		Name:        "/gc/heap/objects:objects",
+		Description: "Number of objects, live or unswept, occupying heap memory.",
+		Kind:        KindUint64,
+	},
 	{
 		Name:        "/memory/classes/heap/free:bytes",
 		Description: "Memory that is available for allocation, and may be returned to the underlying system.",
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index fb4e23a2b5..4ac44bb19c 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -44,6 +44,9 @@ the documentation of the Name field of the Description struct.
 
 Supported metrics
 
+	/gc/heap/objects:objects
+		Number of objects, live or unswept, occupying heap memory.
+
 	/memory/classes/heap/free:bytes
 		Memory that is available for allocation, and may be returned
 		to the underlying system.
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index d925b057b0..6c0be7dc0b 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -70,6 +70,8 @@ func TestReadMetrics(t *testing.T) {
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
 		case "/memory/classes/total:bytes":
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
+		case "/gc/heap/objects:objects":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
 		}
 	}
 }
-- 
cgit v1.2.1


From a8b28ebc87854fb6f2ba99f415f046dc2ff63604 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 6 Aug 2020 16:47:58 +0000
Subject: runtime,runtime/metrics: add heap goal and GC cycle metrics

This change adds three new metrics: the heap goal, GC cycle count, and
forced GC count. These metrics are identical to their MemStats
counterparts.

For #37112.

Change-Id: I5a5e8dd550c0d646e5dcdbdf38274895e27cdd88
Reviewed-on: https://go-review.googlesource.com/c/go/+/247044
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics.go             | 51 ++++++++++++++++++++++++++++++++------
 src/runtime/metrics/description.go | 23 +++++++++++++++++
 src/runtime/metrics/doc.go         | 12 +++++++++
 src/runtime/metrics_test.go        |  8 ++++++
 4 files changed, 86 insertions(+), 8 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index cf619cca4b..6595a4342c 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -7,6 +7,7 @@ package runtime
 // Metrics implementation exported to runtime/metrics.
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -38,6 +39,34 @@ func initMetrics() {
 		return
 	}
 	metrics = map[string]metricData{
+		"/gc/cycles/automatic:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone - in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/forced:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/total:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone
+			},
+		},
+		"/gc/heap/goal:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.heapGoal
+			},
+		},
 		"/gc/heap/objects:objects": {
 			deps: makeStatDepSet(heapStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -248,14 +277,17 @@ func (a *heapStatsAggregate) compute() {
 // heapStatsAggregate, means there could be some skew, but because of
 // these stats are independent, there's no real consistency issue here.
 type sysStatsAggregate struct {
-	stacksSys   uint64
-	mSpanSys    uint64
-	mSpanInUse  uint64
-	mCacheSys   uint64
-	mCacheInUse uint64
-	buckHashSys uint64
-	gcMiscSys   uint64
-	otherSys    uint64
+	stacksSys      uint64
+	mSpanSys       uint64
+	mSpanInUse     uint64
+	mCacheSys      uint64
+	mCacheInUse    uint64
+	buckHashSys    uint64
+	gcMiscSys      uint64
+	otherSys       uint64
+	heapGoal       uint64
+	gcCyclesDone   uint64
+	gcCyclesForced uint64
 }
 
 // compute populates the sysStatsAggregate with values from the runtime.
@@ -264,6 +296,9 @@ func (a *sysStatsAggregate) compute() {
 	a.buckHashSys = memstats.buckhash_sys.load()
 	a.gcMiscSys = memstats.gcMiscSys.load()
 	a.otherSys = memstats.other_sys.load()
+	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.gcCyclesDone = uint64(memstats.numgc)
+	a.gcCyclesForced = uint64(memstats.numforcedgc)
 
 	systemstack(func() {
 		lock(&mheap_.lock)
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 47013e1451..66d229c270 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -50,6 +50,29 @@ type Description struct {
 // The English language descriptions below must be kept in sync with the
 // descriptions of each metric in doc.go.
 var allDesc = []Description{
+	{
+		Name:        "/gc/cycles/automatic:gc-cycles",
+		Description: "Count of completed GC cycles generated by the Go runtime.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/forced:gc-cycles",
+		Description: "Count of completed forced GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/total:gc-cycles",
+		Description: "Count of all completed GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/heap/goal:bytes",
+		Description: "Heap size target for the end of the GC cycle.",
+		Kind:        KindUint64,
+	},
 	{
 		Name:        "/gc/heap/objects:objects",
 		Description: "Number of objects, live or unswept, occupying heap memory.",
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 4ac44bb19c..9b44e73ee6 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -44,6 +44,18 @@ the documentation of the Name field of the Description struct.
 
 Supported metrics
 
+	/gc/cycles/automatic:gc-cycles
+		Count of completed GC cycles generated by the Go runtime.
+
+	/gc/cycles/forced:gc-cycles
+		Count of completed forced GC cycles.
+
+	/gc/cycles/total:gc-cycles
+		Count of all completed GC cycles.
+
+	/gc/heap/goal:bytes
+		Heap size target for the end of the GC cycle.
+
 	/gc/heap/objects:objects
 		Number of objects, live or unswept, occupying heap memory.
 
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 6c0be7dc0b..3724760294 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -72,6 +72,14 @@ func TestReadMetrics(t *testing.T) {
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
 		case "/gc/heap/objects:objects":
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
+		case "/gc/heap/goal:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
+		case "/gc/cycles/automatic:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
+		case "/gc/cycles/forced:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
+		case "/gc/cycles/total:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
 		}
 	}
 }
-- 
cgit v1.2.1


From 22d2b984a680900ebbec6268f93a839286b6f130 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 26 Oct 2020 19:35:23 +0000
Subject: runtime: make sysMemStats' methods nosplit

sysMemStats are updated early on in runtime initialization, so
triggering a stack growth would be bad. Mark them nosplit.

Thank you so much to cherryyz@google.com for finding this fix!

Fixes #42218.

Change-Id: Ic62db76e6a4f829355d7eaabed1727c51adfbd0f
Reviewed-on: https://go-review.googlesource.com/c/go/+/265157
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/runtime/mstats.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 512a06cffa..07f466ec49 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -720,11 +720,17 @@ func flushallmcaches() {
 type sysMemStat uint64
 
 // load atomically reads the value of the stat.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
+//go:nosplit
 func (s *sysMemStat) load() uint64 {
 	return atomic.Load64((*uint64)(s))
 }
 
 // add atomically adds the sysMemStat by n.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
+//go:nosplit
 func (s *sysMemStat) add(n int64) {
 	if s == nil {
 		return
-- 
cgit v1.2.1


From 8e2370bf7f0c992ce1ea5dc54b43551cea71a485 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 6 Aug 2020 19:04:46 +0000
Subject: runtime,runtime/metrics: add object size distribution metrics

This change adds metrics for the distribution of objects allocated and
freed by size, mirroring MemStats' BySize field.

For #37112.

Change-Id: Ibaf1812da93598b37265ec97abc6669c1a5efcbf
Reviewed-on: https://go-review.googlesource.com/c/go/+/247045
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics.go             | 52 ++++++++++++++++++++++++++++++++++++++
 src/runtime/metrics/description.go | 10 ++++++++
 src/runtime/metrics/doc.go         |  6 +++++
 src/runtime/metrics_test.go        | 36 ++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 6595a4342c..32d8ab461c 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -18,6 +18,8 @@ var (
 	metricsSema uint32 = 1
 	metricsInit bool
 	metrics     map[string]metricData
+
+	sizeClassBuckets []float64
 )
 
 type metricData struct {
@@ -38,6 +40,10 @@ func initMetrics() {
 	if metricsInit {
 		return
 	}
+	sizeClassBuckets = make([]float64, _NumSizeClasses)
+	for i := range sizeClassBuckets {
+		sizeClassBuckets[i] = float64(class_to_size[i])
+	}
 	metrics = map[string]metricData{
 		"/gc/cycles/automatic:gc-cycles": {
 			deps: makeStatDepSet(sysStatsDep),
@@ -60,6 +66,26 @@ func initMetrics() {
 				out.scalar = in.sysStats.gcCyclesDone
 			},
 		},
+		"/gc/heap/allocs-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeAllocCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallAllocCount[i])
+				}
+			},
+		},
+		"/gc/heap/frees-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeFreeCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallFreeCount[i])
+				}
+			},
+		},
 		"/gc/heap/goal:bytes": {
 			deps: makeStatDepSet(sysStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -370,6 +396,32 @@ type metricValue struct {
 	pointer unsafe.Pointer // contains non-scalar values.
 }
 
+// float64HistOrInit tries to pull out an existing float64Histogram
+// from the value, but if none exists, then it allocates one with
+// the given buckets.
+func (v *metricValue) float64HistOrInit(buckets []float64) *metricFloat64Histogram {
+	var hist *metricFloat64Histogram
+	if v.kind == metricKindFloat64Histogram && v.pointer != nil {
+		hist = (*metricFloat64Histogram)(v.pointer)
+	} else {
+		v.kind = metricKindFloat64Histogram
+		hist = new(metricFloat64Histogram)
+		v.pointer = unsafe.Pointer(hist)
+	}
+	hist.buckets = buckets
+	if len(hist.counts) != len(hist.buckets)+1 {
+		hist.counts = make([]uint64, len(buckets)+1)
+	}
+	return hist
+}
+
+// metricFloat64Histogram is a runtime copy of runtime/metrics.Float64Histogram
+// and must be kept structurally identical to that type.
+type metricFloat64Histogram struct {
+	counts  []uint64
+	buckets []float64
+}
+
 // agg is used by readMetrics, and is protected by metricsSema.
 //
 // Managed as a global variable because its pointer will be
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 66d229c270..e43904fc7d 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -68,6 +68,16 @@ var allDesc = []Description{
 		Kind:        KindUint64,
 		Cumulative:  true,
 	},
+	{
+		Name:        "/gc/heap/allocs-by-size:objects",
+		Description: "Distribution of all objects allocated by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/frees-by-size:objects",
+		Description: "Distribution of all objects freed by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
 	{
 		Name:        "/gc/heap/goal:bytes",
 		Description: "Heap size target for the end of the GC cycle.",
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 9b44e73ee6..5045a5b4c1 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -53,6 +53,12 @@ Supported metrics
 	/gc/cycles/total:gc-cycles
 		Count of all completed GC cycles.
 
+	/gc/heap/allocs-by-size:objects
+		Distribution of all objects allocated by approximate size.
+
+	/gc/heap/frees-by-size:objects
+		Distribution of all objects freed by approximate size.
+
 	/gc/heap/goal:bytes
 		Heap size target for the end of the GC cycle.
 
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 3724760294..1a30810544 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -98,6 +98,10 @@ func TestReadMetricsConsistency(t *testing.T) {
 	var totalVirtual struct {
 		got, want uint64
 	}
+	var objects struct {
+		alloc, free *metrics.Float64Histogram
+		total       uint64
+	}
 	for i := range samples {
 		kind := samples[i].Value.Kind()
 		if want := descs[samples[i].Name].Kind; kind != want {
@@ -118,11 +122,43 @@ func TestReadMetricsConsistency(t *testing.T) {
 		switch samples[i].Name {
 		case "/memory/classes/total:bytes":
 			totalVirtual.got = samples[i].Value.Uint64()
+		case "/gc/heap/objects:objects":
+			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs-by-size:objects":
+			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees-by-size:objects":
+			objects.free = samples[i].Value.Float64Histogram()
 		}
 	}
 	if totalVirtual.got != totalVirtual.want {
 		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
 	}
+	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
+		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
+	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
+		t.Error("allocs-by-size and frees-by-size counts don't match in length")
+	} else {
+		for i := range objects.alloc.Buckets {
+			ba := objects.alloc.Buckets[i]
+			bf := objects.free.Buckets[i]
+			if ba != bf {
+				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
+			}
+		}
+		if !t.Failed() {
+			got, want := uint64(0), objects.total
+			for i := range objects.alloc.Counts {
+				if objects.alloc.Counts[i] < objects.free.Counts[i] {
+					t.Errorf("found more allocs than frees in object dist bucket %d", i)
+					continue
+				}
+				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+			}
+			if got != want {
+				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
+			}
+		}
+	}
 }
 
 func BenchmarkReadMetricsLatency(b *testing.B) {
-- 
cgit v1.2.1


From 36c5edd8d9e6c13af26733e5c820eae0598203fe Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 6 Aug 2020 20:36:49 +0000
Subject: runtime: add timeHistogram type

This change adds a concurrent HDR time histogram to the runtime with
tests. It also adds a function to generate boundaries for use by the
metrics package.

For #37112.

Change-Id: Ifbef8ddce8e3a965a0dcd58ccd4915c282ae2098
Reviewed-on: https://go-review.googlesource.com/c/go/+/247046
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go    |  24 +++++++
 src/runtime/histogram.go      | 148 ++++++++++++++++++++++++++++++++++++++++++
 src/runtime/histogram_test.go |  58 +++++++++++++++++
 src/runtime/metrics.go        |   2 +
 4 files changed, 232 insertions(+)
 create mode 100644 src/runtime/histogram.go
 create mode 100644 src/runtime/histogram_test.go

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d043fe3ee5..4ca0420d2a 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1141,3 +1141,27 @@ func MSpanCountAlloc(ms *MSpan, bits []byte) int {
 	s.gcmarkBits = nil
 	return result
 }
+
+const (
+	TimeHistSubBucketBits   = timeHistSubBucketBits
+	TimeHistNumSubBuckets   = timeHistNumSubBuckets
+	TimeHistNumSuperBuckets = timeHistNumSuperBuckets
+)
+
+type TimeHistogram timeHistogram
+
+// Counts returns the counts for the given bucket, subBucket indices.
+// Returns true if the bucket was valid, otherwise returns the counts
+// for the overflow bucket and false.
+func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
+	t := (*timeHistogram)(th)
+	i := bucket*TimeHistNumSubBuckets + subBucket
+	if i >= uint(len(t.counts)) {
+		return t.overflow, false
+	}
+	return t.counts[i], true
+}
+
+func (th *TimeHistogram) Record(duration int64) {
+	(*timeHistogram)(th).record(duration)
+}
diff --git a/src/runtime/histogram.go b/src/runtime/histogram.go
new file mode 100644
index 0000000000..4020969eb9
--- /dev/null
+++ b/src/runtime/histogram.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+)
+
+const (
+	// For the time histogram type, we use an HDR histogram.
+	// Values are placed in super-buckets based solely on the most
+	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are then placed into sub-buckets based on the value of
+	// the next timeHistSubBucketBits most significant bits. Thus,
+	// sub-buckets are linear within a super-bucket.
+	//
+	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
+	// defines the error. This error may be computed as
+	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
+	// per super-bucket the error is approximately 6%.
+	//
+	// The number of super-buckets (timeHistNumSuperBuckets), on the
+	// other hand, defines the range. To reserve room for sub-buckets,
+	// bit timeHistSubBucketBits is the first bit considered for
+	// super-buckets, so super-bucket indicies are adjusted accordingly.
+	//
+	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	//
+	//    00110
+	//    ^----
+	//    │  ^
+	//    │  └---- Lowest 4 bits -> sub-bucket 6
+	//    └------- Bit 4 unset -> super-bucket 0
+	//
+	//    10110
+	//    ^----
+	//    │  ^
+	//    │  └---- Next 4 bits -> sub-bucket 6
+	//    └------- Bit 4 set -> super-bucket 1
+	//    100010
+	//    ^----^
+	//    │  ^ └-- Lower bits ignored
+	//    │  └---- Next 4 bits -> sub-bucket 1
+	//    └------- Bit 5 set -> super-bucket 2
+	//
+	// Following this pattern, bucket 45 will have the bit 48 set. We don't
+	// have any buckets for higher values, so the highest sub-bucket will
+	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
+	// more than enough to handle durations produced by the runtime.
+	timeHistSubBucketBits   = 4
+	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
+	timeHistNumSuperBuckets = 45
+	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+)
+
+// timeHistogram represents a distribution of durations in
+// nanoseconds.
+//
+// The accuracy and range of the histogram is defined by the
+// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+//
+// It is an HDR histogram with exponentially-distributed
+// buckets and linearly distributed sub-buckets.
+//
+// Counts in the histogram are updated atomically, so it is safe
+// for concurrent use. It is also safe to read all the values
+// atomically.
+type timeHistogram struct {
+	counts   [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+	overflow uint64
+}
+
+// record adds the given duration to the distribution.
+//
+// Although the duration is an int64 to facilitate ease-of-use
+// with e.g. nanotime, the duration must be non-negative.
+func (h *timeHistogram) record(duration int64) {
+	if duration < 0 {
+		throw("timeHistogram encountered negative duration")
+	}
+	// The index of the exponential bucket is just the index
+	// of the highest set bit adjusted for how many bits we
+	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
+	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
+	var superBucket, subBucket uint
+	if duration >= timeHistNumSubBuckets {
+		// At this point, we know the duration value will always be
+		// at least timeHistSubBucketsBits long.
+		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
+		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
+			// The bucket index we got is larger than what we support, so
+			// add into the special overflow bucket.
+			atomic.Xadd64(&h.overflow, 1)
+			return
+		}
+		// The linear subbucket index is just the timeHistSubBucketsBits
+		// bits after the top bit. To extract that value, shift down
+		// the duration such that we leave the top bit and the next bits
+		// intact, then extract the index.
+		subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
+	} else {
+		subBucket = uint(duration)
+	}
+	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+}
+
+// timeHistogramMetricsBuckets generates a slice of boundaries for
+// the timeHistogram. These boundaries are represented in seconds,
+// not nanoseconds like the timeHistogram represents durations.
+func timeHistogramMetricsBuckets() []float64 {
+	b := make([]float64, timeHistTotalBuckets-1)
+	for i := 0; i < timeHistNumSuperBuckets; i++ {
+		superBucketMin := uint64(0)
+		// The (inclusive) minimum for the first bucket is 0.
+		if i > 0 {
+			// The minimum for the second bucket will be
+			// 1 << timeHistSubBucketBits, indicating that all
+			// sub-buckets are represented by the next timeHistSubBucketBits
+			// bits.
+			// Thereafter, we shift up by 1 each time, so we can represent
+			// this pattern as (i-1)+timeHistSubBucketBits.
+			superBucketMin = uint64(1) << uint(i-1+timeHistSubBucketBits)
+		}
+		// subBucketShift is the amount that we need to shift the sub-bucket
+		// index to combine it with the bucketMin.
+		subBucketShift := uint(0)
+		if i > 1 {
+			// The first two buckets are exact with respect to integers,
+			// so we'll never have to shift the sub-bucket index. Thereafter,
+			// we shift up by 1 with each subsequent bucket.
+			subBucketShift = uint(i - 2)
+		}
+		for j := 0; j < timeHistNumSubBuckets; j++ {
+			// j is the sub-bucket index. By shifting the index into position to
+			// combine with the bucket minimum, we obtain the minimum value for that
+			// sub-bucket.
+			subBucketMin := superBucketMin + (uint64(j) << subBucketShift)
+
+			// Convert the subBucketMin which is in nanoseconds to a float64 seconds value.
+			// These values will all be exactly representable by a float64.
+			b[i*timeHistNumSubBuckets+j] = float64(subBucketMin) / 1e9
+		}
+	}
+	return b
+}
diff --git a/src/runtime/histogram_test.go b/src/runtime/histogram_test.go
new file mode 100644
index 0000000000..5f5b28f784
--- /dev/null
+++ b/src/runtime/histogram_test.go
@@ -0,0 +1,58 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+var dummyTimeHistogram TimeHistogram
+
+func TestTimeHistogram(t *testing.T) {
+	// We need to use a global dummy because this
+	// could get stack-allocated with a non-8-byte alignment.
+	// The result of this bad alignment is a segfault on
+	// 32-bit platforms when calling Record.
+	h := &dummyTimeHistogram
+
+	// Record exactly one sample in each bucket.
+	for i := 0; i < TimeHistNumSuperBuckets; i++ {
+		var base int64
+		if i > 0 {
+			base = int64(1) << (i + TimeHistSubBucketBits - 1)
+		}
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
+			v := int64(j)
+			if i > 0 {
+				v <<= i - 1
+			}
+			h.Record(base + v)
+		}
+	}
+	// Hit the overflow bucket.
+	h.Record(int64(^uint64(0) >> 1))
+
+	// Check to make sure there's exactly one count in each
+	// bucket.
+	for i := uint(0); i < TimeHistNumSuperBuckets; i++ {
+		for j := uint(0); j < TimeHistNumSubBuckets; j++ {
+			c, ok := h.Count(i, j)
+			if !ok {
+				t.Errorf("hit overflow bucket unexpectedly: (%d, %d)", i, j)
+			} else if c != 1 {
+				t.Errorf("bucket (%d, %d) has count that is not 1: %d", i, j, c)
+			}
+		}
+	}
+	c, ok := h.Count(TimeHistNumSuperBuckets, 0)
+	if ok {
+		t.Errorf("expected to hit overflow bucket: (%d, %d)", TimeHistNumSuperBuckets, 0)
+	}
+	if c != 1 {
+		t.Errorf("overflow bucket has count that is not 1: %d", c)
+	}
+	dummyTimeHistogram = TimeHistogram{}
+}
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 32d8ab461c..2be38ccaaa 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -20,6 +20,7 @@ var (
 	metrics     map[string]metricData
 
 	sizeClassBuckets []float64
+	timeHistBuckets  []float64
 )
 
 type metricData struct {
@@ -44,6 +45,7 @@ func initMetrics() {
 	for i := range sizeClassBuckets {
 		sizeClassBuckets[i] = float64(class_to_size[i])
 	}
+	timeHistBuckets = timeHistogramMetricsBuckets()
 	metrics = map[string]metricData{
 		"/gc/cycles/automatic:gc-cycles": {
 			deps: makeStatDepSet(sysStatsDep),
-- 
cgit v1.2.1


From d39a89fd5843f535d634620d27110b320431f584 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 6 Aug 2020 21:59:13 +0000
Subject: runtime,runtime/metrics: add metric for distribution of GC pauses

For #37112.

Change-Id: Ibb0425c9c582ae3da3b2662d5bbe830d7df9079c
Reviewed-on: https://go-review.googlesource.com/c/go/+/247047
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics.go             |  9 +++++++++
 src/runtime/metrics/description.go |  5 +++++
 src/runtime/metrics/doc.go         |  3 +++
 src/runtime/metrics_test.go        | 22 ++++++++++++++++++++++
 src/runtime/mgc.go                 |  3 +++
 src/runtime/mstats.go              | 12 ++++++++++++
 6 files changed, 54 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 2be38ccaaa..0e391472b2 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -102,6 +102,15 @@ func initMetrics() {
 				out.scalar = in.heapStats.numObjects
 			},
 		},
+		"/gc/pauses:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[len(hist.counts)-1] = atomic.Load64(&memstats.gcPauseDist.overflow)
+				for i := range hist.buckets {
+					hist.counts[i] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+				}
+			},
+		},
 		"/memory/classes/heap/free:bytes": {
 			deps: makeStatDepSet(heapStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index e43904fc7d..47959e467c 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -88,6 +88,11 @@ var allDesc = []Description{
 		Description: "Number of objects, live or unswept, occupying heap memory.",
 		Kind:        KindUint64,
 	},
+	{
+		Name:        "/gc/pauses:seconds",
+		Description: "Distribution individual GC-related stop-the-world pause latencies.",
+		Kind:        KindFloat64Histogram,
+	},
 	{
 		Name:        "/memory/classes/heap/free:bytes",
 		Description: "Memory that is available for allocation, and may be returned to the underlying system.",
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 5045a5b4c1..1e12ade5a1 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -65,6 +65,9 @@ Supported metrics
 	/gc/heap/objects:objects
 		Number of objects, live or unswept, occupying heap memory.
 
+	/gc/pauses:seconds
+		Distribution individual GC-related stop-the-world pause latencies.
+
 	/memory/classes/heap/free:bytes
 		Memory that is available for allocation, and may be returned
 		to the underlying system.
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 1a30810544..7b3132bc30 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -90,6 +90,11 @@ func TestReadMetricsConsistency(t *testing.T) {
 	// things (e.g. allocating) so what we read can't reasonably compared
 	// to runtime values.
 
+	// Run a few GC cycles to get some of the stats to be non-zero.
+	runtime.GC()
+	runtime.GC()
+	runtime.GC()
+
 	// Read all the supported metrics through the metrics package.
 	descs, samples := prepareAllMetricsSamples()
 	metrics.Read(samples)
@@ -102,6 +107,10 @@ func TestReadMetricsConsistency(t *testing.T) {
 		alloc, free *metrics.Float64Histogram
 		total       uint64
 	}
+	var gc struct {
+		numGC  uint64
+		pauses uint64
+	}
 	for i := range samples {
 		kind := samples[i].Value.Kind()
 		if want := descs[samples[i].Name].Kind; kind != want {
@@ -128,6 +137,14 @@ func TestReadMetricsConsistency(t *testing.T) {
 			objects.alloc = samples[i].Value.Float64Histogram()
 		case "/gc/heap/frees-by-size:objects":
 			objects.free = samples[i].Value.Float64Histogram()
+		case "/gc/cycles:gc-cycles":
+			gc.numGC = samples[i].Value.Uint64()
+		case "/gc/pauses:seconds":
+			h := samples[i].Value.Float64Histogram()
+			gc.pauses = 0
+			for i := range h.Counts {
+				gc.pauses += h.Counts[i]
+			}
 		}
 	}
 	if totalVirtual.got != totalVirtual.want {
@@ -159,6 +176,11 @@ func TestReadMetricsConsistency(t *testing.T) {
 			}
 		}
 	}
+	// The current GC has at least 2 pauses per GC.
+	// Check to see if that value makes sense.
+	if gc.pauses < gc.numGC*2 {
+		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
+	}
 }
 
 func BenchmarkReadMetricsLatency(b *testing.B) {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 540c376f1c..b0ab0ae6bb 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1418,6 +1418,7 @@ func gcStart(trigger gcTrigger) {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
+		memstats.gcPauseDist.record(now - work.pauseStart)
 	})
 
 	// Release the world sema before Gosched() in STW mode
@@ -1565,6 +1566,7 @@ top:
 		systemstack(func() {
 			now := startTheWorldWithSema(true)
 			work.pauseNS += now - work.pauseStart
+			memstats.gcPauseDist.record(now - work.pauseStart)
 		})
 		semrelease(&worldsema)
 		goto top
@@ -1677,6 +1679,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
+	memstats.gcPauseDist.record(now - work.pauseStart)
 	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
 	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 07f466ec49..e0a417d213 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -157,6 +157,14 @@ type mstats struct {
 
 	// heapStats is a set of statistics
 	heapStats consistentHeapStats
+
+	_ uint32 // ensure gcPauseDist is aligned
+
+	// gcPauseDist represents the distribution of all GC-related
+	// application pauses in the runtime.
+	//
+	// Each individual pause is counted separately, unlike pause_ns.
+	gcPauseDist timeHistogram
 }
 
 var memstats mstats
@@ -443,6 +451,10 @@ func init() {
 		println(offset)
 		throw("memstats.heapStats not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
+		println(offset)
+		throw("memstats.gcPauseDist not aligned to 8 bytes")
+	}
 	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
 	// [3]heapStatsDelta) to be 8-byte aligned.
 	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
-- 
cgit v1.2.1


From 80c6b92ecb911409f57d06793a1213395b75ebe2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 7 Aug 2020 16:37:29 +0000
Subject: runtime,runtime/metrics: export goroutine count as a metric

For #37112.

Change-Id: I994dfe848605b95ef6aec24f53869e929247e987
Reviewed-on: https://go-review.googlesource.com/c/go/+/247049
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/metrics.go             | 6 ++++++
 src/runtime/metrics/description.go | 5 +++++
 src/runtime/metrics/doc.go         | 3 +++
 src/runtime/metrics_test.go        | 4 ++++
 4 files changed, 18 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 0e391472b2..d3c0341aee 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -214,6 +214,12 @@ func initMetrics() {
 					in.sysStats.gcMiscSys + in.sysStats.otherSys
 			},
 		},
+		"/sched/goroutines:goroutines": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(gcount())
+			},
+		},
 	}
 	metricsInit = true
 }
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 47959e467c..bc2e0882db 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -163,6 +163,11 @@ var allDesc = []Description{
 		Description: "All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.",
 		Kind:        KindUint64,
 	},
+	{
+		Name:        "/sched/goroutines:goroutines",
+		Description: "Count of live goroutines.",
+		Kind:        KindUint64,
+	},
 }
 
 // All returns a slice of containing metric descriptions for all supported metrics.
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 1e12ade5a1..e340f3d0dd 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -123,5 +123,8 @@ Supported metrics
 		as read-write. Note that this does not include memory mapped
 		by code called via cgo or via the syscall package.
 		Sum of all metrics in /memory/classes.
+
+	/sched/goroutines:goroutines
+		Count of live goroutines.
 */
 package metrics
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 7b3132bc30..167edd57fd 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -145,6 +145,10 @@ func TestReadMetricsConsistency(t *testing.T) {
 			for i := range h.Counts {
 				gc.pauses += h.Counts[i]
 			}
+		case "/sched/goroutines:goroutines":
+			if samples[i].Value.Uint64() < 1 {
+				t.Error("number of goroutines is less than one")
+			}
 		}
 	}
 	if totalVirtual.got != totalVirtual.want {
-- 
cgit v1.2.1


From 32d0eaa44e2d83cff6f0c1fa3d58af7627f3cd99 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 17 Sep 2020 21:19:28 +0000
Subject: runtime: implement dumpmemstats in terms of readmemstats_m

Since MemStats is now populated directly and some values are derived,
avoid duplicating the logic by instead populating the heap dump directly
from MemStats (external version) instead of memstats (runtime internal
version).

Change-Id: I0bec96bfa02d2ffd1b56475779c124a760e64238
Reviewed-on: https://go-review.googlesource.com/c/go/+/255817
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/heapdump.go | 76 ++++++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 32 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 6fcd9746af..33e224d587 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -20,8 +20,19 @@ import (
 func runtime_debug_WriteHeapDump(fd uintptr) {
 	stopTheWorld("write heap dump")
 
+	// Keep m on this G's stack instead of the system stack.
+	// Both readmemstats_m and writeheapdump_m have pretty large
+	// peak stack depths and we risk blowing the system stack.
+	// This is safe because the world is stopped, so we don't
+	// need to worry about anyone shrinking and therefore moving
+	// our stack.
+	var m MemStats
 	systemstack(func() {
-		writeheapdump_m(fd)
+		// Call readmemstats_m here instead of deeper in
+		// writeheapdump_m because we might blow the system stack
+		// otherwise.
+		readmemstats_m(&m)
+		writeheapdump_m(fd, &m)
 	})
 
 	startTheWorld()
@@ -539,39 +550,40 @@ func dumpms() {
 	}
 }
 
-func dumpmemstats() {
+//go:systemstack
+func dumpmemstats(m *MemStats) {
 	// These ints should be identical to the exported
 	// MemStats structure and should be ordered the same
 	// way too.
 	dumpint(tagMemStats)
-	dumpint(memstats.alloc)
-	dumpint(memstats.total_alloc)
-	dumpint(memstats.sys)
-	dumpint(memstats.nlookup)
-	dumpint(memstats.nmalloc)
-	dumpint(memstats.nfree)
-	dumpint(memstats.alloc)
-	dumpint(memstats.heap_sys.load())
-	dumpint(memstats.heap_sys.load() - memstats.heap_inuse)
-	dumpint(memstats.heap_inuse)
-	dumpint(memstats.heap_released)
-	dumpint(memstats.heap_objects)
-	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys.load())
-	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys.load())
-	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys.load())
-	dumpint(memstats.buckhash_sys.load())
-	dumpint(memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse)
-	dumpint(memstats.other_sys.load())
-	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc_unix)
-	dumpint(memstats.pause_total_ns)
+	dumpint(m.Alloc)
+	dumpint(m.TotalAlloc)
+	dumpint(m.Sys)
+	dumpint(m.Lookups)
+	dumpint(m.Mallocs)
+	dumpint(m.Frees)
+	dumpint(m.HeapAlloc)
+	dumpint(m.HeapSys)
+	dumpint(m.HeapIdle)
+	dumpint(m.HeapInuse)
+	dumpint(m.HeapReleased)
+	dumpint(m.HeapObjects)
+	dumpint(m.StackInuse)
+	dumpint(m.StackSys)
+	dumpint(m.MSpanInuse)
+	dumpint(m.MSpanSys)
+	dumpint(m.MCacheInuse)
+	dumpint(m.MCacheSys)
+	dumpint(m.BuckHashSys)
+	dumpint(m.GCSys)
+	dumpint(m.OtherSys)
+	dumpint(m.NextGC)
+	dumpint(m.LastGC)
+	dumpint(m.PauseTotalNs)
 	for i := 0; i < 256; i++ {
-		dumpint(memstats.pause_ns[i])
+		dumpint(m.PauseNs[i])
 	}
-	dumpint(uint64(memstats.numgc))
+	dumpint(uint64(m.NumGC))
 }
 
 func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
@@ -642,7 +654,7 @@ func dumpmemprof() {
 
 var dumphdr = []byte("go1.7 heap dump\n")
 
-func mdump() {
+func mdump(m *MemStats) {
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
 		if s.state.get() == mSpanInUse {
@@ -657,13 +669,13 @@ func mdump() {
 	dumpgs()
 	dumpms()
 	dumproots()
-	dumpmemstats()
+	dumpmemstats(m)
 	dumpmemprof()
 	dumpint(tagEOF)
 	flush()
 }
 
-func writeheapdump_m(fd uintptr) {
+func writeheapdump_m(fd uintptr, m *MemStats) {
 	_g_ := getg()
 	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
 	_g_.waitreason = waitReasonDumpingHeap
@@ -677,7 +689,7 @@ func writeheapdump_m(fd uintptr) {
 	dumpfd = fd
 
 	// Call dump routine.
-	mdump()
+	mdump(m)
 
 	// Reset dump file.
 	dumpfd = 0
-- 
cgit v1.2.1


From 76bce1dd52b0c2a06d48bf7db4e89e8dec47c507 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 14 Jul 2020 21:45:16 +0000
Subject: runtime: implement addrRanges.findSucc with a binary search

This change modifies addrRanges.findSucc to more efficiently find the
successor range in an addrRanges by using a binary search to narrow down
large addrRanges and iterate over no more than 8 addrRanges.

This change makes the runtime more robust against systems that may
aggressively randomize the address space mappings it gives the runtime
(e.g. Fuchsia).

For #40191.

Change-Id: If529df2abd2edb1b1496d8690ddd284ecd7138c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/242679
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mranges.go | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 16acadcff1..84a2c06dbb 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -172,20 +172,46 @@ func (a *addrRanges) init(sysStat *sysMemStat) {
 	a.totalBytes = 0
 }
 
-// findSucc returns the first index in a such that base is
+// findSucc returns the first index in a such that addr is
 // less than the base of the addrRange at that index.
 func (a *addrRanges) findSucc(addr uintptr) int {
-	// TODO(mknyszek): Consider a binary search for large arrays.
-	// While iterating over these ranges is potentially expensive,
-	// the expected number of ranges is small, ideally just 1,
-	// since Go heaps are usually mostly contiguous.
 	base := offAddr{addr}
-	for i := range a.ranges {
+
+	// Narrow down the search space via a binary search
+	// for large addrRanges until we have at most iterMax
+	// candidates left.
+	const iterMax = 8
+	bot, top := 0, len(a.ranges)
+	for top-bot > iterMax {
+		i := ((top - bot) / 2) + bot
+		if a.ranges[i].contains(base.addr()) {
+			// a.ranges[i] contains base, so
+			// its successor is the next index.
+			return i + 1
+		}
+		if base.lessThan(a.ranges[i].base) {
+			// In this case i might actually be
+			// the successor, but we can't be sure
+			// until we check the ones before it.
+			top = i
+		} else {
+			// In this case we know base is
+			// greater than or equal to a.ranges[i].limit-1,
+			// so i is definitely not the successor.
+			// We already checked i, so pick the next
+			// one.
+			bot = i + 1
+		}
+	}
+	// There are top-bot candidates left, so
+	// iterate over them and find the first that
+	// base is strictly less than.
+	for i := bot; i < top; i++ {
 		if base.lessThan(a.ranges[i].base) {
 			return i
 		}
 	}
-	return len(a.ranges)
+	return top
 }
 
 // findAddrGreaterEqual returns the smallest address represented by a
-- 
cgit v1.2.1


From 8fdc79e18a9704185bd6471b592db1e8004bd993 Mon Sep 17 00:00:00 2001
From: Chris Hines <chris.cs.guy@gmail.com>
Date: Fri, 1 May 2020 17:04:36 -0400
Subject: runtime: reduce timer latency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change the scheduler to treat expired timers with the same approach it
uses to steal runnable G's.

Previously the scheduler ignored timers on P's not marked for
preemption. That had the downside that any G's waiting on those expired
timers starved until the G running on their P completed or was
preempted. That could take as long as 20ms if sysmon was in a 10ms
wake up cycle.

In addition, a spinning P that ignored an expired timer and found no
other work would stop despite there being available work, missing the
opportunity for greater parallelism.

With this change the scheduler no longer ignores timers on
non-preemptable P's or relies on sysmon as a backstop to start threads
when timers expire. Instead it wakes an idle P, if needed, when
creating a new timer because it cannot predict if the current P will
have a scheduling opportunity before the new timer expires. The P it
wakes will determine how long to sleep and block on the netpoller for
the required time, potentially stealing the new timer when it wakes.

This change also eliminates a race between a spinning P transitioning
to idle concurrently with timer creation using the same pattern used
for submission of new goroutines in the same window.

Benchmark analysis:

CL 232199, which was included in Go 1.15 improved timer latency over Go
1.14 by allowing P's to steal timers from P's not marked for preemption.
The benchmarks added in this CL measure that improvement in the
ParallelTimerLatency benchmark seen below. However, Go 1.15 still relies
on sysmon to notice expired timers in some situations and sysmon can
sleep for up to 10ms before waking to check timers. This CL fixes that
shortcoming with modest regression on other benchmarks.

name \ avg-late-ns                                        go14.time.bench  go15.time.bench  fix.time.bench
ParallelTimerLatency-8                                         17.3M ± 3%        7.9M ± 0%       0.2M ± 3%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8        53.4k ±23%       50.7k ±31%     252.4k ± 9%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8         204k ±14%         90k ±58%       188k ±12%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8        1.17M ± 0%       0.11M ± 5%      0.11M ± 2%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8        1.81M ±44%       0.10M ± 4%      0.10M ± 2%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8        2.28M ±66%       0.09M ±13%      0.08M ±21%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8        2.84M ±85%       0.07M ±15%      0.07M ±18%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8        2.13M ±27%       0.06M ± 4%      0.06M ± 9%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8        2.63M ± 6%       0.06M ±11%      0.06M ± 9%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8        3.32M ±17%       0.06M ±16%      0.07M ±14%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8       8.46M ±20%       4.37M ±21%      5.03M ±23%
StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8          1.02M ± 1%       0.20M ± 2%      0.20M ± 2%

name \ max-late-ns                                        go14.time.bench  go15.time.bench  fix.time.bench
ParallelTimerLatency-8                                         18.3M ± 1%        8.2M ± 0%       0.5M ±12%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8         141k ±19%        127k ±19%      1129k ± 3%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8        2.78M ± 4%       1.23M ±15%      1.26M ± 5%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8        6.05M ± 5%       0.67M ±56%      0.81M ±33%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8        7.93M ±20%       0.71M ±46%      0.76M ±41%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8        9.41M ±30%       0.92M ±23%      0.81M ±44%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8        10.8M ±42%        0.8M ±41%       0.8M ±30%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8        9.62M ±24%       0.77M ±38%      0.88M ±27%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8        10.6M ±10%        0.8M ±32%       0.7M ±27%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8        11.9M ±36%        0.6M ±46%       0.8M ±38%
StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8       36.8M ±21%       24.7M ±21%      27.5M ±16%
StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8          2.12M ± 2%       1.02M ±11%      1.03M ± 7%

Other time benchmarks:
name \ time/op          go14.time.bench  go15.time.bench  fix.time.bench
AfterFunc-8                  137µs ± 4%       123µs ± 4%      131µs ± 2%
After-8                      212µs ± 3%       195µs ± 4%      204µs ± 7%
Stop-8                       165µs ± 6%       156µs ± 2%      151µs ±12%
SimultaneousAfterFunc-8      260µs ± 3%       248µs ± 3%      284µs ± 2%
StartStop-8                 65.8µs ± 9%      64.4µs ± 7%     67.3µs ±15%
Reset-8                     13.6µs ± 2%       9.6µs ± 2%      9.1µs ± 4%
Sleep-8                      307µs ± 4%       306µs ± 3%      320µs ± 2%
Ticker-8                    53.0µs ± 5%      54.5µs ± 5%     57.0µs ±11%
TickerReset-8                                9.24µs ± 2%     9.51µs ± 3%
TickerResetNaive-8                            149µs ± 5%      145µs ± 5%

Fixes #38860
Updates #25471
Updates #27707

Change-Id: If52680509b0f3b66dbd1d0c13fa574bd2d0bbd57
Reviewed-on: https://go-review.googlesource.com/c/go/+/232298
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/lockrank.go |  22 +++----
 src/runtime/proc.go     | 168 +++++++++++++++++++++++++++---------------------
 2 files changed, 107 insertions(+), 83 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 0cbbfc4f45..3f9b087856 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index ebecc92745..6feecef985 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2264,11 +2264,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2477,40 +2482,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
 
-			// Don't bother to attempt to steal if p2 is idle.
-			if !idlepMask.read(enum.position()) {
-				if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-					return gp, false
-				}
-			}
-
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
 			//
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
 			//
-			// TODO(prattmic): Maintain a global look-aside similar
-			// to idlepMask to avoid looking at p2 if it can't
-			// possibly have timers.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// TODO(prattmic): Maintain a global look-aside similar to idlepMask
+			// to avoid looking at p2 if it can't possibly have timers.
+			if stealTimersOrRunNextG {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2531,6 +2529,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2606,7 +2611,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2640,6 +2645,35 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for _, _p_ := range allpSnapshot {
+		// This is similar to nobarrierWakeTime, but minimizes calls to
+		// nanotime.
+		if atomic.Load(&_p_.adjustTimers) > 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			pollUntil = now
+		} else {
+			w := int64(atomic.Load64(&_p_.timer0When))
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
 		lock(&sched.lock)
@@ -2735,9 +2769,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2748,6 +2782,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -3034,25 +3072,6 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 	return rnow, pollUntil, ran
 }
 
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
-}
-
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -4603,7 +4622,7 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
-	maskWords := (nprocs+31) / 32
+	maskWords := (nprocs + 31) / 32
 
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
@@ -4927,11 +4946,28 @@ func sysmon() {
 		}
 		usleep(delay)
 		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4945,33 +4981,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
 					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4996,12 +5026,6 @@ func sysmon() {
 			}
 		}
 		mDoFixup()
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
-- 
cgit v1.2.1


From c515852732a490bab64f35d001ddc444b0f0f553 Mon Sep 17 00:00:00 2001
From: Heisenberg <lziqiang1@gmail.com>
Date: Tue, 8 Sep 2020 14:31:39 +0800
Subject: runtime: add 2-byte and 8-byte sub-benchmarks for memmove load/store

Change-Id: I6389d7efe90836b6ece44d2e75053d1ad9f35d08
Reviewed-on: https://go-review.googlesource.com/c/go/+/253417
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/runtime/memmove_test.go | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 396c1304c5..b549433f71 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -538,21 +538,30 @@ func BenchmarkCopyFat1024(b *testing.B) {
 	}
 }
 
+// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
+// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+// when we try to read the result with one 4-byte load.
 func BenchmarkIssue18740(b *testing.B) {
-	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
-	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
-	// when we try to read the result with one 4-byte load.
-	var buf [4]byte
-	for j := 0; j < b.N; j++ {
-		s := uint32(0)
-		for i := 0; i < 4096; i += 4 {
-			copy(buf[:], g[i:])
-			s += binary.LittleEndian.Uint32(buf[:])
-		}
-		sink = uint64(s)
+	benchmarks := []struct {
+		name  string
+		nbyte int
+		f     func([]byte) uint64
+	}{
+		{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
+		{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
+		{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
+	}
+
+	var g [4096]byte
+	for _, bm := range benchmarks {
+		buf := make([]byte, bm.nbyte)
+		b.Run(bm.name, func(b *testing.B) {
+			for j := 0; j < b.N; j++ {
+				for i := 0; i < 4096; i += bm.nbyte {
+					copy(buf[:], g[i:])
+					sink += bm.f(buf[:])
+				}
+			}
+		})
 	}
 }
-
-// TODO: 2 byte and 8 byte benchmarks also.
-
-var g [4096]byte
-- 
cgit v1.2.1


From 009d71409821a6ac4f1b32aaae2c856c20a29f92 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Thu, 22 Oct 2020 16:37:19 -0700
Subject: cmd/compile, runtime: store pointers to go:notinheap types indirectly

pointers to go:notinheap types should be treated as scalars. That
means they shouldn't be stored directly in interfaces, or directly
in reflect.Value.ptr.

Also be sure to use uintpr to compare such pointers in reflect.DeepEqual.

Fixes #42076

Change-Id: I53735f6d434e9c3108d4940bd1bae14c61ef2a74
Reviewed-on: https://go-review.googlesource.com/c/go/+/264480
Trust: Keith Randall <khr@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/netpoll.go | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 34ea82a7fa..77eb3aa4c6 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -79,16 +79,17 @@ type pollDesc struct {
 	lock    mutex // protects the following fields
 	fd      uintptr
 	closing bool
-	everr   bool    // marks event scanning error happened
-	user    uint32  // user settable cookie
-	rseq    uintptr // protects from stale read timers
-	rg      uintptr // pdReady, pdWait, G waiting for read or nil
-	rt      timer   // read deadline timer (set if rt.f != nil)
-	rd      int64   // read deadline
-	wseq    uintptr // protects from stale write timers
-	wg      uintptr // pdReady, pdWait, G waiting for write or nil
-	wt      timer   // write deadline timer
-	wd      int64   // write deadline
+	everr   bool      // marks event scanning error happened
+	user    uint32    // user settable cookie
+	rseq    uintptr   // protects from stale read timers
+	rg      uintptr   // pdReady, pdWait, G waiting for read or nil
+	rt      timer     // read deadline timer (set if rt.f != nil)
+	rd      int64     // read deadline
+	wseq    uintptr   // protects from stale write timers
+	wg      uintptr   // pdReady, pdWait, G waiting for write or nil
+	wt      timer     // write deadline timer
+	wd      int64     // write deadline
+	self    *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
 }
 
 type pollCache struct {
@@ -157,6 +158,7 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd.wseq++
 	pd.wg = 0
 	pd.wd = 0
+	pd.self = pd
 	unlock(&pd.lock)
 
 	var errno int32
@@ -271,14 +273,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 			// Copy current seq into the timer arg.
 			// Timer func will check the seq against current descriptor seq,
 			// if they differ the descriptor was reused or timers were reset.
-			pd.rt.arg = pd
+			pd.rt.arg = pd.makeArg()
 			pd.rt.seq = pd.rseq
 			resettimer(&pd.rt, pd.rd)
 		}
 	} else if pd.rd != rd0 || combo != combo0 {
 		pd.rseq++ // invalidate current timers
 		if pd.rd > 0 {
-			modtimer(&pd.rt, pd.rd, 0, rtf, pd, pd.rseq)
+			modtimer(&pd.rt, pd.rd, 0, rtf, pd.makeArg(), pd.rseq)
 		} else {
 			deltimer(&pd.rt)
 			pd.rt.f = nil
@@ -287,14 +289,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	if pd.wt.f == nil {
 		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
-			pd.wt.arg = pd
+			pd.wt.arg = pd.makeArg()
 			pd.wt.seq = pd.wseq
 			resettimer(&pd.wt, pd.wd)
 		}
 	} else if pd.wd != wd0 || combo != combo0 {
 		pd.wseq++ // invalidate current timers
 		if pd.wd > 0 && !combo {
-			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd, pd.wseq)
+			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd.makeArg(), pd.wseq)
 		} else {
 			deltimer(&pd.wt)
 			pd.wt.f = nil
@@ -547,3 +549,20 @@ func (c *pollCache) alloc() *pollDesc {
 	unlock(&c.lock)
 	return pd
 }
+
+// makeArg converts pd to an interface{}.
+// makeArg does not do any allocation. Normally, such
+// a conversion requires an allocation because pointers to
+// go:notinheap types (which pollDesc is) must be stored
+// in interfaces indirectly. See issue 42076.
+func (pd *pollDesc) makeArg() (i interface{}) {
+	x := (*eface)(unsafe.Pointer(&i))
+	x._type = pdType
+	x.data = unsafe.Pointer(&pd.self)
+	return
+}
+
+var (
+	pdEface interface{} = (*pollDesc)(nil)
+	pdType  *_type      = efaceOf(&pdEface)._type
+)
-- 
cgit v1.2.1


From 091257def92b0280b07bde9536b7cdf5f3b02aec Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 27 Oct 2020 14:15:00 -0700
Subject: cmd/compile: print pointers to go:notinheap types without converting
 to unsafe.Pointer

Pretty minor concern, but after auditing the compiler/runtime for
conversions from pointers to go:notinheap types to unsafe.Pointer,
this is the only remaining one I found.

Update #42076

Change-Id: I81d5b893c9ada2fc19a51c2559262f2e9ff71c35
Reviewed-on: https://go-review.googlesource.com/c/go/+/265757
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
---
 src/runtime/print.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/print.go b/src/runtime/print.go
index e605eb34cb..64055a34cc 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -237,6 +237,9 @@ func printhex(v uint64) {
 func printpointer(p unsafe.Pointer) {
 	printhex(uint64(uintptr(p)))
 }
+func printuintptr(p uintptr) {
+	printhex(uint64(p))
+}
 
 func printstring(s string) {
 	gwrite(bytes(s))
-- 
cgit v1.2.1


From b4b014465216790e01aa66f9120d03230e4aff46 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 29 Sep 2020 17:01:33 -0700
Subject: runtime: don't always adjust timers

Some programs have a lot of timers that they adjust both forward and
backward in time. This can cause a large number of timerModifiedEarlier
timers. In practice these timers are used for I/O deadlines and are
rarely reached. The effect is that the runtime spends a lot of time
in adjusttimers making sure that there are no timerModifiedEarlier
timers, but the effort is wasted because none of the adjusted timers
are near the top of the timer heap anyhow.

Avoid much of this extra work by keeping track of the earliest known
timerModifiedEarlier timer. This lets us skip adjusttimers if we know
that none of the timers will be ready to run anyhow. We will still
eventually run it, when we reach the deadline of the earliest known
timerModifiedEarlier, although in practice that timer has likely
been removed. When we do run adjusttimers, we will reset all of the
timerModifiedEarlier timers, and clear our notion of when we need
to run adjusttimers again.

This effect should be to significantly reduce the number of times we
walk through the timer list in adjusttimers.

Fixes #41699

Change-Id: I38eb2be611fb34e3017bb33d0a9ed40d75fb414f
Reviewed-on: https://go-review.googlesource.com/c/go/+/258303
Trust: Ian Lance Taylor <iant@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/proc.go     | 52 +++++++++++++-------------
 src/runtime/runtime2.go |  7 ++++
 src/runtime/time.go     | 99 +++++++++++++++++++++++++++----------------------
 3 files changed, 88 insertions(+), 70 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 6feecef985..87d4b6e568 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -3017,40 +3017,40 @@ func dropg() {
 // We pass now in and out to avoid extra calls of nanotime.
 //go:yeswritebarrierrec
 func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
-	// If there are no timers to adjust, and the first timer on
-	// the heap is not yet ready to run, then there is nothing to do.
-	if atomic.Load(&pp.adjustTimers) == 0 {
-		next := int64(atomic.Load64(&pp.timer0When))
-		if next == 0 {
-			return now, 0, false
-		}
-		if now == 0 {
-			now = nanotime()
-		}
-		if now < next {
-			// Next timer is not ready to run.
-			// But keep going if we would clear deleted timers.
-			// This corresponds to the condition below where
-			// we decide whether to call clearDeletedTimers.
-			if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
-				return now, next, false
-			}
+	// If it's not yet time for the first timer, or the first adjusted
+	// timer, then there is nothing to do.
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+
+	if next == 0 {
+		// No timers to run or adjust.
+		return now, 0, false
+	}
+
+	if now == 0 {
+		now = nanotime()
+	}
+	if now < next {
+		// Next timer is not ready to run, but keep going
+		// if we would clear deleted timers.
+		// This corresponds to the condition below where
+		// we decide whether to call clearDeletedTimers.
+		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+			return now, next, false
 		}
 	}
 
 	lock(&pp.timersLock)
 
-	adjusttimers(pp)
-
-	rnow = now
 	if len(pp.timers) > 0 {
-		if rnow == 0 {
-			rnow = nanotime()
-		}
+		adjusttimers(pp, now)
 		for len(pp.timers) > 0 {
 			// Note that runtimer may temporarily unlock
 			// pp.timersLock.
-			if tw := runtimer(pp, rnow); tw != 0 {
+			if tw := runtimer(pp, now); tw != 0 {
 				if tw > 0 {
 					pollUntil = tw
 				}
@@ -3069,7 +3069,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 
 	unlock(&pp.timersLock)
 
-	return rnow, pollUntil, ran
+	return now, pollUntil, ran
 }
 
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 7bac5fd38d..a2e4411c7d 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -646,6 +646,13 @@ type p struct {
 	// This is 0 if the timer heap is empty.
 	timer0When uint64
 
+	// The earliest known nextwhen field of a timer with
+	// timerModifiedEarlier status. Because the timer may have been
+	// modified again, there need not be any timer with this value.
+	// This is updated using atomic functions.
+	// This is 0 if the value is unknown.
+	timerModifiedEarliest uint64
+
 	// Per-P GC state
 	gcAssistTime         int64    // Nanoseconds in assistAlloc
 	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
diff --git a/src/runtime/time.go b/src/runtime/time.go
index f895bf8443..99290f66d0 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -491,6 +491,8 @@ loop:
 			newStatus = timerModifiedEarlier
 		}
 
+		tpp := t.pp.ptr()
+
 		// Update the adjustTimers field.  Subtract one if we
 		// are removing a timerModifiedEarlier, add one if we
 		// are adding a timerModifiedEarlier.
@@ -500,9 +502,10 @@ loop:
 		}
 		if newStatus == timerModifiedEarlier {
 			adjust++
+			updateTimerModifiedEarliest(tpp, when)
 		}
 		if adjust != 0 {
-			atomic.Xadd(&t.pp.ptr().adjustTimers, adjust)
+			atomic.Xadd(&tpp.adjustTimers, adjust)
 		}
 
 		// Set the new status of the timer.
@@ -637,16 +640,36 @@ func moveTimers(pp *p, timers []*timer) {
 // the correct place in the heap. While looking for those timers,
 // it also moves timers that have been modified to run later,
 // and removes deleted timers. The caller must have locked the timers for pp.
-func adjusttimers(pp *p) {
-	if len(pp.timers) == 0 {
-		return
-	}
+func adjusttimers(pp *p, now int64) {
 	if atomic.Load(&pp.adjustTimers) == 0 {
 		if verifyTimers {
 			verifyTimerHeap(pp)
 		}
+		// There are no timers to adjust, so it is safe to clear
+		// timerModifiedEarliest. Do so in case it is stale.
+		// Everything will work if we don't do this,
+		// but clearing here may save future calls to adjusttimers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
 		return
 	}
+
+	// If we haven't yet reached the time of the first timerModifiedEarlier
+	// timer, don't do anything. This speeds up programs that adjust
+	// a lot of timers back and forth if the timers rarely expire.
+	// We'll postpone looking through all the adjusted timers until
+	// one would actually expire.
+	if first := atomic.Load64(&pp.timerModifiedEarliest); first != 0 {
+		if int64(first) > now {
+			if verifyTimers {
+				verifyTimerHeap(pp)
+			}
+			return
+		}
+
+		// We are going to clear all timerModifiedEarlier timers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
+	}
+
 	var moved []*timer
 loop:
 	for i := 0; i < len(pp.timers); i++ {
@@ -868,6 +891,10 @@ func runOneTimer(pp *p, t *timer, now int64) {
 //
 // The caller must have locked the timers for pp.
 func clearDeletedTimers(pp *p) {
+	// We are going to clear all timerModifiedEarlier timers.
+	// Do this now in case new ones show up while we are looping.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
 	cdel := int32(0)
 	cearlier := int32(0)
 	to := 0
@@ -977,6 +1004,21 @@ func updateTimer0When(pp *p) {
 	}
 }
 
+// updateTimerModifiedEarliest updates the recorded nextwhen field of the
+// earlier timerModifiedEarier value.
+// The timers for pp will not be locked.
+func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
+	for {
+		old := atomic.Load64(&pp.timerModifiedEarliest)
+		if old != 0 && int64(old) < nextwhen {
+			return
+		}
+		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+			return
+		}
+	}
+}
+
 // timeSleepUntil returns the time when the next timer should fire,
 // and the P that holds the timer heap that that timer is on.
 // This is only called by sysmon and checkdead.
@@ -993,48 +1035,17 @@ func timeSleepUntil() (int64, *p) {
 			continue
 		}
 
-		c := atomic.Load(&pp.adjustTimers)
-		if c == 0 {
-			w := int64(atomic.Load64(&pp.timer0When))
-			if w != 0 && w < next {
-				next = w
-				pret = pp
-			}
-			continue
+		w := int64(atomic.Load64(&pp.timer0When))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
 
-		lock(&pp.timersLock)
-		for _, t := range pp.timers {
-			switch s := atomic.Load(&t.status); s {
-			case timerWaiting:
-				if t.when < next {
-					next = t.when
-				}
-			case timerModifiedEarlier, timerModifiedLater:
-				if t.nextwhen < next {
-					next = t.nextwhen
-				}
-				if s == timerModifiedEarlier {
-					c--
-				}
-			}
-			// The timers are sorted, so we only have to check
-			// the first timer for each P, unless there are
-			// some timerModifiedEarlier timers. The number
-			// of timerModifiedEarlier timers is in the adjustTimers
-			// field, used to initialize c, above.
-			//
-			// We don't worry about cases like timerModifying.
-			// New timers can show up at any time,
-			// so this function is necessarily imprecise.
-			// Do a signed check here since we aren't
-			// synchronizing the read of pp.adjustTimers
-			// with the check of a timer status.
-			if int32(c) <= 0 {
-				break
-			}
+		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
-		unlock(&pp.timersLock)
 	}
 	unlock(&allpLock)
 
-- 
cgit v1.2.1


From 49b017fe59bf628795f2c4fdbcb5db942e865fa9 Mon Sep 17 00:00:00 2001
From: George Tsilias <tsiliasg@gmail.com>
Date: Thu, 4 Jun 2020 23:11:56 +0300
Subject: runtime: handle signal 34 for musl setgid

It has been observed that setgid hangs when using cgo with musl.
This fix ensures that signal 34 gets handled in an appropriate way,
like signal 33 when using glibc.

Fixes #39343

Change-Id: I89565663e2c361f62cbccfe80aaedf290bd58d57
Reviewed-on: https://go-review.googlesource.com/c/go/+/236518
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Tobias Klauser <tobias.klauser@gmail.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/sigtab_linux_generic.go | 2 +-
 src/runtime/sigtab_linux_mipsx.go   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index b26040b803..38d686544f 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -45,7 +45,7 @@ var sigtable = [...]sigTabT{
 	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/* 34 */ {_SigNotify, "signal 34"},
+	/* 34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/* 35 */ {_SigNotify, "signal 35"},
 	/* 36 */ {_SigNotify, "signal 36"},
 	/* 37 */ {_SigNotify, "signal 37"},
diff --git a/src/runtime/sigtab_linux_mipsx.go b/src/runtime/sigtab_linux_mipsx.go
index 81dd2314c5..51ef470ce7 100644
--- a/src/runtime/sigtab_linux_mipsx.go
+++ b/src/runtime/sigtab_linux_mipsx.go
@@ -42,7 +42,7 @@ var sigtable = [...]sigTabT{
 	/*  31 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/*  32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/*  33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/*  34 */ {_SigNotify, "signal 34"},
+	/*  34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/*  35 */ {_SigNotify, "signal 35"},
 	/*  36 */ {_SigNotify, "signal 36"},
 	/*  37 */ {_SigNotify, "signal 37"},
-- 
cgit v1.2.1


From 368c40116434532dc0b53b72fa04788ca6742898 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 27 Oct 2020 16:09:40 -0700
Subject: runtime: block signals in needm before allocating M

Otherwise, if a signal occurs just after we allocated the M,
we can deadlock if the signal handler needs to allocate an M
itself.

Fixes #42207

Change-Id: I76f44547f419e8b1c14cbf49bf602c6e645d8c14
Reviewed-on: https://go-review.googlesource.com/c/go/+/265759
Trust: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Bryan C. Mills <bcmills@google.com>
---
 src/runtime/crash_unix_test.go                    |  9 +++
 src/runtime/os_js.go                              |  2 +-
 src/runtime/os_plan9.go                           |  2 +-
 src/runtime/os_windows.go                         |  2 +-
 src/runtime/proc.go                               | 26 ++++---
 src/runtime/signal_unix.go                        |  8 +-
 src/runtime/testdata/testprogcgo/needmdeadlock.go | 95 +++++++++++++++++++++++
 7 files changed, 127 insertions(+), 17 deletions(-)
 create mode 100644 src/runtime/testdata/testprogcgo/needmdeadlock.go

(limited to 'src/runtime')

diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index fc87f37408..7aba3d4846 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -358,3 +358,12 @@ func TestSignalM(t *testing.T) {
 		t.Fatalf("signal sent to M %d, but received on M %d", want, got)
 	}
 }
+
+// Issue #42207.
+func TestNeedmDeadlock(t *testing.T) {
+	output := runTestProg(t, "testprogcgo", "NeedmDeadlock")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index ff0ee3aa6b..94983b358d 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -59,7 +59,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index f3037a7508..62aecea060 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -184,7 +184,7 @@ func mpreinit(mp *m) {
 	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, true))
 }
 
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 func msigrestore(sigmask sigset) {
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 9dd140c952..ffb087f9db 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -873,7 +873,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 87d4b6e568..b335e1184d 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -598,7 +598,7 @@ func schedinit() {
 	typelinksinit() // uses maps, activeModules
 	itabsinit()     // uses activeModules
 
-	msigsave(_g_.m)
+	sigsave(&_g_.m.sigmask)
 	initSigmask = _g_.m.sigmask
 
 	goargs()
@@ -1707,6 +1707,18 @@ func needm() {
 		exit(1)
 	}
 
+	// Save and block signals before getting an M.
+	// The signal handler may call needm itself,
+	// and we must avoid a deadlock. Also, once g is installed,
+	// any incoming signals will try to execute,
+	// but we won't have the sigaltstack settings and other data
+	// set up appropriately until the end of minit, which will
+	// unblock the signals. This is the same dance as when
+	// starting a new m to run Go code via newosproc.
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock()
+
 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
 	// that the extra list always contains or will soon contain
@@ -1724,14 +1736,8 @@ func needm() {
 	extraMCount--
 	unlockextra(mp.schedlink.ptr())
 
-	// Save and block signals before installing g.
-	// Once g is installed, any incoming signals will try to execute,
-	// but we won't have the sigaltstack settings and other data
-	// set up appropriately until the end of minit, which will
-	// unblock the signals. This is the same dance as when
-	// starting a new m to run Go code via newosproc.
-	msigsave(mp)
-	sigblock()
+	// Store the original signal mask for use by minit.
+	mp.sigmask = sigmask
 
 	// Install g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
@@ -3676,7 +3682,7 @@ func beforefork() {
 	// a signal handler before exec if a signal is sent to the process
 	// group. See issue #18600.
 	gp.m.locks++
-	msigsave(gp.m)
+	sigsave(&gp.m.sigmask)
 	sigblock()
 
 	// This function is called before fork in syscall package.
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 9318a9b8bc..bf4a319b37 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -1031,15 +1031,15 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 	return true
 }
 
-// msigsave saves the current thread's signal mask into mp.sigmask.
+// sigsave saves the current thread's signal mask into *p.
 // This is used to preserve the non-Go signal mask when a non-Go
 // thread calls a Go function.
 // This is nosplit and nowritebarrierrec because it is called by needm
 // which may be called on a non-Go thread with no g available.
 //go:nosplit
 //go:nowritebarrierrec
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+func sigsave(p *sigset) {
+	sigprocmask(_SIG_SETMASK, nil, p)
 }
 
 // msigrestore sets the current thread's signal mask to sigmask.
@@ -1111,7 +1111,7 @@ func minitSignalStack() {
 // thread's signal mask. When this is called all signals have been
 // blocked for the thread.  This starts with m.sigmask, which was set
 // either from initSigmask for a newly created thread or by calling
-// msigsave if this is a non-Go thread calling a Go function. It
+// sigsave if this is a non-Go thread calling a Go function. It
 // removes all essential signals from the mask, thus causing those
 // signals to not be blocked. Then it sets the thread's signal mask.
 // After this is called the thread can receive signals.
diff --git a/src/runtime/testdata/testprogcgo/needmdeadlock.go b/src/runtime/testdata/testprogcgo/needmdeadlock.go
new file mode 100644
index 0000000000..5a9c359006
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/needmdeadlock.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// This is for issue #42207.
+// During a call to needm we could get a SIGCHLD signal
+// which would itself call needm, causing a deadlock.
+
+/*
+#include <signal.h>
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+
+extern void GoNeedM();
+
+#define SIGNALERS 10
+
+static void* needmSignalThread(void* p) {
+	pthread_t* pt = (pthread_t*)(p);
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		if (pthread_kill(*pt, SIGCHLD) < 0) {
+			return NULL;
+		}
+		usleep(1);
+	}
+	return NULL;
+}
+
+// We don't need many calls, as the deadlock is only likely
+// to occur the first couple of times that needm is called.
+// After that there will likely be an extra M available.
+#define CALLS 10
+
+static void* needmCallbackThread(void* p) {
+	int i;
+
+	for (i = 0; i < SIGNALERS; i++) {
+		sched_yield(); // Help the signal threads get started.
+	}
+	for (i = 0; i < CALLS; i++) {
+		GoNeedM();
+	}
+	return NULL;
+}
+
+static void runNeedmSignalThread() {
+	int i;
+	pthread_t caller;
+	pthread_t s[SIGNALERS];
+
+	pthread_create(&caller, NULL, needmCallbackThread, NULL);
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_create(&s[i], NULL, needmSignalThread, &caller);
+	}
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_join(s[i], NULL);
+	}
+	pthread_join(caller, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"time"
+)
+
+func init() {
+	register("NeedmDeadlock", NeedmDeadlock)
+}
+
+//export GoNeedM
+func GoNeedM() {
+}
+
+func NeedmDeadlock() {
+	// The failure symptom is that the program hangs because of a
+	// deadlock in needm, so set an alarm.
+	go func() {
+		time.Sleep(5 * time.Second)
+		fmt.Println("Hung for 5 seconds")
+		os.Exit(1)
+	}()
+
+	C.runNeedmSignalThread()
+	fmt.Println("OK")
+}
-- 
cgit v1.2.1


From 02335cf4131f4eb1869f50f906e993676f7f414a Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 27 Oct 2020 21:05:13 -0700
Subject: runtime: move TestNeedmDeadlock to crash_cgo_test.go

It requires cgo. Also, skip the test on windows and plan9.

For #42207

Change-Id: I8522773f93bc3f9826506a41a08b86a083262e31
Reviewed-on: https://go-review.googlesource.com/c/go/+/265778
Trust: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
---
 src/runtime/crash_cgo_test.go  | 13 +++++++++++++
 src/runtime/crash_unix_test.go |  9 ---------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index b200984050..0680d07a32 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -600,3 +600,16 @@ func TestEINTR(t *testing.T) {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+// Issue #42207.
+func TestNeedmDeadlock(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprogcgo", "NeedmDeadlock")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 7aba3d4846..fc87f37408 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -358,12 +358,3 @@ func TestSignalM(t *testing.T) {
 		t.Fatalf("signal sent to M %d, but received on M %d", want, got)
 	}
 }
-
-// Issue #42207.
-func TestNeedmDeadlock(t *testing.T) {
-	output := runTestProg(t, "testprogcgo", "NeedmDeadlock")
-	want := "OK\n"
-	if output != want {
-		t.Fatalf("want %s, got %s\n", want, output)
-	}
-}
-- 
cgit v1.2.1


From 150d2448e5a213cd679396371c0a147918dc2125 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Derkacz?= <michal@Lnet.pl>
Date: Sun, 14 Jun 2020 00:06:24 +0200
Subject: cmd/compile,cmd/internal/obj/riscv,runtime: use Duff's devices on
 riscv64

Implement runtime.duffzero and runtime.duffcopy for riscv64.
Use obj.ADUFFZERO/obj.ADUFFCOPY for medium size, word aligned
zeroing/moving.

Change-Id: I42ec622055630c94cb77e286d8d33dbe7c9f846c
Reviewed-on: https://go-review.googlesource.com/c/go/+/237797
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/runtime/duff_riscv64.s | 907 +++++++++++++++++++++++++++++++++++++++++++++
 src/runtime/mkduff.go      |  28 ++
 2 files changed, 935 insertions(+)
 create mode 100644 src/runtime/duff_riscv64.s

(limited to 'src/runtime')

diff --git a/src/runtime/duff_riscv64.s b/src/runtime/duff_riscv64.s
new file mode 100644
index 0000000000..f7bd3f326e
--- /dev/null
+++ b/src/runtime/duff_riscv64.s
@@ -0,0 +1,907 @@
+// Code generated by mkduff.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+#include "textflag.h"
+
+TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	RET
+
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	RET
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 8859ed68cc..6ddf0256e9 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -38,6 +38,7 @@ func main() {
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
+	gen("riscv64", notags, zeroRISCV64, copyRISCV64)
 }
 
 func gen(arch string, tags, zero, copy func(io.Writer)) {
@@ -227,3 +228,30 @@ func copyMIPS64x(w io.Writer) {
 	}
 	fmt.Fprintln(w, "\tRET")
 }
+
+func zeroRISCV64(w io.Writer) {
+	// ZERO: always zero
+	// X10: ptr to memory to be zeroed
+	// X10 is updated as a side effect.
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\tZERO, (X10)")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+	}
+	fmt.Fprintln(w, "\tRET")
+}
+
+func copyRISCV64(w io.Writer) {
+	// X10: ptr to source memory
+	// X11: ptr to destination memory
+	// X10 and X11 are updated as a side effect
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\t(X10), X31")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+		fmt.Fprintln(w, "\tMOV\tX31, (X11)")
+		fmt.Fprintln(w, "\tADD\t$8, X11")
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintln(w, "\tRET")
+}
-- 
cgit v1.2.1