summaryrefslogtreecommitdiff
path: root/pkg/cgroups
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/cgroups')
-rw-r--r--pkg/cgroups/apply_nosystemd.go15
-rw-r--r--pkg/cgroups/apply_raw.go216
-rw-r--r--pkg/cgroups/apply_systemd.go158
-rw-r--r--pkg/cgroups/cgroups.go179
4 files changed, 405 insertions, 163 deletions
diff --git a/pkg/cgroups/apply_nosystemd.go b/pkg/cgroups/apply_nosystemd.go
new file mode 100644
index 0000000000..f94d475907
--- /dev/null
+++ b/pkg/cgroups/apply_nosystemd.go
@@ -0,0 +1,15 @@
+// +build !linux
+
+package cgroups
+
+import (
+ "fmt"
+)
+
+func useSystemd() bool {
+ return false
+}
+
+func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
diff --git a/pkg/cgroups/apply_raw.go b/pkg/cgroups/apply_raw.go
new file mode 100644
index 0000000000..220f08f1dc
--- /dev/null
+++ b/pkg/cgroups/apply_raw.go
@@ -0,0 +1,216 @@
+package cgroups
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+)
+
+type rawCgroup struct {
+ root string
+ cgroup string
+}
+
+func rawApply(c *Cgroup, pid int) (ActiveCgroup, error) {
+ // We have two implementation of cgroups support, one is based on
+ // systemd and the dbus api, and one is based on raw cgroup fs operations
+ // following the pre-single-writer model docs at:
+ // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
+ //
+ // we can pick any subsystem to find the root
+
+ cgroupRoot, err := FindCgroupMountpoint("cpu")
+ if err != nil {
+ return nil, err
+ }
+ cgroupRoot = filepath.Dir(cgroupRoot)
+
+ if _, err := os.Stat(cgroupRoot); err != nil {
+ return nil, fmt.Errorf("cgroups fs not found")
+ }
+
+ cgroup := c.Name
+ if c.Parent != "" {
+ cgroup = filepath.Join(c.Parent, cgroup)
+ }
+
+ raw := &rawCgroup{
+ root: cgroupRoot,
+ cgroup: cgroup,
+ }
+
+ if err := raw.setupDevices(c, pid); err != nil {
+ return nil, err
+ }
+ if err := raw.setupMemory(c, pid); err != nil {
+ return nil, err
+ }
+ if err := raw.setupCpu(c, pid); err != nil {
+ return nil, err
+ }
+ if err := raw.setupCpuset(c, pid); err != nil {
+ return nil, err
+ }
+ return raw, nil
+}
+
+func (raw *rawCgroup) path(subsystem string) (string, error) {
+ initPath, err := GetInitCgroupDir(subsystem)
+ if err != nil {
+ return "", err
+ }
+ return filepath.Join(raw.root, subsystem, initPath, raw.cgroup), nil
+}
+
+func (raw *rawCgroup) join(subsystem string, pid int) (string, error) {
+ path, err := raw.path(subsystem)
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
+ return "", err
+ }
+ if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
+ return "", err
+ }
+ return path, nil
+}
+
+func (raw *rawCgroup) setupDevices(c *Cgroup, pid int) (err error) {
+ if !c.DeviceAccess {
+ dir, err := raw.join("devices", pid)
+ if err != nil {
+ return err
+ }
+
+ defer func() {
+ if err != nil {
+ os.RemoveAll(dir)
+ }
+ }()
+
+ if err := writeFile(dir, "devices.deny", "a"); err != nil {
+ return err
+ }
+
+ allow := []string{
+ // allow mknod for any device
+ "c *:* m",
+ "b *:* m",
+
+ // /dev/null, zero, full
+ "c 1:3 rwm",
+ "c 1:5 rwm",
+ "c 1:7 rwm",
+
+ // consoles
+ "c 5:1 rwm",
+ "c 5:0 rwm",
+ "c 4:0 rwm",
+ "c 4:1 rwm",
+
+ // /dev/urandom,/dev/random
+ "c 1:9 rwm",
+ "c 1:8 rwm",
+
+ // /dev/pts/ - pts namespaces are "coming soon"
+ "c 136:* rwm",
+ "c 5:2 rwm",
+
+ // tuntap
+ "c 10:200 rwm",
+ }
+
+ for _, val := range allow {
+ if err := writeFile(dir, "devices.allow", val); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+func (raw *rawCgroup) setupMemory(c *Cgroup, pid int) (err error) {
+ if c.Memory != 0 || c.MemorySwap != 0 {
+ dir, err := raw.join("memory", pid)
+ if err != nil {
+ return err
+ }
+ defer func() {
+ if err != nil {
+ os.RemoveAll(dir)
+ }
+ }()
+
+ if c.Memory != 0 {
+ if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
+ return err
+ }
+ if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
+ return err
+ }
+ }
+ // By default, MemorySwap is set to twice the size of RAM.
+ // If you want to omit MemorySwap, set it to `-1'.
+ if c.MemorySwap != -1 {
+ if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+func (raw *rawCgroup) setupCpu(c *Cgroup, pid int) (err error) {
+ // We always want to join the cpu group, to allow fair cpu scheduling
+ // on a container basis
+ dir, err := raw.join("cpu", pid)
+ if err != nil {
+ return err
+ }
+ if c.CpuShares != 0 {
+ if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (raw *rawCgroup) setupCpuset(c *Cgroup, pid int) (err error) {
+ if c.CpusetCpus != "" {
+ dir, err := raw.join("cpuset", pid)
+ if err != nil {
+ return err
+ }
+ defer func() {
+ if err != nil {
+ os.RemoveAll(dir)
+ }
+ }()
+
+ if err := writeFile(dir, "cpuset.cpus", c.CpusetCpus); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (raw *rawCgroup) Cleanup() error {
+ get := func(subsystem string) string {
+ path, _ := raw.path(subsystem)
+ return path
+ }
+
+ for _, path := range []string{
+ get("memory"),
+ get("devices"),
+ get("cpu"),
+ get("cpuset"),
+ } {
+ if path != "" {
+ os.RemoveAll(path)
+ }
+ }
+ return nil
+}
diff --git a/pkg/cgroups/apply_systemd.go b/pkg/cgroups/apply_systemd.go
new file mode 100644
index 0000000000..c689d5753e
--- /dev/null
+++ b/pkg/cgroups/apply_systemd.go
@@ -0,0 +1,158 @@
+// +build linux
+
+package cgroups
+
+import (
+ "fmt"
+ systemd1 "github.com/coreos/go-systemd/dbus"
+ "github.com/dotcloud/docker/pkg/systemd"
+ "github.com/godbus/dbus"
+ "path/filepath"
+ "strings"
+ "sync"
+)
+
+type systemdCgroup struct {
+}
+
+var (
+ connLock sync.Mutex
+ theConn *systemd1.Conn
+ hasStartTransientUnit bool
+)
+
+func useSystemd() bool {
+ if !systemd.SdBooted() {
+ return false
+ }
+
+ connLock.Lock()
+ defer connLock.Unlock()
+
+ if theConn == nil {
+ var err error
+ theConn, err = systemd1.New()
+ if err != nil {
+ return false
+ }
+
+ // Assume we have StartTransientUnit
+ hasStartTransientUnit = true
+
+ // But if we get UnknownMethod error we don't
+ if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
+ hasStartTransientUnit = false
+ }
+ }
+ }
+ }
+
+ return hasStartTransientUnit
+}
+
+type DeviceAllow struct {
+ Node string
+ Permissions string
+}
+
+func getIfaceForUnit(unitName string) string {
+ if strings.HasSuffix(unitName, ".scope") {
+ return "Scope"
+ }
+ if strings.HasSuffix(unitName, ".service") {
+ return "Service"
+ }
+ return "Unit"
+}
+
+func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) {
+ unitName := c.Parent + "-" + c.Name + ".scope"
+ slice := "system.slice"
+
+ var properties []systemd1.Property
+
+ for _, v := range c.UnitProperties {
+ switch v[0] {
+ case "Slice":
+ slice = v[1]
+ default:
+ return nil, fmt.Errorf("Unknown unit propery %s", v[0])
+ }
+ }
+
+ properties = append(properties,
+ systemd1.Property{"Slice", dbus.MakeVariant(slice)},
+ systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)},
+ systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})})
+
+ if !c.DeviceAccess {
+ properties = append(properties,
+ systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")},
+ systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{
+ {"/dev/null", "rwm"},
+ {"/dev/zero", "rwm"},
+ {"/dev/full", "rwm"},
+ {"/dev/random", "rwm"},
+ {"/dev/urandom", "rwm"},
+ {"/dev/tty", "rwm"},
+ {"/dev/console", "rwm"},
+ {"/dev/tty0", "rwm"},
+ {"/dev/tty1", "rwm"},
+ {"/dev/pts/ptmx", "rwm"},
+ // There is no way to add /dev/pts/* here atm, so we hack this manually below
+ // /dev/pts/* (how to add this?)
+ // Same with tuntap, which doesn't exist as a node most of the time
+ })})
+ }
+
+ if c.Memory != 0 {
+ properties = append(properties,
+ systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))})
+ }
+ // TODO: MemorySwap not available in systemd
+
+ if c.CpuShares != 0 {
+ properties = append(properties,
+ systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))})
+ }
+
+ if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil {
+ return nil, err
+ }
+
+ // To work around the lack of /dev/pts/* support above we need to manually add these
+ // so, ask systemd for the cgroup used
+ props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName))
+ if err != nil {
+ return nil, err
+ }
+
+ cgroup := props["ControlGroup"].(string)
+
+ if !c.DeviceAccess {
+ mountpoint, err := FindCgroupMountpoint("devices")
+ if err != nil {
+ return nil, err
+ }
+
+ path := filepath.Join(mountpoint, cgroup)
+
+ // /dev/pts/*
+ if err := writeFile(path, "devices.allow", "c 136:* rwm"); err != nil {
+ return nil, err
+ }
+ // tuntap
+ if err := writeFile(path, "devices.allow", "c 10:200 rwm"); err != nil {
+ return nil, err
+ }
+ }
+
+ return &systemdCgroup{}, nil
+}
+
+func (c *systemdCgroup) Cleanup() error {
+ // systemd cleans up, we don't need to do anything
+ return nil
+}
diff --git a/pkg/cgroups/cgroups.go b/pkg/cgroups/cgroups.go
index b40e1a31fa..5fe10346df 100644
--- a/pkg/cgroups/cgroups.go
+++ b/pkg/cgroups/cgroups.go
@@ -8,7 +8,6 @@ import (
"io/ioutil"
"os"
"path/filepath"
- "strconv"
"strings"
)
@@ -16,10 +15,17 @@ type Cgroup struct {
Name string `json:"name,omitempty"`
Parent string `json:"parent,omitempty"`
- DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice
- Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes)
- MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap
- CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers)
+ DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice
+ Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes)
+ MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap
+ CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers)
+ CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use
+
+ UnitProperties [][2]string `json:"unit_properties,omitempty"` // systemd unit properties
+}
+
+type ActiveCgroup interface {
+ Cleanup() error
}
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
@@ -62,48 +68,6 @@ func GetInitCgroupDir(subsystem string) (string, error) {
return parseCgroupFile(subsystem, f)
}
-func (c *Cgroup) Path(root, subsystem string) (string, error) {
- cgroup := c.Name
- if c.Parent != "" {
- cgroup = filepath.Join(c.Parent, cgroup)
- }
- initPath, err := GetInitCgroupDir(subsystem)
- if err != nil {
- return "", err
- }
- return filepath.Join(root, subsystem, initPath, cgroup), nil
-}
-
-func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) {
- path, err := c.Path(root, subsystem)
- if err != nil {
- return "", err
- }
- if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
- return "", err
- }
- if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
- return "", err
- }
- return path, nil
-}
-
-func (c *Cgroup) Cleanup(root string) error {
- get := func(subsystem string) string {
- path, _ := c.Path(root, subsystem)
- return path
- }
-
- for _, path := range []string{
- get("memory"),
- get("devices"),
- get("cpu"),
- } {
- os.RemoveAll(path)
- }
- return nil
-}
-
func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
s := bufio.NewScanner(r)
for s.Scan() {
@@ -125,126 +89,15 @@ func writeFile(dir, file, data string) error {
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
-func (c *Cgroup) Apply(pid int) error {
+func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) {
// We have two implementation of cgroups support, one is based on
// systemd and the dbus api, and one is based on raw cgroup fs operations
// following the pre-single-writer model docs at:
// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
- //
- // we can pick any subsystem to find the root
- cgroupRoot, err := FindCgroupMountpoint("cpu")
- if err != nil {
- return err
- }
- cgroupRoot = filepath.Dir(cgroupRoot)
-
- if _, err := os.Stat(cgroupRoot); err != nil {
- return fmt.Errorf("cgroups fs not found")
- }
- if err := c.setupDevices(cgroupRoot, pid); err != nil {
- return err
- }
- if err := c.setupMemory(cgroupRoot, pid); err != nil {
- return err
- }
- if err := c.setupCpu(cgroupRoot, pid); err != nil {
- return err
- }
- return nil
-}
-func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) {
- if !c.DeviceAccess {
- dir, err := c.Join(cgroupRoot, "devices", pid)
- if err != nil {
- return err
- }
-
- defer func() {
- if err != nil {
- os.RemoveAll(dir)
- }
- }()
-
- if err := writeFile(dir, "devices.deny", "a"); err != nil {
- return err
- }
-
- allow := []string{
- // /dev/null, zero, full
- "c 1:3 rwm",
- "c 1:5 rwm",
- "c 1:7 rwm",
-
- // consoles
- "c 5:1 rwm",
- "c 5:0 rwm",
- "c 4:0 rwm",
- "c 4:1 rwm",
-
- // /dev/urandom,/dev/random
- "c 1:9 rwm",
- "c 1:8 rwm",
-
- // /dev/pts/ - pts namespaces are "coming soon"
- "c 136:* rwm",
- "c 5:2 rwm",
-
- // tuntap
- "c 10:200 rwm",
- }
-
- for _, val := range allow {
- if err := writeFile(dir, "devices.allow", val); err != nil {
- return err
- }
- }
- }
- return nil
-}
-
-func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) {
- if c.Memory != 0 || c.MemorySwap != 0 {
- dir, err := c.Join(cgroupRoot, "memory", pid)
- if err != nil {
- return err
- }
- defer func() {
- if err != nil {
- os.RemoveAll(dir)
- }
- }()
-
- if c.Memory != 0 {
- if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
- return err
- }
- if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
- return err
- }
- }
- // By default, MemorySwap is set to twice the size of RAM.
- // If you want to omit MemorySwap, set it to `-1'.
- if c.MemorySwap != -1 {
- if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil {
- return err
- }
- }
- }
- return nil
-}
-
-func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) {
- // We always want to join the cpu group, to allow fair cpu scheduling
- // on a container basis
- dir, err := c.Join(cgroupRoot, "cpu", pid)
- if err != nil {
- return err
- }
- if c.CpuShares != 0 {
- if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil {
- return err
- }
+ if useSystemd() {
+ return systemdApply(c, pid)
+ } else {
+ return rawApply(c, pid)
}
- return nil
}