diff options
Diffstat (limited to 'pkg/cgroups')
-rw-r--r-- | pkg/cgroups/apply_nosystemd.go | 15 | ||||
-rw-r--r-- | pkg/cgroups/apply_raw.go | 216 | ||||
-rw-r--r-- | pkg/cgroups/apply_systemd.go | 158 | ||||
-rw-r--r-- | pkg/cgroups/cgroups.go | 179 |
4 files changed, 405 insertions, 163 deletions
diff --git a/pkg/cgroups/apply_nosystemd.go b/pkg/cgroups/apply_nosystemd.go new file mode 100644 index 0000000000..f94d475907 --- /dev/null +++ b/pkg/cgroups/apply_nosystemd.go @@ -0,0 +1,15 @@ +// +build !linux + +package cgroups + +import ( + "fmt" +) + +func useSystemd() bool { + return false +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/pkg/cgroups/apply_raw.go b/pkg/cgroups/apply_raw.go new file mode 100644 index 0000000000..220f08f1dc --- /dev/null +++ b/pkg/cgroups/apply_raw.go @@ -0,0 +1,216 @@ +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + "strconv" +) + +type rawCgroup struct { + root string + cgroup string +} + +func rawApply(c *Cgroup, pid int) (ActiveCgroup, error) { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + + cgroupRoot, err := FindCgroupMountpoint("cpu") + if err != nil { + return nil, err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return nil, fmt.Errorf("cgroups fs not found") + } + + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + + raw := &rawCgroup{ + root: cgroupRoot, + cgroup: cgroup, + } + + if err := raw.setupDevices(c, pid); err != nil { + return nil, err + } + if err := raw.setupMemory(c, pid); err != nil { + return nil, err + } + if err := raw.setupCpu(c, pid); err != nil { + return nil, err + } + if err := raw.setupCpuset(c, pid); err != nil { + return nil, err + } + return raw, nil +} + +func (raw *rawCgroup) path(subsystem string) (string, error) { + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(raw.root, subsystem, initPath, raw.cgroup), nil +} + +func (raw *rawCgroup) join(subsystem string, pid int) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (raw *rawCgroup) setupDevices(c *Cgroup, pid int) (err error) { + if !c.DeviceAccess { + dir, err := raw.join("devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // allow mknod for any device + "c *:* m", + "b *:* m", + + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupMemory(c *Cgroup, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := raw.join("memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupCpu(c *Cgroup, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := raw.join("cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func (raw *rawCgroup) setupCpuset(c *Cgroup, pid int) (err error) { + if c.CpusetCpus != "" { + dir, err := raw.join("cpuset", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "cpuset.cpus", c.CpusetCpus); err != nil { + return err + } + } + return nil +} + +func (raw *rawCgroup) Cleanup() error { + get := func(subsystem string) string { + path, _ := raw.path(subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + get("cpuset"), + } { + if path != "" { + os.RemoveAll(path) + } + } + return nil +} diff --git a/pkg/cgroups/apply_systemd.go b/pkg/cgroups/apply_systemd.go new file mode 100644 index 0000000000..c689d5753e --- /dev/null +++ b/pkg/cgroups/apply_systemd.go @@ -0,0 +1,158 @@ +// +build linux + +package cgroups + +import ( + "fmt" + systemd1 "github.com/coreos/go-systemd/dbus" + "github.com/dotcloud/docker/pkg/systemd" + "github.com/godbus/dbus" + "path/filepath" + "strings" + "sync" +) + +type systemdCgroup struct { +} + +var ( + connLock sync.Mutex + theConn *systemd1.Conn + hasStartTransientUnit bool +) + +func useSystemd() bool { + if !systemd.SdBooted() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemd1.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + } + } + } + } + + return hasStartTransientUnit +} + +type DeviceAllow struct { + Node string + Permissions string +} + +func getIfaceForUnit(unitName string) string { + if strings.HasSuffix(unitName, ".scope") { + return "Scope" + } + if strings.HasSuffix(unitName, ".service") { + return "Service" + } + return "Unit" +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + unitName := c.Parent + "-" + c.Name + ".scope" + slice := "system.slice" + + var properties []systemd1.Property + + for _, v := range c.UnitProperties { + switch v[0] { + case "Slice": + slice = v[1] + default: + return nil, fmt.Errorf("Unknown unit propery %s", v[0]) + } + } + + properties = append(properties, + systemd1.Property{"Slice", dbus.MakeVariant(slice)}, + systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)}, + systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}) + + if !c.DeviceAccess { + properties = append(properties, + systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}, + systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{ + {"/dev/null", "rwm"}, + {"/dev/zero", "rwm"}, + {"/dev/full", "rwm"}, + {"/dev/random", "rwm"}, + {"/dev/urandom", "rwm"}, + {"/dev/tty", "rwm"}, + {"/dev/console", "rwm"}, + {"/dev/tty0", "rwm"}, + {"/dev/tty1", "rwm"}, + {"/dev/pts/ptmx", "rwm"}, + // There is no way to add /dev/pts/* here atm, so we hack this manually below + // /dev/pts/* (how to add this?) + // Same with tuntap, which doesn't exist as a node most of the time + })}) + } + + if c.Memory != 0 { + properties = append(properties, + systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))}) + } + // TODO: MemorySwap not available in systemd + + if c.CpuShares != 0 { + properties = append(properties, + systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))}) + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { + return nil, err + } + + // To work around the lack of /dev/pts/* support above we need to manually add these + // so, ask systemd for the cgroup used + props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName)) + if err != nil { + return nil, err + } + + cgroup := props["ControlGroup"].(string) + + if !c.DeviceAccess { + mountpoint, err := FindCgroupMountpoint("devices") + if err != nil { + return nil, err + } + + path := filepath.Join(mountpoint, cgroup) + + // /dev/pts/* + if err := writeFile(path, "devices.allow", "c 136:* rwm"); err != nil { + return nil, err + } + // tuntap + if err := writeFile(path, "devices.allow", "c 10:200 rwm"); err != nil { + return nil, err + } + } + + return &systemdCgroup{}, nil +} + +func (c *systemdCgroup) Cleanup() error { + // systemd cleans up, we don't need to do anything + return nil +} diff --git a/pkg/cgroups/cgroups.go b/pkg/cgroups/cgroups.go index b40e1a31fa..5fe10346df 100644 --- a/pkg/cgroups/cgroups.go +++ b/pkg/cgroups/cgroups.go @@ -8,7 +8,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strconv" "strings" ) @@ -16,10 +15,17 @@ type Cgroup struct { Name string `json:"name,omitempty"` Parent string `json:"parent,omitempty"` - DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice - Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) - MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap - CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use + + UnitProperties [][2]string `json:"unit_properties,omitempty"` // systemd unit properties +} + +type ActiveCgroup interface { + Cleanup() error } // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt @@ -62,48 +68,6 @@ func GetInitCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } -func (c *Cgroup) Path(root, subsystem string) (string, error) { - cgroup := c.Name - if c.Parent != "" { - cgroup = filepath.Join(c.Parent, cgroup) - } - initPath, err := GetInitCgroupDir(subsystem) - if err != nil { - return "", err - } - return filepath.Join(root, subsystem, initPath, cgroup), nil -} - -func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { - path, err := c.Path(root, subsystem) - if err != nil { - return "", err - } - if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { - return "", err - } - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { - return "", err - } - return path, nil -} - -func (c *Cgroup) Cleanup(root string) error { - get := func(subsystem string) string { - path, _ := c.Path(root, subsystem) - return path - } - - for _, path := range []string{ - get("memory"), - get("devices"), - get("cpu"), - } { - os.RemoveAll(path) - } - return nil -} - func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) for s.Scan() { @@ -125,126 +89,15 @@ func writeFile(dir, file, data string) error { return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } -func (c *Cgroup) Apply(pid int) error { +func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) { // We have two implementation of cgroups support, one is based on // systemd and the dbus api, and one is based on raw cgroup fs operations // following the pre-single-writer model docs at: // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ - // - // we can pick any subsystem to find the root - cgroupRoot, err := FindCgroupMountpoint("cpu") - if err != nil { - return err - } - cgroupRoot = filepath.Dir(cgroupRoot) - - if _, err := os.Stat(cgroupRoot); err != nil { - return fmt.Errorf("cgroups fs not found") - } - if err := c.setupDevices(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupMemory(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupCpu(cgroupRoot, pid); err != nil { - return err - } - return nil -} -func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { - if !c.DeviceAccess { - dir, err := c.Join(cgroupRoot, "devices", pid) - if err != nil { - return err - } - - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if err := writeFile(dir, "devices.deny", "a"); err != nil { - return err - } - - allow := []string{ - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { - if c.Memory != 0 || c.MemorySwap != 0 { - dir, err := c.Join(cgroupRoot, "memory", pid) - if err != nil { - return err - } - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if c.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - } - // By default, MemorySwap is set to twice the size of RAM. - // If you want to omit MemorySwap, set it to `-1'. - if c.MemorySwap != -1 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - dir, err := c.Join(cgroupRoot, "cpu", pid) - if err != nil { - return err - } - if c.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { - return err - } + if useSystemd() { + return systemdApply(c, pid) + } else { + return rawApply(c, pid) } - return nil } |