diff options
Diffstat (limited to 'runtime/execdriver')
20 files changed, 2192 insertions, 0 deletions
diff --git a/runtime/execdriver/MAINTAINERS b/runtime/execdriver/MAINTAINERS new file mode 100644 index 0000000000..1cb551364d --- /dev/null +++ b/runtime/execdriver/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby <michael@crosbymichael.com> (@crosbymichael) +Guillaume J. Charmes <guillaume@docker.com> (@creack) diff --git a/runtime/execdriver/driver.go b/runtime/execdriver/driver.go new file mode 100644 index 0000000000..27a575cb3a --- /dev/null +++ b/runtime/execdriver/driver.go @@ -0,0 +1,144 @@ +package execdriver + +import ( + "errors" + "io" + "os" + "os/exec" +) + +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + +var ( + ErrNotRunning = errors.New("Process could not be started") + ErrWaitTimeoutReached = errors.New("Wait timeout reached") + ErrDriverAlreadyRegistered = errors.New("A driver already registered this docker init function") + ErrDriverNotFound = errors.New("The requested docker init has not been found") +) + +var dockerInitFcts map[string]InitFunc + +type ( + StartCallback func(*Command) + InitFunc func(i *InitArgs) error +) + +func RegisterInitFunc(name string, fct InitFunc) error { + if dockerInitFcts == nil { + dockerInitFcts = make(map[string]InitFunc) + } + if _, ok := dockerInitFcts[name]; ok { + return ErrDriverAlreadyRegistered + } + dockerInitFcts[name] = fct + return nil +} + +func GetInitFunc(name string) (InitFunc, error) { + fct, ok := dockerInitFcts[name] + if !ok { + return nil, ErrDriverNotFound + } + return fct, nil +} + +// Args provided to the init function for a driver +type InitArgs struct { + User string + Gateway string + Ip string + WorkDir string + Privileged bool + Env []string + Args []string + Mtu int + Driver string + Console string + Pipe int + Root string +} + +// Driver specific information based on +// processes registered with the driver +type Info interface { + IsRunning() bool +} + +// Terminal in an interface for drivers to implement +// if they want to support Close and Resize calls from +// the core +type Terminal interface { + io.Closer + Resize(height, width int) error +} + +type TtyTerminal interface { + Master() *os.File +} + +type Driver interface { + Run(c *Command, pipes *Pipes, startCallback StartCallback) (int, error) // Run executes the process and blocks until the process exits and returns the exit code + Kill(c *Command, sig int) error + Name() string // Driver name + Info(id string) Info // "temporary" hack (until we move state from core to plugins) + GetPidsForContainer(id string) ([]int, error) // Returns a list of pids for the given container. + Terminate(c *Command) error // kill it with fire +} + +// Network settings of the container +type Network struct { + Interface *NetworkInterface `json:"interface"` // if interface is nil then networking is disabled + Mtu int `json:"mtu"` +} + +type NetworkInterface struct { + Gateway string `json:"gateway"` + IPAddress string `json:"ip"` + Bridge string `json:"bridge"` + IPPrefixLen int `json:"ip_prefix_len"` +} + +type Resources struct { + Memory int64 `json:"memory"` + MemorySwap int64 `json:"memory_swap"` + CpuShares int64 `json:"cpu_shares"` +} + +type Mount struct { + Source string `json:"source"` + Destination string `json:"destination"` + Writable bool `json:"writable"` + Private bool `json:"private"` +} + +// Process wrapps an os/exec.Cmd to add more metadata +type Command struct { + exec.Cmd `json:"-"` + + ID string `json:"id"` + Privileged bool `json:"privileged"` + User string `json:"user"` + Rootfs string `json:"rootfs"` // root fs of the container + InitPath string `json:"initpath"` // dockerinit + Entrypoint string `json:"entrypoint"` + Arguments []string `json:"arguments"` + WorkingDir string `json:"working_dir"` + ConfigPath string `json:"config_path"` // this should be able to be removed when the lxc template is moved into the driver + Tty bool `json:"tty"` + Network *Network `json:"network"` + Config map[string][]string `json:"config"` // generic values that specific drivers can consume + Resources *Resources `json:"resources"` + Mounts []Mount `json:"mounts"` + + Terminal Terminal `json:"-"` // standard or tty terminal + Console string `json:"-"` // dev/console path + ContainerPid int `json:"container_pid"` // the pid for the process inside a container +} + +// Return the pid of the process +// If the process is nil -1 will be returned +func (c *Command) Pid() int { + return c.ContainerPid +} diff --git a/runtime/execdriver/execdrivers/execdrivers.go b/runtime/execdriver/execdrivers/execdrivers.go new file mode 100644 index 0000000000..9e277c86df --- /dev/null +++ b/runtime/execdriver/execdrivers/execdrivers.go @@ -0,0 +1,23 @@ +package execdrivers + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/sysinfo" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/execdriver/lxc" + "github.com/dotcloud/docker/runtime/execdriver/native" + "path" +) + +func NewDriver(name, root, initPath string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) { + switch name { + case "lxc": + // we want to five the lxc driver the full docker root because it needs + // to access and write config and template files in /var/lib/docker/containers/* + // to be backwards compatible + return lxc.NewDriver(root, sysInfo.AppArmor) + case "native": + return native.NewDriver(path.Join(root, "execdriver", "native"), initPath) + } + return nil, fmt.Errorf("unknown exec driver %s", name) +} diff --git a/runtime/execdriver/lxc/driver.go b/runtime/execdriver/lxc/driver.go new file mode 100644 index 0000000000..ef16dcc380 --- /dev/null +++ b/runtime/execdriver/lxc/driver.go @@ -0,0 +1,418 @@ +package lxc + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/utils" + "io/ioutil" + "log" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" +) + +const DriverName = "lxc" + +func init() { + execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { + if err := setupEnv(args); err != nil { + return err + } + + if err := setupHostname(args); err != nil { + return err + } + + if err := setupNetworking(args); err != nil { + return err + } + + if err := setupCapabilities(args); err != nil { + return err + } + + if err := setupWorkingDirectory(args); err != nil { + return err + } + + if err := changeUser(args); err != nil { + return err + } + + path, err := exec.LookPath(args.Args[0]) + if err != nil { + log.Printf("Unable to locate %v", args.Args[0]) + os.Exit(127) + } + if err := syscall.Exec(path, args.Args, os.Environ()); err != nil { + return fmt.Errorf("dockerinit unable to execute %s - %s", path, err) + } + panic("Unreachable") + }) +} + +type driver struct { + root string // root path for the driver to use + apparmor bool + sharedRoot bool +} + +func NewDriver(root string, apparmor bool) (*driver, error) { + // setup unconfined symlink + if err := linkLxcStart(root); err != nil { + return nil, err + } + return &driver{ + apparmor: apparmor, + root: root, + sharedRoot: rootIsShared(), + }, nil +} + +func (d *driver) Name() string { + version := d.version() + return fmt.Sprintf("%s-%s", DriverName, version) +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + if err := execdriver.SetTerminal(c, pipes); err != nil { + return -1, err + } + configPath, err := d.generateLXCConfig(c) + if err != nil { + return -1, err + } + params := []string{ + "lxc-start", + "-n", c.ID, + "-f", configPath, + "--", + c.InitPath, + "-driver", + DriverName, + } + + if c.Network.Interface != nil { + params = append(params, + "-g", c.Network.Interface.Gateway, + "-i", fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen), + ) + } + params = append(params, + "-mtu", strconv.Itoa(c.Network.Mtu), + ) + + if c.User != "" { + params = append(params, "-u", c.User) + } + + if c.Privileged { + if d.apparmor { + params[0] = path.Join(d.root, "lxc-start-unconfined") + + } + params = append(params, "-privileged") + } + + if c.WorkingDir != "" { + params = append(params, "-w", c.WorkingDir) + } + + params = append(params, "--", c.Entrypoint) + params = append(params, c.Arguments...) + + if d.sharedRoot { + // lxc-start really needs / to be non-shared, or all kinds of stuff break + // when lxc-start unmount things and those unmounts propagate to the main + // mount namespace. + // What we really want is to clone into a new namespace and then + // mount / MS_REC|MS_SLAVE, but since we can't really clone or fork + // without exec in go we have to do this horrible shell hack... + shellString := + "mount --make-rslave /; exec " + + utils.ShellQuoteArguments(params) + + params = []string{ + "unshare", "-m", "--", "/bin/sh", "-c", shellString, + } + } + + var ( + name = params[0] + arg = params[1:] + ) + aname, err := exec.LookPath(name) + if err != nil { + aname = name + } + c.Path = aname + c.Args = append([]string{name}, arg...) + + if err := c.Start(); err != nil { + return -1, err + } + + var ( + waitErr error + waitLock = make(chan struct{}) + ) + go func() { + if err := c.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { // Do not propagate the error if it's simply a status code != 0 + waitErr = err + } + } + close(waitLock) + }() + + // Poll lxc for RUNNING status + pid, err := d.waitForStart(c, waitLock) + if err != nil { + if c.Process != nil { + c.Process.Kill() + } + return -1, err + } + c.ContainerPid = pid + + if startCallback != nil { + startCallback(c) + } + + <-waitLock + + return getExitCode(c), waitErr +} + +/// Return the exit code of the process +// if the process has not exited -1 will be returned +func getExitCode(c *execdriver.Command) int { + if c.ProcessState == nil { + return -1 + } + return c.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() +} + +func (d *driver) Kill(c *execdriver.Command, sig int) error { + return KillLxc(c.ID, sig) +} + +func (d *driver) Terminate(c *execdriver.Command) error { + return KillLxc(c.ID, 9) +} + +func (d *driver) version() string { + var ( + version string + output []byte + err error + ) + if _, errPath := exec.LookPath("lxc-version"); errPath == nil { + output, err = exec.Command("lxc-version").CombinedOutput() + } else { + output, err = exec.Command("lxc-start", "--version").CombinedOutput() + } + if err == nil { + version = strings.TrimSpace(string(output)) + if parts := strings.SplitN(version, ":", 2); len(parts) == 2 { + version = strings.TrimSpace(parts[1]) + } + } + return version +} + +func KillLxc(id string, sig int) error { + var ( + err error + output []byte + ) + _, err = exec.LookPath("lxc-kill") + if err == nil { + output, err = exec.Command("lxc-kill", "-n", id, strconv.Itoa(sig)).CombinedOutput() + } else { + output, err = exec.Command("lxc-stop", "-k", "-n", id, strconv.Itoa(sig)).CombinedOutput() + } + if err != nil { + return fmt.Errorf("Err: %s Output: %s", err, output) + } + return nil +} + +// wait for the process to start and return the pid for the process +func (d *driver) waitForStart(c *execdriver.Command, waitLock chan struct{}) (int, error) { + var ( + err error + output []byte + ) + // We wait for the container to be fully running. + // Timeout after 5 seconds. In case of broken pipe, just retry. + // Note: The container can run and finish correctly before + // the end of this loop + for now := time.Now(); time.Since(now) < 5*time.Second; { + select { + case <-waitLock: + // If the process dies while waiting for it, just return + return -1, nil + default: + } + + output, err = d.getInfo(c.ID) + if err != nil { + output, err = d.getInfo(c.ID) + if err != nil { + return -1, err + } + } + info, err := parseLxcInfo(string(output)) + if err != nil { + return -1, err + } + if info.Running { + return info.Pid, nil + } + time.Sleep(50 * time.Millisecond) + } + return -1, execdriver.ErrNotRunning +} + +func (d *driver) getInfo(id string) ([]byte, error) { + return exec.Command("lxc-info", "-n", id).CombinedOutput() +} + +type info struct { + ID string + driver *driver +} + +func (i *info) IsRunning() bool { + var running bool + + output, err := i.driver.getInfo(i.ID) + if err != nil { + utils.Errorf("Error getting info for lxc container %s: %s (%s)", i.ID, err, output) + return false + } + if strings.Contains(string(output), "RUNNING") { + running = true + } + return running +} + +func (d *driver) Info(id string) execdriver.Info { + return &info{ + ID: id, + driver: d, + } +} + +func (d *driver) GetPidsForContainer(id string) ([]int, error) { + pids := []int{} + + // cpu is chosen because it is the only non optional subsystem in cgroups + subsystem := "cpu" + cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return pids, err + } + + cgroupDir, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return pids, err + } + + filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks") + if _, err := os.Stat(filename); os.IsNotExist(err) { + // With more recent lxc versions use, cgroup will be in lxc/ + filename = filepath.Join(cgroupRoot, cgroupDir, "lxc", id, "tasks") + } + + output, err := ioutil.ReadFile(filename) + if err != nil { + return pids, err + } + for _, p := range strings.Split(string(output), "\n") { + if len(p) == 0 { + continue + } + pid, err := strconv.Atoi(p) + if err != nil { + return pids, fmt.Errorf("Invalid pid '%s': %s", p, err) + } + pids = append(pids, pid) + } + return pids, nil +} + +func linkLxcStart(root string) error { + sourcePath, err := exec.LookPath("lxc-start") + if err != nil { + return err + } + targetPath := path.Join(root, "lxc-start-unconfined") + + if _, err := os.Lstat(targetPath); err != nil && !os.IsNotExist(err) { + return err + } else if err == nil { + if err := os.Remove(targetPath); err != nil { + return err + } + } + return os.Symlink(sourcePath, targetPath) +} + +// TODO: This can be moved to the mountinfo reader in the mount pkg +func rootIsShared() bool { + if data, err := ioutil.ReadFile("/proc/self/mountinfo"); err == nil { + for _, line := range strings.Split(string(data), "\n") { + cols := strings.Split(line, " ") + if len(cols) >= 6 && cols[4] == "/" { + return strings.HasPrefix(cols[6], "shared") + } + } + } + + // No idea, probably safe to assume so + return true +} + +func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) { + var ( + process, mount string + root = path.Join(d.root, "containers", c.ID, "config.lxc") + labels = c.Config["label"] + ) + fo, err := os.Create(root) + if err != nil { + return "", err + } + defer fo.Close() + + if len(labels) > 0 { + process, mount, err = label.GenLabels(labels[0]) + if err != nil { + return "", err + } + } + + if err := LxcTemplateCompiled.Execute(fo, struct { + *execdriver.Command + AppArmor bool + ProcessLabel string + MountLabel string + }{ + Command: c, + AppArmor: d.apparmor, + ProcessLabel: process, + MountLabel: mount, + }); err != nil { + return "", err + } + return root, nil +} diff --git a/runtime/execdriver/lxc/info.go b/runtime/execdriver/lxc/info.go new file mode 100644 index 0000000000..27b4c58604 --- /dev/null +++ b/runtime/execdriver/lxc/info.go @@ -0,0 +1,50 @@ +package lxc + +import ( + "bufio" + "errors" + "strconv" + "strings" +) + +var ( + ErrCannotParse = errors.New("cannot parse raw input") +) + +type lxcInfo struct { + Running bool + Pid int +} + +func parseLxcInfo(raw string) (*lxcInfo, error) { + if raw == "" { + return nil, ErrCannotParse + } + var ( + err error + s = bufio.NewScanner(strings.NewReader(raw)) + info = &lxcInfo{} + ) + for s.Scan() { + text := s.Text() + + if s.Err() != nil { + return nil, s.Err() + } + + parts := strings.Split(text, ":") + if len(parts) < 2 { + continue + } + switch strings.ToLower(strings.TrimSpace(parts[0])) { + case "state": + info.Running = strings.TrimSpace(parts[1]) == "RUNNING" + case "pid": + info.Pid, err = strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return nil, err + } + } + } + return info, nil +} diff --git a/runtime/execdriver/lxc/info_test.go b/runtime/execdriver/lxc/info_test.go new file mode 100644 index 0000000000..edafc02511 --- /dev/null +++ b/runtime/execdriver/lxc/info_test.go @@ -0,0 +1,36 @@ +package lxc + +import ( + "testing" +) + +func TestParseRunningInfo(t *testing.T) { + raw := ` + state: RUNNING + pid: 50` + + info, err := parseLxcInfo(raw) + if err != nil { + t.Fatal(err) + } + if !info.Running { + t.Fatal("info should return a running state") + } + if info.Pid != 50 { + t.Fatalf("info should have pid 50 got %d", info.Pid) + } +} + +func TestEmptyInfo(t *testing.T) { + _, err := parseLxcInfo("") + if err == nil { + t.Fatal("error should not be nil") + } +} + +func TestBadInfo(t *testing.T) { + _, err := parseLxcInfo("state") + if err != nil { + t.Fatal(err) + } +} diff --git a/runtime/execdriver/lxc/init.go b/runtime/execdriver/lxc/init.go new file mode 100644 index 0000000000..c1933a5e43 --- /dev/null +++ b/runtime/execdriver/lxc/init.go @@ -0,0 +1,175 @@ +package lxc + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/netlink" + "github.com/dotcloud/docker/pkg/user" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/syndtr/gocapability/capability" + "io/ioutil" + "net" + "os" + "strings" + "syscall" +) + +// Clear environment pollution introduced by lxc-start +func setupEnv(args *execdriver.InitArgs) error { + // Get env + var env []string + content, err := ioutil.ReadFile(".dockerenv") + if err != nil { + return fmt.Errorf("Unable to load environment variables: %v", err) + } + if err := json.Unmarshal(content, &env); err != nil { + return fmt.Errorf("Unable to unmarshal environment variables: %v", err) + } + // Propagate the plugin-specific container env variable + env = append(env, "container="+os.Getenv("container")) + + args.Env = env + + os.Clearenv() + for _, kv := range args.Env { + parts := strings.SplitN(kv, "=", 2) + if len(parts) == 1 { + parts = append(parts, "") + } + os.Setenv(parts[0], parts[1]) + } + + return nil +} + +func setupHostname(args *execdriver.InitArgs) error { + hostname := getEnv(args, "HOSTNAME") + if hostname == "" { + return nil + } + return setHostname(hostname) +} + +// Setup networking +func setupNetworking(args *execdriver.InitArgs) error { + if args.Ip != "" { + // eth0 + iface, err := net.InterfaceByName("eth0") + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + ip, ipNet, err := net.ParseCIDR(args.Ip) + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkLinkAddIp(iface, ip, ipNet); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkSetMTU(iface, args.Mtu); err != nil { + return fmt.Errorf("Unable to set MTU: %v", err) + } + if err := netlink.NetworkLinkUp(iface); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + + // loopback + iface, err = net.InterfaceByName("lo") + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkLinkUp(iface); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + } + if args.Gateway != "" { + gw := net.ParseIP(args.Gateway) + if gw == nil { + return fmt.Errorf("Unable to set up networking, %s is not a valid gateway IP", args.Gateway) + } + + if err := netlink.AddDefaultGw(gw); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + } + + return nil +} + +// Setup working directory +func setupWorkingDirectory(args *execdriver.InitArgs) error { + if args.WorkDir == "" { + return nil + } + if err := syscall.Chdir(args.WorkDir); err != nil { + return fmt.Errorf("Unable to change dir to %v: %v", args.WorkDir, err) + } + return nil +} + +// Takes care of dropping privileges to the desired user +func changeUser(args *execdriver.InitArgs) error { + uid, gid, suppGids, err := user.GetUserGroupSupplementary( + args.User, + syscall.Getuid(), syscall.Getgid(), + ) + if err != nil { + return err + } + + if err := syscall.Setgroups(suppGids); err != nil { + return fmt.Errorf("Setgroups failed: %v", err) + } + if err := syscall.Setgid(gid); err != nil { + return fmt.Errorf("Setgid failed: %v", err) + } + if err := syscall.Setuid(uid); err != nil { + return fmt.Errorf("Setuid failed: %v", err) + } + + return nil +} + +func setupCapabilities(args *execdriver.InitArgs) error { + if args.Privileged { + return nil + } + + drop := []capability.Cap{ + capability.CAP_SETPCAP, + capability.CAP_SYS_MODULE, + capability.CAP_SYS_RAWIO, + capability.CAP_SYS_PACCT, + capability.CAP_SYS_ADMIN, + capability.CAP_SYS_NICE, + capability.CAP_SYS_RESOURCE, + capability.CAP_SYS_TIME, + capability.CAP_SYS_TTY_CONFIG, + capability.CAP_AUDIT_WRITE, + capability.CAP_AUDIT_CONTROL, + capability.CAP_MAC_OVERRIDE, + capability.CAP_MAC_ADMIN, + capability.CAP_NET_ADMIN, + } + + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + return nil +} + +func getEnv(args *execdriver.InitArgs, key string) string { + for _, kv := range args.Env { + parts := strings.SplitN(kv, "=", 2) + if parts[0] == key && len(parts) == 2 { + return parts[1] + } + } + return "" +} diff --git a/runtime/execdriver/lxc/lxc_init_linux.go b/runtime/execdriver/lxc/lxc_init_linux.go new file mode 100644 index 0000000000..7288f5877b --- /dev/null +++ b/runtime/execdriver/lxc/lxc_init_linux.go @@ -0,0 +1,11 @@ +// +build amd64 + +package lxc + +import ( + "syscall" +) + +func setHostname(hostname string) error { + return syscall.Sethostname([]byte(hostname)) +} diff --git a/runtime/execdriver/lxc/lxc_init_unsupported.go b/runtime/execdriver/lxc/lxc_init_unsupported.go new file mode 100644 index 0000000000..d68cb91a1e --- /dev/null +++ b/runtime/execdriver/lxc/lxc_init_unsupported.go @@ -0,0 +1,7 @@ +// +build !linux !amd64 + +package lxc + +func setHostname(hostname string) error { + panic("Not supported on darwin") +} diff --git a/runtime/execdriver/lxc/lxc_template.go b/runtime/execdriver/lxc/lxc_template.go new file mode 100644 index 0000000000..c49753c6aa --- /dev/null +++ b/runtime/execdriver/lxc/lxc_template.go @@ -0,0 +1,176 @@ +package lxc + +import ( + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/runtime/execdriver" + "strings" + "text/template" +) + +const LxcTemplate = ` +{{if .Network.Interface}} +# network configuration +lxc.network.type = veth +lxc.network.link = {{.Network.Interface.Bridge}} +lxc.network.name = eth0 +{{else}} +# network is disabled (-n=false) +lxc.network.type = empty +lxc.network.flags = up +{{end}} +lxc.network.mtu = {{.Network.Mtu}} + +# root filesystem +{{$ROOTFS := .Rootfs}} +lxc.rootfs = {{$ROOTFS}} + +# use a dedicated pts for the container (and limit the number of pseudo terminal +# available) +lxc.pts = 1024 + +# disable the main console +lxc.console = none +{{if .ProcessLabel}} +lxc.se_context = {{ .ProcessLabel}} +{{end}} +{{$MOUNTLABEL := .MountLabel}} + +# no controlling tty at all +lxc.tty = 1 + +{{if .Privileged}} +lxc.cgroup.devices.allow = a +{{else}} +# no implicit access to devices +lxc.cgroup.devices.deny = a + +# but allow mknod for any device +lxc.cgroup.devices.allow = c *:* m +lxc.cgroup.devices.allow = b *:* m + +# /dev/null and zero +lxc.cgroup.devices.allow = c 1:3 rwm +lxc.cgroup.devices.allow = c 1:5 rwm + +# consoles +lxc.cgroup.devices.allow = c 5:1 rwm +lxc.cgroup.devices.allow = c 5:0 rwm +lxc.cgroup.devices.allow = c 4:0 rwm +lxc.cgroup.devices.allow = c 4:1 rwm + +# /dev/urandom,/dev/random +lxc.cgroup.devices.allow = c 1:9 rwm +lxc.cgroup.devices.allow = c 1:8 rwm + +# /dev/pts/ - pts namespaces are "coming soon" +lxc.cgroup.devices.allow = c 136:* rwm +lxc.cgroup.devices.allow = c 5:2 rwm + +# tuntap +lxc.cgroup.devices.allow = c 10:200 rwm + +# fuse +#lxc.cgroup.devices.allow = c 10:229 rwm + +# rtc +#lxc.cgroup.devices.allow = c 254:0 rwm +{{end}} + +# standard mount point +# Use mnt.putold as per https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/986385 +lxc.pivotdir = lxc_putold + +# NOTICE: These mounts must be applied within the namespace + +# WARNING: procfs is a known attack vector and should probably be disabled +# if your userspace allows it. eg. see http://blog.zx2c4.com/749 +lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0 + +# WARNING: sysfs is a known attack vector and should probably be disabled +# if your userspace allows it. eg. see http://bit.ly/T9CkqJ +lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0 + +{{if .Tty}} +lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0 +{{end}} + +lxc.mount.entry = devpts {{escapeFstabSpaces $ROOTFS}}/dev/pts devpts {{formatMountLabel "newinstance,ptmxmode=0666,nosuid,noexec" $MOUNTLABEL}} 0 0 +lxc.mount.entry = shm {{escapeFstabSpaces $ROOTFS}}/dev/shm tmpfs {{formatMountLabel "size=65536k,nosuid,nodev,noexec" $MOUNTLABEL}} 0 0 + +{{range $value := .Mounts}} +{{if $value.Writable}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,rw 0 0 +{{else}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,ro 0 0 +{{end}} +{{end}} + +{{if .Privileged}} +{{if .AppArmor}} +lxc.aa_profile = unconfined +{{else}} +#lxc.aa_profile = unconfined +{{end}} +{{end}} + +# limits +{{if .Resources}} +{{if .Resources.Memory}} +lxc.cgroup.memory.limit_in_bytes = {{.Resources.Memory}} +lxc.cgroup.memory.soft_limit_in_bytes = {{.Resources.Memory}} +{{with $memSwap := getMemorySwap .Resources}} +lxc.cgroup.memory.memsw.limit_in_bytes = {{$memSwap}} +{{end}} +{{end}} +{{if .Resources.CpuShares}} +lxc.cgroup.cpu.shares = {{.Resources.CpuShares}} +{{end}} +{{end}} + +{{if .Config.lxc}} +{{range $value := .Config.lxc}} +lxc.{{$value}} +{{end}} +{{end}} +` + +var LxcTemplateCompiled *template.Template + +// Escape spaces in strings according to the fstab documentation, which is the +// format for "lxc.mount.entry" lines in lxc.conf. See also "man 5 fstab". +func escapeFstabSpaces(field string) string { + return strings.Replace(field, " ", "\\040", -1) +} + +func getMemorySwap(v *execdriver.Resources) int64 { + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if v.MemorySwap < 0 { + return 0 + } + return v.Memory * 2 +} + +func getLabel(c map[string][]string, name string) string { + label := c["label"] + for _, l := range label { + parts := strings.SplitN(l, "=", 2) + if strings.TrimSpace(parts[0]) == name { + return strings.TrimSpace(parts[1]) + } + } + return "" +} + +func init() { + var err error + funcMap := template.FuncMap{ + "getMemorySwap": getMemorySwap, + "escapeFstabSpaces": escapeFstabSpaces, + "formatMountLabel": label.FormatMountLabel, + } + LxcTemplateCompiled, err = template.New("lxc").Funcs(funcMap).Parse(LxcTemplate) + if err != nil { + panic(err) + } +} diff --git a/runtime/execdriver/lxc/lxc_template_unit_test.go b/runtime/execdriver/lxc/lxc_template_unit_test.go new file mode 100644 index 0000000000..7f473a0502 --- /dev/null +++ b/runtime/execdriver/lxc/lxc_template_unit_test.go @@ -0,0 +1,135 @@ +package lxc + +import ( + "bufio" + "fmt" + "github.com/dotcloud/docker/runtime/execdriver" + "io/ioutil" + "math/rand" + "os" + "path" + "strings" + "testing" + "time" +) + +func TestLXCConfig(t *testing.T) { + root, err := ioutil.TempDir("", "TestLXCConfig") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + os.MkdirAll(path.Join(root, "containers", "1"), 0777) + + // Memory is allocated randomly for testing + rand.Seed(time.Now().UTC().UnixNano()) + var ( + memMin = 33554432 + memMax = 536870912 + mem = memMin + rand.Intn(memMax-memMin) + cpuMin = 100 + cpuMax = 10000 + cpu = cpuMin + rand.Intn(cpuMax-cpuMin) + ) + + driver, err := NewDriver(root, false) + if err != nil { + t.Fatal(err) + } + command := &execdriver.Command{ + ID: "1", + Resources: &execdriver.Resources{ + Memory: int64(mem), + CpuShares: int64(cpu), + }, + Network: &execdriver.Network{ + Mtu: 1500, + Interface: nil, + }, + } + p, err := driver.generateLXCConfig(command) + if err != nil { + t.Fatal(err) + } + grepFile(t, p, + fmt.Sprintf("lxc.cgroup.memory.limit_in_bytes = %d", mem)) + + grepFile(t, p, + fmt.Sprintf("lxc.cgroup.memory.memsw.limit_in_bytes = %d", mem*2)) +} + +func TestCustomLxcConfig(t *testing.T) { + root, err := ioutil.TempDir("", "TestCustomLxcConfig") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + os.MkdirAll(path.Join(root, "containers", "1"), 0777) + + driver, err := NewDriver(root, false) + if err != nil { + t.Fatal(err) + } + command := &execdriver.Command{ + ID: "1", + Privileged: false, + Config: map[string][]string{ + "lxc": { + "lxc.utsname = docker", + "lxc.cgroup.cpuset.cpus = 0,1", + }, + }, + Network: &execdriver.Network{ + Mtu: 1500, + Interface: nil, + }, + } + + p, err := driver.generateLXCConfig(command) + if err != nil { + t.Fatal(err) + } + + grepFile(t, p, "lxc.utsname = docker") + grepFile(t, p, "lxc.cgroup.cpuset.cpus = 0,1") +} + +func grepFile(t *testing.T, path string, pattern string) { + f, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + r := bufio.NewReader(f) + var ( + line string + ) + err = nil + for err == nil { + line, err = r.ReadString('\n') + if strings.Contains(line, pattern) == true { + return + } + } + t.Fatalf("grepFile: pattern \"%s\" not found in \"%s\"", pattern, path) +} + +func TestEscapeFstabSpaces(t *testing.T) { + var testInputs = map[string]string{ + " ": "\\040", + "": "", + "/double space": "/double\\040\\040space", + "/some long test string": "/some\\040long\\040test\\040string", + "/var/lib/docker": "/var/lib/docker", + " leading": "\\040leading", + "trailing ": "trailing\\040", + } + for in, exp := range testInputs { + if out := escapeFstabSpaces(in); exp != out { + t.Logf("Expected %s got %s", exp, out) + t.Fail() + } + } +} diff --git a/runtime/execdriver/native/configuration/parse.go b/runtime/execdriver/native/configuration/parse.go new file mode 100644 index 0000000000..6d6c643919 --- /dev/null +++ b/runtime/execdriver/native/configuration/parse.go @@ -0,0 +1,186 @@ +package configuration + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/utils" + "os/exec" + "path/filepath" + "strconv" + "strings" +) + +type Action func(*libcontainer.Container, interface{}, string) error + +var actions = map[string]Action{ + "cap.add": addCap, // add a cap + "cap.drop": dropCap, // drop a cap + + "ns.add": addNamespace, // add a namespace + "ns.drop": dropNamespace, // drop a namespace when cloning + + "net.join": joinNetNamespace, // join another containers net namespace + + "cgroups.cpu_shares": cpuShares, // set the cpu shares + "cgroups.memory": memory, // set the memory limit + "cgroups.memory_swap": memorySwap, // set the memory swap limit + "cgroups.cpuset.cpus": cpusetCpus, // set the cpus used + + "apparmor_profile": apparmorProfile, // set the apparmor profile to apply + + "fs.readonly": readonlyFs, // make the rootfs of the container read only +} + +func cpusetCpus(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + container.Cgroups.CpusetCpus = value + + return nil +} + +func apparmorProfile(container *libcontainer.Container, context interface{}, value string) error { + container.Context["apparmor_profile"] = value + return nil +} + +func cpuShares(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + v, err := strconv.ParseInt(value, 10, 0) + if err != nil { + return err + } + container.Cgroups.CpuShares = v + return nil +} + +func memory(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + + v, err := utils.RAMInBytes(value) + if err != nil { + return err + } + container.Cgroups.Memory = v + return nil +} + +func memorySwap(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + v, err := strconv.ParseInt(value, 0, 64) + if err != nil { + return err + } + container.Cgroups.MemorySwap = v + return nil +} + +func addCap(container *libcontainer.Container, context interface{}, value string) error { + c := container.CapabilitiesMask.Get(value) + if c == nil { + return fmt.Errorf("%s is not a valid capability", value) + } + c.Enabled = true + return nil +} + +func dropCap(container *libcontainer.Container, context interface{}, value string) error { + c := container.CapabilitiesMask.Get(value) + if c == nil { + return fmt.Errorf("%s is not a valid capability", value) + } + c.Enabled = false + return nil +} + +func addNamespace(container *libcontainer.Container, context interface{}, value string) error { + ns := container.Namespaces.Get(value) + if ns == nil { + return fmt.Errorf("%s is not a valid namespace", value[1:]) + } + ns.Enabled = true + return nil +} + +func dropNamespace(container *libcontainer.Container, context interface{}, value string) error { + ns := container.Namespaces.Get(value) + if ns == nil { + return fmt.Errorf("%s is not a valid namespace", value[1:]) + } + ns.Enabled = false + return nil +} + +func readonlyFs(container *libcontainer.Container, context interface{}, value string) error { + switch value { + case "1", "true": + container.ReadonlyFs = true + default: + container.ReadonlyFs = false + } + return nil +} + +func joinNetNamespace(container *libcontainer.Container, context interface{}, value string) error { + var ( + running = context.(map[string]*exec.Cmd) + cmd = running[value] + ) + + if cmd == nil || cmd.Process == nil { + return fmt.Errorf("%s is not a valid running container to join", value) + } + nspath := filepath.Join("/proc", fmt.Sprint(cmd.Process.Pid), "ns", "net") + container.Networks = append(container.Networks, &libcontainer.Network{ + Type: "netns", + Context: libcontainer.Context{ + "nspath": nspath, + }, + }) + return nil +} + +func vethMacAddress(container *libcontainer.Container, context interface{}, value string) error { + var veth *libcontainer.Network + for _, network := range container.Networks { + if network.Type == "veth" { + veth = network + break + } + } + if veth == nil { + return fmt.Errorf("not veth configured for container") + } + veth.Context["mac"] = value + return nil +} + +// configureCustomOptions takes string commands from the user and allows modification of the +// container's default configuration. +// +// TODO: this can be moved to a general utils or parser in pkg +func ParseConfiguration(container *libcontainer.Container, running map[string]*exec.Cmd, opts []string) error { + for _, opt := range opts { + kv := strings.SplitN(opt, "=", 2) + if len(kv) < 2 { + return fmt.Errorf("invalid format for %s", opt) + } + + action, exists := actions[kv[0]] + if !exists { + return fmt.Errorf("%s is not a valid option for the native driver", kv[0]) + } + + if err := action(container, running, kv[1]); err != nil { + return err + } + } + return nil +} diff --git a/runtime/execdriver/native/configuration/parse_test.go b/runtime/execdriver/native/configuration/parse_test.go new file mode 100644 index 0000000000..8001358766 --- /dev/null +++ b/runtime/execdriver/native/configuration/parse_test.go @@ -0,0 +1,166 @@ +package configuration + +import ( + "github.com/dotcloud/docker/runtime/execdriver/native/template" + "testing" +) + +func TestSetReadonlyRootFs(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "fs.readonly=true", + } + ) + + if container.ReadonlyFs { + t.Fatal("container should not have a readonly rootfs by default") + } + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if !container.ReadonlyFs { + t.Fatal("container should have a readonly rootfs") + } +} + +func TestConfigurationsDoNotConflict(t *testing.T) { + var ( + container1 = template.New() + container2 = template.New() + opts = []string{ + "cap.add=NET_ADMIN", + } + ) + + if err := ParseConfiguration(container1, nil, opts); err != nil { + t.Fatal(err) + } + + if !container1.CapabilitiesMask.Get("NET_ADMIN").Enabled { + t.Fatal("container one should have NET_ADMIN enabled") + } + if container2.CapabilitiesMask.Get("NET_ADMIN").Enabled { + t.Fatal("container two should not have NET_ADMIN enabled") + } +} + +func TestCpusetCpus(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.cpuset.cpus=1,2", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := "1,2"; container.Cgroups.CpusetCpus != expected { + t.Fatalf("expected %s got %s for cpuset.cpus", expected, container.Cgroups.CpusetCpus) + } +} + +func TestAppArmorProfile(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "apparmor_profile=koye-the-protector", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + if expected := "koye-the-protector"; container.Context["apparmor_profile"] != expected { + t.Fatalf("expected profile %s got %s", expected, container.Context["apparmor_profile"]) + } +} + +func TestCpuShares(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.cpu_shares=1048", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := int64(1048); container.Cgroups.CpuShares != expected { + t.Fatalf("expected cpu shares %d got %d", expected, container.Cgroups.CpuShares) + } +} + +func TestCgroupMemory(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.memory=500m", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := int64(500 * 1024 * 1024); container.Cgroups.Memory != expected { + t.Fatalf("expected memory %d got %d", expected, container.Cgroups.Memory) + } +} + +func TestAddCap(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cap.add=MKNOD", + "cap.add=SYS_ADMIN", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if !container.CapabilitiesMask.Get("MKNOD").Enabled { + t.Fatal("container should have MKNOD enabled") + } + if !container.CapabilitiesMask.Get("SYS_ADMIN").Enabled { + t.Fatal("container should have SYS_ADMIN enabled") + } +} + +func TestDropCap(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cap.drop=MKNOD", + } + ) + // enabled all caps like in privileged mode + for _, c := range container.CapabilitiesMask { + c.Enabled = true + } + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if container.CapabilitiesMask.Get("MKNOD").Enabled { + t.Fatal("container should not have MKNOD enabled") + } +} + +func TestDropNamespace(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "ns.drop=NEWNET", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if container.Namespaces.Get("NEWNET").Enabled { + t.Fatal("container should not have NEWNET enabled") + } +} diff --git a/runtime/execdriver/native/create.go b/runtime/execdriver/native/create.go new file mode 100644 index 0000000000..71fab3e064 --- /dev/null +++ b/runtime/execdriver/native/create.go @@ -0,0 +1,114 @@ +package native + +import ( + "fmt" + "os" + + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/execdriver/native/configuration" + "github.com/dotcloud/docker/runtime/execdriver/native/template" +) + +// createContainer populates and configures the container type with the +// data provided by the execdriver.Command +func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container, error) { + container := template.New() + + container.Hostname = getEnv("HOSTNAME", c.Env) + container.Tty = c.Tty + container.User = c.User + container.WorkingDir = c.WorkingDir + container.Env = c.Env + container.Cgroups.Name = c.ID + // check to see if we are running in ramdisk to disable pivot root + container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" + + if err := d.createNetwork(container, c); err != nil { + return nil, err + } + if c.Privileged { + if err := d.setPrivileged(container); err != nil { + return nil, err + } + } + if err := d.setupCgroups(container, c); err != nil { + return nil, err + } + if err := d.setupMounts(container, c); err != nil { + return nil, err + } + if err := d.setupLabels(container, c); err != nil { + return nil, err + } + if err := configuration.ParseConfiguration(container, d.activeContainers, c.Config["native"]); err != nil { + return nil, err + } + return container, nil +} + +func (d *driver) createNetwork(container *libcontainer.Container, c *execdriver.Command) error { + container.Networks = []*libcontainer.Network{ + { + Mtu: c.Network.Mtu, + Address: fmt.Sprintf("%s/%d", "127.0.0.1", 0), + Gateway: "localhost", + Type: "loopback", + Context: libcontainer.Context{}, + }, + } + + if c.Network.Interface != nil { + vethNetwork := libcontainer.Network{ + Mtu: c.Network.Mtu, + Address: fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen), + Gateway: c.Network.Interface.Gateway, + Type: "veth", + Context: libcontainer.Context{ + "prefix": "veth", + "bridge": c.Network.Interface.Bridge, + }, + } + container.Networks = append(container.Networks, &vethNetwork) + } + return nil +} + +func (d *driver) setPrivileged(container *libcontainer.Container) error { + for _, c := range container.CapabilitiesMask { + c.Enabled = true + } + container.Cgroups.DeviceAccess = true + container.Context["apparmor_profile"] = "unconfined" + return nil +} + +func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.Command) error { + if c.Resources != nil { + container.Cgroups.CpuShares = c.Resources.CpuShares + container.Cgroups.Memory = c.Resources.Memory + container.Cgroups.MemorySwap = c.Resources.MemorySwap + } + return nil +} + +func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error { + for _, m := range c.Mounts { + container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private}) + } + return nil +} + +func (d *driver) setupLabels(container *libcontainer.Container, c *execdriver.Command) error { + labels := c.Config["label"] + if len(labels) > 0 { + process, mount, err := label.GenLabels(labels[0]) + if err != nil { + return err + } + container.Context["mount_label"] = mount + container.Context["process_label"] = process + } + return nil +} diff --git a/runtime/execdriver/native/driver.go b/runtime/execdriver/native/driver.go new file mode 100644 index 0000000000..d18865e508 --- /dev/null +++ b/runtime/execdriver/native/driver.go @@ -0,0 +1,292 @@ +package native + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/apparmor" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/runtime/execdriver" + "io" + "io/ioutil" + "log" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +const ( + DriverName = "native" + Version = "0.1" + BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root +) + +func init() { + execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { + var ( + container *libcontainer.Container + ns = nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{args.Root}, createLogger("")) + ) + f, err := os.Open(filepath.Join(args.Root, "container.json")) + if err != nil { + return err + } + if err := json.NewDecoder(f).Decode(&container); err != nil { + f.Close() + return err + } + f.Close() + + cwd, err := os.Getwd() + if err != nil { + return err + } + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(args.Pipe)) + if err != nil { + return err + } + if err := ns.Init(container, cwd, args.Console, syncPipe, args.Args); err != nil { + return err + } + return nil + }) +} + +type driver struct { + root string + initPath string + activeContainers map[string]*exec.Cmd +} + +func NewDriver(root, initPath string) (*driver, error) { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, err + } + // native driver root is at docker_root/execdriver/native. Put apparmor at docker_root + if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil { + return nil, err + } + return &driver{ + root: root, + initPath: initPath, + activeContainers: make(map[string]*exec.Cmd), + }, nil +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + // take the Command and populate the libcontainer.Container from it + container, err := d.createContainer(c) + if err != nil { + return -1, err + } + d.activeContainers[c.ID] = &c.Cmd + + var ( + term nsinit.Terminal + factory = &dockerCommandFactory{c: c, driver: d} + stateWriter = &dockerStateWriter{ + callback: startCallback, + c: c, + dsw: &nsinit.DefaultStateWriter{filepath.Join(d.root, c.ID)}, + } + ns = nsinit.NewNsInit(factory, stateWriter, createLogger(os.Getenv("DEBUG"))) + args = append([]string{c.Entrypoint}, c.Arguments...) + ) + if err := d.createContainerRoot(c.ID); err != nil { + return -1, err + } + defer d.removeContainerRoot(c.ID) + + if c.Tty { + term = &dockerTtyTerm{ + pipes: pipes, + } + } else { + term = &dockerStdTerm{ + pipes: pipes, + } + } + c.Terminal = term + if err := d.writeContainerFile(container, c.ID); err != nil { + return -1, err + } + return ns.Exec(container, term, args) +} + +func (d *driver) Kill(p *execdriver.Command, sig int) error { + return syscall.Kill(p.Process.Pid, syscall.Signal(sig)) +} + +func (d *driver) Terminate(p *execdriver.Command) error { + // lets check the start time for the process + started, err := d.readStartTime(p) + if err != nil { + // if we don't have the data on disk then we can assume the process is gone + // because this is only removed after we know the process has stopped + if os.IsNotExist(err) { + return nil + } + return err + } + + currentStartTime, err := system.GetProcessStartTime(p.Process.Pid) + if err != nil { + return err + } + if started == currentStartTime { + err = syscall.Kill(p.Process.Pid, 9) + } + d.removeContainerRoot(p.ID) + return err + +} + +func (d *driver) readStartTime(p *execdriver.Command) (string, error) { + data, err := ioutil.ReadFile(filepath.Join(d.root, p.ID, "start")) + if err != nil { + return "", err + } + return string(data), nil +} + +func (d *driver) Info(id string) execdriver.Info { + return &info{ + ID: id, + driver: d, + } +} + +func (d *driver) Name() string { + return fmt.Sprintf("%s-%s", DriverName, Version) +} + +// TODO: this can be improved with our driver +// there has to be a better way to do this +func (d *driver) GetPidsForContainer(id string) ([]int, error) { + pids := []int{} + + subsystem := "devices" + cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return pids, err + } + cgroupDir, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return pids, err + } + + filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks") + if _, err := os.Stat(filename); os.IsNotExist(err) { + filename = filepath.Join(cgroupRoot, cgroupDir, "docker", id, "tasks") + } + + output, err := ioutil.ReadFile(filename) + if err != nil { + return pids, err + } + for _, p := range strings.Split(string(output), "\n") { + if len(p) == 0 { + continue + } + pid, err := strconv.Atoi(p) + if err != nil { + return pids, fmt.Errorf("Invalid pid '%s': %s", p, err) + } + pids = append(pids, pid) + } + return pids, nil +} + +func (d *driver) writeContainerFile(container *libcontainer.Container, id string) error { + data, err := json.Marshal(container) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(d.root, id, "container.json"), data, 0655) +} + +func (d *driver) createContainerRoot(id string) error { + return os.MkdirAll(filepath.Join(d.root, id), 0655) +} + +func (d *driver) removeContainerRoot(id string) error { + return os.RemoveAll(filepath.Join(d.root, id)) +} + +func getEnv(key string, env []string) string { + for _, pair := range env { + parts := strings.Split(pair, "=") + if parts[0] == key { + return parts[1] + } + } + return "" +} + +type dockerCommandFactory struct { + c *execdriver.Command + driver *driver +} + +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (d *dockerCommandFactory) Create(container *libcontainer.Container, console string, syncFile *os.File, args []string) *exec.Cmd { + // we need to join the rootfs because nsinit will setup the rootfs and chroot + initPath := filepath.Join(d.c.Rootfs, d.c.InitPath) + + d.c.Path = d.driver.initPath + d.c.Args = append([]string{ + initPath, + "-driver", DriverName, + "-console", console, + "-pipe", "3", + "-root", filepath.Join(d.driver.root, d.c.ID), + "--", + }, args...) + + // set this to nil so that when we set the clone flags anything else is reset + d.c.SysProcAttr = nil + system.SetCloneFlags(&d.c.Cmd, uintptr(nsinit.GetNamespaceFlags(container.Namespaces))) + d.c.ExtraFiles = []*os.File{syncFile} + + d.c.Env = container.Env + d.c.Dir = d.c.Rootfs + + return &d.c.Cmd +} + +type dockerStateWriter struct { + dsw nsinit.StateWriter + c *execdriver.Command + callback execdriver.StartCallback +} + +func (d *dockerStateWriter) WritePid(pid int, started string) error { + d.c.ContainerPid = pid + err := d.dsw.WritePid(pid, started) + if d.callback != nil { + d.callback(d.c) + } + return err +} + +func (d *dockerStateWriter) DeletePid() error { + return d.dsw.DeletePid() +} + +func createLogger(debug string) *log.Logger { + var w io.Writer + // if we are in debug mode set the logger to stderr + if debug != "" { + w = os.Stderr + } else { + w = ioutil.Discard + } + return log.New(w, "[libcontainer] ", log.LstdFlags) +} diff --git a/runtime/execdriver/native/info.go b/runtime/execdriver/native/info.go new file mode 100644 index 0000000000..aef2f85c6b --- /dev/null +++ b/runtime/execdriver/native/info.go @@ -0,0 +1,21 @@ +package native + +import ( + "os" + "path/filepath" +) + +type info struct { + ID string + driver *driver +} + +// IsRunning is determined by looking for the +// pid file for a container. If the file exists then the +// container is currently running +func (i *info) IsRunning() bool { + if _, err := os.Stat(filepath.Join(i.driver.root, i.ID, "pid")); err == nil { + return true + } + return false +} diff --git a/runtime/execdriver/native/template/default_template.go b/runtime/execdriver/native/template/default_template.go new file mode 100644 index 0000000000..a1ecb04d76 --- /dev/null +++ b/runtime/execdriver/native/template/default_template.go @@ -0,0 +1,45 @@ +package template + +import ( + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +// New returns the docker default configuration for libcontainer +func New() *libcontainer.Container { + container := &libcontainer.Container{ + CapabilitiesMask: libcontainer.Capabilities{ + libcontainer.GetCapability("SETPCAP"), + libcontainer.GetCapability("SYS_MODULE"), + libcontainer.GetCapability("SYS_RAWIO"), + libcontainer.GetCapability("SYS_PACCT"), + libcontainer.GetCapability("SYS_ADMIN"), + libcontainer.GetCapability("SYS_NICE"), + libcontainer.GetCapability("SYS_RESOURCE"), + libcontainer.GetCapability("SYS_TIME"), + libcontainer.GetCapability("SYS_TTY_CONFIG"), + libcontainer.GetCapability("AUDIT_WRITE"), + libcontainer.GetCapability("AUDIT_CONTROL"), + libcontainer.GetCapability("MAC_OVERRIDE"), + libcontainer.GetCapability("MAC_ADMIN"), + libcontainer.GetCapability("NET_ADMIN"), + libcontainer.GetCapability("MKNOD"), + }, + Namespaces: libcontainer.Namespaces{ + libcontainer.GetNamespace("NEWNS"), + libcontainer.GetNamespace("NEWUTS"), + libcontainer.GetNamespace("NEWIPC"), + libcontainer.GetNamespace("NEWPID"), + libcontainer.GetNamespace("NEWNET"), + }, + Cgroups: &cgroups.Cgroup{ + Parent: "docker", + DeviceAccess: false, + }, + Context: libcontainer.Context{ + "apparmor_profile": "docker-default", + }, + } + container.CapabilitiesMask.Get("MKNOD").Enabled = true + return container +} diff --git a/runtime/execdriver/native/term.go b/runtime/execdriver/native/term.go new file mode 100644 index 0000000000..0d5298d388 --- /dev/null +++ b/runtime/execdriver/native/term.go @@ -0,0 +1,42 @@ +/* + These types are wrappers around the libcontainer Terminal interface so that + we can resuse the docker implementations where possible. +*/ +package native + +import ( + "github.com/dotcloud/docker/runtime/execdriver" + "io" + "os" + "os/exec" +) + +type dockerStdTerm struct { + execdriver.StdConsole + pipes *execdriver.Pipes +} + +func (d *dockerStdTerm) Attach(cmd *exec.Cmd) error { + return d.AttachPipes(cmd, d.pipes) +} + +func (d *dockerStdTerm) SetMaster(master *os.File) { + // do nothing +} + +type dockerTtyTerm struct { + execdriver.TtyConsole + pipes *execdriver.Pipes +} + +func (t *dockerTtyTerm) Attach(cmd *exec.Cmd) error { + go io.Copy(t.pipes.Stdout, t.MasterPty) + if t.pipes.Stdin != nil { + go io.Copy(t.MasterPty, t.pipes.Stdin) + } + return nil +} + +func (t *dockerTtyTerm) SetMaster(master *os.File) { + t.MasterPty = master +} diff --git a/runtime/execdriver/pipes.go b/runtime/execdriver/pipes.go new file mode 100644 index 0000000000..158219f0c5 --- /dev/null +++ b/runtime/execdriver/pipes.go @@ -0,0 +1,23 @@ +package execdriver + +import ( + "io" +) + +// Pipes is a wrapper around a containers output for +// stdin, stdout, stderr +type Pipes struct { + Stdin io.ReadCloser + Stdout, Stderr io.Writer +} + +func NewPipes(stdin io.ReadCloser, stdout, stderr io.Writer, useStdin bool) *Pipes { + p := &Pipes{ + Stdout: stdout, + Stderr: stderr, + } + if useStdin { + p.Stdin = stdin + } + return p +} diff --git a/runtime/execdriver/termconsole.go b/runtime/execdriver/termconsole.go new file mode 100644 index 0000000000..af6b88d3d1 --- /dev/null +++ b/runtime/execdriver/termconsole.go @@ -0,0 +1,126 @@ +package execdriver + +import ( + "github.com/dotcloud/docker/pkg/term" + "github.com/kr/pty" + "io" + "os" + "os/exec" +) + +func SetTerminal(command *Command, pipes *Pipes) error { + var ( + term Terminal + err error + ) + if command.Tty { + term, err = NewTtyConsole(command, pipes) + } else { + term, err = NewStdConsole(command, pipes) + } + if err != nil { + return err + } + command.Terminal = term + return nil +} + +type TtyConsole struct { + MasterPty *os.File + SlavePty *os.File +} + +func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) { + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + return nil, err + } + tty := &TtyConsole{ + MasterPty: ptyMaster, + SlavePty: ptySlave, + } + if err := tty.AttachPipes(&command.Cmd, pipes); err != nil { + tty.Close() + return nil, err + } + command.Console = tty.SlavePty.Name() + return tty, nil +} + +func (t *TtyConsole) Master() *os.File { + return t.MasterPty +} + +func (t *TtyConsole) Resize(h, w int) error { + return term.SetWinsize(t.MasterPty.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + +func (t *TtyConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { + command.Stdout = t.SlavePty + command.Stderr = t.SlavePty + + go func() { + if wb, ok := pipes.Stdout.(interface { + CloseWriters() error + }); ok { + defer wb.CloseWriters() + } + io.Copy(pipes.Stdout, t.MasterPty) + }() + + if pipes.Stdin != nil { + command.Stdin = t.SlavePty + command.SysProcAttr.Setctty = true + + go func() { + defer pipes.Stdin.Close() + io.Copy(t.MasterPty, pipes.Stdin) + }() + } + return nil +} + +func (t *TtyConsole) Close() error { + t.SlavePty.Close() + return t.MasterPty.Close() +} + +type StdConsole struct { +} + +func NewStdConsole(command *Command, pipes *Pipes) (*StdConsole, error) { + std := &StdConsole{} + + if err := std.AttachPipes(&command.Cmd, pipes); err != nil { + return nil, err + } + return std, nil +} + +func (s *StdConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { + command.Stdout = pipes.Stdout + command.Stderr = pipes.Stderr + + if pipes.Stdin != nil { + stdin, err := command.StdinPipe() + if err != nil { + return err + } + + go func() { + defer stdin.Close() + io.Copy(stdin, pipes.Stdin) + }() + } + return nil +} + +func (s *StdConsole) Resize(h, w int) error { + // we do not need to reside a non tty + return nil +} + +func (s *StdConsole) Close() error { + // nothing to close here + return nil +} |
