diff options
| author | unclejack <unclejack@users.noreply.github.com> | 2014-04-09 01:56:01 +0300 |
|---|---|---|
| committer | unclejack <unclejack@users.noreply.github.com> | 2014-04-09 01:56:01 +0300 |
| commit | e128a606e39fa63c6b4fd6e53a1d88cf00aad868 (patch) | |
| tree | 199ee7eb6678ffecd2ddad95fce794c795ad5183 /runtime | |
| parent | 143c9707a9fafc39e1d9747f528db97b2564f01e (diff) | |
| parent | dc9c28f51d669d6b09e81c2381f800f1a33bb659 (diff) | |
| download | docker-release-0.10.tar.gz | |
Merge pull request #5079 from unclejack/bump_v0.10.0release-0.100.10.1-hotfixes
Bump version to v0.10.0
Diffstat (limited to 'runtime')
67 files changed, 12405 insertions, 0 deletions
diff --git a/runtime/container.go b/runtime/container.go new file mode 100644 index 0000000000..c8053b146c --- /dev/null +++ b/runtime/container.go @@ -0,0 +1,1229 @@ +package runtime + +import ( + "encoding/json" + "errors" + "fmt" + "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/engine" + "github.com/dotcloud/docker/image" + "github.com/dotcloud/docker/links" + "github.com/dotcloud/docker/nat" + "github.com/dotcloud/docker/runconfig" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/graphdriver" + "github.com/dotcloud/docker/utils" + "io" + "io/ioutil" + "log" + "os" + "path" + "strings" + "sync" + "syscall" + "time" +) + +const DefaultPathEnv = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +var ( + ErrNotATTY = errors.New("The PTY is not a file") + ErrNoTTY = errors.New("No PTY found") + ErrContainerStart = errors.New("The container failed to start. Unknown error") + ErrContainerStartTimeout = errors.New("The container failed to start due to timed out.") +) + +type Container struct { + sync.Mutex + root string // Path to the "home" of the container, including metadata. + basefs string // Path to the graphdriver mountpoint + + ID string + + Created time.Time + + Path string + Args []string + + Config *runconfig.Config + State State + Image string + + NetworkSettings *NetworkSettings + + ResolvConfPath string + HostnamePath string + HostsPath string + Name string + Driver string + ExecDriver string + + command *execdriver.Command + stdout *utils.WriteBroadcaster + stderr *utils.WriteBroadcaster + stdin io.ReadCloser + stdinPipe io.WriteCloser + + runtime *Runtime + + waitLock chan struct{} + Volumes map[string]string + // Store rw/ro in a separate structure to preserve reverse-compatibility on-disk. + // Easier than migrating older container configs :) + VolumesRW map[string]bool + hostConfig *runconfig.HostConfig + + activeLinks map[string]*links.Link +} + +// FIXME: move deprecated port stuff to nat to clean up the core. +type PortMapping map[string]string // Deprecated + +type NetworkSettings struct { + IPAddress string + IPPrefixLen int + Gateway string + Bridge string + PortMapping map[string]PortMapping // Deprecated + Ports nat.PortMap +} + +func (settings *NetworkSettings) PortMappingAPI() *engine.Table { + var outs = engine.NewTable("", 0) + for port, bindings := range settings.Ports { + p, _ := nat.ParsePort(port.Port()) + if len(bindings) == 0 { + out := &engine.Env{} + out.SetInt("PublicPort", p) + out.Set("Type", port.Proto()) + outs.Add(out) + continue + } + for _, binding := range bindings { + out := &engine.Env{} + h, _ := nat.ParsePort(binding.HostPort) + out.SetInt("PrivatePort", p) + out.SetInt("PublicPort", h) + out.Set("Type", port.Proto()) + out.Set("IP", binding.HostIp) + outs.Add(out) + } + } + return outs +} + +// Inject the io.Reader at the given path. Note: do not close the reader +func (container *Container) Inject(file io.Reader, pth string) error { + if err := container.Mount(); err != nil { + return fmt.Errorf("inject: error mounting container %s: %s", container.ID, err) + } + defer container.Unmount() + + // Return error if path exists + destPath := path.Join(container.basefs, pth) + if _, err := os.Stat(destPath); err == nil { + // Since err is nil, the path could be stat'd and it exists + return fmt.Errorf("%s exists", pth) + } else if !os.IsNotExist(err) { + // Expect err might be that the file doesn't exist, so + // if it's some other error, return that. + + return err + } + + // Make sure the directory exists + if err := os.MkdirAll(path.Join(container.basefs, path.Dir(pth)), 0755); err != nil { + return err + } + + dest, err := os.Create(destPath) + if err != nil { + return err + } + defer dest.Close() + + if _, err := io.Copy(dest, file); err != nil { + return err + } + return nil +} + +func (container *Container) When() time.Time { + return container.Created +} + +func (container *Container) FromDisk() error { + data, err := ioutil.ReadFile(container.jsonPath()) + if err != nil { + return err + } + // Load container settings + // udp broke compat of docker.PortMapping, but it's not used when loading a container, we can skip it + if err := json.Unmarshal(data, container); err != nil && !strings.Contains(err.Error(), "docker.PortMapping") { + return err + } + return container.readHostConfig() +} + +func (container *Container) ToDisk() (err error) { + data, err := json.Marshal(container) + if err != nil { + return + } + err = ioutil.WriteFile(container.jsonPath(), data, 0666) + if err != nil { + return + } + return container.WriteHostConfig() +} + +func (container *Container) readHostConfig() error { + container.hostConfig = &runconfig.HostConfig{} + // If the hostconfig file does not exist, do not read it. + // (We still have to initialize container.hostConfig, + // but that's OK, since we just did that above.) + _, err := os.Stat(container.hostConfigPath()) + if os.IsNotExist(err) { + return nil + } + data, err := ioutil.ReadFile(container.hostConfigPath()) + if err != nil { + return err + } + return json.Unmarshal(data, container.hostConfig) +} + +func (container *Container) WriteHostConfig() (err error) { + data, err := json.Marshal(container.hostConfig) + if err != nil { + return + } + return ioutil.WriteFile(container.hostConfigPath(), data, 0666) +} + +func (container *Container) generateEnvConfig(env []string) error { + data, err := json.Marshal(env) + if err != nil { + return err + } + p, err := container.EnvConfigPath() + if err != nil { + return err + } + ioutil.WriteFile(p, data, 0600) + return nil +} + +func (container *Container) Attach(stdin io.ReadCloser, stdinCloser io.Closer, stdout io.Writer, stderr io.Writer) chan error { + var cStdout, cStderr io.ReadCloser + + var nJobs int + errors := make(chan error, 3) + if stdin != nil && container.Config.OpenStdin { + nJobs += 1 + if cStdin, err := container.StdinPipe(); err != nil { + errors <- err + } else { + go func() { + utils.Debugf("attach: stdin: begin") + defer utils.Debugf("attach: stdin: end") + // No matter what, when stdin is closed (io.Copy unblock), close stdout and stderr + if container.Config.StdinOnce && !container.Config.Tty { + defer cStdin.Close() + } else { + defer func() { + if cStdout != nil { + cStdout.Close() + } + if cStderr != nil { + cStderr.Close() + } + }() + } + if container.Config.Tty { + _, err = utils.CopyEscapable(cStdin, stdin) + } else { + _, err = io.Copy(cStdin, stdin) + } + if err == io.ErrClosedPipe { + err = nil + } + if err != nil { + utils.Errorf("attach: stdin: %s", err) + } + errors <- err + }() + } + } + if stdout != nil { + nJobs += 1 + if p, err := container.StdoutPipe(); err != nil { + errors <- err + } else { + cStdout = p + go func() { + utils.Debugf("attach: stdout: begin") + defer utils.Debugf("attach: stdout: end") + // If we are in StdinOnce mode, then close stdin + if container.Config.StdinOnce && stdin != nil { + defer stdin.Close() + } + if stdinCloser != nil { + defer stdinCloser.Close() + } + _, err := io.Copy(stdout, cStdout) + if err == io.ErrClosedPipe { + err = nil + } + if err != nil { + utils.Errorf("attach: stdout: %s", err) + } + errors <- err + }() + } + } else { + go func() { + if stdinCloser != nil { + defer stdinCloser.Close() + } + if cStdout, err := container.StdoutPipe(); err != nil { + utils.Errorf("attach: stdout pipe: %s", err) + } else { + io.Copy(&utils.NopWriter{}, cStdout) + } + }() + } + if stderr != nil { + nJobs += 1 + if p, err := container.StderrPipe(); err != nil { + errors <- err + } else { + cStderr = p + go func() { + utils.Debugf("attach: stderr: begin") + defer utils.Debugf("attach: stderr: end") + // If we are in StdinOnce mode, then close stdin + if container.Config.StdinOnce && stdin != nil { + defer stdin.Close() + } + if stdinCloser != nil { + defer stdinCloser.Close() + } + _, err := io.Copy(stderr, cStderr) + if err == io.ErrClosedPipe { + err = nil + } + if err != nil { + utils.Errorf("attach: stderr: %s", err) + } + errors <- err + }() + } + } else { + go func() { + if stdinCloser != nil { + defer stdinCloser.Close() + } + + if cStderr, err := container.StderrPipe(); err != nil { + utils.Errorf("attach: stdout pipe: %s", err) + } else { + io.Copy(&utils.NopWriter{}, cStderr) + } + }() + } + + return utils.Go(func() error { + defer func() { + if cStdout != nil { + cStdout.Close() + } + if cStderr != nil { + cStderr.Close() + } + }() + + // FIXME: how to clean up the stdin goroutine without the unwanted side effect + // of closing the passed stdin? Add an intermediary io.Pipe? + for i := 0; i < nJobs; i += 1 { + utils.Debugf("attach: waiting for job %d/%d", i+1, nJobs) + if err := <-errors; err != nil { + utils.Errorf("attach: job %d returned error %s, aborting all jobs", i+1, err) + return err + } + utils.Debugf("attach: job %d completed successfully", i+1) + } + utils.Debugf("attach: all jobs completed successfully") + return nil + }) +} + +func populateCommand(c *Container) { + var ( + en *execdriver.Network + driverConfig = make(map[string][]string) + ) + + en = &execdriver.Network{ + Mtu: c.runtime.config.Mtu, + Interface: nil, + } + + if !c.Config.NetworkDisabled { + network := c.NetworkSettings + en.Interface = &execdriver.NetworkInterface{ + Gateway: network.Gateway, + Bridge: network.Bridge, + IPAddress: network.IPAddress, + IPPrefixLen: network.IPPrefixLen, + } + } + + // TODO: this can be removed after lxc-conf is fully deprecated + mergeLxcConfIntoOptions(c.hostConfig, driverConfig) + + resources := &execdriver.Resources{ + Memory: c.Config.Memory, + MemorySwap: c.Config.MemorySwap, + CpuShares: c.Config.CpuShares, + } + c.command = &execdriver.Command{ + ID: c.ID, + Privileged: c.hostConfig.Privileged, + Rootfs: c.RootfsPath(), + InitPath: "/.dockerinit", + Entrypoint: c.Path, + Arguments: c.Args, + WorkingDir: c.Config.WorkingDir, + Network: en, + Tty: c.Config.Tty, + User: c.Config.User, + Config: driverConfig, + Resources: resources, + } + c.command.SysProcAttr = &syscall.SysProcAttr{Setsid: true} +} + +func (container *Container) ArgsAsString() string { + var args []string + for _, arg := range container.Args { + if strings.Contains(arg, " ") { + args = append(args, fmt.Sprintf("'%s'", arg)) + } else { + args = append(args, arg) + } + } + return strings.Join(args, " ") +} + +func (container *Container) Start() (err error) { + container.Lock() + defer container.Unlock() + + if container.State.IsRunning() { + return nil + } + + defer func() { + if err != nil { + container.cleanup() + } + }() + + if container.ResolvConfPath == "" { + if err := container.setupContainerDns(); err != nil { + return err + } + } + + if err := container.Mount(); err != nil { + return err + } + + if container.runtime.config.DisableNetwork { + container.Config.NetworkDisabled = true + container.buildHostnameAndHostsFiles("127.0.1.1") + } else { + if err := container.allocateNetwork(); err != nil { + return err + } + container.buildHostnameAndHostsFiles(container.NetworkSettings.IPAddress) + } + + // Make sure the config is compatible with the current kernel + if container.Config.Memory > 0 && !container.runtime.sysInfo.MemoryLimit { + log.Printf("WARNING: Your kernel does not support memory limit capabilities. Limitation discarded.\n") + container.Config.Memory = 0 + } + if container.Config.Memory > 0 && !container.runtime.sysInfo.SwapLimit { + log.Printf("WARNING: Your kernel does not support swap limit capabilities. Limitation discarded.\n") + container.Config.MemorySwap = -1 + } + + if container.runtime.sysInfo.IPv4ForwardingDisabled { + log.Printf("WARNING: IPv4 forwarding is disabled. Networking will not work") + } + + if err := prepareVolumesForContainer(container); err != nil { + return err + } + + // Setup environment + env := []string{ + "HOME=/", + "PATH=" + DefaultPathEnv, + "HOSTNAME=" + container.Config.Hostname, + } + + if container.Config.Tty { + env = append(env, "TERM=xterm") + } + + // Init any links between the parent and children + runtime := container.runtime + + children, err := runtime.Children(container.Name) + if err != nil { + return err + } + + if len(children) > 0 { + container.activeLinks = make(map[string]*links.Link, len(children)) + + // If we encounter an error make sure that we rollback any network + // config and ip table changes + rollback := func() { + for _, link := range container.activeLinks { + link.Disable() + } + container.activeLinks = nil + } + + for linkAlias, child := range children { + if !child.State.IsRunning() { + return fmt.Errorf("Cannot link to a non running container: %s AS %s", child.Name, linkAlias) + } + + link, err := links.NewLink( + container.NetworkSettings.IPAddress, + child.NetworkSettings.IPAddress, + linkAlias, + child.Config.Env, + child.Config.ExposedPorts, + runtime.eng) + + if err != nil { + rollback() + return err + } + + container.activeLinks[link.Alias()] = link + if err := link.Enable(); err != nil { + rollback() + return err + } + + for _, envVar := range link.ToEnv() { + env = append(env, envVar) + } + } + } + + // because the env on the container can override certain default values + // we need to replace the 'env' keys where they match and append anything + // else. + env = utils.ReplaceOrAppendEnvValues(env, container.Config.Env) + if err := container.generateEnvConfig(env); err != nil { + return err + } + + if container.Config.WorkingDir != "" { + container.Config.WorkingDir = path.Clean(container.Config.WorkingDir) + + pthInfo, err := os.Stat(path.Join(container.basefs, container.Config.WorkingDir)) + if err != nil { + if !os.IsNotExist(err) { + return err + } + if err := os.MkdirAll(path.Join(container.basefs, container.Config.WorkingDir), 0755); err != nil { + return err + } + } + if pthInfo != nil && !pthInfo.IsDir() { + return fmt.Errorf("Cannot mkdir: %s is not a directory", container.Config.WorkingDir) + } + } + + envPath, err := container.EnvConfigPath() + if err != nil { + return err + } + + populateCommand(container) + container.command.Env = env + + if err := setupMountsForContainer(container, envPath); err != nil { + return err + } + + // Setup logging of stdout and stderr to disk + if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil { + return err + } + if err := container.runtime.LogToDisk(container.stderr, container.logPath("json"), "stderr"); err != nil { + return err + } + container.waitLock = make(chan struct{}) + + callbackLock := make(chan struct{}) + callback := func(command *execdriver.Command) { + container.State.SetRunning(command.Pid()) + if command.Tty { + // The callback is called after the process Start() + // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlace + // which we close here. + if c, ok := command.Stdout.(io.Closer); ok { + c.Close() + } + } + if err := container.ToDisk(); err != nil { + utils.Debugf("%s", err) + } + close(callbackLock) + } + + // We use a callback here instead of a goroutine and an chan for + // syncronization purposes + cErr := utils.Go(func() error { return container.monitor(callback) }) + + // Start should not return until the process is actually running + select { + case <-callbackLock: + case err := <-cErr: + return err + } + return nil +} + +func (container *Container) Run() error { + if err := container.Start(); err != nil { + return err + } + container.Wait() + return nil +} + +func (container *Container) Output() (output []byte, err error) { + pipe, err := container.StdoutPipe() + if err != nil { + return nil, err + } + defer pipe.Close() + if err := container.Start(); err != nil { + return nil, err + } + output, err = ioutil.ReadAll(pipe) + container.Wait() + return output, err +} + +// Container.StdinPipe returns a WriteCloser which can be used to feed data +// to the standard input of the container's active process. +// Container.StdoutPipe and Container.StderrPipe each return a ReadCloser +// which can be used to retrieve the standard output (and error) generated +// by the container's active process. The output (and error) are actually +// copied and delivered to all StdoutPipe and StderrPipe consumers, using +// a kind of "broadcaster". + +func (container *Container) StdinPipe() (io.WriteCloser, error) { + return container.stdinPipe, nil +} + +func (container *Container) StdoutPipe() (io.ReadCloser, error) { + reader, writer := io.Pipe() + container.stdout.AddWriter(writer, "") + return utils.NewBufReader(reader), nil +} + +func (container *Container) StderrPipe() (io.ReadCloser, error) { + reader, writer := io.Pipe() + container.stderr.AddWriter(writer, "") + return utils.NewBufReader(reader), nil +} + +func (container *Container) buildHostnameAndHostsFiles(IP string) { + container.HostnamePath = path.Join(container.root, "hostname") + ioutil.WriteFile(container.HostnamePath, []byte(container.Config.Hostname+"\n"), 0644) + + hostsContent := []byte(` +127.0.0.1 localhost +::1 localhost ip6-localhost ip6-loopback +fe00::0 ip6-localnet +ff00::0 ip6-mcastprefix +ff02::1 ip6-allnodes +ff02::2 ip6-allrouters +`) + + container.HostsPath = path.Join(container.root, "hosts") + + if container.Config.Domainname != "" { + hostsContent = append([]byte(fmt.Sprintf("%s\t%s.%s %s\n", IP, container.Config.Hostname, container.Config.Domainname, container.Config.Hostname)), hostsContent...) + } else if !container.Config.NetworkDisabled { + hostsContent = append([]byte(fmt.Sprintf("%s\t%s\n", IP, container.Config.Hostname)), hostsContent...) + } + + ioutil.WriteFile(container.HostsPath, hostsContent, 0644) +} + +func (container *Container) allocateNetwork() error { + if container.Config.NetworkDisabled { + return nil + } + + var ( + env *engine.Env + err error + eng = container.runtime.eng + ) + + if container.State.IsGhost() { + if container.runtime.config.DisableNetwork { + env = &engine.Env{} + } else { + currentIP := container.NetworkSettings.IPAddress + + job := eng.Job("allocate_interface", container.ID) + if currentIP != "" { + job.Setenv("RequestIP", currentIP) + } + + env, err = job.Stdout.AddEnv() + if err != nil { + return err + } + + if err := job.Run(); err != nil { + return err + } + } + } else { + job := eng.Job("allocate_interface", container.ID) + env, err = job.Stdout.AddEnv() + if err != nil { + return err + } + if err := job.Run(); err != nil { + return err + } + } + + if container.Config.PortSpecs != nil { + utils.Debugf("Migrating port mappings for container: %s", strings.Join(container.Config.PortSpecs, ", ")) + if err := migratePortMappings(container.Config, container.hostConfig); err != nil { + return err + } + container.Config.PortSpecs = nil + if err := container.WriteHostConfig(); err != nil { + return err + } + } + + var ( + portSpecs = make(nat.PortSet) + bindings = make(nat.PortMap) + ) + + if !container.State.IsGhost() { + if container.Config.ExposedPorts != nil { + portSpecs = container.Config.ExposedPorts + } + if container.hostConfig.PortBindings != nil { + bindings = container.hostConfig.PortBindings + } + } else { + if container.NetworkSettings.Ports != nil { + for port, binding := range container.NetworkSettings.Ports { + portSpecs[port] = struct{}{} + bindings[port] = binding + } + } + } + + container.NetworkSettings.PortMapping = nil + + for port := range portSpecs { + binding := bindings[port] + if container.hostConfig.PublishAllPorts && len(binding) == 0 { + binding = append(binding, nat.PortBinding{}) + } + + for i := 0; i < len(binding); i++ { + b := binding[i] + + portJob := eng.Job("allocate_port", container.ID) + portJob.Setenv("HostIP", b.HostIp) + portJob.Setenv("HostPort", b.HostPort) + portJob.Setenv("Proto", port.Proto()) + portJob.Setenv("ContainerPort", port.Port()) + + portEnv, err := portJob.Stdout.AddEnv() + if err != nil { + return err + } + if err := portJob.Run(); err != nil { + eng.Job("release_interface", container.ID).Run() + return err + } + b.HostIp = portEnv.Get("HostIP") + b.HostPort = portEnv.Get("HostPort") + + binding[i] = b + } + bindings[port] = binding + } + container.WriteHostConfig() + + container.NetworkSettings.Ports = bindings + + container.NetworkSettings.Bridge = env.Get("Bridge") + container.NetworkSettings.IPAddress = env.Get("IP") + container.NetworkSettings.IPPrefixLen = env.GetInt("IPPrefixLen") + container.NetworkSettings.Gateway = env.Get("Gateway") + + return nil +} + +func (container *Container) releaseNetwork() { + if container.Config.NetworkDisabled { + return + } + eng := container.runtime.eng + + eng.Job("release_interface", container.ID).Run() + container.NetworkSettings = &NetworkSettings{} +} + +func (container *Container) monitor(callback execdriver.StartCallback) error { + var ( + err error + exitCode int + ) + + pipes := execdriver.NewPipes(container.stdin, container.stdout, container.stderr, container.Config.OpenStdin) + exitCode, err = container.runtime.Run(container, pipes, callback) + if err != nil { + utils.Errorf("Error running container: %s", err) + } + + if container.runtime != nil && container.runtime.srv != nil && container.runtime.srv.IsRunning() { + container.State.SetStopped(exitCode) + + // FIXME: there is a race condition here which causes this to fail during the unit tests. + // If another goroutine was waiting for Wait() to return before removing the container's root + // from the filesystem... At this point it may already have done so. + // This is because State.setStopped() has already been called, and has caused Wait() + // to return. + // FIXME: why are we serializing running state to disk in the first place? + //log.Printf("%s: Failed to dump configuration to the disk: %s", container.ID, err) + if err := container.ToDisk(); err != nil { + utils.Errorf("Error dumping container state to disk: %s\n", err) + } + } + + // Cleanup + container.cleanup() + + // Re-create a brand new stdin pipe once the container exited + if container.Config.OpenStdin { + container.stdin, container.stdinPipe = io.Pipe() + } + + if container.runtime != nil && container.runtime.srv != nil { + container.runtime.srv.LogEvent("die", container.ID, container.runtime.repositories.ImageName(container.Image)) + } + + close(container.waitLock) + + return err +} + +func (container *Container) cleanup() { + container.releaseNetwork() + + // Disable all active links + if container.activeLinks != nil { + for _, link := range container.activeLinks { + link.Disable() + } + } + if container.Config.OpenStdin { + if err := container.stdin.Close(); err != nil { + utils.Errorf("%s: Error close stdin: %s", container.ID, err) + } + } + if err := container.stdout.CloseWriters(); err != nil { + utils.Errorf("%s: Error close stdout: %s", container.ID, err) + } + if err := container.stderr.CloseWriters(); err != nil { + utils.Errorf("%s: Error close stderr: %s", container.ID, err) + } + if container.command != nil && container.command.Terminal != nil { + if err := container.command.Terminal.Close(); err != nil { + utils.Errorf("%s: Error closing terminal: %s", container.ID, err) + } + } + + if err := container.Unmount(); err != nil { + log.Printf("%v: Failed to umount filesystem: %v", container.ID, err) + } +} + +func (container *Container) KillSig(sig int) error { + container.Lock() + defer container.Unlock() + + if !container.State.IsRunning() { + return nil + } + return container.runtime.Kill(container, sig) +} + +func (container *Container) Kill() error { + if !container.State.IsRunning() { + return nil + } + + // 1. Send SIGKILL + if err := container.KillSig(9); err != nil { + return err + } + + // 2. Wait for the process to die, in last resort, try to kill the process directly + if err := container.WaitTimeout(10 * time.Second); err != nil { + log.Printf("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", utils.TruncateID(container.ID)) + if err := syscall.Kill(container.State.Pid, 9); err != nil { + return err + } + } + + container.Wait() + return nil +} + +func (container *Container) Stop(seconds int) error { + if !container.State.IsRunning() { + return nil + } + + // 1. Send a SIGTERM + if err := container.KillSig(15); err != nil { + log.Print("Failed to send SIGTERM to the process, force killing") + if err := container.KillSig(9); err != nil { + return err + } + } + + // 2. Wait for the process to exit on its own + if err := container.WaitTimeout(time.Duration(seconds) * time.Second); err != nil { + log.Printf("Container %v failed to exit within %d seconds of SIGTERM - using the force", container.ID, seconds) + // 3. If it doesn't, then send SIGKILL + if err := container.Kill(); err != nil { + return err + } + } + return nil +} + +func (container *Container) Restart(seconds int) error { + // Avoid unnecessarily unmounting and then directly mounting + // the container when the container stops and then starts + // again + if err := container.Mount(); err == nil { + defer container.Unmount() + } + + if err := container.Stop(seconds); err != nil { + return err + } + return container.Start() +} + +// Wait blocks until the container stops running, then returns its exit code. +func (container *Container) Wait() int { + <-container.waitLock + return container.State.GetExitCode() +} + +func (container *Container) Resize(h, w int) error { + return container.command.Terminal.Resize(h, w) +} + +func (container *Container) ExportRw() (archive.Archive, error) { + if err := container.Mount(); err != nil { + return nil, err + } + if container.runtime == nil { + return nil, fmt.Errorf("Can't load storage driver for unregistered container %s", container.ID) + } + archive, err := container.runtime.Diff(container) + if err != nil { + container.Unmount() + return nil, err + } + return utils.NewReadCloserWrapper(archive, func() error { + err := archive.Close() + container.Unmount() + return err + }), + nil +} + +func (container *Container) Export() (archive.Archive, error) { + if err := container.Mount(); err != nil { + return nil, err + } + + archive, err := archive.Tar(container.basefs, archive.Uncompressed) + if err != nil { + container.Unmount() + return nil, err + } + return utils.NewReadCloserWrapper(archive, func() error { + err := archive.Close() + container.Unmount() + return err + }), + nil +} + +func (container *Container) WaitTimeout(timeout time.Duration) error { + done := make(chan bool) + go func() { + container.Wait() + done <- true + }() + + select { + case <-time.After(timeout): + return fmt.Errorf("Timed Out") + case <-done: + return nil + } +} + +func (container *Container) Mount() error { + return container.runtime.Mount(container) +} + +func (container *Container) Changes() ([]archive.Change, error) { + return container.runtime.Changes(container) +} + +func (container *Container) GetImage() (*image.Image, error) { + if container.runtime == nil { + return nil, fmt.Errorf("Can't get image of unregistered container") + } + return container.runtime.graph.Get(container.Image) +} + +func (container *Container) Unmount() error { + return container.runtime.Unmount(container) +} + +func (container *Container) logPath(name string) string { + return path.Join(container.root, fmt.Sprintf("%s-%s.log", container.ID, name)) +} + +func (container *Container) ReadLog(name string) (io.Reader, error) { + return os.Open(container.logPath(name)) +} + +func (container *Container) hostConfigPath() string { + return path.Join(container.root, "hostconfig.json") +} + +func (container *Container) jsonPath() string { + return path.Join(container.root, "config.json") +} + +func (container *Container) EnvConfigPath() (string, error) { + p := path.Join(container.root, "config.env") + if _, err := os.Stat(p); err != nil { + if os.IsNotExist(err) { + f, err := os.Create(p) + if err != nil { + return "", err + } + f.Close() + } else { + return "", err + } + } + return p, nil +} + +// This method must be exported to be used from the lxc template +// This directory is only usable when the container is running +func (container *Container) RootfsPath() string { + return container.basefs +} + +func validateID(id string) error { + if id == "" { + return fmt.Errorf("Invalid empty id") + } + return nil +} + +// GetSize, return real size, virtual size +func (container *Container) GetSize() (int64, int64) { + var ( + sizeRw, sizeRootfs int64 + err error + driver = container.runtime.driver + ) + + if err := container.Mount(); err != nil { + utils.Errorf("Warning: failed to compute size of container rootfs %s: %s", container.ID, err) + return sizeRw, sizeRootfs + } + defer container.Unmount() + + if differ, ok := container.runtime.driver.(graphdriver.Differ); ok { + sizeRw, err = differ.DiffSize(container.ID) + if err != nil { + utils.Errorf("Warning: driver %s couldn't return diff size of container %s: %s", driver, container.ID, err) + // FIXME: GetSize should return an error. Not changing it now in case + // there is a side-effect. + sizeRw = -1 + } + } else { + changes, _ := container.Changes() + if changes != nil { + sizeRw = archive.ChangesSize(container.basefs, changes) + } else { + sizeRw = -1 + } + } + + if _, err = os.Stat(container.basefs); err != nil { + if sizeRootfs, err = utils.TreeSize(container.basefs); err != nil { + sizeRootfs = -1 + } + } + return sizeRw, sizeRootfs +} + +func (container *Container) Copy(resource string) (io.ReadCloser, error) { + if err := container.Mount(); err != nil { + return nil, err + } + var filter []string + basePath := path.Join(container.basefs, resource) + stat, err := os.Stat(basePath) + if err != nil { + container.Unmount() + return nil, err + } + if !stat.IsDir() { + d, f := path.Split(basePath) + basePath = d + filter = []string{f} + } else { + filter = []string{path.Base(basePath)} + basePath = path.Dir(basePath) + } + + archive, err := archive.TarFilter(basePath, &archive.TarOptions{ + Compression: archive.Uncompressed, + Includes: filter, + }) + if err != nil { + return nil, err + } + return utils.NewReadCloserWrapper(archive, func() error { + err := archive.Close() + container.Unmount() + return err + }), + nil +} + +// Returns true if the container exposes a certain port +func (container *Container) Exposes(p nat.Port) bool { + _, exists := container.Config.ExposedPorts[p] + return exists +} + +func (container *Container) GetPtyMaster() (*os.File, error) { + ttyConsole, ok := container.command.Terminal.(execdriver.TtyTerminal) + if !ok { + return nil, ErrNoTTY + } + return ttyConsole.Master(), nil +} + +func (container *Container) HostConfig() *runconfig.HostConfig { + return container.hostConfig +} + +func (container *Container) SetHostConfig(hostConfig *runconfig.HostConfig) { + container.hostConfig = hostConfig +} + +func (container *Container) DisableLink(name string) { + if container.activeLinks != nil { + if link, exists := container.activeLinks[name]; exists { + link.Disable() + } else { + utils.Debugf("Could not find active link for %s", name) + } + } +} + +func (container *Container) setupContainerDns() error { + var ( + config = container.hostConfig + runtime = container.runtime + ) + resolvConf, err := utils.GetResolvConf() + if err != nil { + return err + } + // If custom dns exists, then create a resolv.conf for the container + if len(config.Dns) > 0 || len(runtime.config.Dns) > 0 || len(config.DnsSearch) > 0 || len(runtime.config.DnsSearch) > 0 { + var ( + dns = utils.GetNameservers(resolvConf) + dnsSearch = utils.GetSearchDomains(resolvConf) + ) + if len(config.Dns) > 0 { + dns = config.Dns + } else if len(runtime.config.Dns) > 0 { + dns = runtime.config.Dns + } + if len(config.DnsSearch) > 0 { + dnsSearch = config.DnsSearch + } else if len(runtime.config.DnsSearch) > 0 { + dnsSearch = runtime.config.DnsSearch + } + container.ResolvConfPath = path.Join(container.root, "resolv.conf") + f, err := os.Create(container.ResolvConfPath) + if err != nil { + return err + } + defer f.Close() + for _, dns := range dns { + if _, err := f.Write([]byte("nameserver " + dns + "\n")); err != nil { + return err + } + } + if len(dnsSearch) > 0 { + if _, err := f.Write([]byte("search " + strings.Join(dnsSearch, " ") + "\n")); err != nil { + return err + } + } + } else { + container.ResolvConfPath = "/etc/resolv.conf" + } + return nil +} diff --git a/runtime/container_unit_test.go b/runtime/container_unit_test.go new file mode 100644 index 0000000000..fba036ca50 --- /dev/null +++ b/runtime/container_unit_test.go @@ -0,0 +1,145 @@ +package runtime + +import ( + "github.com/dotcloud/docker/nat" + "testing" +) + +func TestParseNetworkOptsPrivateOnly(t *testing.T) { + ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100::80"}) + if err != nil { + t.Fatal(err) + } + if len(ports) != 1 { + t.Logf("Expected 1 got %d", len(ports)) + t.FailNow() + } + if len(bindings) != 1 { + t.Logf("Expected 1 got %d", len(bindings)) + t.FailNow() + } + for k := range ports { + if k.Proto() != "tcp" { + t.Logf("Expected tcp got %s", k.Proto()) + t.Fail() + } + if k.Port() != "80" { + t.Logf("Expected 80 got %s", k.Port()) + t.Fail() + } + b, exists := bindings[k] + if !exists { + t.Log("Binding does not exist") + t.FailNow() + } + if len(b) != 1 { + t.Logf("Expected 1 got %d", len(b)) + t.FailNow() + } + s := b[0] + if s.HostPort != "" { + t.Logf("Expected \"\" got %s", s.HostPort) + t.Fail() + } + if s.HostIp != "192.168.1.100" { + t.Fail() + } + } +} + +func TestParseNetworkOptsPublic(t *testing.T) { + ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100:8080:80"}) + if err != nil { + t.Fatal(err) + } + if len(ports) != 1 { + t.Logf("Expected 1 got %d", len(ports)) + t.FailNow() + } + if len(bindings) != 1 { + t.Logf("Expected 1 got %d", len(bindings)) + t.FailNow() + } + for k := range ports { + if k.Proto() != "tcp" { + t.Logf("Expected tcp got %s", k.Proto()) + t.Fail() + } + if k.Port() != "80" { + t.Logf("Expected 80 got %s", k.Port()) + t.Fail() + } + b, exists := bindings[k] + if !exists { + t.Log("Binding does not exist") + t.FailNow() + } + if len(b) != 1 { + t.Logf("Expected 1 got %d", len(b)) + t.FailNow() + } + s := b[0] + if s.HostPort != "8080" { + t.Logf("Expected 8080 got %s", s.HostPort) + t.Fail() + } + if s.HostIp != "192.168.1.100" { + t.Fail() + } + } +} + +func TestParseNetworkOptsUdp(t *testing.T) { + ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100::6000/udp"}) + if err != nil { + t.Fatal(err) + } + if len(ports) != 1 { + t.Logf("Expected 1 got %d", len(ports)) + t.FailNow() + } + if len(bindings) != 1 { + t.Logf("Expected 1 got %d", len(bindings)) + t.FailNow() + } + for k := range ports { + if k.Proto() != "udp" { + t.Logf("Expected udp got %s", k.Proto()) + t.Fail() + } + if k.Port() != "6000" { + t.Logf("Expected 6000 got %s", k.Port()) + t.Fail() + } + b, exists := bindings[k] + if !exists { + t.Log("Binding does not exist") + t.FailNow() + } + if len(b) != 1 { + t.Logf("Expected 1 got %d", len(b)) + t.FailNow() + } + s := b[0] + if s.HostPort != "" { + t.Logf("Expected \"\" got %s", s.HostPort) + t.Fail() + } + if s.HostIp != "192.168.1.100" { + t.Fail() + } + } +} + +func TestGetFullName(t *testing.T) { + name, err := GetFullContainerName("testing") + if err != nil { + t.Fatal(err) + } + if name != "/testing" { + t.Fatalf("Expected /testing got %s", name) + } + if _, err := GetFullContainerName(""); err == nil { + t.Fatal("Error should not be nil") + } +} diff --git a/runtime/execdriver/MAINTAINERS b/runtime/execdriver/MAINTAINERS new file mode 100644 index 0000000000..1cb551364d --- /dev/null +++ b/runtime/execdriver/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby <michael@crosbymichael.com> (@crosbymichael) +Guillaume J. Charmes <guillaume@docker.com> (@creack) diff --git a/runtime/execdriver/driver.go b/runtime/execdriver/driver.go new file mode 100644 index 0000000000..27a575cb3a --- /dev/null +++ b/runtime/execdriver/driver.go @@ -0,0 +1,144 @@ +package execdriver + +import ( + "errors" + "io" + "os" + "os/exec" +) + +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + +var ( + ErrNotRunning = errors.New("Process could not be started") + ErrWaitTimeoutReached = errors.New("Wait timeout reached") + ErrDriverAlreadyRegistered = errors.New("A driver already registered this docker init function") + ErrDriverNotFound = errors.New("The requested docker init has not been found") +) + +var dockerInitFcts map[string]InitFunc + +type ( + StartCallback func(*Command) + InitFunc func(i *InitArgs) error +) + +func RegisterInitFunc(name string, fct InitFunc) error { + if dockerInitFcts == nil { + dockerInitFcts = make(map[string]InitFunc) + } + if _, ok := dockerInitFcts[name]; ok { + return ErrDriverAlreadyRegistered + } + dockerInitFcts[name] = fct + return nil +} + +func GetInitFunc(name string) (InitFunc, error) { + fct, ok := dockerInitFcts[name] + if !ok { + return nil, ErrDriverNotFound + } + return fct, nil +} + +// Args provided to the init function for a driver +type InitArgs struct { + User string + Gateway string + Ip string + WorkDir string + Privileged bool + Env []string + Args []string + Mtu int + Driver string + Console string + Pipe int + Root string +} + +// Driver specific information based on +// processes registered with the driver +type Info interface { + IsRunning() bool +} + +// Terminal in an interface for drivers to implement +// if they want to support Close and Resize calls from +// the core +type Terminal interface { + io.Closer + Resize(height, width int) error +} + +type TtyTerminal interface { + Master() *os.File +} + +type Driver interface { + Run(c *Command, pipes *Pipes, startCallback StartCallback) (int, error) // Run executes the process and blocks until the process exits and returns the exit code + Kill(c *Command, sig int) error + Name() string // Driver name + Info(id string) Info // "temporary" hack (until we move state from core to plugins) + GetPidsForContainer(id string) ([]int, error) // Returns a list of pids for the given container. + Terminate(c *Command) error // kill it with fire +} + +// Network settings of the container +type Network struct { + Interface *NetworkInterface `json:"interface"` // if interface is nil then networking is disabled + Mtu int `json:"mtu"` +} + +type NetworkInterface struct { + Gateway string `json:"gateway"` + IPAddress string `json:"ip"` + Bridge string `json:"bridge"` + IPPrefixLen int `json:"ip_prefix_len"` +} + +type Resources struct { + Memory int64 `json:"memory"` + MemorySwap int64 `json:"memory_swap"` + CpuShares int64 `json:"cpu_shares"` +} + +type Mount struct { + Source string `json:"source"` + Destination string `json:"destination"` + Writable bool `json:"writable"` + Private bool `json:"private"` +} + +// Process wrapps an os/exec.Cmd to add more metadata +type Command struct { + exec.Cmd `json:"-"` + + ID string `json:"id"` + Privileged bool `json:"privileged"` + User string `json:"user"` + Rootfs string `json:"rootfs"` // root fs of the container + InitPath string `json:"initpath"` // dockerinit + Entrypoint string `json:"entrypoint"` + Arguments []string `json:"arguments"` + WorkingDir string `json:"working_dir"` + ConfigPath string `json:"config_path"` // this should be able to be removed when the lxc template is moved into the driver + Tty bool `json:"tty"` + Network *Network `json:"network"` + Config map[string][]string `json:"config"` // generic values that specific drivers can consume + Resources *Resources `json:"resources"` + Mounts []Mount `json:"mounts"` + + Terminal Terminal `json:"-"` // standard or tty terminal + Console string `json:"-"` // dev/console path + ContainerPid int `json:"container_pid"` // the pid for the process inside a container +} + +// Return the pid of the process +// If the process is nil -1 will be returned +func (c *Command) Pid() int { + return c.ContainerPid +} diff --git a/runtime/execdriver/execdrivers/execdrivers.go b/runtime/execdriver/execdrivers/execdrivers.go new file mode 100644 index 0000000000..9e277c86df --- /dev/null +++ b/runtime/execdriver/execdrivers/execdrivers.go @@ -0,0 +1,23 @@ +package execdrivers + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/sysinfo" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/execdriver/lxc" + "github.com/dotcloud/docker/runtime/execdriver/native" + "path" +) + +func NewDriver(name, root, initPath string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) { + switch name { + case "lxc": + // we want to five the lxc driver the full docker root because it needs + // to access and write config and template files in /var/lib/docker/containers/* + // to be backwards compatible + return lxc.NewDriver(root, sysInfo.AppArmor) + case "native": + return native.NewDriver(path.Join(root, "execdriver", "native"), initPath) + } + return nil, fmt.Errorf("unknown exec driver %s", name) +} diff --git a/runtime/execdriver/lxc/driver.go b/runtime/execdriver/lxc/driver.go new file mode 100644 index 0000000000..ef16dcc380 --- /dev/null +++ b/runtime/execdriver/lxc/driver.go @@ -0,0 +1,418 @@ +package lxc + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/utils" + "io/ioutil" + "log" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" +) + +const DriverName = "lxc" + +func init() { + execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { + if err := setupEnv(args); err != nil { + return err + } + + if err := setupHostname(args); err != nil { + return err + } + + if err := setupNetworking(args); err != nil { + return err + } + + if err := setupCapabilities(args); err != nil { + return err + } + + if err := setupWorkingDirectory(args); err != nil { + return err + } + + if err := changeUser(args); err != nil { + return err + } + + path, err := exec.LookPath(args.Args[0]) + if err != nil { + log.Printf("Unable to locate %v", args.Args[0]) + os.Exit(127) + } + if err := syscall.Exec(path, args.Args, os.Environ()); err != nil { + return fmt.Errorf("dockerinit unable to execute %s - %s", path, err) + } + panic("Unreachable") + }) +} + +type driver struct { + root string // root path for the driver to use + apparmor bool + sharedRoot bool +} + +func NewDriver(root string, apparmor bool) (*driver, error) { + // setup unconfined symlink + if err := linkLxcStart(root); err != nil { + return nil, err + } + return &driver{ + apparmor: apparmor, + root: root, + sharedRoot: rootIsShared(), + }, nil +} + +func (d *driver) Name() string { + version := d.version() + return fmt.Sprintf("%s-%s", DriverName, version) +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + if err := execdriver.SetTerminal(c, pipes); err != nil { + return -1, err + } + configPath, err := d.generateLXCConfig(c) + if err != nil { + return -1, err + } + params := []string{ + "lxc-start", + "-n", c.ID, + "-f", configPath, + "--", + c.InitPath, + "-driver", + DriverName, + } + + if c.Network.Interface != nil { + params = append(params, + "-g", c.Network.Interface.Gateway, + "-i", fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen), + ) + } + params = append(params, + "-mtu", strconv.Itoa(c.Network.Mtu), + ) + + if c.User != "" { + params = append(params, "-u", c.User) + } + + if c.Privileged { + if d.apparmor { + params[0] = path.Join(d.root, "lxc-start-unconfined") + + } + params = append(params, "-privileged") + } + + if c.WorkingDir != "" { + params = append(params, "-w", c.WorkingDir) + } + + params = append(params, "--", c.Entrypoint) + params = append(params, c.Arguments...) + + if d.sharedRoot { + // lxc-start really needs / to be non-shared, or all kinds of stuff break + // when lxc-start unmount things and those unmounts propagate to the main + // mount namespace. + // What we really want is to clone into a new namespace and then + // mount / MS_REC|MS_SLAVE, but since we can't really clone or fork + // without exec in go we have to do this horrible shell hack... + shellString := + "mount --make-rslave /; exec " + + utils.ShellQuoteArguments(params) + + params = []string{ + "unshare", "-m", "--", "/bin/sh", "-c", shellString, + } + } + + var ( + name = params[0] + arg = params[1:] + ) + aname, err := exec.LookPath(name) + if err != nil { + aname = name + } + c.Path = aname + c.Args = append([]string{name}, arg...) + + if err := c.Start(); err != nil { + return -1, err + } + + var ( + waitErr error + waitLock = make(chan struct{}) + ) + go func() { + if err := c.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { // Do not propagate the error if it's simply a status code != 0 + waitErr = err + } + } + close(waitLock) + }() + + // Poll lxc for RUNNING status + pid, err := d.waitForStart(c, waitLock) + if err != nil { + if c.Process != nil { + c.Process.Kill() + } + return -1, err + } + c.ContainerPid = pid + + if startCallback != nil { + startCallback(c) + } + + <-waitLock + + return getExitCode(c), waitErr +} + +/// Return the exit code of the process +// if the process has not exited -1 will be returned +func getExitCode(c *execdriver.Command) int { + if c.ProcessState == nil { + return -1 + } + return c.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() +} + +func (d *driver) Kill(c *execdriver.Command, sig int) error { + return KillLxc(c.ID, sig) +} + +func (d *driver) Terminate(c *execdriver.Command) error { + return KillLxc(c.ID, 9) +} + +func (d *driver) version() string { + var ( + version string + output []byte + err error + ) + if _, errPath := exec.LookPath("lxc-version"); errPath == nil { + output, err = exec.Command("lxc-version").CombinedOutput() + } else { + output, err = exec.Command("lxc-start", "--version").CombinedOutput() + } + if err == nil { + version = strings.TrimSpace(string(output)) + if parts := strings.SplitN(version, ":", 2); len(parts) == 2 { + version = strings.TrimSpace(parts[1]) + } + } + return version +} + +func KillLxc(id string, sig int) error { + var ( + err error + output []byte + ) + _, err = exec.LookPath("lxc-kill") + if err == nil { + output, err = exec.Command("lxc-kill", "-n", id, strconv.Itoa(sig)).CombinedOutput() + } else { + output, err = exec.Command("lxc-stop", "-k", "-n", id, strconv.Itoa(sig)).CombinedOutput() + } + if err != nil { + return fmt.Errorf("Err: %s Output: %s", err, output) + } + return nil +} + +// wait for the process to start and return the pid for the process +func (d *driver) waitForStart(c *execdriver.Command, waitLock chan struct{}) (int, error) { + var ( + err error + output []byte + ) + // We wait for the container to be fully running. + // Timeout after 5 seconds. In case of broken pipe, just retry. + // Note: The container can run and finish correctly before + // the end of this loop + for now := time.Now(); time.Since(now) < 5*time.Second; { + select { + case <-waitLock: + // If the process dies while waiting for it, just return + return -1, nil + default: + } + + output, err = d.getInfo(c.ID) + if err != nil { + output, err = d.getInfo(c.ID) + if err != nil { + return -1, err + } + } + info, err := parseLxcInfo(string(output)) + if err != nil { + return -1, err + } + if info.Running { + return info.Pid, nil + } + time.Sleep(50 * time.Millisecond) + } + return -1, execdriver.ErrNotRunning +} + +func (d *driver) getInfo(id string) ([]byte, error) { + return exec.Command("lxc-info", "-n", id).CombinedOutput() +} + +type info struct { + ID string + driver *driver +} + +func (i *info) IsRunning() bool { + var running bool + + output, err := i.driver.getInfo(i.ID) + if err != nil { + utils.Errorf("Error getting info for lxc container %s: %s (%s)", i.ID, err, output) + return false + } + if strings.Contains(string(output), "RUNNING") { + running = true + } + return running +} + +func (d *driver) Info(id string) execdriver.Info { + return &info{ + ID: id, + driver: d, + } +} + +func (d *driver) GetPidsForContainer(id string) ([]int, error) { + pids := []int{} + + // cpu is chosen because it is the only non optional subsystem in cgroups + subsystem := "cpu" + cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return pids, err + } + + cgroupDir, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return pids, err + } + + filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks") + if _, err := os.Stat(filename); os.IsNotExist(err) { + // With more recent lxc versions use, cgroup will be in lxc/ + filename = filepath.Join(cgroupRoot, cgroupDir, "lxc", id, "tasks") + } + + output, err := ioutil.ReadFile(filename) + if err != nil { + return pids, err + } + for _, p := range strings.Split(string(output), "\n") { + if len(p) == 0 { + continue + } + pid, err := strconv.Atoi(p) + if err != nil { + return pids, fmt.Errorf("Invalid pid '%s': %s", p, err) + } + pids = append(pids, pid) + } + return pids, nil +} + +func linkLxcStart(root string) error { + sourcePath, err := exec.LookPath("lxc-start") + if err != nil { + return err + } + targetPath := path.Join(root, "lxc-start-unconfined") + + if _, err := os.Lstat(targetPath); err != nil && !os.IsNotExist(err) { + return err + } else if err == nil { + if err := os.Remove(targetPath); err != nil { + return err + } + } + return os.Symlink(sourcePath, targetPath) +} + +// TODO: This can be moved to the mountinfo reader in the mount pkg +func rootIsShared() bool { + if data, err := ioutil.ReadFile("/proc/self/mountinfo"); err == nil { + for _, line := range strings.Split(string(data), "\n") { + cols := strings.Split(line, " ") + if len(cols) >= 6 && cols[4] == "/" { + return strings.HasPrefix(cols[6], "shared") + } + } + } + + // No idea, probably safe to assume so + return true +} + +func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) { + var ( + process, mount string + root = path.Join(d.root, "containers", c.ID, "config.lxc") + labels = c.Config["label"] + ) + fo, err := os.Create(root) + if err != nil { + return "", err + } + defer fo.Close() + + if len(labels) > 0 { + process, mount, err = label.GenLabels(labels[0]) + if err != nil { + return "", err + } + } + + if err := LxcTemplateCompiled.Execute(fo, struct { + *execdriver.Command + AppArmor bool + ProcessLabel string + MountLabel string + }{ + Command: c, + AppArmor: d.apparmor, + ProcessLabel: process, + MountLabel: mount, + }); err != nil { + return "", err + } + return root, nil +} diff --git a/runtime/execdriver/lxc/info.go b/runtime/execdriver/lxc/info.go new file mode 100644 index 0000000000..27b4c58604 --- /dev/null +++ b/runtime/execdriver/lxc/info.go @@ -0,0 +1,50 @@ +package lxc + +import ( + "bufio" + "errors" + "strconv" + "strings" +) + +var ( + ErrCannotParse = errors.New("cannot parse raw input") +) + +type lxcInfo struct { + Running bool + Pid int +} + +func parseLxcInfo(raw string) (*lxcInfo, error) { + if raw == "" { + return nil, ErrCannotParse + } + var ( + err error + s = bufio.NewScanner(strings.NewReader(raw)) + info = &lxcInfo{} + ) + for s.Scan() { + text := s.Text() + + if s.Err() != nil { + return nil, s.Err() + } + + parts := strings.Split(text, ":") + if len(parts) < 2 { + continue + } + switch strings.ToLower(strings.TrimSpace(parts[0])) { + case "state": + info.Running = strings.TrimSpace(parts[1]) == "RUNNING" + case "pid": + info.Pid, err = strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return nil, err + } + } + } + return info, nil +} diff --git a/runtime/execdriver/lxc/info_test.go b/runtime/execdriver/lxc/info_test.go new file mode 100644 index 0000000000..edafc02511 --- /dev/null +++ b/runtime/execdriver/lxc/info_test.go @@ -0,0 +1,36 @@ +package lxc + +import ( + "testing" +) + +func TestParseRunningInfo(t *testing.T) { + raw := ` + state: RUNNING + pid: 50` + + info, err := parseLxcInfo(raw) + if err != nil { + t.Fatal(err) + } + if !info.Running { + t.Fatal("info should return a running state") + } + if info.Pid != 50 { + t.Fatalf("info should have pid 50 got %d", info.Pid) + } +} + +func TestEmptyInfo(t *testing.T) { + _, err := parseLxcInfo("") + if err == nil { + t.Fatal("error should not be nil") + } +} + +func TestBadInfo(t *testing.T) { + _, err := parseLxcInfo("state") + if err != nil { + t.Fatal(err) + } +} diff --git a/runtime/execdriver/lxc/init.go b/runtime/execdriver/lxc/init.go new file mode 100644 index 0000000000..c1933a5e43 --- /dev/null +++ b/runtime/execdriver/lxc/init.go @@ -0,0 +1,175 @@ +package lxc + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/netlink" + "github.com/dotcloud/docker/pkg/user" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/syndtr/gocapability/capability" + "io/ioutil" + "net" + "os" + "strings" + "syscall" +) + +// Clear environment pollution introduced by lxc-start +func setupEnv(args *execdriver.InitArgs) error { + // Get env + var env []string + content, err := ioutil.ReadFile(".dockerenv") + if err != nil { + return fmt.Errorf("Unable to load environment variables: %v", err) + } + if err := json.Unmarshal(content, &env); err != nil { + return fmt.Errorf("Unable to unmarshal environment variables: %v", err) + } + // Propagate the plugin-specific container env variable + env = append(env, "container="+os.Getenv("container")) + + args.Env = env + + os.Clearenv() + for _, kv := range args.Env { + parts := strings.SplitN(kv, "=", 2) + if len(parts) == 1 { + parts = append(parts, "") + } + os.Setenv(parts[0], parts[1]) + } + + return nil +} + +func setupHostname(args *execdriver.InitArgs) error { + hostname := getEnv(args, "HOSTNAME") + if hostname == "" { + return nil + } + return setHostname(hostname) +} + +// Setup networking +func setupNetworking(args *execdriver.InitArgs) error { + if args.Ip != "" { + // eth0 + iface, err := net.InterfaceByName("eth0") + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + ip, ipNet, err := net.ParseCIDR(args.Ip) + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkLinkAddIp(iface, ip, ipNet); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkSetMTU(iface, args.Mtu); err != nil { + return fmt.Errorf("Unable to set MTU: %v", err) + } + if err := netlink.NetworkLinkUp(iface); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + + // loopback + iface, err = net.InterfaceByName("lo") + if err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + if err := netlink.NetworkLinkUp(iface); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + } + if args.Gateway != "" { + gw := net.ParseIP(args.Gateway) + if gw == nil { + return fmt.Errorf("Unable to set up networking, %s is not a valid gateway IP", args.Gateway) + } + + if err := netlink.AddDefaultGw(gw); err != nil { + return fmt.Errorf("Unable to set up networking: %v", err) + } + } + + return nil +} + +// Setup working directory +func setupWorkingDirectory(args *execdriver.InitArgs) error { + if args.WorkDir == "" { + return nil + } + if err := syscall.Chdir(args.WorkDir); err != nil { + return fmt.Errorf("Unable to change dir to %v: %v", args.WorkDir, err) + } + return nil +} + +// Takes care of dropping privileges to the desired user +func changeUser(args *execdriver.InitArgs) error { + uid, gid, suppGids, err := user.GetUserGroupSupplementary( + args.User, + syscall.Getuid(), syscall.Getgid(), + ) + if err != nil { + return err + } + + if err := syscall.Setgroups(suppGids); err != nil { + return fmt.Errorf("Setgroups failed: %v", err) + } + if err := syscall.Setgid(gid); err != nil { + return fmt.Errorf("Setgid failed: %v", err) + } + if err := syscall.Setuid(uid); err != nil { + return fmt.Errorf("Setuid failed: %v", err) + } + + return nil +} + +func setupCapabilities(args *execdriver.InitArgs) error { + if args.Privileged { + return nil + } + + drop := []capability.Cap{ + capability.CAP_SETPCAP, + capability.CAP_SYS_MODULE, + capability.CAP_SYS_RAWIO, + capability.CAP_SYS_PACCT, + capability.CAP_SYS_ADMIN, + capability.CAP_SYS_NICE, + capability.CAP_SYS_RESOURCE, + capability.CAP_SYS_TIME, + capability.CAP_SYS_TTY_CONFIG, + capability.CAP_AUDIT_WRITE, + capability.CAP_AUDIT_CONTROL, + capability.CAP_MAC_OVERRIDE, + capability.CAP_MAC_ADMIN, + capability.CAP_NET_ADMIN, + } + + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + return nil +} + +func getEnv(args *execdriver.InitArgs, key string) string { + for _, kv := range args.Env { + parts := strings.SplitN(kv, "=", 2) + if parts[0] == key && len(parts) == 2 { + return parts[1] + } + } + return "" +} diff --git a/runtime/execdriver/lxc/lxc_init_linux.go b/runtime/execdriver/lxc/lxc_init_linux.go new file mode 100644 index 0000000000..7288f5877b --- /dev/null +++ b/runtime/execdriver/lxc/lxc_init_linux.go @@ -0,0 +1,11 @@ +// +build amd64 + +package lxc + +import ( + "syscall" +) + +func setHostname(hostname string) error { + return syscall.Sethostname([]byte(hostname)) +} diff --git a/runtime/execdriver/lxc/lxc_init_unsupported.go b/runtime/execdriver/lxc/lxc_init_unsupported.go new file mode 100644 index 0000000000..d68cb91a1e --- /dev/null +++ b/runtime/execdriver/lxc/lxc_init_unsupported.go @@ -0,0 +1,7 @@ +// +build !linux !amd64 + +package lxc + +func setHostname(hostname string) error { + panic("Not supported on darwin") +} diff --git a/runtime/execdriver/lxc/lxc_template.go b/runtime/execdriver/lxc/lxc_template.go new file mode 100644 index 0000000000..c49753c6aa --- /dev/null +++ b/runtime/execdriver/lxc/lxc_template.go @@ -0,0 +1,176 @@ +package lxc + +import ( + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/runtime/execdriver" + "strings" + "text/template" +) + +const LxcTemplate = ` +{{if .Network.Interface}} +# network configuration +lxc.network.type = veth +lxc.network.link = {{.Network.Interface.Bridge}} +lxc.network.name = eth0 +{{else}} +# network is disabled (-n=false) +lxc.network.type = empty +lxc.network.flags = up +{{end}} +lxc.network.mtu = {{.Network.Mtu}} + +# root filesystem +{{$ROOTFS := .Rootfs}} +lxc.rootfs = {{$ROOTFS}} + +# use a dedicated pts for the container (and limit the number of pseudo terminal +# available) +lxc.pts = 1024 + +# disable the main console +lxc.console = none +{{if .ProcessLabel}} +lxc.se_context = {{ .ProcessLabel}} +{{end}} +{{$MOUNTLABEL := .MountLabel}} + +# no controlling tty at all +lxc.tty = 1 + +{{if .Privileged}} +lxc.cgroup.devices.allow = a +{{else}} +# no implicit access to devices +lxc.cgroup.devices.deny = a + +# but allow mknod for any device +lxc.cgroup.devices.allow = c *:* m +lxc.cgroup.devices.allow = b *:* m + +# /dev/null and zero +lxc.cgroup.devices.allow = c 1:3 rwm +lxc.cgroup.devices.allow = c 1:5 rwm + +# consoles +lxc.cgroup.devices.allow = c 5:1 rwm +lxc.cgroup.devices.allow = c 5:0 rwm +lxc.cgroup.devices.allow = c 4:0 rwm +lxc.cgroup.devices.allow = c 4:1 rwm + +# /dev/urandom,/dev/random +lxc.cgroup.devices.allow = c 1:9 rwm +lxc.cgroup.devices.allow = c 1:8 rwm + +# /dev/pts/ - pts namespaces are "coming soon" +lxc.cgroup.devices.allow = c 136:* rwm +lxc.cgroup.devices.allow = c 5:2 rwm + +# tuntap +lxc.cgroup.devices.allow = c 10:200 rwm + +# fuse +#lxc.cgroup.devices.allow = c 10:229 rwm + +# rtc +#lxc.cgroup.devices.allow = c 254:0 rwm +{{end}} + +# standard mount point +# Use mnt.putold as per https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/986385 +lxc.pivotdir = lxc_putold + +# NOTICE: These mounts must be applied within the namespace + +# WARNING: procfs is a known attack vector and should probably be disabled +# if your userspace allows it. eg. see http://blog.zx2c4.com/749 +lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0 + +# WARNING: sysfs is a known attack vector and should probably be disabled +# if your userspace allows it. eg. see http://bit.ly/T9CkqJ +lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0 + +{{if .Tty}} +lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0 +{{end}} + +lxc.mount.entry = devpts {{escapeFstabSpaces $ROOTFS}}/dev/pts devpts {{formatMountLabel "newinstance,ptmxmode=0666,nosuid,noexec" $MOUNTLABEL}} 0 0 +lxc.mount.entry = shm {{escapeFstabSpaces $ROOTFS}}/dev/shm tmpfs {{formatMountLabel "size=65536k,nosuid,nodev,noexec" $MOUNTLABEL}} 0 0 + +{{range $value := .Mounts}} +{{if $value.Writable}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,rw 0 0 +{{else}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,ro 0 0 +{{end}} +{{end}} + +{{if .Privileged}} +{{if .AppArmor}} +lxc.aa_profile = unconfined +{{else}} +#lxc.aa_profile = unconfined +{{end}} +{{end}} + +# limits +{{if .Resources}} +{{if .Resources.Memory}} +lxc.cgroup.memory.limit_in_bytes = {{.Resources.Memory}} +lxc.cgroup.memory.soft_limit_in_bytes = {{.Resources.Memory}} +{{with $memSwap := getMemorySwap .Resources}} +lxc.cgroup.memory.memsw.limit_in_bytes = {{$memSwap}} +{{end}} +{{end}} +{{if .Resources.CpuShares}} +lxc.cgroup.cpu.shares = {{.Resources.CpuShares}} +{{end}} +{{end}} + +{{if .Config.lxc}} +{{range $value := .Config.lxc}} +lxc.{{$value}} +{{end}} +{{end}} +` + +var LxcTemplateCompiled *template.Template + +// Escape spaces in strings according to the fstab documentation, which is the +// format for "lxc.mount.entry" lines in lxc.conf. See also "man 5 fstab". +func escapeFstabSpaces(field string) string { + return strings.Replace(field, " ", "\\040", -1) +} + +func getMemorySwap(v *execdriver.Resources) int64 { + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if v.MemorySwap < 0 { + return 0 + } + return v.Memory * 2 +} + +func getLabel(c map[string][]string, name string) string { + label := c["label"] + for _, l := range label { + parts := strings.SplitN(l, "=", 2) + if strings.TrimSpace(parts[0]) == name { + return strings.TrimSpace(parts[1]) + } + } + return "" +} + +func init() { + var err error + funcMap := template.FuncMap{ + "getMemorySwap": getMemorySwap, + "escapeFstabSpaces": escapeFstabSpaces, + "formatMountLabel": label.FormatMountLabel, + } + LxcTemplateCompiled, err = template.New("lxc").Funcs(funcMap).Parse(LxcTemplate) + if err != nil { + panic(err) + } +} diff --git a/runtime/execdriver/lxc/lxc_template_unit_test.go b/runtime/execdriver/lxc/lxc_template_unit_test.go new file mode 100644 index 0000000000..7f473a0502 --- /dev/null +++ b/runtime/execdriver/lxc/lxc_template_unit_test.go @@ -0,0 +1,135 @@ +package lxc + +import ( + "bufio" + "fmt" + "github.com/dotcloud/docker/runtime/execdriver" + "io/ioutil" + "math/rand" + "os" + "path" + "strings" + "testing" + "time" +) + +func TestLXCConfig(t *testing.T) { + root, err := ioutil.TempDir("", "TestLXCConfig") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + os.MkdirAll(path.Join(root, "containers", "1"), 0777) + + // Memory is allocated randomly for testing + rand.Seed(time.Now().UTC().UnixNano()) + var ( + memMin = 33554432 + memMax = 536870912 + mem = memMin + rand.Intn(memMax-memMin) + cpuMin = 100 + cpuMax = 10000 + cpu = cpuMin + rand.Intn(cpuMax-cpuMin) + ) + + driver, err := NewDriver(root, false) + if err != nil { + t.Fatal(err) + } + command := &execdriver.Command{ + ID: "1", + Resources: &execdriver.Resources{ + Memory: int64(mem), + CpuShares: int64(cpu), + }, + Network: &execdriver.Network{ + Mtu: 1500, + Interface: nil, + }, + } + p, err := driver.generateLXCConfig(command) + if err != nil { + t.Fatal(err) + } + grepFile(t, p, + fmt.Sprintf("lxc.cgroup.memory.limit_in_bytes = %d", mem)) + + grepFile(t, p, + fmt.Sprintf("lxc.cgroup.memory.memsw.limit_in_bytes = %d", mem*2)) +} + +func TestCustomLxcConfig(t *testing.T) { + root, err := ioutil.TempDir("", "TestCustomLxcConfig") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + os.MkdirAll(path.Join(root, "containers", "1"), 0777) + + driver, err := NewDriver(root, false) + if err != nil { + t.Fatal(err) + } + command := &execdriver.Command{ + ID: "1", + Privileged: false, + Config: map[string][]string{ + "lxc": { + "lxc.utsname = docker", + "lxc.cgroup.cpuset.cpus = 0,1", + }, + }, + Network: &execdriver.Network{ + Mtu: 1500, + Interface: nil, + }, + } + + p, err := driver.generateLXCConfig(command) + if err != nil { + t.Fatal(err) + } + + grepFile(t, p, "lxc.utsname = docker") + grepFile(t, p, "lxc.cgroup.cpuset.cpus = 0,1") +} + +func grepFile(t *testing.T, path string, pattern string) { + f, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + r := bufio.NewReader(f) + var ( + line string + ) + err = nil + for err == nil { + line, err = r.ReadString('\n') + if strings.Contains(line, pattern) == true { + return + } + } + t.Fatalf("grepFile: pattern \"%s\" not found in \"%s\"", pattern, path) +} + +func TestEscapeFstabSpaces(t *testing.T) { + var testInputs = map[string]string{ + " ": "\\040", + "": "", + "/double space": "/double\\040\\040space", + "/some long test string": "/some\\040long\\040test\\040string", + "/var/lib/docker": "/var/lib/docker", + " leading": "\\040leading", + "trailing ": "trailing\\040", + } + for in, exp := range testInputs { + if out := escapeFstabSpaces(in); exp != out { + t.Logf("Expected %s got %s", exp, out) + t.Fail() + } + } +} diff --git a/runtime/execdriver/native/configuration/parse.go b/runtime/execdriver/native/configuration/parse.go new file mode 100644 index 0000000000..6d6c643919 --- /dev/null +++ b/runtime/execdriver/native/configuration/parse.go @@ -0,0 +1,186 @@ +package configuration + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/utils" + "os/exec" + "path/filepath" + "strconv" + "strings" +) + +type Action func(*libcontainer.Container, interface{}, string) error + +var actions = map[string]Action{ + "cap.add": addCap, // add a cap + "cap.drop": dropCap, // drop a cap + + "ns.add": addNamespace, // add a namespace + "ns.drop": dropNamespace, // drop a namespace when cloning + + "net.join": joinNetNamespace, // join another containers net namespace + + "cgroups.cpu_shares": cpuShares, // set the cpu shares + "cgroups.memory": memory, // set the memory limit + "cgroups.memory_swap": memorySwap, // set the memory swap limit + "cgroups.cpuset.cpus": cpusetCpus, // set the cpus used + + "apparmor_profile": apparmorProfile, // set the apparmor profile to apply + + "fs.readonly": readonlyFs, // make the rootfs of the container read only +} + +func cpusetCpus(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + container.Cgroups.CpusetCpus = value + + return nil +} + +func apparmorProfile(container *libcontainer.Container, context interface{}, value string) error { + container.Context["apparmor_profile"] = value + return nil +} + +func cpuShares(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + v, err := strconv.ParseInt(value, 10, 0) + if err != nil { + return err + } + container.Cgroups.CpuShares = v + return nil +} + +func memory(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + + v, err := utils.RAMInBytes(value) + if err != nil { + return err + } + container.Cgroups.Memory = v + return nil +} + +func memorySwap(container *libcontainer.Container, context interface{}, value string) error { + if container.Cgroups == nil { + return fmt.Errorf("cannot set cgroups when they are disabled") + } + v, err := strconv.ParseInt(value, 0, 64) + if err != nil { + return err + } + container.Cgroups.MemorySwap = v + return nil +} + +func addCap(container *libcontainer.Container, context interface{}, value string) error { + c := container.CapabilitiesMask.Get(value) + if c == nil { + return fmt.Errorf("%s is not a valid capability", value) + } + c.Enabled = true + return nil +} + +func dropCap(container *libcontainer.Container, context interface{}, value string) error { + c := container.CapabilitiesMask.Get(value) + if c == nil { + return fmt.Errorf("%s is not a valid capability", value) + } + c.Enabled = false + return nil +} + +func addNamespace(container *libcontainer.Container, context interface{}, value string) error { + ns := container.Namespaces.Get(value) + if ns == nil { + return fmt.Errorf("%s is not a valid namespace", value[1:]) + } + ns.Enabled = true + return nil +} + +func dropNamespace(container *libcontainer.Container, context interface{}, value string) error { + ns := container.Namespaces.Get(value) + if ns == nil { + return fmt.Errorf("%s is not a valid namespace", value[1:]) + } + ns.Enabled = false + return nil +} + +func readonlyFs(container *libcontainer.Container, context interface{}, value string) error { + switch value { + case "1", "true": + container.ReadonlyFs = true + default: + container.ReadonlyFs = false + } + return nil +} + +func joinNetNamespace(container *libcontainer.Container, context interface{}, value string) error { + var ( + running = context.(map[string]*exec.Cmd) + cmd = running[value] + ) + + if cmd == nil || cmd.Process == nil { + return fmt.Errorf("%s is not a valid running container to join", value) + } + nspath := filepath.Join("/proc", fmt.Sprint(cmd.Process.Pid), "ns", "net") + container.Networks = append(container.Networks, &libcontainer.Network{ + Type: "netns", + Context: libcontainer.Context{ + "nspath": nspath, + }, + }) + return nil +} + +func vethMacAddress(container *libcontainer.Container, context interface{}, value string) error { + var veth *libcontainer.Network + for _, network := range container.Networks { + if network.Type == "veth" { + veth = network + break + } + } + if veth == nil { + return fmt.Errorf("not veth configured for container") + } + veth.Context["mac"] = value + return nil +} + +// configureCustomOptions takes string commands from the user and allows modification of the +// container's default configuration. +// +// TODO: this can be moved to a general utils or parser in pkg +func ParseConfiguration(container *libcontainer.Container, running map[string]*exec.Cmd, opts []string) error { + for _, opt := range opts { + kv := strings.SplitN(opt, "=", 2) + if len(kv) < 2 { + return fmt.Errorf("invalid format for %s", opt) + } + + action, exists := actions[kv[0]] + if !exists { + return fmt.Errorf("%s is not a valid option for the native driver", kv[0]) + } + + if err := action(container, running, kv[1]); err != nil { + return err + } + } + return nil +} diff --git a/runtime/execdriver/native/configuration/parse_test.go b/runtime/execdriver/native/configuration/parse_test.go new file mode 100644 index 0000000000..8001358766 --- /dev/null +++ b/runtime/execdriver/native/configuration/parse_test.go @@ -0,0 +1,166 @@ +package configuration + +import ( + "github.com/dotcloud/docker/runtime/execdriver/native/template" + "testing" +) + +func TestSetReadonlyRootFs(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "fs.readonly=true", + } + ) + + if container.ReadonlyFs { + t.Fatal("container should not have a readonly rootfs by default") + } + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if !container.ReadonlyFs { + t.Fatal("container should have a readonly rootfs") + } +} + +func TestConfigurationsDoNotConflict(t *testing.T) { + var ( + container1 = template.New() + container2 = template.New() + opts = []string{ + "cap.add=NET_ADMIN", + } + ) + + if err := ParseConfiguration(container1, nil, opts); err != nil { + t.Fatal(err) + } + + if !container1.CapabilitiesMask.Get("NET_ADMIN").Enabled { + t.Fatal("container one should have NET_ADMIN enabled") + } + if container2.CapabilitiesMask.Get("NET_ADMIN").Enabled { + t.Fatal("container two should not have NET_ADMIN enabled") + } +} + +func TestCpusetCpus(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.cpuset.cpus=1,2", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := "1,2"; container.Cgroups.CpusetCpus != expected { + t.Fatalf("expected %s got %s for cpuset.cpus", expected, container.Cgroups.CpusetCpus) + } +} + +func TestAppArmorProfile(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "apparmor_profile=koye-the-protector", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + if expected := "koye-the-protector"; container.Context["apparmor_profile"] != expected { + t.Fatalf("expected profile %s got %s", expected, container.Context["apparmor_profile"]) + } +} + +func TestCpuShares(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.cpu_shares=1048", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := int64(1048); container.Cgroups.CpuShares != expected { + t.Fatalf("expected cpu shares %d got %d", expected, container.Cgroups.CpuShares) + } +} + +func TestCgroupMemory(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cgroups.memory=500m", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if expected := int64(500 * 1024 * 1024); container.Cgroups.Memory != expected { + t.Fatalf("expected memory %d got %d", expected, container.Cgroups.Memory) + } +} + +func TestAddCap(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cap.add=MKNOD", + "cap.add=SYS_ADMIN", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if !container.CapabilitiesMask.Get("MKNOD").Enabled { + t.Fatal("container should have MKNOD enabled") + } + if !container.CapabilitiesMask.Get("SYS_ADMIN").Enabled { + t.Fatal("container should have SYS_ADMIN enabled") + } +} + +func TestDropCap(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "cap.drop=MKNOD", + } + ) + // enabled all caps like in privileged mode + for _, c := range container.CapabilitiesMask { + c.Enabled = true + } + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if container.CapabilitiesMask.Get("MKNOD").Enabled { + t.Fatal("container should not have MKNOD enabled") + } +} + +func TestDropNamespace(t *testing.T) { + var ( + container = template.New() + opts = []string{ + "ns.drop=NEWNET", + } + ) + if err := ParseConfiguration(container, nil, opts); err != nil { + t.Fatal(err) + } + + if container.Namespaces.Get("NEWNET").Enabled { + t.Fatal("container should not have NEWNET enabled") + } +} diff --git a/runtime/execdriver/native/create.go b/runtime/execdriver/native/create.go new file mode 100644 index 0000000000..71fab3e064 --- /dev/null +++ b/runtime/execdriver/native/create.go @@ -0,0 +1,114 @@ +package native + +import ( + "fmt" + "os" + + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/execdriver/native/configuration" + "github.com/dotcloud/docker/runtime/execdriver/native/template" +) + +// createContainer populates and configures the container type with the +// data provided by the execdriver.Command +func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container, error) { + container := template.New() + + container.Hostname = getEnv("HOSTNAME", c.Env) + container.Tty = c.Tty + container.User = c.User + container.WorkingDir = c.WorkingDir + container.Env = c.Env + container.Cgroups.Name = c.ID + // check to see if we are running in ramdisk to disable pivot root + container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" + + if err := d.createNetwork(container, c); err != nil { + return nil, err + } + if c.Privileged { + if err := d.setPrivileged(container); err != nil { + return nil, err + } + } + if err := d.setupCgroups(container, c); err != nil { + return nil, err + } + if err := d.setupMounts(container, c); err != nil { + return nil, err + } + if err := d.setupLabels(container, c); err != nil { + return nil, err + } + if err := configuration.ParseConfiguration(container, d.activeContainers, c.Config["native"]); err != nil { + return nil, err + } + return container, nil +} + +func (d *driver) createNetwork(container *libcontainer.Container, c *execdriver.Command) error { + container.Networks = []*libcontainer.Network{ + { + Mtu: c.Network.Mtu, + Address: fmt.Sprintf("%s/%d", "127.0.0.1", 0), + Gateway: "localhost", + Type: "loopback", + Context: libcontainer.Context{}, + }, + } + + if c.Network.Interface != nil { + vethNetwork := libcontainer.Network{ + Mtu: c.Network.Mtu, + Address: fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen), + Gateway: c.Network.Interface.Gateway, + Type: "veth", + Context: libcontainer.Context{ + "prefix": "veth", + "bridge": c.Network.Interface.Bridge, + }, + } + container.Networks = append(container.Networks, &vethNetwork) + } + return nil +} + +func (d *driver) setPrivileged(container *libcontainer.Container) error { + for _, c := range container.CapabilitiesMask { + c.Enabled = true + } + container.Cgroups.DeviceAccess = true + container.Context["apparmor_profile"] = "unconfined" + return nil +} + +func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.Command) error { + if c.Resources != nil { + container.Cgroups.CpuShares = c.Resources.CpuShares + container.Cgroups.Memory = c.Resources.Memory + container.Cgroups.MemorySwap = c.Resources.MemorySwap + } + return nil +} + +func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error { + for _, m := range c.Mounts { + container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private}) + } + return nil +} + +func (d *driver) setupLabels(container *libcontainer.Container, c *execdriver.Command) error { + labels := c.Config["label"] + if len(labels) > 0 { + process, mount, err := label.GenLabels(labels[0]) + if err != nil { + return err + } + container.Context["mount_label"] = mount + container.Context["process_label"] = process + } + return nil +} diff --git a/runtime/execdriver/native/driver.go b/runtime/execdriver/native/driver.go new file mode 100644 index 0000000000..d18865e508 --- /dev/null +++ b/runtime/execdriver/native/driver.go @@ -0,0 +1,292 @@ +package native + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/apparmor" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/runtime/execdriver" + "io" + "io/ioutil" + "log" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +const ( + DriverName = "native" + Version = "0.1" + BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root +) + +func init() { + execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { + var ( + container *libcontainer.Container + ns = nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{args.Root}, createLogger("")) + ) + f, err := os.Open(filepath.Join(args.Root, "container.json")) + if err != nil { + return err + } + if err := json.NewDecoder(f).Decode(&container); err != nil { + f.Close() + return err + } + f.Close() + + cwd, err := os.Getwd() + if err != nil { + return err + } + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(args.Pipe)) + if err != nil { + return err + } + if err := ns.Init(container, cwd, args.Console, syncPipe, args.Args); err != nil { + return err + } + return nil + }) +} + +type driver struct { + root string + initPath string + activeContainers map[string]*exec.Cmd +} + +func NewDriver(root, initPath string) (*driver, error) { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, err + } + // native driver root is at docker_root/execdriver/native. Put apparmor at docker_root + if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil { + return nil, err + } + return &driver{ + root: root, + initPath: initPath, + activeContainers: make(map[string]*exec.Cmd), + }, nil +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + // take the Command and populate the libcontainer.Container from it + container, err := d.createContainer(c) + if err != nil { + return -1, err + } + d.activeContainers[c.ID] = &c.Cmd + + var ( + term nsinit.Terminal + factory = &dockerCommandFactory{c: c, driver: d} + stateWriter = &dockerStateWriter{ + callback: startCallback, + c: c, + dsw: &nsinit.DefaultStateWriter{filepath.Join(d.root, c.ID)}, + } + ns = nsinit.NewNsInit(factory, stateWriter, createLogger(os.Getenv("DEBUG"))) + args = append([]string{c.Entrypoint}, c.Arguments...) + ) + if err := d.createContainerRoot(c.ID); err != nil { + return -1, err + } + defer d.removeContainerRoot(c.ID) + + if c.Tty { + term = &dockerTtyTerm{ + pipes: pipes, + } + } else { + term = &dockerStdTerm{ + pipes: pipes, + } + } + c.Terminal = term + if err := d.writeContainerFile(container, c.ID); err != nil { + return -1, err + } + return ns.Exec(container, term, args) +} + +func (d *driver) Kill(p *execdriver.Command, sig int) error { + return syscall.Kill(p.Process.Pid, syscall.Signal(sig)) +} + +func (d *driver) Terminate(p *execdriver.Command) error { + // lets check the start time for the process + started, err := d.readStartTime(p) + if err != nil { + // if we don't have the data on disk then we can assume the process is gone + // because this is only removed after we know the process has stopped + if os.IsNotExist(err) { + return nil + } + return err + } + + currentStartTime, err := system.GetProcessStartTime(p.Process.Pid) + if err != nil { + return err + } + if started == currentStartTime { + err = syscall.Kill(p.Process.Pid, 9) + } + d.removeContainerRoot(p.ID) + return err + +} + +func (d *driver) readStartTime(p *execdriver.Command) (string, error) { + data, err := ioutil.ReadFile(filepath.Join(d.root, p.ID, "start")) + if err != nil { + return "", err + } + return string(data), nil +} + +func (d *driver) Info(id string) execdriver.Info { + return &info{ + ID: id, + driver: d, + } +} + +func (d *driver) Name() string { + return fmt.Sprintf("%s-%s", DriverName, Version) +} + +// TODO: this can be improved with our driver +// there has to be a better way to do this +func (d *driver) GetPidsForContainer(id string) ([]int, error) { + pids := []int{} + + subsystem := "devices" + cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return pids, err + } + cgroupDir, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return pids, err + } + + filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks") + if _, err := os.Stat(filename); os.IsNotExist(err) { + filename = filepath.Join(cgroupRoot, cgroupDir, "docker", id, "tasks") + } + + output, err := ioutil.ReadFile(filename) + if err != nil { + return pids, err + } + for _, p := range strings.Split(string(output), "\n") { + if len(p) == 0 { + continue + } + pid, err := strconv.Atoi(p) + if err != nil { + return pids, fmt.Errorf("Invalid pid '%s': %s", p, err) + } + pids = append(pids, pid) + } + return pids, nil +} + +func (d *driver) writeContainerFile(container *libcontainer.Container, id string) error { + data, err := json.Marshal(container) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(d.root, id, "container.json"), data, 0655) +} + +func (d *driver) createContainerRoot(id string) error { + return os.MkdirAll(filepath.Join(d.root, id), 0655) +} + +func (d *driver) removeContainerRoot(id string) error { + return os.RemoveAll(filepath.Join(d.root, id)) +} + +func getEnv(key string, env []string) string { + for _, pair := range env { + parts := strings.Split(pair, "=") + if parts[0] == key { + return parts[1] + } + } + return "" +} + +type dockerCommandFactory struct { + c *execdriver.Command + driver *driver +} + +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (d *dockerCommandFactory) Create(container *libcontainer.Container, console string, syncFile *os.File, args []string) *exec.Cmd { + // we need to join the rootfs because nsinit will setup the rootfs and chroot + initPath := filepath.Join(d.c.Rootfs, d.c.InitPath) + + d.c.Path = d.driver.initPath + d.c.Args = append([]string{ + initPath, + "-driver", DriverName, + "-console", console, + "-pipe", "3", + "-root", filepath.Join(d.driver.root, d.c.ID), + "--", + }, args...) + + // set this to nil so that when we set the clone flags anything else is reset + d.c.SysProcAttr = nil + system.SetCloneFlags(&d.c.Cmd, uintptr(nsinit.GetNamespaceFlags(container.Namespaces))) + d.c.ExtraFiles = []*os.File{syncFile} + + d.c.Env = container.Env + d.c.Dir = d.c.Rootfs + + return &d.c.Cmd +} + +type dockerStateWriter struct { + dsw nsinit.StateWriter + c *execdriver.Command + callback execdriver.StartCallback +} + +func (d *dockerStateWriter) WritePid(pid int, started string) error { + d.c.ContainerPid = pid + err := d.dsw.WritePid(pid, started) + if d.callback != nil { + d.callback(d.c) + } + return err +} + +func (d *dockerStateWriter) DeletePid() error { + return d.dsw.DeletePid() +} + +func createLogger(debug string) *log.Logger { + var w io.Writer + // if we are in debug mode set the logger to stderr + if debug != "" { + w = os.Stderr + } else { + w = ioutil.Discard + } + return log.New(w, "[libcontainer] ", log.LstdFlags) +} diff --git a/runtime/execdriver/native/info.go b/runtime/execdriver/native/info.go new file mode 100644 index 0000000000..aef2f85c6b --- /dev/null +++ b/runtime/execdriver/native/info.go @@ -0,0 +1,21 @@ +package native + +import ( + "os" + "path/filepath" +) + +type info struct { + ID string + driver *driver +} + +// IsRunning is determined by looking for the +// pid file for a container. If the file exists then the +// container is currently running +func (i *info) IsRunning() bool { + if _, err := os.Stat(filepath.Join(i.driver.root, i.ID, "pid")); err == nil { + return true + } + return false +} diff --git a/runtime/execdriver/native/template/default_template.go b/runtime/execdriver/native/template/default_template.go new file mode 100644 index 0000000000..a1ecb04d76 --- /dev/null +++ b/runtime/execdriver/native/template/default_template.go @@ -0,0 +1,45 @@ +package template + +import ( + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +// New returns the docker default configuration for libcontainer +func New() *libcontainer.Container { + container := &libcontainer.Container{ + CapabilitiesMask: libcontainer.Capabilities{ + libcontainer.GetCapability("SETPCAP"), + libcontainer.GetCapability("SYS_MODULE"), + libcontainer.GetCapability("SYS_RAWIO"), + libcontainer.GetCapability("SYS_PACCT"), + libcontainer.GetCapability("SYS_ADMIN"), + libcontainer.GetCapability("SYS_NICE"), + libcontainer.GetCapability("SYS_RESOURCE"), + libcontainer.GetCapability("SYS_TIME"), + libcontainer.GetCapability("SYS_TTY_CONFIG"), + libcontainer.GetCapability("AUDIT_WRITE"), + libcontainer.GetCapability("AUDIT_CONTROL"), + libcontainer.GetCapability("MAC_OVERRIDE"), + libcontainer.GetCapability("MAC_ADMIN"), + libcontainer.GetCapability("NET_ADMIN"), + libcontainer.GetCapability("MKNOD"), + }, + Namespaces: libcontainer.Namespaces{ + libcontainer.GetNamespace("NEWNS"), + libcontainer.GetNamespace("NEWUTS"), + libcontainer.GetNamespace("NEWIPC"), + libcontainer.GetNamespace("NEWPID"), + libcontainer.GetNamespace("NEWNET"), + }, + Cgroups: &cgroups.Cgroup{ + Parent: "docker", + DeviceAccess: false, + }, + Context: libcontainer.Context{ + "apparmor_profile": "docker-default", + }, + } + container.CapabilitiesMask.Get("MKNOD").Enabled = true + return container +} diff --git a/runtime/execdriver/native/term.go b/runtime/execdriver/native/term.go new file mode 100644 index 0000000000..0d5298d388 --- /dev/null +++ b/runtime/execdriver/native/term.go @@ -0,0 +1,42 @@ +/* + These types are wrappers around the libcontainer Terminal interface so that + we can resuse the docker implementations where possible. +*/ +package native + +import ( + "github.com/dotcloud/docker/runtime/execdriver" + "io" + "os" + "os/exec" +) + +type dockerStdTerm struct { + execdriver.StdConsole + pipes *execdriver.Pipes +} + +func (d *dockerStdTerm) Attach(cmd *exec.Cmd) error { + return d.AttachPipes(cmd, d.pipes) +} + +func (d *dockerStdTerm) SetMaster(master *os.File) { + // do nothing +} + +type dockerTtyTerm struct { + execdriver.TtyConsole + pipes *execdriver.Pipes +} + +func (t *dockerTtyTerm) Attach(cmd *exec.Cmd) error { + go io.Copy(t.pipes.Stdout, t.MasterPty) + if t.pipes.Stdin != nil { + go io.Copy(t.MasterPty, t.pipes.Stdin) + } + return nil +} + +func (t *dockerTtyTerm) SetMaster(master *os.File) { + t.MasterPty = master +} diff --git a/runtime/execdriver/pipes.go b/runtime/execdriver/pipes.go new file mode 100644 index 0000000000..158219f0c5 --- /dev/null +++ b/runtime/execdriver/pipes.go @@ -0,0 +1,23 @@ +package execdriver + +import ( + "io" +) + +// Pipes is a wrapper around a containers output for +// stdin, stdout, stderr +type Pipes struct { + Stdin io.ReadCloser + Stdout, Stderr io.Writer +} + +func NewPipes(stdin io.ReadCloser, stdout, stderr io.Writer, useStdin bool) *Pipes { + p := &Pipes{ + Stdout: stdout, + Stderr: stderr, + } + if useStdin { + p.Stdin = stdin + } + return p +} diff --git a/runtime/execdriver/termconsole.go b/runtime/execdriver/termconsole.go new file mode 100644 index 0000000000..af6b88d3d1 --- /dev/null +++ b/runtime/execdriver/termconsole.go @@ -0,0 +1,126 @@ +package execdriver + +import ( + "github.com/dotcloud/docker/pkg/term" + "github.com/kr/pty" + "io" + "os" + "os/exec" +) + +func SetTerminal(command *Command, pipes *Pipes) error { + var ( + term Terminal + err error + ) + if command.Tty { + term, err = NewTtyConsole(command, pipes) + } else { + term, err = NewStdConsole(command, pipes) + } + if err != nil { + return err + } + command.Terminal = term + return nil +} + +type TtyConsole struct { + MasterPty *os.File + SlavePty *os.File +} + +func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) { + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + return nil, err + } + tty := &TtyConsole{ + MasterPty: ptyMaster, + SlavePty: ptySlave, + } + if err := tty.AttachPipes(&command.Cmd, pipes); err != nil { + tty.Close() + return nil, err + } + command.Console = tty.SlavePty.Name() + return tty, nil +} + +func (t *TtyConsole) Master() *os.File { + return t.MasterPty +} + +func (t *TtyConsole) Resize(h, w int) error { + return term.SetWinsize(t.MasterPty.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + +func (t *TtyConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { + command.Stdout = t.SlavePty + command.Stderr = t.SlavePty + + go func() { + if wb, ok := pipes.Stdout.(interface { + CloseWriters() error + }); ok { + defer wb.CloseWriters() + } + io.Copy(pipes.Stdout, t.MasterPty) + }() + + if pipes.Stdin != nil { + command.Stdin = t.SlavePty + command.SysProcAttr.Setctty = true + + go func() { + defer pipes.Stdin.Close() + io.Copy(t.MasterPty, pipes.Stdin) + }() + } + return nil +} + +func (t *TtyConsole) Close() error { + t.SlavePty.Close() + return t.MasterPty.Close() +} + +type StdConsole struct { +} + +func NewStdConsole(command *Command, pipes *Pipes) (*StdConsole, error) { + std := &StdConsole{} + + if err := std.AttachPipes(&command.Cmd, pipes); err != nil { + return nil, err + } + return std, nil +} + +func (s *StdConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { + command.Stdout = pipes.Stdout + command.Stderr = pipes.Stderr + + if pipes.Stdin != nil { + stdin, err := command.StdinPipe() + if err != nil { + return err + } + + go func() { + defer stdin.Close() + io.Copy(stdin, pipes.Stdin) + }() + } + return nil +} + +func (s *StdConsole) Resize(h, w int) error { + // we do not need to reside a non tty + return nil +} + +func (s *StdConsole) Close() error { + // nothing to close here + return nil +} diff --git a/runtime/graphdriver/aufs/aufs.go b/runtime/graphdriver/aufs/aufs.go new file mode 100644 index 0000000000..401bbd8c86 --- /dev/null +++ b/runtime/graphdriver/aufs/aufs.go @@ -0,0 +1,401 @@ +/* + +aufs driver directory structure + +. +├── layers // Metadata of layers +│  ├── 1 +│  ├── 2 +│  └── 3 +├── diffs // Content of the layer +│  ├── 1 // Contains layers that need to be mounted for the id +│  ├── 2 +│  └── 3 +└── mnt // Mount points for the rw layers to be mounted + ├── 1 + ├── 2 + └── 3 + +*/ + +package aufs + +import ( + "bufio" + "fmt" + "github.com/dotcloud/docker/archive" + mountpk "github.com/dotcloud/docker/pkg/mount" + "github.com/dotcloud/docker/runtime/graphdriver" + "github.com/dotcloud/docker/utils" + "os" + "os/exec" + "path" + "strings" + "sync" +) + +var ( + ErrAufsNotSupported = fmt.Errorf("AUFS was not found in /proc/filesystems") +) + +func init() { + graphdriver.Register("aufs", Init) +} + +type Driver struct { + root string + sync.Mutex // Protects concurrent modification to active + active map[string]int +} + +// New returns a new AUFS driver. +// An error is returned if AUFS is not supported. +func Init(root string) (graphdriver.Driver, error) { + // Try to load the aufs kernel module + if err := supportsAufs(); err != nil { + return nil, err + } + paths := []string{ + "mnt", + "diff", + "layers", + } + + a := &Driver{ + root: root, + active: make(map[string]int), + } + + // Create the root aufs driver dir and return + // if it already exists + // If not populate the dir structure + if err := os.MkdirAll(root, 0755); err != nil { + if os.IsExist(err) { + return a, nil + } + return nil, err + } + + for _, p := range paths { + if err := os.MkdirAll(path.Join(root, p), 0755); err != nil { + return nil, err + } + } + return a, nil +} + +// Return a nil error if the kernel supports aufs +// We cannot modprobe because inside dind modprobe fails +// to run +func supportsAufs() error { + // We can try to modprobe aufs first before looking at + // proc/filesystems for when aufs is supported + exec.Command("modprobe", "aufs").Run() + + f, err := os.Open("/proc/filesystems") + if err != nil { + return err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if strings.Contains(s.Text(), "aufs") { + return nil + } + } + return ErrAufsNotSupported +} + +func (a Driver) rootPath() string { + return a.root +} + +func (Driver) String() string { + return "aufs" +} + +func (a Driver) Status() [][2]string { + ids, _ := loadIds(path.Join(a.rootPath(), "layers")) + return [][2]string{ + {"Root Dir", a.rootPath()}, + {"Dirs", fmt.Sprintf("%d", len(ids))}, + } +} + +// Exists returns true if the given id is registered with +// this driver +func (a Driver) Exists(id string) bool { + if _, err := os.Lstat(path.Join(a.rootPath(), "layers", id)); err != nil { + return false + } + return true +} + +// Three folders are created for each id +// mnt, layers, and diff +func (a *Driver) Create(id, parent string, mountLabel string) error { + if err := a.createDirsFor(id); err != nil { + return err + } + // Write the layers metadata + f, err := os.Create(path.Join(a.rootPath(), "layers", id)) + if err != nil { + return err + } + defer f.Close() + + if parent != "" { + ids, err := getParentIds(a.rootPath(), parent) + if err != nil { + return err + } + + if _, err := fmt.Fprintln(f, parent); err != nil { + return err + } + for _, i := range ids { + if _, err := fmt.Fprintln(f, i); err != nil { + return err + } + } + } + return nil +} + +func (a *Driver) createDirsFor(id string) error { + paths := []string{ + "mnt", + "diff", + } + + for _, p := range paths { + if err := os.MkdirAll(path.Join(a.rootPath(), p, id), 0755); err != nil { + return err + } + } + return nil +} + +// Unmount and remove the dir information +func (a *Driver) Remove(id string) error { + // Protect the a.active from concurrent access + a.Lock() + defer a.Unlock() + + if a.active[id] != 0 { + utils.Errorf("Warning: removing active id %s\n", id) + } + + // Make sure the dir is umounted first + if err := a.unmount(id); err != nil { + return err + } + tmpDirs := []string{ + "mnt", + "diff", + } + + // Atomically remove each directory in turn by first moving it out of the + // way (so that docker doesn't find it anymore) before doing removal of + // the whole tree. + for _, p := range tmpDirs { + + realPath := path.Join(a.rootPath(), p, id) + tmpPath := path.Join(a.rootPath(), p, fmt.Sprintf("%s-removing", id)) + if err := os.Rename(realPath, tmpPath); err != nil && !os.IsNotExist(err) { + return err + } + defer os.RemoveAll(tmpPath) + } + + // Remove the layers file for the id + if err := os.Remove(path.Join(a.rootPath(), "layers", id)); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// Return the rootfs path for the id +// This will mount the dir at it's given path +func (a *Driver) Get(id string) (string, error) { + ids, err := getParentIds(a.rootPath(), id) + if err != nil { + if !os.IsNotExist(err) { + return "", err + } + ids = []string{} + } + + // Protect the a.active from concurrent access + a.Lock() + defer a.Unlock() + + count := a.active[id] + + // If a dir does not have a parent ( no layers )do not try to mount + // just return the diff path to the data + out := path.Join(a.rootPath(), "diff", id) + if len(ids) > 0 { + out = path.Join(a.rootPath(), "mnt", id) + + if count == 0 { + if err := a.mount(id); err != nil { + return "", err + } + } + } + + a.active[id] = count + 1 + + return out, nil +} + +func (a *Driver) Put(id string) { + // Protect the a.active from concurrent access + a.Lock() + defer a.Unlock() + + if count := a.active[id]; count > 1 { + a.active[id] = count - 1 + } else { + ids, _ := getParentIds(a.rootPath(), id) + // We only mounted if there are any parents + if ids != nil && len(ids) > 0 { + a.unmount(id) + } + delete(a.active, id) + } +} + +// Returns an archive of the contents for the id +func (a *Driver) Diff(id string) (archive.Archive, error) { + return archive.TarFilter(path.Join(a.rootPath(), "diff", id), &archive.TarOptions{ + Compression: archive.Uncompressed, + }) +} + +func (a *Driver) ApplyDiff(id string, diff archive.ArchiveReader) error { + return archive.Untar(diff, path.Join(a.rootPath(), "diff", id), nil) +} + +// Returns the size of the contents for the id +func (a *Driver) DiffSize(id string) (int64, error) { + return utils.TreeSize(path.Join(a.rootPath(), "diff", id)) +} + +func (a *Driver) Changes(id string) ([]archive.Change, error) { + layers, err := a.getParentLayerPaths(id) + if err != nil { + return nil, err + } + return archive.Changes(layers, path.Join(a.rootPath(), "diff", id)) +} + +func (a *Driver) getParentLayerPaths(id string) ([]string, error) { + parentIds, err := getParentIds(a.rootPath(), id) + if err != nil { + return nil, err + } + if len(parentIds) == 0 { + return nil, fmt.Errorf("Dir %s does not have any parent layers", id) + } + layers := make([]string, len(parentIds)) + + // Get the diff paths for all the parent ids + for i, p := range parentIds { + layers[i] = path.Join(a.rootPath(), "diff", p) + } + return layers, nil +} + +func (a *Driver) mount(id string) error { + // If the id is mounted or we get an error return + if mounted, err := a.mounted(id); err != nil || mounted { + return err + } + + var ( + target = path.Join(a.rootPath(), "mnt", id) + rw = path.Join(a.rootPath(), "diff", id) + ) + + layers, err := a.getParentLayerPaths(id) + if err != nil { + return err + } + + if err := a.aufsMount(layers, rw, target); err != nil { + return err + } + return nil +} + +func (a *Driver) unmount(id string) error { + if mounted, err := a.mounted(id); err != nil || !mounted { + return err + } + target := path.Join(a.rootPath(), "mnt", id) + return Unmount(target) +} + +func (a *Driver) mounted(id string) (bool, error) { + target := path.Join(a.rootPath(), "mnt", id) + return mountpk.Mounted(target) +} + +// During cleanup aufs needs to unmount all mountpoints +func (a *Driver) Cleanup() error { + ids, err := loadIds(path.Join(a.rootPath(), "layers")) + if err != nil { + return err + } + for _, id := range ids { + if err := a.unmount(id); err != nil { + utils.Errorf("Unmounting %s: %s", utils.TruncateID(id), err) + } + } + return nil +} + +func (a *Driver) aufsMount(ro []string, rw, target string) (err error) { + defer func() { + if err != nil { + Unmount(target) + } + }() + + if err = a.tryMount(ro, rw, target); err != nil { + if err = a.mountRw(rw, target); err != nil { + return + } + + for _, layer := range ro { + branch := fmt.Sprintf("append:%s=ro+wh", layer) + if err = mount("none", target, "aufs", MsRemount, branch); err != nil { + return + } + } + } + return +} + +// Try to mount using the aufs fast path, if this fails then +// append ro layers. +func (a *Driver) tryMount(ro []string, rw, target string) (err error) { + var ( + rwBranch = fmt.Sprintf("%s=rw", rw) + roBranches = fmt.Sprintf("%s=ro+wh:", strings.Join(ro, "=ro+wh:")) + ) + return mount("none", target, "aufs", 0, fmt.Sprintf("br:%v:%v,xino=/dev/shm/aufs.xino", rwBranch, roBranches)) +} + +func (a *Driver) mountRw(rw, target string) error { + return mount("none", target, "aufs", 0, fmt.Sprintf("br:%s,xino=/dev/shm/aufs.xino", rw)) +} + +func rollbackMount(target string, err error) { + if err != nil { + Unmount(target) + } +} diff --git a/runtime/graphdriver/aufs/aufs_test.go b/runtime/graphdriver/aufs/aufs_test.go new file mode 100644 index 0000000000..9cfdebd160 --- /dev/null +++ b/runtime/graphdriver/aufs/aufs_test.go @@ -0,0 +1,697 @@ +package aufs + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/runtime/graphdriver" + "io/ioutil" + "os" + "path" + "testing" +) + +var ( + tmp = path.Join(os.TempDir(), "aufs-tests", "aufs") +) + +func testInit(dir string, t *testing.T) graphdriver.Driver { + d, err := Init(dir) + if err != nil { + if err == ErrAufsNotSupported { + t.Skip(err) + } else { + t.Fatal(err) + } + } + return d +} + +func newDriver(t *testing.T) *Driver { + if err := os.MkdirAll(tmp, 0755); err != nil { + t.Fatal(err) + } + + d := testInit(tmp, t) + return d.(*Driver) +} + +func TestNewDriver(t *testing.T) { + if err := os.MkdirAll(tmp, 0755); err != nil { + t.Fatal(err) + } + + d := testInit(tmp, t) + defer os.RemoveAll(tmp) + if d == nil { + t.Fatalf("Driver should not be nil") + } +} + +func TestAufsString(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if d.String() != "aufs" { + t.Fatalf("Expected aufs got %s", d.String()) + } +} + +func TestCreateDirStructure(t *testing.T) { + newDriver(t) + defer os.RemoveAll(tmp) + + paths := []string{ + "mnt", + "layers", + "diff", + } + + for _, p := range paths { + if _, err := os.Stat(path.Join(tmp, p)); err != nil { + t.Fatal(err) + } + } +} + +// We should be able to create two drivers with the same dir structure +func TestNewDriverFromExistingDir(t *testing.T) { + if err := os.MkdirAll(tmp, 0755); err != nil { + t.Fatal(err) + } + + testInit(tmp, t) + testInit(tmp, t) + os.RemoveAll(tmp) +} + +func TestCreateNewDir(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } +} + +func TestCreateNewDirStructure(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + paths := []string{ + "mnt", + "diff", + "layers", + } + + for _, p := range paths { + if _, err := os.Stat(path.Join(tmp, p, "1")); err != nil { + t.Fatal(err) + } + } +} + +func TestRemoveImage(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + if err := d.Remove("1"); err != nil { + t.Fatal(err) + } + + paths := []string{ + "mnt", + "diff", + "layers", + } + + for _, p := range paths { + if _, err := os.Stat(path.Join(tmp, p, "1")); err == nil { + t.Fatalf("Error should not be nil because dirs with id 1 should be delted: %s", p) + } + } +} + +func TestGetWithoutParent(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + diffPath, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + expected := path.Join(tmp, "diff", "1") + if diffPath != expected { + t.Fatalf("Expected path %s got %s", expected, diffPath) + } +} + +func TestCleanupWithNoDirs(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } +} + +func TestCleanupWithDir(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } +} + +func TestMountedFalseResponse(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + response, err := d.mounted("1") + if err != nil { + t.Fatal(err) + } + + if response != false { + t.Fatalf("Response if dir id 1 is mounted should be false") + } +} + +func TestMountedTrueReponse(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + _, err := d.Get("2") + if err != nil { + t.Fatal(err) + } + + response, err := d.mounted("2") + if err != nil { + t.Fatal(err) + } + + if response != true { + t.Fatalf("Response if dir id 2 is mounted should be true") + } +} + +func TestMountWithParent(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + defer func() { + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } + }() + + mntPath, err := d.Get("2") + if err != nil { + t.Fatal(err) + } + if mntPath == "" { + t.Fatal("mntPath should not be empty string") + } + + expected := path.Join(tmp, "mnt", "2") + if mntPath != expected { + t.Fatalf("Expected %s got %s", expected, mntPath) + } +} + +func TestRemoveMountedDir(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + defer func() { + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } + }() + + mntPath, err := d.Get("2") + if err != nil { + t.Fatal(err) + } + if mntPath == "" { + t.Fatal("mntPath should not be empty string") + } + + mounted, err := d.mounted("2") + if err != nil { + t.Fatal(err) + } + + if !mounted { + t.Fatalf("Dir id 2 should be mounted") + } + + if err := d.Remove("2"); err != nil { + t.Fatal(err) + } +} + +func TestCreateWithInvalidParent(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "docker", ""); err == nil { + t.Fatalf("Error should not be nil with parent does not exist") + } +} + +func TestGetDiff(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + diffPath, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + + // Add a file to the diff path with a fixed size + size := int64(1024) + + f, err := os.Create(path.Join(diffPath, "test_file")) + if err != nil { + t.Fatal(err) + } + if err := f.Truncate(size); err != nil { + t.Fatal(err) + } + f.Close() + + a, err := d.Diff("1") + if err != nil { + t.Fatal(err) + } + if a == nil { + t.Fatalf("Archive should not be nil") + } +} + +func TestChanges(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + defer func() { + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } + }() + + mntPoint, err := d.Get("2") + if err != nil { + t.Fatal(err) + } + + // Create a file to save in the mountpoint + f, err := os.Create(path.Join(mntPoint, "test.txt")) + if err != nil { + t.Fatal(err) + } + + if _, err := f.WriteString("testline"); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + + changes, err := d.Changes("2") + if err != nil { + t.Fatal(err) + } + if len(changes) != 1 { + t.Fatalf("Dir 2 should have one change from parent got %d", len(changes)) + } + change := changes[0] + + expectedPath := "/test.txt" + if change.Path != expectedPath { + t.Fatalf("Expected path %s got %s", expectedPath, change.Path) + } + + if change.Kind != archive.ChangeAdd { + t.Fatalf("Change kind should be ChangeAdd got %s", change.Kind) + } + + if err := d.Create("3", "2", ""); err != nil { + t.Fatal(err) + } + mntPoint, err = d.Get("3") + if err != nil { + t.Fatal(err) + } + + // Create a file to save in the mountpoint + f, err = os.Create(path.Join(mntPoint, "test2.txt")) + if err != nil { + t.Fatal(err) + } + + if _, err := f.WriteString("testline"); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + + changes, err = d.Changes("3") + if err != nil { + t.Fatal(err) + } + + if len(changes) != 1 { + t.Fatalf("Dir 2 should have one change from parent got %d", len(changes)) + } + change = changes[0] + + expectedPath = "/test2.txt" + if change.Path != expectedPath { + t.Fatalf("Expected path %s got %s", expectedPath, change.Path) + } + + if change.Kind != archive.ChangeAdd { + t.Fatalf("Change kind should be ChangeAdd got %s", change.Kind) + } +} + +func TestDiffSize(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + diffPath, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + + // Add a file to the diff path with a fixed size + size := int64(1024) + + f, err := os.Create(path.Join(diffPath, "test_file")) + if err != nil { + t.Fatal(err) + } + if err := f.Truncate(size); err != nil { + t.Fatal(err) + } + s, err := f.Stat() + if err != nil { + t.Fatal(err) + } + size = s.Size() + if err := f.Close(); err != nil { + t.Fatal(err) + } + + diffSize, err := d.DiffSize("1") + if err != nil { + t.Fatal(err) + } + if diffSize != size { + t.Fatalf("Expected size to be %d got %d", size, diffSize) + } +} + +func TestChildDiffSize(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + diffPath, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + + // Add a file to the diff path with a fixed size + size := int64(1024) + + f, err := os.Create(path.Join(diffPath, "test_file")) + if err != nil { + t.Fatal(err) + } + if err := f.Truncate(size); err != nil { + t.Fatal(err) + } + s, err := f.Stat() + if err != nil { + t.Fatal(err) + } + size = s.Size() + if err := f.Close(); err != nil { + t.Fatal(err) + } + + diffSize, err := d.DiffSize("1") + if err != nil { + t.Fatal(err) + } + if diffSize != size { + t.Fatalf("Expected size to be %d got %d", size, diffSize) + } + + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + diffSize, err = d.DiffSize("2") + if err != nil { + t.Fatal(err) + } + // The diff size for the child should be zero + if diffSize != 0 { + t.Fatalf("Expected size to be %d got %d", 0, diffSize) + } +} + +func TestExists(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + if d.Exists("none") { + t.Fatal("id name should not exist in the driver") + } + + if !d.Exists("1") { + t.Fatal("id 1 should exist in the driver") + } +} + +func TestStatus(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + status := d.Status() + if status == nil || len(status) == 0 { + t.Fatal("Status should not be nil or empty") + } + rootDir := status[0] + dirs := status[1] + if rootDir[0] != "Root Dir" { + t.Fatalf("Expected Root Dir got %s", rootDir[0]) + } + if rootDir[1] != d.rootPath() { + t.Fatalf("Expected %s got %s", d.rootPath(), rootDir[1]) + } + if dirs[0] != "Dirs" { + t.Fatalf("Expected Dirs got %s", dirs[0]) + } + if dirs[1] != "1" { + t.Fatalf("Expected 1 got %s", dirs[1]) + } +} + +func TestApplyDiff(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + diffPath, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + + // Add a file to the diff path with a fixed size + size := int64(1024) + + f, err := os.Create(path.Join(diffPath, "test_file")) + if err != nil { + t.Fatal(err) + } + if err := f.Truncate(size); err != nil { + t.Fatal(err) + } + f.Close() + + diff, err := d.Diff("1") + if err != nil { + t.Fatal(err) + } + + if err := d.Create("2", "", ""); err != nil { + t.Fatal(err) + } + if err := d.Create("3", "2", ""); err != nil { + t.Fatal(err) + } + + if err := d.ApplyDiff("3", diff); err != nil { + t.Fatal(err) + } + + // Ensure that the file is in the mount point for id 3 + + mountPoint, err := d.Get("3") + if err != nil { + t.Fatal(err) + } + if _, err := os.Stat(path.Join(mountPoint, "test_file")); err != nil { + t.Fatal(err) + } +} + +func hash(c string) string { + h := sha256.New() + fmt.Fprint(h, c) + return hex.EncodeToString(h.Sum(nil)) +} + +func TestMountMoreThan42Layers(t *testing.T) { + d := newDriver(t) + defer os.RemoveAll(tmp) + defer d.Cleanup() + var last string + var expected int + + for i := 1; i < 127; i++ { + expected++ + var ( + parent = fmt.Sprintf("%d", i-1) + current = fmt.Sprintf("%d", i) + ) + + if parent == "0" { + parent = "" + } else { + parent = hash(parent) + } + current = hash(current) + + if err := d.Create(current, parent, ""); err != nil { + t.Logf("Current layer %d", i) + t.Fatal(err) + } + point, err := d.Get(current) + if err != nil { + t.Logf("Current layer %d", i) + t.Fatal(err) + } + f, err := os.Create(path.Join(point, current)) + if err != nil { + t.Logf("Current layer %d", i) + t.Fatal(err) + } + f.Close() + + if i%10 == 0 { + if err := os.Remove(path.Join(point, parent)); err != nil { + t.Logf("Current layer %d", i) + t.Fatal(err) + } + expected-- + } + last = current + } + + // Perform the actual mount for the top most image + point, err := d.Get(last) + if err != nil { + t.Fatal(err) + } + files, err := ioutil.ReadDir(point) + if err != nil { + t.Fatal(err) + } + if len(files) != expected { + t.Fatalf("Expected %d got %d", expected, len(files)) + } +} diff --git a/runtime/graphdriver/aufs/dirs.go b/runtime/graphdriver/aufs/dirs.go new file mode 100644 index 0000000000..fb9b81edd2 --- /dev/null +++ b/runtime/graphdriver/aufs/dirs.go @@ -0,0 +1,46 @@ +package aufs + +import ( + "bufio" + "io/ioutil" + "os" + "path" +) + +// Return all the directories +func loadIds(root string) ([]string, error) { + dirs, err := ioutil.ReadDir(root) + if err != nil { + return nil, err + } + out := []string{} + for _, d := range dirs { + if !d.IsDir() { + out = append(out, d.Name()) + } + } + return out, nil +} + +// Read the layers file for the current id and return all the +// layers represented by new lines in the file +// +// If there are no lines in the file then the id has no parent +// and an empty slice is returned. +func getParentIds(root, id string) ([]string, error) { + f, err := os.Open(path.Join(root, "layers", id)) + if err != nil { + return nil, err + } + defer f.Close() + + out := []string{} + s := bufio.NewScanner(f) + + for s.Scan() { + if t := s.Text(); t != "" { + out = append(out, s.Text()) + } + } + return out, s.Err() +} diff --git a/runtime/graphdriver/aufs/migrate.go b/runtime/graphdriver/aufs/migrate.go new file mode 100644 index 0000000000..400e260797 --- /dev/null +++ b/runtime/graphdriver/aufs/migrate.go @@ -0,0 +1,194 @@ +package aufs + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path" +) + +type metadata struct { + ID string `json:"id"` + ParentID string `json:"parent,omitempty"` + Image string `json:"Image,omitempty"` + + parent *metadata +} + +func pathExists(pth string) bool { + if _, err := os.Stat(pth); err != nil { + return false + } + return true +} + +// Migrate existing images and containers from docker < 0.7.x +// +// The format pre 0.7 is for docker to store the metadata and filesystem +// content in the same directory. For the migration to work we need to move Image layer +// data from /var/lib/docker/graph/<id>/layers to the diff of the registered id. +// +// Next we need to migrate the container's rw layer to diff of the driver. After the +// contents are migrated we need to register the image and container ids with the +// driver. +// +// For the migration we try to move the folder containing the layer files, if that +// fails because the data is currently mounted we will fallback to creating a +// symlink. +func (a *Driver) Migrate(pth string, setupInit func(p string) error) error { + if pathExists(path.Join(pth, "graph")) { + if err := a.migrateRepositories(pth); err != nil { + return err + } + if err := a.migrateImages(path.Join(pth, "graph")); err != nil { + return err + } + return a.migrateContainers(path.Join(pth, "containers"), setupInit) + } + return nil +} + +func (a *Driver) migrateRepositories(pth string) error { + name := path.Join(pth, "repositories") + if err := os.Rename(name, name+"-aufs"); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +func (a *Driver) migrateContainers(pth string, setupInit func(p string) error) error { + fis, err := ioutil.ReadDir(pth) + if err != nil { + return err + } + + for _, fi := range fis { + if id := fi.Name(); fi.IsDir() && pathExists(path.Join(pth, id, "rw")) { + if err := tryRelocate(path.Join(pth, id, "rw"), path.Join(a.rootPath(), "diff", id)); err != nil { + return err + } + + if !a.Exists(id) { + + metadata, err := loadMetadata(path.Join(pth, id, "config.json")) + if err != nil { + return err + } + + initID := fmt.Sprintf("%s-init", id) + if err := a.Create(initID, metadata.Image, ""); err != nil { + return err + } + + initPath, err := a.Get(initID) + if err != nil { + return err + } + // setup init layer + if err := setupInit(initPath); err != nil { + return err + } + + if err := a.Create(id, initID, ""); err != nil { + return err + } + } + } + } + return nil +} + +func (a *Driver) migrateImages(pth string) error { + fis, err := ioutil.ReadDir(pth) + if err != nil { + return err + } + var ( + m = make(map[string]*metadata) + current *metadata + exists bool + ) + + for _, fi := range fis { + if id := fi.Name(); fi.IsDir() && pathExists(path.Join(pth, id, "layer")) { + if current, exists = m[id]; !exists { + current, err = loadMetadata(path.Join(pth, id, "json")) + if err != nil { + return err + } + m[id] = current + } + } + } + + for _, v := range m { + v.parent = m[v.ParentID] + } + + migrated := make(map[string]bool) + for _, v := range m { + if err := a.migrateImage(v, pth, migrated); err != nil { + return err + } + } + return nil +} + +func (a *Driver) migrateImage(m *metadata, pth string, migrated map[string]bool) error { + if !migrated[m.ID] { + if m.parent != nil { + a.migrateImage(m.parent, pth, migrated) + } + if err := tryRelocate(path.Join(pth, m.ID, "layer"), path.Join(a.rootPath(), "diff", m.ID)); err != nil { + return err + } + if !a.Exists(m.ID) { + if err := a.Create(m.ID, m.ParentID, ""); err != nil { + return err + } + } + migrated[m.ID] = true + } + return nil +} + +// tryRelocate will try to rename the old path to the new pack and if +// the operation fails, it will fallback to a symlink +func tryRelocate(oldPath, newPath string) error { + s, err := os.Lstat(newPath) + if err != nil && !os.IsNotExist(err) { + return err + } + // If the destination is a symlink then we already tried to relocate once before + // and it failed so we delete it and try to remove + if s != nil && s.Mode()&os.ModeSymlink == os.ModeSymlink { + if err := os.RemoveAll(newPath); err != nil { + return err + } + } + if err := os.Rename(oldPath, newPath); err != nil { + if sErr := os.Symlink(oldPath, newPath); sErr != nil { + return fmt.Errorf("Unable to relocate %s to %s: Rename err %s Symlink err %s", oldPath, newPath, err, sErr) + } + } + return nil +} + +func loadMetadata(pth string) (*metadata, error) { + f, err := os.Open(pth) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + out = &metadata{} + dec = json.NewDecoder(f) + ) + + if err := dec.Decode(out); err != nil { + return nil, err + } + return out, nil +} diff --git a/runtime/graphdriver/aufs/mount.go b/runtime/graphdriver/aufs/mount.go new file mode 100644 index 0000000000..1f1d98f809 --- /dev/null +++ b/runtime/graphdriver/aufs/mount.go @@ -0,0 +1,17 @@ +package aufs + +import ( + "github.com/dotcloud/docker/utils" + "os/exec" + "syscall" +) + +func Unmount(target string) error { + if err := exec.Command("auplink", target, "flush").Run(); err != nil { + utils.Errorf("[warning]: couldn't run auplink before unmount: %s", err) + } + if err := syscall.Unmount(target, 0); err != nil { + return err + } + return nil +} diff --git a/runtime/graphdriver/aufs/mount_linux.go b/runtime/graphdriver/aufs/mount_linux.go new file mode 100644 index 0000000000..6082d9f240 --- /dev/null +++ b/runtime/graphdriver/aufs/mount_linux.go @@ -0,0 +1,11 @@ +// +build amd64 + +package aufs + +import "syscall" + +const MsRemount = syscall.MS_REMOUNT + +func mount(source string, target string, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} diff --git a/runtime/graphdriver/aufs/mount_unsupported.go b/runtime/graphdriver/aufs/mount_unsupported.go new file mode 100644 index 0000000000..2735624112 --- /dev/null +++ b/runtime/graphdriver/aufs/mount_unsupported.go @@ -0,0 +1,11 @@ +// +build !linux !amd64 + +package aufs + +import "errors" + +const MsRemount = 0 + +func mount(source string, target string, fstype string, flags uintptr, data string) (err error) { + return errors.New("mount is not implemented on darwin") +} diff --git a/runtime/graphdriver/btrfs/btrfs.go b/runtime/graphdriver/btrfs/btrfs.go new file mode 100644 index 0000000000..2a94a4089f --- /dev/null +++ b/runtime/graphdriver/btrfs/btrfs.go @@ -0,0 +1,213 @@ +// +build linux,amd64 + +package btrfs + +/* +#include <stdlib.h> +#include <dirent.h> +#include <btrfs/ioctl.h> +*/ +import "C" + +import ( + "fmt" + "github.com/dotcloud/docker/runtime/graphdriver" + "os" + "path" + "syscall" + "unsafe" +) + +func init() { + graphdriver.Register("btrfs", Init) +} + +func Init(home string) (graphdriver.Driver, error) { + rootdir := path.Dir(home) + + var buf syscall.Statfs_t + if err := syscall.Statfs(rootdir, &buf); err != nil { + return nil, err + } + + if buf.Type != 0x9123683E { + return nil, fmt.Errorf("%s is not a btrfs filesystem", rootdir) + } + + return &Driver{ + home: home, + }, nil +} + +type Driver struct { + home string +} + +func (d *Driver) String() string { + return "btrfs" +} + +func (d *Driver) Status() [][2]string { + return nil +} + +func (d *Driver) Cleanup() error { + return nil +} + +func free(p *C.char) { + C.free(unsafe.Pointer(p)) +} + +func openDir(path string) (*C.DIR, error) { + Cpath := C.CString(path) + defer free(Cpath) + + dir := C.opendir(Cpath) + if dir == nil { + return nil, fmt.Errorf("Can't open dir") + } + return dir, nil +} + +func closeDir(dir *C.DIR) { + if dir != nil { + C.closedir(dir) + } +} + +func getDirFd(dir *C.DIR) uintptr { + return uintptr(C.dirfd(dir)) +} + +func subvolCreate(path, name string, mountLabel string) error { + dir, err := openDir(path) + if err != nil { + return err + } + defer closeDir(dir) + + var args C.struct_btrfs_ioctl_vol_args + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SUBVOL_CREATE, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Failed to create btrfs subvolume: %v", errno.Error()) + } + return nil +} + +func subvolSnapshot(src, dest, name string) error { + srcDir, err := openDir(src) + if err != nil { + return err + } + defer closeDir(srcDir) + + destDir, err := openDir(dest) + if err != nil { + return err + } + defer closeDir(destDir) + + var args C.struct_btrfs_ioctl_vol_args_v2 + args.fd = C.__s64(getDirFd(srcDir)) + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(destDir), C.BTRFS_IOC_SNAP_CREATE_V2, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Failed to create btrfs snapshot: %v", errno.Error()) + } + return nil +} + +func subvolDelete(path, name string) error { + dir, err := openDir(path) + if err != nil { + return err + } + defer closeDir(dir) + + var args C.struct_btrfs_ioctl_vol_args + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SNAP_DESTROY, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Failed to destroy btrfs snapshot: %v", errno.Error()) + } + return nil +} + +func (d *Driver) subvolumesDir() string { + return path.Join(d.home, "subvolumes") +} + +func (d *Driver) subvolumesDirId(id string) string { + return path.Join(d.subvolumesDir(), id) +} + +func (d *Driver) Create(id string, parent string, mountLabel string) error { + subvolumes := path.Join(d.home, "subvolumes") + if err := os.MkdirAll(subvolumes, 0700); err != nil { + return err + } + if parent == "" { + if err := subvolCreate(subvolumes, id, mountLabel); err != nil { + return err + } + } else { + parentDir, err := d.Get(parent) + if err != nil { + return err + } + if err := subvolSnapshot(parentDir, subvolumes, id); err != nil { + return err + } + } + return nil +} + +func (d *Driver) Remove(id string) error { + dir := d.subvolumesDirId(id) + if _, err := os.Stat(dir); err != nil { + return err + } + if err := subvolDelete(d.subvolumesDir(), id); err != nil { + return err + } + return os.RemoveAll(dir) +} + +func (d *Driver) Get(id string) (string, error) { + dir := d.subvolumesDirId(id) + st, err := os.Stat(dir) + if err != nil { + return "", err + } + + if !st.IsDir() { + return "", fmt.Errorf("%s: not a directory", dir) + } + + return dir, nil +} + +func (d *Driver) Put(id string) { + // Get() creates no runtime resources (like e.g. mounts) + // so this doesn't need to do anything. +} + +func (d *Driver) Exists(id string) bool { + dir := d.subvolumesDirId(id) + _, err := os.Stat(dir) + return err == nil +} diff --git a/runtime/graphdriver/btrfs/dummy_unsupported.go b/runtime/graphdriver/btrfs/dummy_unsupported.go new file mode 100644 index 0000000000..6c44615763 --- /dev/null +++ b/runtime/graphdriver/btrfs/dummy_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux !amd64 + +package btrfs diff --git a/runtime/graphdriver/devmapper/attach_loopback.go b/runtime/graphdriver/devmapper/attach_loopback.go new file mode 100644 index 0000000000..23339076e8 --- /dev/null +++ b/runtime/graphdriver/devmapper/attach_loopback.go @@ -0,0 +1,126 @@ +// +build linux,amd64 + +package devmapper + +import ( + "fmt" + "github.com/dotcloud/docker/utils" +) + +func stringToLoopName(src string) [LoNameSize]uint8 { + var dst [LoNameSize]uint8 + copy(dst[:], src[:]) + return dst +} + +func getNextFreeLoopbackIndex() (int, error) { + f, err := osOpenFile("/dev/loop-control", osORdOnly, 0644) + if err != nil { + return 0, err + } + defer f.Close() + + index, err := ioctlLoopCtlGetFree(f.Fd()) + if index < 0 { + index = 0 + } + return index, err +} + +func openNextAvailableLoopback(index int, sparseFile *osFile) (loopFile *osFile, err error) { + // Start looking for a free /dev/loop + for { + target := fmt.Sprintf("/dev/loop%d", index) + index++ + + fi, err := osStat(target) + if err != nil { + if osIsNotExist(err) { + utils.Errorf("There are no more loopback device available.") + } + return nil, ErrAttachLoopbackDevice + } + + if fi.Mode()&osModeDevice != osModeDevice { + utils.Errorf("Loopback device %s is not a block device.", target) + continue + } + + // OpenFile adds O_CLOEXEC + loopFile, err = osOpenFile(target, osORdWr, 0644) + if err != nil { + utils.Errorf("Error openning loopback device: %s", err) + return nil, ErrAttachLoopbackDevice + } + + // Try to attach to the loop file + if err := ioctlLoopSetFd(loopFile.Fd(), sparseFile.Fd()); err != nil { + loopFile.Close() + + // If the error is EBUSY, then try the next loopback + if err != sysEBusy { + utils.Errorf("Cannot set up loopback device %s: %s", target, err) + return nil, ErrAttachLoopbackDevice + } + + // Otherwise, we keep going with the loop + continue + } + // In case of success, we finished. Break the loop. + break + } + + // This can't happen, but let's be sure + if loopFile == nil { + utils.Errorf("Unreachable code reached! Error attaching %s to a loopback device.", sparseFile.Name()) + return nil, ErrAttachLoopbackDevice + } + + return loopFile, nil +} + +// attachLoopDevice attaches the given sparse file to the next +// available loopback device. It returns an opened *osFile. +func attachLoopDevice(sparseName string) (loop *osFile, err error) { + + // Try to retrieve the next available loopback device via syscall. + // If it fails, we discard error and start loopking for a + // loopback from index 0. + startIndex, err := getNextFreeLoopbackIndex() + if err != nil { + utils.Debugf("Error retrieving the next available loopback: %s", err) + } + + // OpenFile adds O_CLOEXEC + sparseFile, err := osOpenFile(sparseName, osORdWr, 0644) + if err != nil { + utils.Errorf("Error openning sparse file %s: %s", sparseName, err) + return nil, ErrAttachLoopbackDevice + } + defer sparseFile.Close() + + loopFile, err := openNextAvailableLoopback(startIndex, sparseFile) + if err != nil { + return nil, err + } + + // Set the status of the loopback device + loopInfo := &LoopInfo64{ + loFileName: stringToLoopName(loopFile.Name()), + loOffset: 0, + loFlags: LoFlagsAutoClear, + } + + if err := ioctlLoopSetStatus64(loopFile.Fd(), loopInfo); err != nil { + utils.Errorf("Cannot set up loopback device info: %s", err) + + // If the call failed, then free the loopback device + if err := ioctlLoopClrFd(loopFile.Fd()); err != nil { + utils.Errorf("Error while cleaning up the loopback device") + } + loopFile.Close() + return nil, ErrAttachLoopbackDevice + } + + return loopFile, nil +} diff --git a/runtime/graphdriver/devmapper/deviceset.go b/runtime/graphdriver/devmapper/deviceset.go new file mode 100644 index 0000000000..97d670a3d9 --- /dev/null +++ b/runtime/graphdriver/devmapper/deviceset.go @@ -0,0 +1,1122 @@ +// +build linux,amd64 + +package devmapper + +import ( + "encoding/json" + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/utils" + "io" + "io/ioutil" + "path" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" +) + +var ( + DefaultDataLoopbackSize int64 = 100 * 1024 * 1024 * 1024 + DefaultMetaDataLoopbackSize int64 = 2 * 1024 * 1024 * 1024 + DefaultBaseFsSize uint64 = 10 * 1024 * 1024 * 1024 +) + +type DevInfo struct { + Hash string `json:"-"` + DeviceId int `json:"device_id"` + Size uint64 `json:"size"` + TransactionId uint64 `json:"transaction_id"` + Initialized bool `json:"initialized"` + devices *DeviceSet `json:"-"` + + mountCount int `json:"-"` + mountPath string `json:"-"` + // A floating mount means one reference is not owned and + // will be stolen by the next mount. This allows us to + // avoid unmounting directly after creation before the + // first get (since we need to mount to set up the device + // a bit first). + floating bool `json:"-"` + + // The global DeviceSet lock guarantees that we serialize all + // the calls to libdevmapper (which is not threadsafe), but we + // sometimes release that lock while sleeping. In that case + // this per-device lock is still held, protecting against + // other accesses to the device that we're doing the wait on. + // + // WARNING: In order to avoid AB-BA deadlocks when releasing + // the global lock while holding the per-device locks all + // device locks must be aquired *before* the device lock, and + // multiple device locks should be aquired parent before child. + lock sync.Mutex `json:"-"` +} + +type MetaData struct { + Devices map[string]*DevInfo `json:devices` + devicesLock sync.Mutex `json:"-"` // Protects all read/writes to Devices map +} + +type DeviceSet struct { + MetaData + sync.Mutex // Protects Devices map and serializes calls into libdevmapper + root string + devicePrefix string + TransactionId uint64 + NewTransactionId uint64 + nextFreeDevice int + sawBusy bool +} + +type DiskUsage struct { + Used uint64 + Total uint64 +} + +type Status struct { + PoolName string + DataLoopback string + MetadataLoopback string + Data DiskUsage + Metadata DiskUsage + SectorSize uint64 +} + +type DevStatus struct { + DeviceId int + Size uint64 + TransactionId uint64 + SizeInSectors uint64 + MappedSectors uint64 + HighestMappedSector uint64 +} + +type UnmountMode int + +const ( + UnmountRegular UnmountMode = iota + UnmountFloat + UnmountSink +) + +func getDevName(name string) string { + return "/dev/mapper/" + name +} + +func (info *DevInfo) Name() string { + hash := info.Hash + if hash == "" { + hash = "base" + } + return fmt.Sprintf("%s-%s", info.devices.devicePrefix, hash) +} + +func (info *DevInfo) DevName() string { + return getDevName(info.Name()) +} + +func (devices *DeviceSet) loopbackDir() string { + return path.Join(devices.root, "devicemapper") +} + +func (devices *DeviceSet) jsonFile() string { + return path.Join(devices.loopbackDir(), "json") +} + +func (devices *DeviceSet) getPoolName() string { + return devices.devicePrefix + "-pool" +} + +func (devices *DeviceSet) getPoolDevName() string { + return getDevName(devices.getPoolName()) +} + +func (devices *DeviceSet) hasImage(name string) bool { + dirname := devices.loopbackDir() + filename := path.Join(dirname, name) + + _, err := osStat(filename) + return err == nil +} + +// ensureImage creates a sparse file of <size> bytes at the path +// <root>/devicemapper/<name>. +// If the file already exists, it does nothing. +// Either way it returns the full path. +func (devices *DeviceSet) ensureImage(name string, size int64) (string, error) { + dirname := devices.loopbackDir() + filename := path.Join(dirname, name) + + if err := osMkdirAll(dirname, 0700); err != nil && !osIsExist(err) { + return "", err + } + + if _, err := osStat(filename); err != nil { + if !osIsNotExist(err) { + return "", err + } + utils.Debugf("Creating loopback file %s for device-manage use", filename) + file, err := osOpenFile(filename, osORdWr|osOCreate, 0600) + if err != nil { + return "", err + } + defer file.Close() + + if err = file.Truncate(size); err != nil { + return "", err + } + } + return filename, nil +} + +func (devices *DeviceSet) allocateDeviceId() int { + // TODO: Add smarter reuse of deleted devices + id := devices.nextFreeDevice + devices.nextFreeDevice = devices.nextFreeDevice + 1 + return id +} + +func (devices *DeviceSet) allocateTransactionId() uint64 { + devices.NewTransactionId = devices.NewTransactionId + 1 + return devices.NewTransactionId +} + +func (devices *DeviceSet) saveMetadata() error { + devices.devicesLock.Lock() + jsonData, err := json.Marshal(devices.MetaData) + devices.devicesLock.Unlock() + if err != nil { + return fmt.Errorf("Error encoding metadata to json: %s", err) + } + tmpFile, err := ioutil.TempFile(filepath.Dir(devices.jsonFile()), ".json") + if err != nil { + return fmt.Errorf("Error creating metadata file: %s", err) + } + + n, err := tmpFile.Write(jsonData) + if err != nil { + return fmt.Errorf("Error writing metadata to %s: %s", tmpFile.Name(), err) + } + if n < len(jsonData) { + return io.ErrShortWrite + } + if err := tmpFile.Sync(); err != nil { + return fmt.Errorf("Error syncing metadata file %s: %s", tmpFile.Name(), err) + } + if err := tmpFile.Close(); err != nil { + return fmt.Errorf("Error closing metadata file %s: %s", tmpFile.Name(), err) + } + if err := osRename(tmpFile.Name(), devices.jsonFile()); err != nil { + return fmt.Errorf("Error committing metadata file %s: %s", tmpFile.Name(), err) + } + + if devices.NewTransactionId != devices.TransactionId { + if err = setTransactionId(devices.getPoolDevName(), devices.TransactionId, devices.NewTransactionId); err != nil { + return fmt.Errorf("Error setting devmapper transition ID: %s", err) + } + devices.TransactionId = devices.NewTransactionId + } + return nil +} + +func (devices *DeviceSet) lookupDevice(hash string) (*DevInfo, error) { + devices.devicesLock.Lock() + defer devices.devicesLock.Unlock() + info := devices.Devices[hash] + if info == nil { + return nil, fmt.Errorf("Unknown device %s", hash) + } + return info, nil +} + +func (devices *DeviceSet) registerDevice(id int, hash string, size uint64) (*DevInfo, error) { + utils.Debugf("registerDevice(%v, %v)", id, hash) + info := &DevInfo{ + Hash: hash, + DeviceId: id, + Size: size, + TransactionId: devices.allocateTransactionId(), + Initialized: false, + devices: devices, + } + + devices.devicesLock.Lock() + devices.Devices[hash] = info + devices.devicesLock.Unlock() + + if err := devices.saveMetadata(); err != nil { + // Try to remove unused device + devices.devicesLock.Lock() + delete(devices.Devices, hash) + devices.devicesLock.Unlock() + return nil, err + } + + return info, nil +} + +func (devices *DeviceSet) activateDeviceIfNeeded(info *DevInfo) error { + utils.Debugf("activateDeviceIfNeeded(%v)", info.Hash) + + if devinfo, _ := getInfo(info.Name()); devinfo != nil && devinfo.Exists != 0 { + return nil + } + + return activateDevice(devices.getPoolDevName(), info.Name(), info.DeviceId, info.Size) +} + +func (devices *DeviceSet) createFilesystem(info *DevInfo) error { + devname := info.DevName() + + err := execRun("mkfs.ext4", "-E", "discard,lazy_itable_init=0,lazy_journal_init=0", devname) + if err != nil { + err = execRun("mkfs.ext4", "-E", "discard,lazy_itable_init=0", devname) + } + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + return nil +} + +func (devices *DeviceSet) loadMetaData() error { + utils.Debugf("loadMetadata()") + defer utils.Debugf("loadMetadata END") + _, _, _, params, err := getStatus(devices.getPoolName()) + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + if _, err := fmt.Sscanf(params, "%d", &devices.TransactionId); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + devices.NewTransactionId = devices.TransactionId + + jsonData, err := ioutil.ReadFile(devices.jsonFile()) + if err != nil && !osIsNotExist(err) { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + devices.MetaData.Devices = make(map[string]*DevInfo) + if jsonData != nil { + if err := json.Unmarshal(jsonData, &devices.MetaData); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + } + + for hash, d := range devices.Devices { + d.Hash = hash + d.devices = devices + + if d.DeviceId >= devices.nextFreeDevice { + devices.nextFreeDevice = d.DeviceId + 1 + } + + // If the transaction id is larger than the actual one we lost the device due to some crash + if d.TransactionId > devices.TransactionId { + utils.Debugf("Removing lost device %s with id %d", hash, d.TransactionId) + delete(devices.Devices, hash) + } + } + return nil +} + +func (devices *DeviceSet) setupBaseImage() error { + oldInfo, _ := devices.lookupDevice("") + if oldInfo != nil && oldInfo.Initialized { + return nil + } + + if oldInfo != nil && !oldInfo.Initialized { + utils.Debugf("Removing uninitialized base image") + if err := devices.deleteDevice(oldInfo); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + } + + utils.Debugf("Initializing base device-manager snapshot") + + id := devices.allocateDeviceId() + + // Create initial device + if err := createDevice(devices.getPoolDevName(), id); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + utils.Debugf("Registering base device (id %v) with FS size %v", id, DefaultBaseFsSize) + info, err := devices.registerDevice(id, "", DefaultBaseFsSize) + if err != nil { + _ = deleteDevice(devices.getPoolDevName(), id) + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + utils.Debugf("Creating filesystem on base device-manager snapshot") + + if err = devices.activateDeviceIfNeeded(info); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + if err := devices.createFilesystem(info); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + info.Initialized = true + if err = devices.saveMetadata(); err != nil { + info.Initialized = false + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + return nil +} + +func setCloseOnExec(name string) { + if fileInfos, _ := ioutil.ReadDir("/proc/self/fd"); fileInfos != nil { + for _, i := range fileInfos { + link, _ := osReadlink(filepath.Join("/proc/self/fd", i.Name())) + if link == name { + fd, err := strconv.Atoi(i.Name()) + if err == nil { + sysCloseOnExec(fd) + } + } + } + } +} + +func (devices *DeviceSet) log(level int, file string, line int, dmError int, message string) { + if level >= 7 { + return // Ignore _LOG_DEBUG + } + + if strings.Contains(message, "busy") { + devices.sawBusy = true + } + + utils.Debugf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message) +} + +func major(device uint64) uint64 { + return (device >> 8) & 0xfff +} + +func minor(device uint64) uint64 { + return (device & 0xff) | ((device >> 12) & 0xfff00) +} + +func (devices *DeviceSet) ResizePool(size int64) error { + dirname := devices.loopbackDir() + datafilename := path.Join(dirname, "data") + metadatafilename := path.Join(dirname, "metadata") + + datafile, err := osOpenFile(datafilename, osORdWr, 0) + if datafile == nil { + return err + } + defer datafile.Close() + + fi, err := datafile.Stat() + if fi == nil { + return err + } + + if fi.Size() > size { + return fmt.Errorf("Can't shrink file") + } + + dataloopback := FindLoopDeviceFor(datafile) + if dataloopback == nil { + return fmt.Errorf("Unable to find loopback mount for: %s", datafilename) + } + defer dataloopback.Close() + + metadatafile, err := osOpenFile(metadatafilename, osORdWr, 0) + if metadatafile == nil { + return err + } + defer metadatafile.Close() + + metadataloopback := FindLoopDeviceFor(metadatafile) + if metadataloopback == nil { + return fmt.Errorf("Unable to find loopback mount for: %s", metadatafilename) + } + defer metadataloopback.Close() + + // Grow loopback file + if err := datafile.Truncate(size); err != nil { + return fmt.Errorf("Unable to grow loopback file: %s", err) + } + + // Reload size for loopback device + if err := LoopbackSetCapacity(dataloopback); err != nil { + return fmt.Errorf("Unable to update loopback capacity: %s", err) + } + + // Suspend the pool + if err := suspendDevice(devices.getPoolName()); err != nil { + return fmt.Errorf("Unable to suspend pool: %s", err) + } + + // Reload with the new block sizes + if err := reloadPool(devices.getPoolName(), dataloopback, metadataloopback); err != nil { + return fmt.Errorf("Unable to reload pool: %s", err) + } + + // Resume the pool + if err := resumeDevice(devices.getPoolName()); err != nil { + return fmt.Errorf("Unable to resume pool: %s", err) + } + + return nil +} + +func (devices *DeviceSet) initDevmapper(doInit bool) error { + logInit(devices) + + // Make sure the sparse images exist in <root>/devicemapper/data and + // <root>/devicemapper/metadata + + hasData := devices.hasImage("data") + hasMetadata := devices.hasImage("metadata") + + if !doInit && !hasData { + return errors.New("Loopback data file not found") + } + + if !doInit && !hasMetadata { + return errors.New("Loopback metadata file not found") + } + + createdLoopback := !hasData || !hasMetadata + data, err := devices.ensureImage("data", DefaultDataLoopbackSize) + if err != nil { + utils.Debugf("Error device ensureImage (data): %s\n", err) + return err + } + metadata, err := devices.ensureImage("metadata", DefaultMetaDataLoopbackSize) + if err != nil { + utils.Debugf("Error device ensureImage (metadata): %s\n", err) + return err + } + + // Set the device prefix from the device id and inode of the docker root dir + + st, err := osStat(devices.root) + if err != nil { + return fmt.Errorf("Error looking up dir %s: %s", devices.root, err) + } + sysSt := toSysStatT(st.Sys()) + // "reg-" stands for "regular file". + // In the future we might use "dev-" for "device file", etc. + // docker-maj,min[-inode] stands for: + // - Managed by docker + // - The target of this device is at major <maj> and minor <min> + // - If <inode> is defined, use that file inside the device as a loopback image. Otherwise use the device itself. + devices.devicePrefix = fmt.Sprintf("docker-%d:%d-%d", major(sysSt.Dev), minor(sysSt.Dev), sysSt.Ino) + utils.Debugf("Generated prefix: %s", devices.devicePrefix) + + // Check for the existence of the device <prefix>-pool + utils.Debugf("Checking for existence of the pool '%s'", devices.getPoolName()) + info, err := getInfo(devices.getPoolName()) + if info == nil { + utils.Debugf("Error device getInfo: %s", err) + return err + } + + // It seems libdevmapper opens this without O_CLOEXEC, and go exec will not close files + // that are not Close-on-exec, and lxc-start will die if it inherits any unexpected files, + // so we add this badhack to make sure it closes itself + setCloseOnExec("/dev/mapper/control") + + // If the pool doesn't exist, create it + if info.Exists == 0 { + utils.Debugf("Pool doesn't exist. Creating it.") + + dataFile, err := attachLoopDevice(data) + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + defer dataFile.Close() + + metadataFile, err := attachLoopDevice(metadata) + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + defer metadataFile.Close() + + if err := createPool(devices.getPoolName(), dataFile, metadataFile); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + } + + // If we didn't just create the data or metadata image, we need to + // load the metadata from the existing file. + if !createdLoopback { + if err = devices.loadMetaData(); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + } + + // Setup the base image + if doInit { + if err := devices.setupBaseImage(); err != nil { + utils.Debugf("Error device setupBaseImage: %s\n", err) + return err + } + } + + return nil +} + +func (devices *DeviceSet) AddDevice(hash, baseHash string) error { + baseInfo, err := devices.lookupDevice(baseHash) + if err != nil { + return err + } + + baseInfo.lock.Lock() + defer baseInfo.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + if info, _ := devices.lookupDevice(hash); info != nil { + return fmt.Errorf("device %s already exists", hash) + } + + deviceId := devices.allocateDeviceId() + + if err := devices.createSnapDevice(devices.getPoolDevName(), deviceId, baseInfo.Name(), baseInfo.DeviceId); err != nil { + utils.Debugf("Error creating snap device: %s\n", err) + return err + } + + if _, err := devices.registerDevice(deviceId, hash, baseInfo.Size); err != nil { + deleteDevice(devices.getPoolDevName(), deviceId) + utils.Debugf("Error registering device: %s\n", err) + return err + } + return nil +} + +func (devices *DeviceSet) deleteDevice(info *DevInfo) error { + // This is a workaround for the kernel not discarding block so + // on the thin pool when we remove a thinp device, so we do it + // manually + if err := devices.activateDeviceIfNeeded(info); err == nil { + if err := BlockDeviceDiscard(info.DevName()); err != nil { + utils.Debugf("Error discarding block on device: %s (ignoring)\n", err) + } + } + + devinfo, _ := getInfo(info.Name()) + if devinfo != nil && devinfo.Exists != 0 { + if err := devices.removeDeviceAndWait(info.Name()); err != nil { + utils.Debugf("Error removing device: %s\n", err) + return err + } + } + + if info.Initialized { + info.Initialized = false + if err := devices.saveMetadata(); err != nil { + utils.Debugf("Error saving meta data: %s\n", err) + return err + } + } + + if err := deleteDevice(devices.getPoolDevName(), info.DeviceId); err != nil { + utils.Debugf("Error deleting device: %s\n", err) + return err + } + + devices.allocateTransactionId() + devices.devicesLock.Lock() + delete(devices.Devices, info.Hash) + devices.devicesLock.Unlock() + + if err := devices.saveMetadata(); err != nil { + devices.devicesLock.Lock() + devices.Devices[info.Hash] = info + devices.devicesLock.Unlock() + utils.Debugf("Error saving meta data: %s\n", err) + return err + } + + return nil +} + +func (devices *DeviceSet) DeleteDevice(hash string) error { + info, err := devices.lookupDevice(hash) + if err != nil { + return err + } + + info.lock.Lock() + defer info.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + return devices.deleteDevice(info) +} + +func (devices *DeviceSet) deactivatePool() error { + utils.Debugf("[devmapper] deactivatePool()") + defer utils.Debugf("[devmapper] deactivatePool END") + devname := devices.getPoolDevName() + devinfo, err := getInfo(devname) + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + if devinfo.Exists != 0 { + return removeDevice(devname) + } + + return nil +} + +func (devices *DeviceSet) deactivateDevice(info *DevInfo) error { + utils.Debugf("[devmapper] deactivateDevice(%s)", info.Hash) + defer utils.Debugf("[devmapper] deactivateDevice END") + + // Wait for the unmount to be effective, + // by watching the value of Info.OpenCount for the device + if err := devices.waitClose(info); err != nil { + utils.Errorf("Warning: error waiting for device %s to close: %s\n", info.Hash, err) + } + + devinfo, err := getInfo(info.Name()) + if err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + if devinfo.Exists != 0 { + if err := devices.removeDeviceAndWait(info.Name()); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + } + + return nil +} + +// Issues the underlying dm remove operation and then waits +// for it to finish. +func (devices *DeviceSet) removeDeviceAndWait(devname string) error { + var err error + + for i := 0; i < 1000; i++ { + devices.sawBusy = false + err = removeDevice(devname) + if err == nil { + break + } + if !devices.sawBusy { + return err + } + + // If we see EBUSY it may be a transient error, + // sleep a bit a retry a few times. + devices.Unlock() + time.Sleep(10 * time.Millisecond) + devices.Lock() + } + if err != nil { + return err + } + + if err := devices.waitRemove(devname); err != nil { + return err + } + return nil +} + +// waitRemove blocks until either: +// a) the device registered at <device_set_prefix>-<hash> is removed, +// or b) the 10 second timeout expires. +func (devices *DeviceSet) waitRemove(devname string) error { + utils.Debugf("[deviceset %s] waitRemove(%s)", devices.devicePrefix, devname) + defer utils.Debugf("[deviceset %s] waitRemove(%s) END", devices.devicePrefix, devname) + i := 0 + for ; i < 1000; i += 1 { + devinfo, err := getInfo(devname) + if err != nil { + // If there is an error we assume the device doesn't exist. + // The error might actually be something else, but we can't differentiate. + return nil + } + if i%100 == 0 { + utils.Debugf("Waiting for removal of %s: exists=%d", devname, devinfo.Exists) + } + if devinfo.Exists == 0 { + break + } + + devices.Unlock() + time.Sleep(10 * time.Millisecond) + devices.Lock() + } + if i == 1000 { + return fmt.Errorf("Timeout while waiting for device %s to be removed", devname) + } + return nil +} + +// waitClose blocks until either: +// a) the device registered at <device_set_prefix>-<hash> is closed, +// or b) the 10 second timeout expires. +func (devices *DeviceSet) waitClose(info *DevInfo) error { + i := 0 + for ; i < 1000; i += 1 { + devinfo, err := getInfo(info.Name()) + if err != nil { + return err + } + if i%100 == 0 { + utils.Debugf("Waiting for unmount of %s: opencount=%d", info.Hash, devinfo.OpenCount) + } + if devinfo.OpenCount == 0 { + break + } + devices.Unlock() + time.Sleep(10 * time.Millisecond) + devices.Lock() + } + if i == 1000 { + return fmt.Errorf("Timeout while waiting for device %s to close", info.Hash) + } + return nil +} + +func (devices *DeviceSet) Shutdown() error { + + utils.Debugf("[deviceset %s] shutdown()", devices.devicePrefix) + utils.Debugf("[devmapper] Shutting down DeviceSet: %s", devices.root) + defer utils.Debugf("[deviceset %s] shutdown END", devices.devicePrefix) + + var devs []*DevInfo + + devices.devicesLock.Lock() + for _, info := range devices.Devices { + devs = append(devs, info) + } + devices.devicesLock.Unlock() + + for _, info := range devs { + info.lock.Lock() + if info.mountCount > 0 { + // We use MNT_DETACH here in case it is still busy in some running + // container. This means it'll go away from the global scope directly, + // and the device will be released when that container dies. + if err := sysUnmount(info.mountPath, syscall.MNT_DETACH); err != nil { + utils.Debugf("Shutdown unmounting %s, error: %s\n", info.mountPath, err) + } + + devices.Lock() + if err := devices.deactivateDevice(info); err != nil { + utils.Debugf("Shutdown deactivate %s , error: %s\n", info.Hash, err) + } + devices.Unlock() + } + info.lock.Unlock() + } + + info, _ := devices.lookupDevice("") + if info != nil { + info.lock.Lock() + devices.Lock() + if err := devices.deactivateDevice(info); err != nil { + utils.Debugf("Shutdown deactivate base , error: %s\n", err) + } + devices.Unlock() + info.lock.Unlock() + } + + devices.Lock() + if err := devices.deactivatePool(); err != nil { + utils.Debugf("Shutdown deactivate pool , error: %s\n", err) + } + devices.Unlock() + + return nil +} + +func (devices *DeviceSet) MountDevice(hash, path string, mountLabel string) error { + info, err := devices.lookupDevice(hash) + if err != nil { + return err + } + + info.lock.Lock() + defer info.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + if info.mountCount > 0 { + if path != info.mountPath { + return fmt.Errorf("Trying to mount devmapper device in multple places (%s, %s)", info.mountPath, path) + } + + if info.floating { + // Steal floating ref + info.floating = false + } else { + info.mountCount++ + } + return nil + } + + if err := devices.activateDeviceIfNeeded(info); err != nil { + return fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err) + } + + var flags uintptr = sysMsMgcVal + + mountOptions := label.FormatMountLabel("discard", mountLabel) + err = sysMount(info.DevName(), path, "ext4", flags, mountOptions) + if err != nil && err == sysEInval { + mountOptions = label.FormatMountLabel(mountLabel, "") + err = sysMount(info.DevName(), path, "ext4", flags, mountOptions) + } + if err != nil { + return fmt.Errorf("Error mounting '%s' on '%s': %s", info.DevName(), path, err) + } + + info.mountCount = 1 + info.mountPath = path + info.floating = false + + return devices.setInitialized(info) +} + +func (devices *DeviceSet) UnmountDevice(hash string, mode UnmountMode) error { + utils.Debugf("[devmapper] UnmountDevice(hash=%s, mode=%d)", hash, mode) + defer utils.Debugf("[devmapper] UnmountDevice END") + + info, err := devices.lookupDevice(hash) + if err != nil { + return err + } + + info.lock.Lock() + defer info.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + if mode == UnmountFloat { + if info.floating { + return fmt.Errorf("UnmountDevice: can't float floating reference %s\n", hash) + } + + // Leave this reference floating + info.floating = true + return nil + } + + if mode == UnmountSink { + if !info.floating { + // Someone already sunk this + return nil + } + // Otherwise, treat this as a regular unmount + } + + if info.mountCount == 0 { + return fmt.Errorf("UnmountDevice: device not-mounted id %s\n", hash) + } + + info.mountCount-- + if info.mountCount > 0 { + return nil + } + + utils.Debugf("[devmapper] Unmount(%s)", info.mountPath) + if err := sysUnmount(info.mountPath, 0); err != nil { + utils.Debugf("\n--->Err: %s\n", err) + return err + } + utils.Debugf("[devmapper] Unmount done") + + if err := devices.deactivateDevice(info); err != nil { + return err + } + + info.mountPath = "" + + return nil +} + +func (devices *DeviceSet) HasDevice(hash string) bool { + devices.Lock() + defer devices.Unlock() + + info, _ := devices.lookupDevice(hash) + return info != nil +} + +func (devices *DeviceSet) HasInitializedDevice(hash string) bool { + devices.Lock() + defer devices.Unlock() + + info, _ := devices.lookupDevice(hash) + return info != nil && info.Initialized +} + +func (devices *DeviceSet) HasActivatedDevice(hash string) bool { + info, _ := devices.lookupDevice(hash) + if info == nil { + return false + } + + info.lock.Lock() + defer info.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + devinfo, _ := getInfo(info.Name()) + return devinfo != nil && devinfo.Exists != 0 +} + +func (devices *DeviceSet) setInitialized(info *DevInfo) error { + info.Initialized = true + if err := devices.saveMetadata(); err != nil { + info.Initialized = false + utils.Debugf("\n--->Err: %s\n", err) + return err + } + + return nil +} + +func (devices *DeviceSet) List() []string { + devices.Lock() + defer devices.Unlock() + + devices.devicesLock.Lock() + ids := make([]string, len(devices.Devices)) + i := 0 + for k := range devices.Devices { + ids[i] = k + i++ + } + devices.devicesLock.Unlock() + + return ids +} + +func (devices *DeviceSet) deviceStatus(devName string) (sizeInSectors, mappedSectors, highestMappedSector uint64, err error) { + var params string + _, sizeInSectors, _, params, err = getStatus(devName) + if err != nil { + return + } + if _, err = fmt.Sscanf(params, "%d %d", &mappedSectors, &highestMappedSector); err == nil { + return + } + return +} + +func (devices *DeviceSet) GetDeviceStatus(hash string) (*DevStatus, error) { + info, err := devices.lookupDevice(hash) + if err != nil { + return nil, err + } + + info.lock.Lock() + defer info.lock.Unlock() + + devices.Lock() + defer devices.Unlock() + + status := &DevStatus{ + DeviceId: info.DeviceId, + Size: info.Size, + TransactionId: info.TransactionId, + } + + if err := devices.activateDeviceIfNeeded(info); err != nil { + return nil, fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err) + } + + if sizeInSectors, mappedSectors, highestMappedSector, err := devices.deviceStatus(info.DevName()); err != nil { + return nil, err + } else { + status.SizeInSectors = sizeInSectors + status.MappedSectors = mappedSectors + status.HighestMappedSector = highestMappedSector + } + + return status, nil +} + +func (devices *DeviceSet) poolStatus() (totalSizeInSectors, transactionId, dataUsed, dataTotal, metadataUsed, metadataTotal uint64, err error) { + var params string + if _, totalSizeInSectors, _, params, err = getStatus(devices.getPoolName()); err == nil { + _, err = fmt.Sscanf(params, "%d %d/%d %d/%d", &transactionId, &metadataUsed, &metadataTotal, &dataUsed, &dataTotal) + } + return +} + +func (devices *DeviceSet) Status() *Status { + devices.Lock() + defer devices.Unlock() + + status := &Status{} + + status.PoolName = devices.getPoolName() + status.DataLoopback = path.Join(devices.loopbackDir(), "data") + status.MetadataLoopback = path.Join(devices.loopbackDir(), "metadata") + + totalSizeInSectors, _, dataUsed, dataTotal, metadataUsed, metadataTotal, err := devices.poolStatus() + if err == nil { + // Convert from blocks to bytes + blockSizeInSectors := totalSizeInSectors / dataTotal + + status.Data.Used = dataUsed * blockSizeInSectors * 512 + status.Data.Total = dataTotal * blockSizeInSectors * 512 + + // metadata blocks are always 4k + status.Metadata.Used = metadataUsed * 4096 + status.Metadata.Total = metadataTotal * 4096 + + status.SectorSize = blockSizeInSectors * 512 + } + + return status +} + +func NewDeviceSet(root string, doInit bool) (*DeviceSet, error) { + SetDevDir("/dev") + + devices := &DeviceSet{ + root: root, + MetaData: MetaData{Devices: make(map[string]*DevInfo)}, + } + + if err := devices.initDevmapper(doInit); err != nil { + return nil, err + } + + return devices, nil +} diff --git a/runtime/graphdriver/devmapper/devmapper.go b/runtime/graphdriver/devmapper/devmapper.go new file mode 100644 index 0000000000..7317118dcf --- /dev/null +++ b/runtime/graphdriver/devmapper/devmapper.go @@ -0,0 +1,595 @@ +// +build linux,amd64 + +package devmapper + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/utils" + "runtime" + "syscall" +) + +type DevmapperLogger interface { + log(level int, file string, line int, dmError int, message string) +} + +const ( + DeviceCreate TaskType = iota + DeviceReload + DeviceRemove + DeviceRemoveAll + DeviceSuspend + DeviceResume + DeviceInfo + DeviceDeps + DeviceRename + DeviceVersion + DeviceStatus + DeviceTable + DeviceWaitevent + DeviceList + DeviceClear + DeviceMknodes + DeviceListVersions + DeviceTargetMsg + DeviceSetGeometry +) + +const ( + AddNodeOnResume AddNodeType = iota + AddNodeOnCreate +) + +var ( + ErrTaskRun = errors.New("dm_task_run failed") + ErrTaskSetName = errors.New("dm_task_set_name failed") + ErrTaskSetMessage = errors.New("dm_task_set_message failed") + ErrTaskSetAddNode = errors.New("dm_task_set_add_node failed") + ErrTaskSetRo = errors.New("dm_task_set_ro failed") + ErrTaskAddTarget = errors.New("dm_task_add_target failed") + ErrTaskSetSector = errors.New("dm_task_set_sector failed") + ErrTaskGetInfo = errors.New("dm_task_get_info failed") + ErrTaskSetCookie = errors.New("dm_task_set_cookie failed") + ErrNilCookie = errors.New("cookie ptr can't be nil") + ErrAttachLoopbackDevice = errors.New("loopback mounting failed") + ErrGetBlockSize = errors.New("Can't get block size") + ErrUdevWait = errors.New("wait on udev cookie failed") + ErrSetDevDir = errors.New("dm_set_dev_dir failed") + ErrGetLibraryVersion = errors.New("dm_get_library_version failed") + ErrCreateRemoveTask = errors.New("Can't create task of type DeviceRemove") + ErrRunRemoveDevice = errors.New("running removeDevice failed") + ErrInvalidAddNode = errors.New("Invalide AddNoce type") + ErrGetLoopbackBackingFile = errors.New("Unable to get loopback backing file") + ErrLoopbackSetCapacity = errors.New("Unable set loopback capacity") +) + +type ( + Task struct { + unmanaged *CDmTask + } + Info struct { + Exists int + Suspended int + LiveTable int + InactiveTable int + OpenCount int32 + EventNr uint32 + Major uint32 + Minor uint32 + ReadOnly int + TargetCount int32 + } + TaskType int + AddNodeType int +) + +func (t *Task) destroy() { + if t != nil { + DmTaskDestroy(t.unmanaged) + runtime.SetFinalizer(t, nil) + } +} + +func TaskCreate(tasktype TaskType) *Task { + Ctask := DmTaskCreate(int(tasktype)) + if Ctask == nil { + return nil + } + task := &Task{unmanaged: Ctask} + runtime.SetFinalizer(task, (*Task).destroy) + return task +} + +func (t *Task) Run() error { + if res := DmTaskRun(t.unmanaged); res != 1 { + return ErrTaskRun + } + return nil +} + +func (t *Task) SetName(name string) error { + if res := DmTaskSetName(t.unmanaged, name); res != 1 { + return ErrTaskSetName + } + return nil +} + +func (t *Task) SetMessage(message string) error { + if res := DmTaskSetMessage(t.unmanaged, message); res != 1 { + return ErrTaskSetMessage + } + return nil +} + +func (t *Task) SetSector(sector uint64) error { + if res := DmTaskSetSector(t.unmanaged, sector); res != 1 { + return ErrTaskSetSector + } + return nil +} + +func (t *Task) SetCookie(cookie *uint, flags uint16) error { + if cookie == nil { + return ErrNilCookie + } + if res := DmTaskSetCookie(t.unmanaged, cookie, flags); res != 1 { + return ErrTaskSetCookie + } + return nil +} + +func (t *Task) SetAddNode(addNode AddNodeType) error { + if addNode != AddNodeOnResume && addNode != AddNodeOnCreate { + return ErrInvalidAddNode + } + if res := DmTaskSetAddNode(t.unmanaged, addNode); res != 1 { + return ErrTaskSetAddNode + } + return nil +} + +func (t *Task) SetRo() error { + if res := DmTaskSetRo(t.unmanaged); res != 1 { + return ErrTaskSetRo + } + return nil +} + +func (t *Task) AddTarget(start, size uint64, ttype, params string) error { + if res := DmTaskAddTarget(t.unmanaged, start, size, + ttype, params); res != 1 { + return ErrTaskAddTarget + } + return nil +} + +func (t *Task) GetInfo() (*Info, error) { + info := &Info{} + if res := DmTaskGetInfo(t.unmanaged, info); res != 1 { + return nil, ErrTaskGetInfo + } + return info, nil +} + +func (t *Task) GetNextTarget(next uintptr) (nextPtr uintptr, start uint64, + length uint64, targetType string, params string) { + + return DmGetNextTarget(t.unmanaged, next, &start, &length, + &targetType, ¶ms), + start, length, targetType, params +} + +func getLoopbackBackingFile(file *osFile) (uint64, uint64, error) { + loopInfo, err := ioctlLoopGetStatus64(file.Fd()) + if err != nil { + utils.Errorf("Error get loopback backing file: %s\n", err) + return 0, 0, ErrGetLoopbackBackingFile + } + return loopInfo.loDevice, loopInfo.loInode, nil +} + +func LoopbackSetCapacity(file *osFile) error { + if err := ioctlLoopSetCapacity(file.Fd(), 0); err != nil { + utils.Errorf("Error loopbackSetCapacity: %s", err) + return ErrLoopbackSetCapacity + } + return nil +} + +func FindLoopDeviceFor(file *osFile) *osFile { + stat, err := file.Stat() + if err != nil { + return nil + } + targetInode := stat.Sys().(*sysStatT).Ino + targetDevice := stat.Sys().(*sysStatT).Dev + + for i := 0; true; i++ { + path := fmt.Sprintf("/dev/loop%d", i) + + file, err := osOpenFile(path, osORdWr, 0) + if err != nil { + if osIsNotExist(err) { + return nil + } + + // Ignore all errors until the first not-exist + // we want to continue looking for the file + continue + } + + dev, inode, err := getLoopbackBackingFile(file) + if err == nil && dev == targetDevice && inode == targetInode { + return file + } + file.Close() + } + + return nil +} + +func UdevWait(cookie uint) error { + if res := DmUdevWait(cookie); res != 1 { + utils.Debugf("Failed to wait on udev cookie %d", cookie) + return ErrUdevWait + } + return nil +} + +func LogInitVerbose(level int) { + DmLogInitVerbose(level) +} + +var dmLogger DevmapperLogger = nil + +func logInit(logger DevmapperLogger) { + dmLogger = logger + LogWithErrnoInit() +} + +func SetDevDir(dir string) error { + if res := DmSetDevDir(dir); res != 1 { + utils.Debugf("Error dm_set_dev_dir") + return ErrSetDevDir + } + return nil +} + +func GetLibraryVersion() (string, error) { + var version string + if res := DmGetLibraryVersion(&version); res != 1 { + return "", ErrGetLibraryVersion + } + return version, nil +} + +// Useful helper for cleanup +func RemoveDevice(name string) error { + task := TaskCreate(DeviceRemove) + if task == nil { + return ErrCreateRemoveTask + } + if err := task.SetName(name); err != nil { + utils.Debugf("Can't set task name %s", name) + return err + } + if err := task.Run(); err != nil { + return ErrRunRemoveDevice + } + return nil +} + +func GetBlockDeviceSize(file *osFile) (uint64, error) { + size, err := ioctlBlkGetSize64(file.Fd()) + if err != nil { + utils.Errorf("Error getblockdevicesize: %s", err) + return 0, ErrGetBlockSize + } + return uint64(size), nil +} + +func BlockDeviceDiscard(path string) error { + file, err := osOpenFile(path, osORdWr, 0) + if err != nil { + return err + } + defer file.Close() + + size, err := GetBlockDeviceSize(file) + if err != nil { + return err + } + + if err := ioctlBlkDiscard(file.Fd(), 0, size); err != nil { + return err + } + + // Without this sometimes the remove of the device that happens after + // discard fails with EBUSY. + syscall.Sync() + + return nil +} + +// This is the programmatic example of "dmsetup create" +func createPool(poolName string, dataFile, metadataFile *osFile) error { + task, err := createTask(DeviceCreate, poolName) + if task == nil { + return err + } + + size, err := GetBlockDeviceSize(dataFile) + if err != nil { + return fmt.Errorf("Can't get data size") + } + + params := metadataFile.Name() + " " + dataFile.Name() + " 128 32768 1 skip_block_zeroing" + if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil { + return fmt.Errorf("Can't add target") + } + + var cookie uint = 0 + if err := task.SetCookie(&cookie, 0); err != nil { + return fmt.Errorf("Can't set cookie") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running DeviceCreate (createPool)") + } + + UdevWait(cookie) + + return nil +} + +func reloadPool(poolName string, dataFile, metadataFile *osFile) error { + task, err := createTask(DeviceReload, poolName) + if task == nil { + return err + } + + size, err := GetBlockDeviceSize(dataFile) + if err != nil { + return fmt.Errorf("Can't get data size") + } + + params := metadataFile.Name() + " " + dataFile.Name() + " 128 32768" + if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil { + return fmt.Errorf("Can't add target") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running DeviceCreate") + } + + return nil +} + +func createTask(t TaskType, name string) (*Task, error) { + task := TaskCreate(t) + if task == nil { + return nil, fmt.Errorf("Can't create task of type %d", int(t)) + } + if err := task.SetName(name); err != nil { + return nil, fmt.Errorf("Can't set task name %s", name) + } + return task, nil +} + +func getInfo(name string) (*Info, error) { + task, err := createTask(DeviceInfo, name) + if task == nil { + return nil, err + } + if err := task.Run(); err != nil { + return nil, err + } + return task.GetInfo() +} + +func getStatus(name string) (uint64, uint64, string, string, error) { + task, err := createTask(DeviceStatus, name) + if task == nil { + utils.Debugf("getStatus: Error createTask: %s", err) + return 0, 0, "", "", err + } + if err := task.Run(); err != nil { + utils.Debugf("getStatus: Error Run: %s", err) + return 0, 0, "", "", err + } + + devinfo, err := task.GetInfo() + if err != nil { + utils.Debugf("getStatus: Error GetInfo: %s", err) + return 0, 0, "", "", err + } + if devinfo.Exists == 0 { + utils.Debugf("getStatus: Non existing device %s", name) + return 0, 0, "", "", fmt.Errorf("Non existing device %s", name) + } + + _, start, length, targetType, params := task.GetNextTarget(0) + return start, length, targetType, params, nil +} + +func setTransactionId(poolName string, oldId uint64, newId uint64) error { + task, err := createTask(DeviceTargetMsg, poolName) + if task == nil { + return err + } + + if err := task.SetSector(0); err != nil { + return fmt.Errorf("Can't set sector") + } + + if err := task.SetMessage(fmt.Sprintf("set_transaction_id %d %d", oldId, newId)); err != nil { + return fmt.Errorf("Can't set message") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running setTransactionId") + } + return nil +} + +func suspendDevice(name string) error { + task, err := createTask(DeviceSuspend, name) + if task == nil { + return err + } + if err := task.Run(); err != nil { + return fmt.Errorf("Error running DeviceSuspend: %s", err) + } + return nil +} + +func resumeDevice(name string) error { + task, err := createTask(DeviceResume, name) + if task == nil { + return err + } + + var cookie uint = 0 + if err := task.SetCookie(&cookie, 0); err != nil { + return fmt.Errorf("Can't set cookie") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running DeviceResume") + } + + UdevWait(cookie) + + return nil +} + +func createDevice(poolName string, deviceId int) error { + utils.Debugf("[devmapper] createDevice(poolName=%v, deviceId=%v)", poolName, deviceId) + task, err := createTask(DeviceTargetMsg, poolName) + if task == nil { + return err + } + + if err := task.SetSector(0); err != nil { + return fmt.Errorf("Can't set sector") + } + + if err := task.SetMessage(fmt.Sprintf("create_thin %d", deviceId)); err != nil { + return fmt.Errorf("Can't set message") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running createDevice") + } + return nil +} + +func deleteDevice(poolName string, deviceId int) error { + task, err := createTask(DeviceTargetMsg, poolName) + if task == nil { + return err + } + + if err := task.SetSector(0); err != nil { + return fmt.Errorf("Can't set sector") + } + + if err := task.SetMessage(fmt.Sprintf("delete %d", deviceId)); err != nil { + return fmt.Errorf("Can't set message") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running deleteDevice") + } + return nil +} + +func removeDevice(name string) error { + utils.Debugf("[devmapper] removeDevice START") + defer utils.Debugf("[devmapper] removeDevice END") + task, err := createTask(DeviceRemove, name) + if task == nil { + return err + } + if err = task.Run(); err != nil { + return fmt.Errorf("Error running removeDevice") + } + return nil +} + +func activateDevice(poolName string, name string, deviceId int, size uint64) error { + task, err := createTask(DeviceCreate, name) + if task == nil { + return err + } + + params := fmt.Sprintf("%s %d", poolName, deviceId) + if err := task.AddTarget(0, size/512, "thin", params); err != nil { + return fmt.Errorf("Can't add target") + } + if err := task.SetAddNode(AddNodeOnCreate); err != nil { + return fmt.Errorf("Can't add node") + } + + var cookie uint = 0 + if err := task.SetCookie(&cookie, 0); err != nil { + return fmt.Errorf("Can't set cookie") + } + + if err := task.Run(); err != nil { + return fmt.Errorf("Error running DeviceCreate (activateDevice)") + } + + UdevWait(cookie) + + return nil +} + +func (devices *DeviceSet) createSnapDevice(poolName string, deviceId int, baseName string, baseDeviceId int) error { + devinfo, _ := getInfo(baseName) + doSuspend := devinfo != nil && devinfo.Exists != 0 + + if doSuspend { + if err := suspendDevice(baseName); err != nil { + return err + } + } + + task, err := createTask(DeviceTargetMsg, poolName) + if task == nil { + if doSuspend { + resumeDevice(baseName) + } + return err + } + + if err := task.SetSector(0); err != nil { + if doSuspend { + resumeDevice(baseName) + } + return fmt.Errorf("Can't set sector") + } + + if err := task.SetMessage(fmt.Sprintf("create_snap %d %d", deviceId, baseDeviceId)); err != nil { + if doSuspend { + resumeDevice(baseName) + } + return fmt.Errorf("Can't set message") + } + + if err := task.Run(); err != nil { + if doSuspend { + resumeDevice(baseName) + } + return fmt.Errorf("Error running DeviceCreate (createSnapDevice)") + } + + if doSuspend { + if err := resumeDevice(baseName); err != nil { + return err + } + } + + return nil +} diff --git a/runtime/graphdriver/devmapper/devmapper_doc.go b/runtime/graphdriver/devmapper/devmapper_doc.go new file mode 100644 index 0000000000..c1c3e3891b --- /dev/null +++ b/runtime/graphdriver/devmapper/devmapper_doc.go @@ -0,0 +1,106 @@ +package devmapper + +// Definition of struct dm_task and sub structures (from lvm2) +// +// struct dm_ioctl { +// /* +// * The version number is made up of three parts: +// * major - no backward or forward compatibility, +// * minor - only backwards compatible, +// * patch - both backwards and forwards compatible. +// * +// * All clients of the ioctl interface should fill in the +// * version number of the interface that they were +// * compiled with. +// * +// * All recognised ioctl commands (ie. those that don't +// * return -ENOTTY) fill out this field, even if the +// * command failed. +// */ +// uint32_t version[3]; /* in/out */ +// uint32_t data_size; /* total size of data passed in +// * including this struct */ + +// uint32_t data_start; /* offset to start of data +// * relative to start of this struct */ + +// uint32_t target_count; /* in/out */ +// int32_t open_count; /* out */ +// uint32_t flags; /* in/out */ + +// /* +// * event_nr holds either the event number (input and output) or the +// * udev cookie value (input only). +// * The DM_DEV_WAIT ioctl takes an event number as input. +// * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls +// * use the field as a cookie to return in the DM_COOKIE +// * variable with the uevents they issue. +// * For output, the ioctls return the event number, not the cookie. +// */ +// uint32_t event_nr; /* in/out */ +// uint32_t padding; + +// uint64_t dev; /* in/out */ + +// char name[DM_NAME_LEN]; /* device name */ +// char uuid[DM_UUID_LEN]; /* unique identifier for +// * the block device */ +// char data[7]; /* padding or data */ +// }; + +// struct target { +// uint64_t start; +// uint64_t length; +// char *type; +// char *params; + +// struct target *next; +// }; + +// typedef enum { +// DM_ADD_NODE_ON_RESUME, /* add /dev/mapper node with dmsetup resume */ +// DM_ADD_NODE_ON_CREATE /* add /dev/mapper node with dmsetup create */ +// } dm_add_node_t; + +// struct dm_task { +// int type; +// char *dev_name; +// char *mangled_dev_name; + +// struct target *head, *tail; + +// int read_only; +// uint32_t event_nr; +// int major; +// int minor; +// int allow_default_major_fallback; +// uid_t uid; +// gid_t gid; +// mode_t mode; +// uint32_t read_ahead; +// uint32_t read_ahead_flags; +// union { +// struct dm_ioctl *v4; +// } dmi; +// char *newname; +// char *message; +// char *geometry; +// uint64_t sector; +// int no_flush; +// int no_open_count; +// int skip_lockfs; +// int query_inactive_table; +// int suppress_identical_reload; +// dm_add_node_t add_node; +// uint64_t existing_table_size; +// int cookie_set; +// int new_uuid; +// int secure_data; +// int retry_remove; +// int enable_checks; +// int expected_errno; + +// char *uuid; +// char *mangled_uuid; +// }; +// diff --git a/runtime/graphdriver/devmapper/devmapper_log.go b/runtime/graphdriver/devmapper/devmapper_log.go new file mode 100644 index 0000000000..18dde7cca5 --- /dev/null +++ b/runtime/graphdriver/devmapper/devmapper_log.go @@ -0,0 +1,15 @@ +// +build linux,amd64 + +package devmapper + +import "C" + +// Due to the way cgo works this has to be in a separate file, as devmapper.go has +// definitions in the cgo block, which is incompatible with using "//export" + +//export DevmapperLogCallback +func DevmapperLogCallback(level C.int, file *C.char, line C.int, dm_errno_or_class C.int, message *C.char) { + if dmLogger != nil { + dmLogger.log(int(level), C.GoString(file), int(line), int(dm_errno_or_class), C.GoString(message)) + } +} diff --git a/runtime/graphdriver/devmapper/devmapper_test.go b/runtime/graphdriver/devmapper/devmapper_test.go new file mode 100644 index 0000000000..3ffa163ceb --- /dev/null +++ b/runtime/graphdriver/devmapper/devmapper_test.go @@ -0,0 +1,287 @@ +// +build linux,amd64 + +package devmapper + +import ( + "testing" +) + +func TestTaskCreate(t *testing.T) { + t.Skip("FIXME: not a unit test") + // Test success + taskCreate(t, DeviceInfo) + + // Test Failure + DmTaskCreate = dmTaskCreateFail + defer func() { DmTaskCreate = dmTaskCreateFct }() + if task := TaskCreate(-1); task != nil { + t.Fatalf("An error should have occured while creating an invalid task.") + } +} + +func TestTaskRun(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + // Perform the RUN + if err := task.Run(); err != nil { + t.Fatal(err) + } + // Make sure we don't have error with GetInfo + if _, err := task.GetInfo(); err != nil { + t.Fatal(err) + } + + // Test failure + DmTaskRun = dmTaskRunFail + defer func() { DmTaskRun = dmTaskRunFct }() + + task = taskCreate(t, DeviceInfo) + // Perform the RUN + if err := task.Run(); err != ErrTaskRun { + t.Fatalf("An error should have occured while running task.") + } + // Make sure GetInfo also fails + if _, err := task.GetInfo(); err != ErrTaskGetInfo { + t.Fatalf("GetInfo should fail if task.Run() failed.") + } +} + +func TestTaskSetName(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.SetName("test"); err != nil { + t.Fatal(err) + } + + // Test failure + DmTaskSetName = dmTaskSetNameFail + defer func() { DmTaskSetName = dmTaskSetNameFct }() + + if err := task.SetName("test"); err != ErrTaskSetName { + t.Fatalf("An error should have occured while runnign SetName.") + } +} + +func TestTaskSetMessage(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.SetMessage("test"); err != nil { + t.Fatal(err) + } + + // Test failure + DmTaskSetMessage = dmTaskSetMessageFail + defer func() { DmTaskSetMessage = dmTaskSetMessageFct }() + + if err := task.SetMessage("test"); err != ErrTaskSetMessage { + t.Fatalf("An error should have occured while runnign SetMessage.") + } +} + +func TestTaskSetSector(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.SetSector(128); err != nil { + t.Fatal(err) + } + + DmTaskSetSector = dmTaskSetSectorFail + defer func() { DmTaskSetSector = dmTaskSetSectorFct }() + + // Test failure + if err := task.SetSector(0); err != ErrTaskSetSector { + t.Fatalf("An error should have occured while running SetSector.") + } +} + +func TestTaskSetCookie(t *testing.T) { + t.Skip("FIXME: not a unit test") + var ( + cookie uint = 0 + task = taskCreate(t, DeviceInfo) + ) + + // Test success + if err := task.SetCookie(&cookie, 0); err != nil { + t.Fatal(err) + } + + // Test failure + if err := task.SetCookie(nil, 0); err != ErrNilCookie { + t.Fatalf("An error should have occured while running SetCookie with nil cookie.") + } + + DmTaskSetCookie = dmTaskSetCookieFail + defer func() { DmTaskSetCookie = dmTaskSetCookieFct }() + + if err := task.SetCookie(&cookie, 0); err != ErrTaskSetCookie { + t.Fatalf("An error should have occured while running SetCookie.") + } +} + +func TestTaskSetAddNode(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.SetAddNode(0); err != nil { + t.Fatal(err) + } + + // Test failure + if err := task.SetAddNode(-1); err != ErrInvalidAddNode { + t.Fatalf("An error should have occured running SetAddNode with wrong node.") + } + + DmTaskSetAddNode = dmTaskSetAddNodeFail + defer func() { DmTaskSetAddNode = dmTaskSetAddNodeFct }() + + if err := task.SetAddNode(0); err != ErrTaskSetAddNode { + t.Fatalf("An error should have occured running SetAddNode.") + } +} + +func TestTaskSetRo(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.SetRo(); err != nil { + t.Fatal(err) + } + + // Test failure + DmTaskSetRo = dmTaskSetRoFail + defer func() { DmTaskSetRo = dmTaskSetRoFct }() + + if err := task.SetRo(); err != ErrTaskSetRo { + t.Fatalf("An error should have occured running SetRo.") + } +} + +func TestTaskAddTarget(t *testing.T) { + t.Skip("FIXME: not a unit test") + task := taskCreate(t, DeviceInfo) + + // Test success + if err := task.AddTarget(0, 128, "thinp", ""); err != nil { + t.Fatal(err) + } + + // Test failure + DmTaskAddTarget = dmTaskAddTargetFail + defer func() { DmTaskAddTarget = dmTaskAddTargetFct }() + + if err := task.AddTarget(0, 128, "thinp", ""); err != ErrTaskAddTarget { + t.Fatalf("An error should have occured running AddTarget.") + } +} + +// func TestTaskGetInfo(t *testing.T) { +// task := taskCreate(t, DeviceInfo) + +// // Test success +// if _, err := task.GetInfo(); err != nil { +// t.Fatal(err) +// } + +// // Test failure +// DmTaskGetInfo = dmTaskGetInfoFail +// defer func() { DmTaskGetInfo = dmTaskGetInfoFct }() + +// if _, err := task.GetInfo(); err != ErrTaskGetInfo { +// t.Fatalf("An error should have occured running GetInfo.") +// } +// } + +// func TestTaskGetNextTarget(t *testing.T) { +// task := taskCreate(t, DeviceInfo) + +// if next, _, _, _, _ := task.GetNextTarget(0); next == 0 { +// t.Fatalf("The next target should not be 0.") +// } +// } + +/// Utils +func taskCreate(t *testing.T, taskType TaskType) *Task { + task := TaskCreate(taskType) + if task == nil { + t.Fatalf("Error creating task") + } + return task +} + +/// Failure function replacement +func dmTaskCreateFail(t int) *CDmTask { + return nil +} + +func dmTaskRunFail(task *CDmTask) int { + return -1 +} + +func dmTaskSetNameFail(task *CDmTask, name string) int { + return -1 +} + +func dmTaskSetMessageFail(task *CDmTask, message string) int { + return -1 +} + +func dmTaskSetSectorFail(task *CDmTask, sector uint64) int { + return -1 +} + +func dmTaskSetCookieFail(task *CDmTask, cookie *uint, flags uint16) int { + return -1 +} + +func dmTaskSetAddNodeFail(task *CDmTask, addNode AddNodeType) int { + return -1 +} + +func dmTaskSetRoFail(task *CDmTask) int { + return -1 +} + +func dmTaskAddTargetFail(task *CDmTask, + start, size uint64, ttype, params string) int { + return -1 +} + +func dmTaskGetInfoFail(task *CDmTask, info *Info) int { + return -1 +} + +func dmGetNextTargetFail(task *CDmTask, next uintptr, start, length *uint64, + target, params *string) uintptr { + return 0 +} + +func dmAttachLoopDeviceFail(filename string, fd *int) string { + return "" +} + +func sysGetBlockSizeFail(fd uintptr, size *uint64) sysErrno { + return 1 +} + +func dmUdevWaitFail(cookie uint) int { + return -1 +} + +func dmSetDevDirFail(dir string) int { + return -1 +} + +func dmGetLibraryVersionFail(version *string) int { + return -1 +} diff --git a/runtime/graphdriver/devmapper/devmapper_wrapper.go b/runtime/graphdriver/devmapper/devmapper_wrapper.go new file mode 100644 index 0000000000..bf558affc8 --- /dev/null +++ b/runtime/graphdriver/devmapper/devmapper_wrapper.go @@ -0,0 +1,229 @@ +// +build linux,amd64 + +package devmapper + +/* +#cgo LDFLAGS: -L. -ldevmapper +#include <libdevmapper.h> +#include <linux/loop.h> // FIXME: present only for defines, maybe we can remove it? +#include <linux/fs.h> // FIXME: present only for BLKGETSIZE64, maybe we can remove it? + +#ifndef LOOP_CTL_GET_FREE + #define LOOP_CTL_GET_FREE 0x4C82 +#endif + +#ifndef LO_FLAGS_PARTSCAN + #define LO_FLAGS_PARTSCAN 8 +#endif + +// FIXME: Can't we find a way to do the logging in pure Go? +extern void DevmapperLogCallback(int level, char *file, int line, int dm_errno_or_class, char *str); + +static void log_cb(int level, const char *file, int line, int dm_errno_or_class, const char *f, ...) +{ + char buffer[256]; + va_list ap; + + va_start(ap, f); + vsnprintf(buffer, 256, f, ap); + va_end(ap); + + DevmapperLogCallback(level, (char *)file, line, dm_errno_or_class, buffer); +} + +static void log_with_errno_init() +{ + dm_log_with_errno_init(log_cb); +} +*/ +import "C" + +import ( + "unsafe" +) + +type ( + CDmTask C.struct_dm_task + + CLoopInfo64 C.struct_loop_info64 + LoopInfo64 struct { + loDevice uint64 /* ioctl r/o */ + loInode uint64 /* ioctl r/o */ + loRdevice uint64 /* ioctl r/o */ + loOffset uint64 + loSizelimit uint64 /* bytes, 0 == max available */ + loNumber uint32 /* ioctl r/o */ + loEncrypt_type uint32 + loEncrypt_key_size uint32 /* ioctl w/o */ + loFlags uint32 /* ioctl r/o */ + loFileName [LoNameSize]uint8 + loCryptName [LoNameSize]uint8 + loEncryptKey [LoKeySize]uint8 /* ioctl w/o */ + loInit [2]uint64 + } +) + +// IOCTL consts +const ( + BlkGetSize64 = C.BLKGETSIZE64 + BlkDiscard = C.BLKDISCARD + + LoopSetFd = C.LOOP_SET_FD + LoopCtlGetFree = C.LOOP_CTL_GET_FREE + LoopGetStatus64 = C.LOOP_GET_STATUS64 + LoopSetStatus64 = C.LOOP_SET_STATUS64 + LoopClrFd = C.LOOP_CLR_FD + LoopSetCapacity = C.LOOP_SET_CAPACITY +) + +const ( + LoFlagsAutoClear = C.LO_FLAGS_AUTOCLEAR + LoFlagsReadOnly = C.LO_FLAGS_READ_ONLY + LoFlagsPartScan = C.LO_FLAGS_PARTSCAN + LoKeySize = C.LO_KEY_SIZE + LoNameSize = C.LO_NAME_SIZE +) + +var ( + DmGetLibraryVersion = dmGetLibraryVersionFct + DmGetNextTarget = dmGetNextTargetFct + DmLogInitVerbose = dmLogInitVerboseFct + DmSetDevDir = dmSetDevDirFct + DmTaskAddTarget = dmTaskAddTargetFct + DmTaskCreate = dmTaskCreateFct + DmTaskDestroy = dmTaskDestroyFct + DmTaskGetInfo = dmTaskGetInfoFct + DmTaskRun = dmTaskRunFct + DmTaskSetAddNode = dmTaskSetAddNodeFct + DmTaskSetCookie = dmTaskSetCookieFct + DmTaskSetMessage = dmTaskSetMessageFct + DmTaskSetName = dmTaskSetNameFct + DmTaskSetRo = dmTaskSetRoFct + DmTaskSetSector = dmTaskSetSectorFct + DmUdevWait = dmUdevWaitFct + LogWithErrnoInit = logWithErrnoInitFct +) + +func free(p *C.char) { + C.free(unsafe.Pointer(p)) +} + +func dmTaskDestroyFct(task *CDmTask) { + C.dm_task_destroy((*C.struct_dm_task)(task)) +} + +func dmTaskCreateFct(taskType int) *CDmTask { + return (*CDmTask)(C.dm_task_create(C.int(taskType))) +} + +func dmTaskRunFct(task *CDmTask) int { + ret, _ := C.dm_task_run((*C.struct_dm_task)(task)) + return int(ret) +} + +func dmTaskSetNameFct(task *CDmTask, name string) int { + Cname := C.CString(name) + defer free(Cname) + + return int(C.dm_task_set_name((*C.struct_dm_task)(task), Cname)) +} + +func dmTaskSetMessageFct(task *CDmTask, message string) int { + Cmessage := C.CString(message) + defer free(Cmessage) + + return int(C.dm_task_set_message((*C.struct_dm_task)(task), Cmessage)) +} + +func dmTaskSetSectorFct(task *CDmTask, sector uint64) int { + return int(C.dm_task_set_sector((*C.struct_dm_task)(task), C.uint64_t(sector))) +} + +func dmTaskSetCookieFct(task *CDmTask, cookie *uint, flags uint16) int { + cCookie := C.uint32_t(*cookie) + defer func() { + *cookie = uint(cCookie) + }() + return int(C.dm_task_set_cookie((*C.struct_dm_task)(task), &cCookie, C.uint16_t(flags))) +} + +func dmTaskSetAddNodeFct(task *CDmTask, addNode AddNodeType) int { + return int(C.dm_task_set_add_node((*C.struct_dm_task)(task), C.dm_add_node_t(addNode))) +} + +func dmTaskSetRoFct(task *CDmTask) int { + return int(C.dm_task_set_ro((*C.struct_dm_task)(task))) +} + +func dmTaskAddTargetFct(task *CDmTask, + start, size uint64, ttype, params string) int { + + Cttype := C.CString(ttype) + defer free(Cttype) + + Cparams := C.CString(params) + defer free(Cparams) + + return int(C.dm_task_add_target((*C.struct_dm_task)(task), C.uint64_t(start), C.uint64_t(size), Cttype, Cparams)) +} + +func dmTaskGetInfoFct(task *CDmTask, info *Info) int { + Cinfo := C.struct_dm_info{} + defer func() { + info.Exists = int(Cinfo.exists) + info.Suspended = int(Cinfo.suspended) + info.LiveTable = int(Cinfo.live_table) + info.InactiveTable = int(Cinfo.inactive_table) + info.OpenCount = int32(Cinfo.open_count) + info.EventNr = uint32(Cinfo.event_nr) + info.Major = uint32(Cinfo.major) + info.Minor = uint32(Cinfo.minor) + info.ReadOnly = int(Cinfo.read_only) + info.TargetCount = int32(Cinfo.target_count) + }() + return int(C.dm_task_get_info((*C.struct_dm_task)(task), &Cinfo)) +} + +func dmGetNextTargetFct(task *CDmTask, next uintptr, start, length *uint64, target, params *string) uintptr { + var ( + Cstart, Clength C.uint64_t + CtargetType, Cparams *C.char + ) + defer func() { + *start = uint64(Cstart) + *length = uint64(Clength) + *target = C.GoString(CtargetType) + *params = C.GoString(Cparams) + }() + + nextp := C.dm_get_next_target((*C.struct_dm_task)(task), unsafe.Pointer(next), &Cstart, &Clength, &CtargetType, &Cparams) + return uintptr(nextp) +} + +func dmUdevWaitFct(cookie uint) int { + return int(C.dm_udev_wait(C.uint32_t(cookie))) +} + +func dmLogInitVerboseFct(level int) { + C.dm_log_init_verbose(C.int(level)) +} + +func logWithErrnoInitFct() { + C.log_with_errno_init() +} + +func dmSetDevDirFct(dir string) int { + Cdir := C.CString(dir) + defer free(Cdir) + + return int(C.dm_set_dev_dir(Cdir)) +} + +func dmGetLibraryVersionFct(version *string) int { + buffer := C.CString(string(make([]byte, 128))) + defer free(buffer) + defer func() { + *version = C.GoString(buffer) + }() + return int(C.dm_get_library_version(buffer, 128)) +} diff --git a/runtime/graphdriver/devmapper/driver.go b/runtime/graphdriver/devmapper/driver.go new file mode 100644 index 0000000000..35fe883f26 --- /dev/null +++ b/runtime/graphdriver/devmapper/driver.go @@ -0,0 +1,142 @@ +// +build linux,amd64 + +package devmapper + +import ( + "fmt" + "github.com/dotcloud/docker/runtime/graphdriver" + "github.com/dotcloud/docker/utils" + "io/ioutil" + "os" + "path" +) + +func init() { + graphdriver.Register("devicemapper", Init) +} + +// Placeholder interfaces, to be replaced +// at integration. + +// End of placeholder interfaces. + +type Driver struct { + *DeviceSet + home string +} + +var Init = func(home string) (graphdriver.Driver, error) { + deviceSet, err := NewDeviceSet(home, true) + if err != nil { + return nil, err + } + d := &Driver{ + DeviceSet: deviceSet, + home: home, + } + return d, nil +} + +func (d *Driver) String() string { + return "devicemapper" +} + +func (d *Driver) Status() [][2]string { + s := d.DeviceSet.Status() + + status := [][2]string{ + {"Pool Name", s.PoolName}, + {"Data file", s.DataLoopback}, + {"Metadata file", s.MetadataLoopback}, + {"Data Space Used", fmt.Sprintf("%.1f Mb", float64(s.Data.Used)/(1024*1024))}, + {"Data Space Total", fmt.Sprintf("%.1f Mb", float64(s.Data.Total)/(1024*1024))}, + {"Metadata Space Used", fmt.Sprintf("%.1f Mb", float64(s.Metadata.Used)/(1024*1024))}, + {"Metadata Space Total", fmt.Sprintf("%.1f Mb", float64(s.Metadata.Total)/(1024*1024))}, + } + return status +} + +func (d *Driver) Cleanup() error { + return d.DeviceSet.Shutdown() +} + +func (d *Driver) Create(id, parent string, mountLabel string) error { + if err := d.DeviceSet.AddDevice(id, parent); err != nil { + return err + } + mp := path.Join(d.home, "mnt", id) + if err := d.mount(id, mp); err != nil { + return err + } + + if err := osMkdirAll(path.Join(mp, "rootfs"), 0755); err != nil && !osIsExist(err) { + return err + } + + // Create an "id" file with the container/image id in it to help reconscruct this in case + // of later problems + if err := ioutil.WriteFile(path.Join(mp, "id"), []byte(id), 0600); err != nil { + return err + } + + // We float this reference so that the next Get call can + // steal it, so we don't have to unmount + if err := d.DeviceSet.UnmountDevice(id, UnmountFloat); err != nil { + return err + } + + return nil +} + +func (d *Driver) Remove(id string) error { + if !d.DeviceSet.HasDevice(id) { + // Consider removing a non-existing device a no-op + // This is useful to be able to progress on container removal + // if the underlying device has gone away due to earlier errors + return nil + } + + // Sink the float from create in case no Get() call was made + if err := d.DeviceSet.UnmountDevice(id, UnmountSink); err != nil { + return err + } + // This assumes the device has been properly Get/Put:ed and thus is unmounted + if err := d.DeviceSet.DeleteDevice(id); err != nil { + return err + } + + mp := path.Join(d.home, "mnt", id) + if err := os.RemoveAll(mp); err != nil && !os.IsNotExist(err) { + return err + } + + return nil +} + +func (d *Driver) Get(id string) (string, error) { + mp := path.Join(d.home, "mnt", id) + if err := d.mount(id, mp); err != nil { + return "", err + } + + return path.Join(mp, "rootfs"), nil +} + +func (d *Driver) Put(id string) { + if err := d.DeviceSet.UnmountDevice(id, UnmountRegular); err != nil { + utils.Errorf("Warning: error unmounting device %s: %s\n", id, err) + } +} + +func (d *Driver) mount(id, mountPoint string) error { + // Create the target directories if they don't exist + if err := osMkdirAll(mountPoint, 0755); err != nil && !osIsExist(err) { + return err + } + // Mount the device + return d.DeviceSet.MountDevice(id, mountPoint, "") +} + +func (d *Driver) Exists(id string) bool { + return d.Devices[id] != nil +} diff --git a/runtime/graphdriver/devmapper/driver_test.go b/runtime/graphdriver/devmapper/driver_test.go new file mode 100644 index 0000000000..4ca72db0ca --- /dev/null +++ b/runtime/graphdriver/devmapper/driver_test.go @@ -0,0 +1,886 @@ +// +build linux,amd64 + +package devmapper + +import ( + "fmt" + "github.com/dotcloud/docker/runtime/graphdriver" + "io/ioutil" + "path" + "runtime" + "strings" + "syscall" + "testing" +) + +func init() { + // Reduce the size the the base fs and loopback for the tests + DefaultDataLoopbackSize = 300 * 1024 * 1024 + DefaultMetaDataLoopbackSize = 200 * 1024 * 1024 + DefaultBaseFsSize = 300 * 1024 * 1024 +} + +// denyAllDevmapper mocks all calls to libdevmapper in the unit tests, and denies them by default +func denyAllDevmapper() { + // Hijack all calls to libdevmapper with default panics. + // Authorized calls are selectively hijacked in each tests. + DmTaskCreate = func(t int) *CDmTask { + panic("DmTaskCreate: this method should not be called here") + } + DmTaskRun = func(task *CDmTask) int { + panic("DmTaskRun: this method should not be called here") + } + DmTaskSetName = func(task *CDmTask, name string) int { + panic("DmTaskSetName: this method should not be called here") + } + DmTaskSetMessage = func(task *CDmTask, message string) int { + panic("DmTaskSetMessage: this method should not be called here") + } + DmTaskSetSector = func(task *CDmTask, sector uint64) int { + panic("DmTaskSetSector: this method should not be called here") + } + DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int { + panic("DmTaskSetCookie: this method should not be called here") + } + DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int { + panic("DmTaskSetAddNode: this method should not be called here") + } + DmTaskSetRo = func(task *CDmTask) int { + panic("DmTaskSetRo: this method should not be called here") + } + DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int { + panic("DmTaskAddTarget: this method should not be called here") + } + DmTaskGetInfo = func(task *CDmTask, info *Info) int { + panic("DmTaskGetInfo: this method should not be called here") + } + DmGetNextTarget = func(task *CDmTask, next uintptr, start, length *uint64, target, params *string) uintptr { + panic("DmGetNextTarget: this method should not be called here") + } + DmUdevWait = func(cookie uint) int { + panic("DmUdevWait: this method should not be called here") + } + DmSetDevDir = func(dir string) int { + panic("DmSetDevDir: this method should not be called here") + } + DmGetLibraryVersion = func(version *string) int { + panic("DmGetLibraryVersion: this method should not be called here") + } + DmLogInitVerbose = func(level int) { + panic("DmLogInitVerbose: this method should not be called here") + } + DmTaskDestroy = func(task *CDmTask) { + panic("DmTaskDestroy: this method should not be called here") + } + LogWithErrnoInit = func() { + panic("LogWithErrnoInit: this method should not be called here") + } +} + +func denyAllSyscall() { + sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) { + panic("sysMount: this method should not be called here") + } + sysUnmount = func(target string, flags int) (err error) { + panic("sysUnmount: this method should not be called here") + } + sysCloseOnExec = func(fd int) { + panic("sysCloseOnExec: this method should not be called here") + } + sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + panic("sysSyscall: this method should not be called here") + } + // Not a syscall, but forbidding it here anyway + Mounted = func(mnt string) (bool, error) { + panic("devmapper.Mounted: this method should not be called here") + } + // osOpenFile = os.OpenFile + // osNewFile = os.NewFile + // osCreate = os.Create + // osStat = os.Stat + // osIsNotExist = os.IsNotExist + // osIsExist = os.IsExist + // osMkdirAll = os.MkdirAll + // osRemoveAll = os.RemoveAll + // osRename = os.Rename + // osReadlink = os.Readlink + + // execRun = func(name string, args ...string) error { + // return exec.Command(name, args...).Run() + // } +} + +func mkTestDirectory(t *testing.T) string { + dir, err := ioutil.TempDir("", "docker-test-devmapper-") + if err != nil { + t.Fatal(err) + } + return dir +} + +func newDriver(t *testing.T) *Driver { + home := mkTestDirectory(t) + d, err := Init(home) + if err != nil { + t.Fatal(err) + } + return d.(*Driver) +} + +func cleanup(d *Driver) { + d.Cleanup() + osRemoveAll(d.home) +} + +type Set map[string]bool + +func (r Set) Assert(t *testing.T, names ...string) { + for _, key := range names { + required := true + if strings.HasPrefix(key, "?") { + key = key[1:] + required = false + } + if _, exists := r[key]; !exists && required { + t.Fatalf("Key not set: %s", key) + } + delete(r, key) + } + if len(r) != 0 { + t.Fatalf("Unexpected keys: %v", r) + } +} + +func TestInit(t *testing.T) { + var ( + calls = make(Set) + taskMessages = make(Set) + taskTypes = make(Set) + home = mkTestDirectory(t) + ) + defer osRemoveAll(home) + + func() { + denyAllDevmapper() + DmSetDevDir = func(dir string) int { + calls["DmSetDevDir"] = true + expectedDir := "/dev" + if dir != expectedDir { + t.Fatalf("Wrong libdevmapper call\nExpected: DmSetDevDir(%v)\nReceived: DmSetDevDir(%v)\n", expectedDir, dir) + } + return 0 + } + LogWithErrnoInit = func() { + calls["DmLogWithErrnoInit"] = true + } + var task1 CDmTask + DmTaskCreate = func(taskType int) *CDmTask { + calls["DmTaskCreate"] = true + taskTypes[fmt.Sprintf("%d", taskType)] = true + return &task1 + } + DmTaskSetName = func(task *CDmTask, name string) int { + calls["DmTaskSetName"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetName(%v)\nReceived: DmTaskSetName(%v)\n", expectedTask, task) + } + // FIXME: use Set.AssertRegexp() + if !strings.HasPrefix(name, "docker-") && !strings.HasPrefix(name, "/dev/mapper/docker-") || + !strings.HasSuffix(name, "-pool") && !strings.HasSuffix(name, "-base") { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetName(%v)\nReceived: DmTaskSetName(%v)\n", "docker-...-pool", name) + } + return 1 + } + DmTaskRun = func(task *CDmTask) int { + calls["DmTaskRun"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskRun(%v)\nReceived: DmTaskRun(%v)\n", expectedTask, task) + } + return 1 + } + DmTaskGetInfo = func(task *CDmTask, info *Info) int { + calls["DmTaskGetInfo"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskGetInfo(%v)\nReceived: DmTaskGetInfo(%v)\n", expectedTask, task) + } + // This will crash if info is not dereferenceable + info.Exists = 0 + return 1 + } + DmTaskSetSector = func(task *CDmTask, sector uint64) int { + calls["DmTaskSetSector"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetSector(%v)\nReceived: DmTaskSetSector(%v)\n", expectedTask, task) + } + if expectedSector := uint64(0); sector != expectedSector { + t.Fatalf("Wrong libdevmapper call to DmTaskSetSector\nExpected: %v\nReceived: %v\n", expectedSector, sector) + } + return 1 + } + DmTaskSetMessage = func(task *CDmTask, message string) int { + calls["DmTaskSetMessage"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetSector(%v)\nReceived: DmTaskSetSector(%v)\n", expectedTask, task) + } + taskMessages[message] = true + return 1 + } + DmTaskDestroy = func(task *CDmTask) { + calls["DmTaskDestroy"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task) + } + } + DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int { + calls["DmTaskSetTarget"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task) + } + if start != 0 { + t.Fatalf("Wrong start: %d != %d", start, 0) + } + if ttype != "thin" && ttype != "thin-pool" { + t.Fatalf("Wrong ttype: %s", ttype) + } + // Quick smoke test + if params == "" { + t.Fatalf("Params should not be empty") + } + return 1 + } + fakeCookie := uint(4321) + DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int { + calls["DmTaskSetCookie"] = true + expectedTask := &task1 + if task != expectedTask { + t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task) + } + if flags != 0 { + t.Fatalf("Cookie flags should be 0 (not %x)", flags) + } + *cookie = fakeCookie + return 1 + } + DmUdevWait = func(cookie uint) int { + calls["DmUdevWait"] = true + if cookie != fakeCookie { + t.Fatalf("Wrong cookie: %d != %d", cookie, fakeCookie) + } + return 1 + } + DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int { + if addNode != AddNodeOnCreate { + t.Fatalf("Wrong AddNoteType: %v (expected %v)", addNode, AddNodeOnCreate) + } + calls["DmTaskSetAddNode"] = true + return 1 + } + execRun = func(name string, args ...string) error { + calls["execRun"] = true + if name != "mkfs.ext4" { + t.Fatalf("Expected %s to be executed, not %s", "mkfs.ext4", name) + } + return nil + } + driver, err := Init(home) + if err != nil { + t.Fatal(err) + } + defer func() { + if err := driver.Cleanup(); err != nil { + t.Fatal(err) + } + }() + }() + // Put all tests in a function to make sure the garbage collection will + // occur. + + // Call GC to cleanup runtime.Finalizers + runtime.GC() + + calls.Assert(t, + "DmSetDevDir", + "DmLogWithErrnoInit", + "DmTaskSetName", + "DmTaskRun", + "DmTaskGetInfo", + "DmTaskDestroy", + "execRun", + "DmTaskCreate", + "DmTaskSetTarget", + "DmTaskSetCookie", + "DmUdevWait", + "DmTaskSetSector", + "DmTaskSetMessage", + "DmTaskSetAddNode", + ) + taskTypes.Assert(t, "0", "6", "17") + taskMessages.Assert(t, "create_thin 0", "set_transaction_id 0 1") +} + +func fakeInit() func(home string) (graphdriver.Driver, error) { + oldInit := Init + Init = func(home string) (graphdriver.Driver, error) { + return &Driver{ + home: home, + }, nil + } + return oldInit +} + +func restoreInit(init func(home string) (graphdriver.Driver, error)) { + Init = init +} + +func mockAllDevmapper(calls Set) { + DmSetDevDir = func(dir string) int { + calls["DmSetDevDir"] = true + return 0 + } + LogWithErrnoInit = func() { + calls["DmLogWithErrnoInit"] = true + } + DmTaskCreate = func(taskType int) *CDmTask { + calls["DmTaskCreate"] = true + return &CDmTask{} + } + DmTaskSetName = func(task *CDmTask, name string) int { + calls["DmTaskSetName"] = true + return 1 + } + DmTaskRun = func(task *CDmTask) int { + calls["DmTaskRun"] = true + return 1 + } + DmTaskGetInfo = func(task *CDmTask, info *Info) int { + calls["DmTaskGetInfo"] = true + return 1 + } + DmTaskSetSector = func(task *CDmTask, sector uint64) int { + calls["DmTaskSetSector"] = true + return 1 + } + DmTaskSetMessage = func(task *CDmTask, message string) int { + calls["DmTaskSetMessage"] = true + return 1 + } + DmTaskDestroy = func(task *CDmTask) { + calls["DmTaskDestroy"] = true + } + DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int { + calls["DmTaskSetTarget"] = true + return 1 + } + DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int { + calls["DmTaskSetCookie"] = true + return 1 + } + DmUdevWait = func(cookie uint) int { + calls["DmUdevWait"] = true + return 1 + } + DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int { + calls["DmTaskSetAddNode"] = true + return 1 + } + execRun = func(name string, args ...string) error { + calls["execRun"] = true + return nil + } +} + +func TestDriverName(t *testing.T) { + denyAllDevmapper() + defer denyAllDevmapper() + + oldInit := fakeInit() + defer restoreInit(oldInit) + + d := newDriver(t) + if d.String() != "devicemapper" { + t.Fatalf("Expected driver name to be devicemapper got %s", d.String()) + } +} + +func TestDriverCreate(t *testing.T) { + denyAllDevmapper() + denyAllSyscall() + defer denyAllSyscall() + defer denyAllDevmapper() + + calls := make(Set) + mockAllDevmapper(calls) + + sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) { + calls["sysMount"] = true + // FIXME: compare the exact source and target strings (inodes + devname) + if expectedSource := "/dev/mapper/docker-"; !strings.HasPrefix(source, expectedSource) { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedSource, source) + } + if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target) + } + if expectedFstype := "ext4"; fstype != expectedFstype { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFstype, fstype) + } + if expectedFlags := uintptr(3236757504); flags != expectedFlags { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags) + } + return nil + } + + Mounted = func(mnt string) (bool, error) { + calls["Mounted"] = true + if !strings.HasPrefix(mnt, "/tmp/docker-test-devmapper-") || !strings.HasSuffix(mnt, "/mnt/1") { + t.Fatalf("Wrong mounted call\nExpected: Mounted(%v)\nReceived: Mounted(%v)\n", "/tmp/docker-test-devmapper-.../mnt/1", mnt) + } + return false, nil + } + + sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + calls["sysSyscall"] = true + if trap != sysSysIoctl { + t.Fatalf("Unexpected syscall. Expecting SYS_IOCTL, received: %d", trap) + } + switch a2 { + case LoopSetFd: + calls["ioctl.loopsetfd"] = true + case LoopCtlGetFree: + calls["ioctl.loopctlgetfree"] = true + case LoopGetStatus64: + calls["ioctl.loopgetstatus"] = true + case LoopSetStatus64: + calls["ioctl.loopsetstatus"] = true + case LoopClrFd: + calls["ioctl.loopclrfd"] = true + case LoopSetCapacity: + calls["ioctl.loopsetcapacity"] = true + case BlkGetSize64: + calls["ioctl.blkgetsize"] = true + default: + t.Fatalf("Unexpected IOCTL. Received %d", a2) + } + return 0, 0, 0 + } + + func() { + d := newDriver(t) + + calls.Assert(t, + "DmSetDevDir", + "DmLogWithErrnoInit", + "DmTaskSetName", + "DmTaskRun", + "DmTaskGetInfo", + "execRun", + "DmTaskCreate", + "DmTaskSetTarget", + "DmTaskSetCookie", + "DmUdevWait", + "DmTaskSetSector", + "DmTaskSetMessage", + "DmTaskSetAddNode", + "sysSyscall", + "ioctl.blkgetsize", + "ioctl.loopsetfd", + "ioctl.loopsetstatus", + "?ioctl.loopctlgetfree", + ) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + calls.Assert(t, + "DmTaskCreate", + "DmTaskGetInfo", + "sysMount", + "DmTaskRun", + "DmTaskSetTarget", + "DmTaskSetSector", + "DmTaskSetCookie", + "DmUdevWait", + "DmTaskSetName", + "DmTaskSetMessage", + "DmTaskSetAddNode", + ) + + }() + + runtime.GC() + + calls.Assert(t, + "DmTaskDestroy", + ) +} + +func TestDriverRemove(t *testing.T) { + denyAllDevmapper() + denyAllSyscall() + defer denyAllSyscall() + defer denyAllDevmapper() + + calls := make(Set) + mockAllDevmapper(calls) + + sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) { + calls["sysMount"] = true + // FIXME: compare the exact source and target strings (inodes + devname) + if expectedSource := "/dev/mapper/docker-"; !strings.HasPrefix(source, expectedSource) { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedSource, source) + } + if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target) + } + if expectedFstype := "ext4"; fstype != expectedFstype { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFstype, fstype) + } + if expectedFlags := uintptr(3236757504); flags != expectedFlags { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags) + } + return nil + } + sysUnmount = func(target string, flags int) (err error) { + calls["sysUnmount"] = true + // FIXME: compare the exact source and target strings (inodes + devname) + if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target) + } + if expectedFlags := 0; flags != expectedFlags { + t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags) + } + return nil + } + Mounted = func(mnt string) (bool, error) { + calls["Mounted"] = true + return false, nil + } + + sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + calls["sysSyscall"] = true + if trap != sysSysIoctl { + t.Fatalf("Unexpected syscall. Expecting SYS_IOCTL, received: %d", trap) + } + switch a2 { + case LoopSetFd: + calls["ioctl.loopsetfd"] = true + case LoopCtlGetFree: + calls["ioctl.loopctlgetfree"] = true + case LoopGetStatus64: + calls["ioctl.loopgetstatus"] = true + case LoopSetStatus64: + calls["ioctl.loopsetstatus"] = true + case LoopClrFd: + calls["ioctl.loopclrfd"] = true + case LoopSetCapacity: + calls["ioctl.loopsetcapacity"] = true + case BlkGetSize64: + calls["ioctl.blkgetsize"] = true + default: + t.Fatalf("Unexpected IOCTL. Received %d", a2) + } + return 0, 0, 0 + } + + func() { + d := newDriver(t) + + calls.Assert(t, + "DmSetDevDir", + "DmLogWithErrnoInit", + "DmTaskSetName", + "DmTaskRun", + "DmTaskGetInfo", + "execRun", + "DmTaskCreate", + "DmTaskSetTarget", + "DmTaskSetCookie", + "DmUdevWait", + "DmTaskSetSector", + "DmTaskSetMessage", + "DmTaskSetAddNode", + "sysSyscall", + "ioctl.blkgetsize", + "ioctl.loopsetfd", + "ioctl.loopsetstatus", + "?ioctl.loopctlgetfree", + ) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + calls.Assert(t, + "DmTaskCreate", + "DmTaskGetInfo", + "sysMount", + "DmTaskRun", + "DmTaskSetTarget", + "DmTaskSetSector", + "DmTaskSetCookie", + "DmUdevWait", + "DmTaskSetName", + "DmTaskSetMessage", + "DmTaskSetAddNode", + ) + + Mounted = func(mnt string) (bool, error) { + calls["Mounted"] = true + return true, nil + } + + if err := d.Remove("1"); err != nil { + t.Fatal(err) + } + + calls.Assert(t, + "DmTaskRun", + "DmTaskSetSector", + "DmTaskSetName", + "DmTaskSetMessage", + "DmTaskCreate", + "DmTaskGetInfo", + "DmTaskSetCookie", + "DmTaskSetTarget", + "DmTaskSetAddNode", + "DmUdevWait", + "sysUnmount", + ) + }() + runtime.GC() + + calls.Assert(t, + "DmTaskDestroy", + ) +} + +func TestCleanup(t *testing.T) { + t.Skip("FIXME: not a unit test") + t.Skip("Unimplemented") + d := newDriver(t) + defer osRemoveAll(d.home) + + mountPoints := make([]string, 2) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + // Mount the id + p, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + mountPoints[0] = p + + if err := d.Create("2", "1", ""); err != nil { + t.Fatal(err) + } + + p, err = d.Get("2") + if err != nil { + t.Fatal(err) + } + mountPoints[1] = p + + // Ensure that all the mount points are currently mounted + for _, p := range mountPoints { + if mounted, err := Mounted(p); err != nil { + t.Fatal(err) + } else if !mounted { + t.Fatalf("Expected %s to be mounted", p) + } + } + + // Ensure that devices are active + for _, p := range []string{"1", "2"} { + if !d.HasActivatedDevice(p) { + t.Fatalf("Expected %s to have an active device", p) + } + } + + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } + + // Ensure that all the mount points are no longer mounted + for _, p := range mountPoints { + if mounted, err := Mounted(p); err != nil { + t.Fatal(err) + } else if mounted { + t.Fatalf("Expected %s to not be mounted", p) + } + } + + // Ensure that devices are no longer activated + for _, p := range []string{"1", "2"} { + if d.HasActivatedDevice(p) { + t.Fatalf("Expected %s not be an active device", p) + } + } +} + +func TestNotMounted(t *testing.T) { + t.Skip("FIXME: not a unit test") + t.Skip("Not implemented") + d := newDriver(t) + defer cleanup(d) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + mounted, err := Mounted(path.Join(d.home, "mnt", "1")) + if err != nil { + t.Fatal(err) + } + if mounted { + t.Fatal("Id 1 should not be mounted") + } +} + +func TestMounted(t *testing.T) { + t.Skip("FIXME: not a unit test") + d := newDriver(t) + defer cleanup(d) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } + + mounted, err := Mounted(path.Join(d.home, "mnt", "1")) + if err != nil { + t.Fatal(err) + } + if !mounted { + t.Fatal("Id 1 should be mounted") + } +} + +func TestInitCleanedDriver(t *testing.T) { + t.Skip("FIXME: not a unit test") + d := newDriver(t) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } + + if err := d.Cleanup(); err != nil { + t.Fatal(err) + } + + driver, err := Init(d.home) + if err != nil { + t.Fatal(err) + } + d = driver.(*Driver) + defer cleanup(d) + + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } +} + +func TestMountMountedDriver(t *testing.T) { + t.Skip("FIXME: not a unit test") + d := newDriver(t) + defer cleanup(d) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + // Perform get on same id to ensure that it will + // not be mounted twice + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } +} + +func TestGetReturnsValidDevice(t *testing.T) { + t.Skip("FIXME: not a unit test") + d := newDriver(t) + defer cleanup(d) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + if !d.HasDevice("1") { + t.Fatalf("Expected id 1 to be in device set") + } + + if _, err := d.Get("1"); err != nil { + t.Fatal(err) + } + + if !d.HasActivatedDevice("1") { + t.Fatalf("Expected id 1 to be activated") + } + + if !d.HasInitializedDevice("1") { + t.Fatalf("Expected id 1 to be initialized") + } +} + +func TestDriverGetSize(t *testing.T) { + t.Skip("FIXME: not a unit test") + t.Skipf("Size is currently not implemented") + + d := newDriver(t) + defer cleanup(d) + + if err := d.Create("1", "", ""); err != nil { + t.Fatal(err) + } + + mountPoint, err := d.Get("1") + if err != nil { + t.Fatal(err) + } + + size := int64(1024) + + f, err := osCreate(path.Join(mountPoint, "test_file")) + if err != nil { + t.Fatal(err) + } + if err := f.Truncate(size); err != nil { + t.Fatal(err) + } + f.Close() + + // diffSize, err := d.DiffSize("1") + // if err != nil { + // t.Fatal(err) + // } + // if diffSize != size { + // t.Fatalf("Expected size %d got %d", size, diffSize) + // } +} + +func assertMap(t *testing.T, m map[string]bool, keys ...string) { + for _, key := range keys { + if _, exists := m[key]; !exists { + t.Fatalf("Key not set: %s", key) + } + delete(m, key) + } + if len(m) != 0 { + t.Fatalf("Unexpected keys: %v", m) + } +} diff --git a/runtime/graphdriver/devmapper/ioctl.go b/runtime/graphdriver/devmapper/ioctl.go new file mode 100644 index 0000000000..30bafff943 --- /dev/null +++ b/runtime/graphdriver/devmapper/ioctl.go @@ -0,0 +1,71 @@ +// +build linux,amd64 + +package devmapper + +import ( + "unsafe" +) + +func ioctlLoopCtlGetFree(fd uintptr) (int, error) { + index, _, err := sysSyscall(sysSysIoctl, fd, LoopCtlGetFree, 0) + if err != 0 { + return 0, err + } + return int(index), nil +} + +func ioctlLoopSetFd(loopFd, sparseFd uintptr) error { + if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetFd, sparseFd); err != 0 { + return err + } + return nil +} + +func ioctlLoopSetStatus64(loopFd uintptr, loopInfo *LoopInfo64) error { + if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetStatus64, uintptr(unsafe.Pointer(loopInfo))); err != 0 { + return err + } + return nil +} + +func ioctlLoopClrFd(loopFd uintptr) error { + if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopClrFd, 0); err != 0 { + return err + } + return nil +} + +func ioctlLoopGetStatus64(loopFd uintptr) (*LoopInfo64, error) { + loopInfo := &LoopInfo64{} + + if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopGetStatus64, uintptr(unsafe.Pointer(loopInfo))); err != 0 { + return nil, err + } + return loopInfo, nil +} + +func ioctlLoopSetCapacity(loopFd uintptr, value int) error { + if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetCapacity, uintptr(value)); err != 0 { + return err + } + return nil +} + +func ioctlBlkGetSize64(fd uintptr) (int64, error) { + var size int64 + if _, _, err := sysSyscall(sysSysIoctl, fd, BlkGetSize64, uintptr(unsafe.Pointer(&size))); err != 0 { + return 0, err + } + return size, nil +} + +func ioctlBlkDiscard(fd uintptr, offset, length uint64) error { + var r [2]uint64 + r[0] = offset + r[1] = length + + if _, _, err := sysSyscall(sysSysIoctl, fd, BlkDiscard, uintptr(unsafe.Pointer(&r[0]))); err != 0 { + return err + } + return nil +} diff --git a/runtime/graphdriver/devmapper/mount.go b/runtime/graphdriver/devmapper/mount.go new file mode 100644 index 0000000000..4f19109bf8 --- /dev/null +++ b/runtime/graphdriver/devmapper/mount.go @@ -0,0 +1,27 @@ +// +build linux,amd64 + +package devmapper + +import ( + "path/filepath" +) + +// FIXME: this is copy-pasted from the aufs driver. +// It should be moved into the core. + +var Mounted = func(mountpoint string) (bool, error) { + mntpoint, err := osStat(mountpoint) + if err != nil { + if osIsNotExist(err) { + return false, nil + } + return false, err + } + parent, err := osStat(filepath.Join(mountpoint, "..")) + if err != nil { + return false, err + } + mntpointSt := toSysStatT(mntpoint.Sys()) + parentSt := toSysStatT(parent.Sys()) + return mntpointSt.Dev != parentSt.Dev, nil +} diff --git a/runtime/graphdriver/devmapper/sys.go b/runtime/graphdriver/devmapper/sys.go new file mode 100644 index 0000000000..5a9ab4d74b --- /dev/null +++ b/runtime/graphdriver/devmapper/sys.go @@ -0,0 +1,57 @@ +// +build linux,amd64 + +package devmapper + +import ( + "os" + "os/exec" + "syscall" +) + +type ( + sysStatT syscall.Stat_t + sysErrno syscall.Errno + + osFile struct{ *os.File } +) + +var ( + sysMount = syscall.Mount + sysUnmount = syscall.Unmount + sysCloseOnExec = syscall.CloseOnExec + sysSyscall = syscall.Syscall + + osOpenFile = func(name string, flag int, perm os.FileMode) (*osFile, error) { + f, err := os.OpenFile(name, flag, perm) + return &osFile{File: f}, err + } + osOpen = func(name string) (*osFile, error) { f, err := os.Open(name); return &osFile{File: f}, err } + osNewFile = os.NewFile + osCreate = os.Create + osStat = os.Stat + osIsNotExist = os.IsNotExist + osIsExist = os.IsExist + osMkdirAll = os.MkdirAll + osRemoveAll = os.RemoveAll + osRename = os.Rename + osReadlink = os.Readlink + + execRun = func(name string, args ...string) error { return exec.Command(name, args...).Run() } +) + +const ( + sysMsMgcVal = syscall.MS_MGC_VAL + sysMsRdOnly = syscall.MS_RDONLY + sysEInval = syscall.EINVAL + sysSysIoctl = syscall.SYS_IOCTL + sysEBusy = syscall.EBUSY + + osORdOnly = os.O_RDONLY + osORdWr = os.O_RDWR + osOCreate = os.O_CREATE + osModeDevice = os.ModeDevice +) + +func toSysStatT(i interface{}) *sysStatT { + return (*sysStatT)(i.(*syscall.Stat_t)) +} diff --git a/runtime/graphdriver/driver.go b/runtime/graphdriver/driver.go new file mode 100644 index 0000000000..bd4c2faaca --- /dev/null +++ b/runtime/graphdriver/driver.go @@ -0,0 +1,92 @@ +package graphdriver + +import ( + "fmt" + "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/utils" + "os" + "path" +) + +type InitFunc func(root string) (Driver, error) + +type Driver interface { + String() string + + Create(id, parent string, mountLabel string) error + Remove(id string) error + + Get(id string) (dir string, err error) + Put(id string) + Exists(id string) bool + + Status() [][2]string + + Cleanup() error +} + +type Differ interface { + Diff(id string) (archive.Archive, error) + Changes(id string) ([]archive.Change, error) + ApplyDiff(id string, diff archive.ArchiveReader) error + DiffSize(id string) (bytes int64, err error) +} + +var ( + DefaultDriver string + // All registred drivers + drivers map[string]InitFunc + // Slice of drivers that should be used in an order + priority = []string{ + "aufs", + "btrfs", + "devicemapper", + "vfs", + } +) + +func init() { + drivers = make(map[string]InitFunc) +} + +func Register(name string, initFunc InitFunc) error { + if _, exists := drivers[name]; exists { + return fmt.Errorf("Name already registered %s", name) + } + drivers[name] = initFunc + + return nil +} + +func GetDriver(name, home string) (Driver, error) { + if initFunc, exists := drivers[name]; exists { + return initFunc(path.Join(home, name)) + } + return nil, fmt.Errorf("No such driver: %s", name) +} + +func New(root string) (driver Driver, err error) { + for _, name := range []string{os.Getenv("DOCKER_DRIVER"), DefaultDriver} { + if name != "" { + return GetDriver(name, root) + } + } + + // Check for priority drivers first + for _, name := range priority { + if driver, err = GetDriver(name, root); err != nil { + utils.Debugf("Error loading driver %s: %s", name, err) + continue + } + return driver, nil + } + + // Check all registered drivers if no priority driver is found + for _, initFunc := range drivers { + if driver, err = initFunc(root); err != nil { + continue + } + return driver, nil + } + return nil, err +} diff --git a/runtime/graphdriver/vfs/driver.go b/runtime/graphdriver/vfs/driver.go new file mode 100644 index 0000000000..fe09560f24 --- /dev/null +++ b/runtime/graphdriver/vfs/driver.go @@ -0,0 +1,95 @@ +package vfs + +import ( + "fmt" + "github.com/dotcloud/docker/runtime/graphdriver" + "os" + "os/exec" + "path" +) + +func init() { + graphdriver.Register("vfs", Init) +} + +func Init(home string) (graphdriver.Driver, error) { + d := &Driver{ + home: home, + } + return d, nil +} + +type Driver struct { + home string +} + +func (d *Driver) String() string { + return "vfs" +} + +func (d *Driver) Status() [][2]string { + return nil +} + +func (d *Driver) Cleanup() error { + return nil +} + +func copyDir(src, dst string) error { + if output, err := exec.Command("cp", "-aT", "--reflink=auto", src, dst).CombinedOutput(); err != nil { + return fmt.Errorf("Error VFS copying directory: %s (%s)", err, output) + } + return nil +} + +func (d *Driver) Create(id string, parent string, mountLabel string) error { + dir := d.dir(id) + if err := os.MkdirAll(path.Dir(dir), 0700); err != nil { + return err + } + if err := os.Mkdir(dir, 0700); err != nil { + return err + } + if parent == "" { + return nil + } + parentDir, err := d.Get(parent) + if err != nil { + return fmt.Errorf("%s: %s", parent, err) + } + if err := copyDir(parentDir, dir); err != nil { + return err + } + return nil +} + +func (d *Driver) dir(id string) string { + return path.Join(d.home, "dir", path.Base(id)) +} + +func (d *Driver) Remove(id string) error { + if _, err := os.Stat(d.dir(id)); err != nil { + return err + } + return os.RemoveAll(d.dir(id)) +} + +func (d *Driver) Get(id string) (string, error) { + dir := d.dir(id) + if st, err := os.Stat(dir); err != nil { + return "", err + } else if !st.IsDir() { + return "", fmt.Errorf("%s: not a directory", dir) + } + return dir, nil +} + +func (d *Driver) Put(id string) { + // The vfs driver has no runtime resources (e.g. mounts) + // to clean up, so we don't need anything here +} + +func (d *Driver) Exists(id string) bool { + _, err := os.Stat(d.dir(id)) + return err == nil +} diff --git a/runtime/history.go b/runtime/history.go new file mode 100644 index 0000000000..835ac9c11e --- /dev/null +++ b/runtime/history.go @@ -0,0 +1,30 @@ +package runtime + +import ( + "sort" +) + +// History is a convenience type for storing a list of containers, +// ordered by creation date. +type History []*Container + +func (history *History) Len() int { + return len(*history) +} + +func (history *History) Less(i, j int) bool { + containers := *history + return containers[j].When().Before(containers[i].When()) +} + +func (history *History) Swap(i, j int) { + containers := *history + tmp := containers[i] + containers[i] = containers[j] + containers[j] = tmp +} + +func (history *History) Add(container *Container) { + *history = append(*history, container) + sort.Sort(history) +} diff --git a/runtime/networkdriver/bridge/driver.go b/runtime/networkdriver/bridge/driver.go new file mode 100644 index 0000000000..f7c3bc6b01 --- /dev/null +++ b/runtime/networkdriver/bridge/driver.go @@ -0,0 +1,470 @@ +package bridge + +import ( + "fmt" + "github.com/dotcloud/docker/engine" + "github.com/dotcloud/docker/pkg/iptables" + "github.com/dotcloud/docker/pkg/netlink" + "github.com/dotcloud/docker/runtime/networkdriver" + "github.com/dotcloud/docker/runtime/networkdriver/ipallocator" + "github.com/dotcloud/docker/runtime/networkdriver/portallocator" + "github.com/dotcloud/docker/runtime/networkdriver/portmapper" + "github.com/dotcloud/docker/utils" + "io/ioutil" + "log" + "net" + "strings" +) + +const ( + DefaultNetworkBridge = "docker0" +) + +// Network interface represents the networking stack of a container +type networkInterface struct { + IP net.IP + PortMappings []net.Addr // there are mappings to the host interfaces +} + +var ( + addrs = []string{ + // Here we don't follow the convention of using the 1st IP of the range for the gateway. + // This is to use the same gateway IPs as the /24 ranges, which predate the /16 ranges. + // In theory this shouldn't matter - in practice there's bound to be a few scripts relying + // on the internal addressing or other stupid things like that. + // The shouldn't, but hey, let's not break them unless we really have to. + "172.17.42.1/16", // Don't use 172.16.0.0/16, it conflicts with EC2 DNS 172.16.0.23 + "10.0.42.1/16", // Don't even try using the entire /8, that's too intrusive + "10.1.42.1/16", + "10.42.42.1/16", + "172.16.42.1/24", + "172.16.43.1/24", + "172.16.44.1/24", + "10.0.42.1/24", + "10.0.43.1/24", + "192.168.42.1/24", + "192.168.43.1/24", + "192.168.44.1/24", + } + + bridgeIface string + bridgeNetwork *net.IPNet + + defaultBindingIP = net.ParseIP("0.0.0.0") + currentInterfaces = make(map[string]*networkInterface) +) + +func InitDriver(job *engine.Job) engine.Status { + var ( + network *net.IPNet + enableIPTables = job.GetenvBool("EnableIptables") + icc = job.GetenvBool("InterContainerCommunication") + ipForward = job.GetenvBool("EnableIpForward") + bridgeIP = job.Getenv("BridgeIP") + ) + + if defaultIP := job.Getenv("DefaultBindingIP"); defaultIP != "" { + defaultBindingIP = net.ParseIP(defaultIP) + } + + bridgeIface = job.Getenv("BridgeIface") + if bridgeIface == "" { + bridgeIface = DefaultNetworkBridge + } + + addr, err := networkdriver.GetIfaceAddr(bridgeIface) + if err != nil { + // If the iface is not found, try to create it + job.Logf("creating new bridge for %s", bridgeIface) + if err := createBridge(bridgeIP); err != nil { + job.Error(err) + return engine.StatusErr + } + + job.Logf("getting iface addr") + addr, err = networkdriver.GetIfaceAddr(bridgeIface) + if err != nil { + job.Error(err) + return engine.StatusErr + } + network = addr.(*net.IPNet) + } else { + network = addr.(*net.IPNet) + // validate that the bridge ip matches the ip specified by BridgeIP + if bridgeIP != "" { + if !network.IP.Equal(net.ParseIP(bridgeIP)) { + return job.Errorf("bridge ip (%s) does not match existing bridge configuration %s", network.IP, bridgeIP) + } + } + } + + // Configure iptables for link support + if enableIPTables { + if err := setupIPTables(addr, icc); err != nil { + job.Error(err) + return engine.StatusErr + } + } + + if ipForward { + // Enable IPv4 forwarding + if err := ioutil.WriteFile("/proc/sys/net/ipv4/ip_forward", []byte{'1', '\n'}, 0644); err != nil { + job.Logf("WARNING: unable to enable IPv4 forwarding: %s\n", err) + } + } + + // We can always try removing the iptables + if err := iptables.RemoveExistingChain("DOCKER"); err != nil { + job.Error(err) + return engine.StatusErr + } + + if enableIPTables { + chain, err := iptables.NewChain("DOCKER", bridgeIface) + if err != nil { + job.Error(err) + return engine.StatusErr + } + portmapper.SetIptablesChain(chain) + } + + bridgeNetwork = network + + // https://github.com/dotcloud/docker/issues/2768 + job.Eng.Hack_SetGlobalVar("httpapi.bridgeIP", bridgeNetwork.IP) + + for name, f := range map[string]engine.Handler{ + "allocate_interface": Allocate, + "release_interface": Release, + "allocate_port": AllocatePort, + "link": LinkContainers, + } { + if err := job.Eng.Register(name, f); err != nil { + job.Error(err) + return engine.StatusErr + } + } + return engine.StatusOK +} + +func setupIPTables(addr net.Addr, icc bool) error { + // Enable NAT + natArgs := []string{"POSTROUTING", "-t", "nat", "-s", addr.String(), "!", "-d", addr.String(), "-j", "MASQUERADE"} + + if !iptables.Exists(natArgs...) { + if output, err := iptables.Raw(append([]string{"-I"}, natArgs...)...); err != nil { + return fmt.Errorf("Unable to enable network bridge NAT: %s", err) + } else if len(output) != 0 { + return fmt.Errorf("Error iptables postrouting: %s", output) + } + } + + var ( + args = []string{"FORWARD", "-i", bridgeIface, "-o", bridgeIface, "-j"} + acceptArgs = append(args, "ACCEPT") + dropArgs = append(args, "DROP") + ) + + if !icc { + iptables.Raw(append([]string{"-D"}, acceptArgs...)...) + + if !iptables.Exists(dropArgs...) { + utils.Debugf("Disable inter-container communication") + if output, err := iptables.Raw(append([]string{"-I"}, dropArgs...)...); err != nil { + return fmt.Errorf("Unable to prevent intercontainer communication: %s", err) + } else if len(output) != 0 { + return fmt.Errorf("Error disabling intercontainer communication: %s", output) + } + } + } else { + iptables.Raw(append([]string{"-D"}, dropArgs...)...) + + if !iptables.Exists(acceptArgs...) { + utils.Debugf("Enable inter-container communication") + if output, err := iptables.Raw(append([]string{"-I"}, acceptArgs...)...); err != nil { + return fmt.Errorf("Unable to allow intercontainer communication: %s", err) + } else if len(output) != 0 { + return fmt.Errorf("Error enabling intercontainer communication: %s", output) + } + } + } + + // Accept all non-intercontainer outgoing packets + outgoingArgs := []string{"FORWARD", "-i", bridgeIface, "!", "-o", bridgeIface, "-j", "ACCEPT"} + if !iptables.Exists(outgoingArgs...) { + if output, err := iptables.Raw(append([]string{"-I"}, outgoingArgs...)...); err != nil { + return fmt.Errorf("Unable to allow outgoing packets: %s", err) + } else if len(output) != 0 { + return fmt.Errorf("Error iptables allow outgoing: %s", output) + } + } + + // Accept incoming packets for existing connections + existingArgs := []string{"FORWARD", "-o", bridgeIface, "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED", "-j", "ACCEPT"} + + if !iptables.Exists(existingArgs...) { + if output, err := iptables.Raw(append([]string{"-I"}, existingArgs...)...); err != nil { + return fmt.Errorf("Unable to allow incoming packets: %s", err) + } else if len(output) != 0 { + return fmt.Errorf("Error iptables allow incoming: %s", output) + } + } + return nil +} + +// CreateBridgeIface creates a network bridge interface on the host system with the name `ifaceName`, +// and attempts to configure it with an address which doesn't conflict with any other interface on the host. +// If it can't find an address which doesn't conflict, it will return an error. +func createBridge(bridgeIP string) error { + nameservers := []string{} + resolvConf, _ := utils.GetResolvConf() + // we don't check for an error here, because we don't really care + // if we can't read /etc/resolv.conf. So instead we skip the append + // if resolvConf is nil. It either doesn't exist, or we can't read it + // for some reason. + if resolvConf != nil { + nameservers = append(nameservers, utils.GetNameserversAsCIDR(resolvConf)...) + } + + var ifaceAddr string + if len(bridgeIP) != 0 { + _, _, err := net.ParseCIDR(bridgeIP) + if err != nil { + return err + } + ifaceAddr = bridgeIP + } else { + for _, addr := range addrs { + _, dockerNetwork, err := net.ParseCIDR(addr) + if err != nil { + return err + } + if err := networkdriver.CheckNameserverOverlaps(nameservers, dockerNetwork); err == nil { + if err := networkdriver.CheckRouteOverlaps(dockerNetwork); err == nil { + ifaceAddr = addr + break + } else { + utils.Debugf("%s %s", addr, err) + } + } + } + } + + if ifaceAddr == "" { + return fmt.Errorf("Could not find a free IP address range for interface '%s'. Please configure its address manually and run 'docker -b %s'", bridgeIface, bridgeIface) + } + utils.Debugf("Creating bridge %s with network %s", bridgeIface, ifaceAddr) + + if err := createBridgeIface(bridgeIface); err != nil { + return err + } + + iface, err := net.InterfaceByName(bridgeIface) + if err != nil { + return err + } + + ipAddr, ipNet, err := net.ParseCIDR(ifaceAddr) + if err != nil { + return err + } + + if netlink.NetworkLinkAddIp(iface, ipAddr, ipNet); err != nil { + return fmt.Errorf("Unable to add private network: %s", err) + } + if err := netlink.NetworkLinkUp(iface); err != nil { + return fmt.Errorf("Unable to start network bridge: %s", err) + } + return nil +} + +func createBridgeIface(name string) error { + kv, err := utils.GetKernelVersion() + // only set the bridge's mac address if the kernel version is > 3.3 + // before that it was not supported + setBridgeMacAddr := err == nil && (kv.Kernel >= 3 && kv.Major >= 3) + utils.Debugf("setting bridge mac address = %v", setBridgeMacAddr) + return netlink.CreateBridge(name, setBridgeMacAddr) +} + +// Allocate a network interface +func Allocate(job *engine.Job) engine.Status { + var ( + ip *net.IP + err error + id = job.Args[0] + requestedIP = net.ParseIP(job.Getenv("RequestedIP")) + ) + + if requestedIP != nil { + ip, err = ipallocator.RequestIP(bridgeNetwork, &requestedIP) + } else { + ip, err = ipallocator.RequestIP(bridgeNetwork, nil) + } + if err != nil { + job.Error(err) + return engine.StatusErr + } + + out := engine.Env{} + out.Set("IP", ip.String()) + out.Set("Mask", bridgeNetwork.Mask.String()) + out.Set("Gateway", bridgeNetwork.IP.String()) + out.Set("Bridge", bridgeIface) + + size, _ := bridgeNetwork.Mask.Size() + out.SetInt("IPPrefixLen", size) + + currentInterfaces[id] = &networkInterface{ + IP: *ip, + } + + out.WriteTo(job.Stdout) + + return engine.StatusOK +} + +// release an interface for a select ip +func Release(job *engine.Job) engine.Status { + var ( + id = job.Args[0] + containerInterface = currentInterfaces[id] + ip net.IP + port int + proto string + ) + + if containerInterface == nil { + return job.Errorf("No network information to release for %s", id) + } + + for _, nat := range containerInterface.PortMappings { + if err := portmapper.Unmap(nat); err != nil { + log.Printf("Unable to unmap port %s: %s", nat, err) + } + + // this is host mappings + switch a := nat.(type) { + case *net.TCPAddr: + proto = "tcp" + ip = a.IP + port = a.Port + case *net.UDPAddr: + proto = "udp" + ip = a.IP + port = a.Port + } + + if err := portallocator.ReleasePort(ip, proto, port); err != nil { + log.Printf("Unable to release port %s", nat) + } + } + + if err := ipallocator.ReleaseIP(bridgeNetwork, &containerInterface.IP); err != nil { + log.Printf("Unable to release ip %s\n", err) + } + return engine.StatusOK +} + +// Allocate an external port and map it to the interface +func AllocatePort(job *engine.Job) engine.Status { + var ( + err error + + ip = defaultBindingIP + id = job.Args[0] + hostIP = job.Getenv("HostIP") + hostPort = job.GetenvInt("HostPort") + containerPort = job.GetenvInt("ContainerPort") + proto = job.Getenv("Proto") + network = currentInterfaces[id] + ) + + if hostIP != "" { + ip = net.ParseIP(hostIP) + } + + // host ip, proto, and host port + hostPort, err = portallocator.RequestPort(ip, proto, hostPort) + if err != nil { + job.Error(err) + return engine.StatusErr + } + + var ( + container net.Addr + host net.Addr + ) + + if proto == "tcp" { + host = &net.TCPAddr{IP: ip, Port: hostPort} + container = &net.TCPAddr{IP: network.IP, Port: containerPort} + } else { + host = &net.UDPAddr{IP: ip, Port: hostPort} + container = &net.UDPAddr{IP: network.IP, Port: containerPort} + } + + if err := portmapper.Map(container, ip, hostPort); err != nil { + portallocator.ReleasePort(ip, proto, hostPort) + + job.Error(err) + return engine.StatusErr + } + network.PortMappings = append(network.PortMappings, host) + + out := engine.Env{} + out.Set("HostIP", ip.String()) + out.SetInt("HostPort", hostPort) + + if _, err := out.WriteTo(job.Stdout); err != nil { + job.Error(err) + return engine.StatusErr + } + return engine.StatusOK +} + +func LinkContainers(job *engine.Job) engine.Status { + var ( + action = job.Args[0] + childIP = job.Getenv("ChildIP") + parentIP = job.Getenv("ParentIP") + ignoreErrors = job.GetenvBool("IgnoreErrors") + ports = job.GetenvList("Ports") + ) + split := func(p string) (string, string) { + parts := strings.Split(p, "/") + return parts[0], parts[1] + } + + for _, p := range ports { + port, proto := split(p) + if output, err := iptables.Raw(action, "FORWARD", + "-i", bridgeIface, "-o", bridgeIface, + "-p", proto, + "-s", parentIP, + "--dport", port, + "-d", childIP, + "-j", "ACCEPT"); !ignoreErrors && err != nil { + job.Error(err) + return engine.StatusErr + } else if len(output) != 0 { + job.Errorf("Error toggle iptables forward: %s", output) + return engine.StatusErr + } + + if output, err := iptables.Raw(action, "FORWARD", + "-i", bridgeIface, "-o", bridgeIface, + "-p", proto, + "-s", childIP, + "--sport", port, + "-d", parentIP, + "-j", "ACCEPT"); !ignoreErrors && err != nil { + job.Error(err) + return engine.StatusErr + } else if len(output) != 0 { + job.Errorf("Error toggle iptables forward: %s", output) + return engine.StatusErr + } + } + return engine.StatusOK +} diff --git a/runtime/networkdriver/ipallocator/allocator.go b/runtime/networkdriver/ipallocator/allocator.go new file mode 100644 index 0000000000..70a7028bbe --- /dev/null +++ b/runtime/networkdriver/ipallocator/allocator.go @@ -0,0 +1,159 @@ +package ipallocator + +import ( + "encoding/binary" + "errors" + "github.com/dotcloud/docker/pkg/collections" + "github.com/dotcloud/docker/runtime/networkdriver" + "net" + "sync" +) + +type networkSet map[string]*collections.OrderedIntSet + +var ( + ErrNoAvailableIPs = errors.New("no available ip addresses on network") + ErrIPAlreadyAllocated = errors.New("ip already allocated") +) + +var ( + lock = sync.Mutex{} + allocatedIPs = networkSet{} + availableIPS = networkSet{} +) + +// RequestIP requests an available ip from the given network. It +// will return the next available ip if the ip provided is nil. If the +// ip provided is not nil it will validate that the provided ip is available +// for use or return an error +func RequestIP(address *net.IPNet, ip *net.IP) (*net.IP, error) { + lock.Lock() + defer lock.Unlock() + + checkAddress(address) + + if ip == nil { + next, err := getNextIp(address) + if err != nil { + return nil, err + } + return next, nil + } + + if err := registerIP(address, ip); err != nil { + return nil, err + } + return ip, nil +} + +// ReleaseIP adds the provided ip back into the pool of +// available ips to be returned for use. +func ReleaseIP(address *net.IPNet, ip *net.IP) error { + lock.Lock() + defer lock.Unlock() + + checkAddress(address) + + var ( + existing = allocatedIPs[address.String()] + available = availableIPS[address.String()] + pos = getPosition(address, ip) + ) + + existing.Remove(int(pos)) + available.Push(int(pos)) + + return nil +} + +// convert the ip into the position in the subnet. Only +// position are saved in the set +func getPosition(address *net.IPNet, ip *net.IP) int32 { + var ( + first, _ = networkdriver.NetworkRange(address) + base = ipToInt(&first) + i = ipToInt(ip) + ) + return i - base +} + +// return an available ip if one is currently available. If not, +// return the next available ip for the nextwork +func getNextIp(address *net.IPNet) (*net.IP, error) { + var ( + ownIP = ipToInt(&address.IP) + available = availableIPS[address.String()] + allocated = allocatedIPs[address.String()] + first, _ = networkdriver.NetworkRange(address) + base = ipToInt(&first) + size = int(networkdriver.NetworkSize(address.Mask)) + max = int32(size - 2) // size -1 for the broadcast address, -1 for the gateway address + pos = int32(available.Pop()) + ) + + // We pop and push the position not the ip + if pos != 0 { + ip := intToIP(int32(base + pos)) + allocated.Push(int(pos)) + + return ip, nil + } + + var ( + firstNetIP = address.IP.To4().Mask(address.Mask) + firstAsInt = ipToInt(&firstNetIP) + 1 + ) + + pos = int32(allocated.PullBack()) + for i := int32(0); i < max; i++ { + pos = pos%max + 1 + next := int32(base + pos) + + if next == ownIP || next == firstAsInt { + continue + } + + if !allocated.Exists(int(pos)) { + ip := intToIP(next) + allocated.Push(int(pos)) + return ip, nil + } + } + return nil, ErrNoAvailableIPs +} + +func registerIP(address *net.IPNet, ip *net.IP) error { + var ( + existing = allocatedIPs[address.String()] + available = availableIPS[address.String()] + pos = getPosition(address, ip) + ) + + if existing.Exists(int(pos)) { + return ErrIPAlreadyAllocated + } + available.Remove(int(pos)) + + return nil +} + +// Converts a 4 bytes IP into a 32 bit integer +func ipToInt(ip *net.IP) int32 { + return int32(binary.BigEndian.Uint32(ip.To4())) +} + +// Converts 32 bit integer into a 4 bytes IP address +func intToIP(n int32) *net.IP { + b := make([]byte, 4) + binary.BigEndian.PutUint32(b, uint32(n)) + ip := net.IP(b) + return &ip +} + +func checkAddress(address *net.IPNet) { + key := address.String() + if _, exists := allocatedIPs[key]; !exists { + allocatedIPs[key] = collections.NewOrderedIntSet() + availableIPS[key] = collections.NewOrderedIntSet() + } +} diff --git a/runtime/networkdriver/ipallocator/allocator_test.go b/runtime/networkdriver/ipallocator/allocator_test.go new file mode 100644 index 0000000000..5e9fcfc983 --- /dev/null +++ b/runtime/networkdriver/ipallocator/allocator_test.go @@ -0,0 +1,241 @@ +package ipallocator + +import ( + "fmt" + "net" + "testing" +) + +func reset() { + allocatedIPs = networkSet{} + availableIPS = networkSet{} +} + +func TestRequestNewIps(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + + for i := 2; i < 10; i++ { + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + if expected := fmt.Sprintf("192.168.0.%d", i); ip.String() != expected { + t.Fatalf("Expected ip %s got %s", expected, ip.String()) + } + } +} + +func TestReleaseIp(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + if err := ReleaseIP(network, ip); err != nil { + t.Fatal(err) + } +} + +func TestGetReleasedIp(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + value := ip.String() + if err := ReleaseIP(network, ip); err != nil { + t.Fatal(err) + } + + ip, err = RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + if ip.String() != value { + t.Fatalf("Expected to receive same ip %s got %s", value, ip.String()) + } +} + +func TestRequesetSpecificIp(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + + ip := net.ParseIP("192.168.1.5") + + if _, err := RequestIP(network, &ip); err != nil { + t.Fatal(err) + } +} + +func TestConversion(t *testing.T) { + ip := net.ParseIP("127.0.0.1") + i := ipToInt(&ip) + if i == 0 { + t.Fatal("converted to zero") + } + conv := intToIP(i) + if !ip.Equal(*conv) { + t.Error(conv.String()) + } +} + +func TestIPAllocator(t *testing.T) { + expectedIPs := []net.IP{ + 0: net.IPv4(127, 0, 0, 2), + 1: net.IPv4(127, 0, 0, 3), + 2: net.IPv4(127, 0, 0, 4), + 3: net.IPv4(127, 0, 0, 5), + 4: net.IPv4(127, 0, 0, 6), + } + + gwIP, n, _ := net.ParseCIDR("127.0.0.1/29") + network := &net.IPNet{IP: gwIP, Mask: n.Mask} + // Pool after initialisation (f = free, u = used) + // 2(f) - 3(f) - 4(f) - 5(f) - 6(f) + // ↑ + + // Check that we get 5 IPs, from 127.0.0.2–127.0.0.6, in that + // order. + for i := 0; i < 5; i++ { + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + assertIPEquals(t, &expectedIPs[i], ip) + } + // Before loop begin + // 2(f) - 3(f) - 4(f) - 5(f) - 6(f) + // ↑ + + // After i = 0 + // 2(u) - 3(f) - 4(f) - 5(f) - 6(f) + // ↑ + + // After i = 1 + // 2(u) - 3(u) - 4(f) - 5(f) - 6(f) + // ↑ + + // After i = 2 + // 2(u) - 3(u) - 4(u) - 5(f) - 6(f) + // ↑ + + // After i = 3 + // 2(u) - 3(u) - 4(u) - 5(u) - 6(f) + // ↑ + + // After i = 4 + // 2(u) - 3(u) - 4(u) - 5(u) - 6(u) + // ↑ + + // Check that there are no more IPs + ip, err := RequestIP(network, nil) + if err == nil { + t.Fatalf("There shouldn't be any IP addresses at this point, got %s\n", ip) + } + + // Release some IPs in non-sequential order + if err := ReleaseIP(network, &expectedIPs[3]); err != nil { + t.Fatal(err) + } + // 2(u) - 3(u) - 4(u) - 5(f) - 6(u) + // ↑ + + if err := ReleaseIP(network, &expectedIPs[2]); err != nil { + t.Fatal(err) + } + // 2(u) - 3(u) - 4(f) - 5(f) - 6(u) + // ↑ + + if err := ReleaseIP(network, &expectedIPs[4]); err != nil { + t.Fatal(err) + } + // 2(u) - 3(u) - 4(f) - 5(f) - 6(f) + // ↑ + + // Make sure that IPs are reused in sequential order, starting + // with the first released IP + newIPs := make([]*net.IP, 3) + for i := 0; i < 3; i++ { + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + + newIPs[i] = ip + } + // Before loop begin + // 2(u) - 3(u) - 4(f) - 5(f) - 6(f) + // ↑ + + // After i = 0 + // 2(u) - 3(u) - 4(f) - 5(u) - 6(f) + // ↑ + + // After i = 1 + // 2(u) - 3(u) - 4(f) - 5(u) - 6(u) + // ↑ + + // After i = 2 + // 2(u) - 3(u) - 4(u) - 5(u) - 6(u) + // ↑ + + // Reordered these because the new set will always return the + // lowest ips first and not in the order that they were released + assertIPEquals(t, &expectedIPs[2], newIPs[0]) + assertIPEquals(t, &expectedIPs[3], newIPs[1]) + assertIPEquals(t, &expectedIPs[4], newIPs[2]) + + _, err = RequestIP(network, nil) + if err == nil { + t.Fatal("There shouldn't be any IP addresses at this point") + } +} + +func TestAllocateFirstIP(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 0}, + Mask: []byte{255, 255, 255, 0}, + } + + firstIP := network.IP.To4().Mask(network.Mask) + first := ipToInt(&firstIP) + 1 + + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + allocated := ipToInt(ip) + + if allocated == first { + t.Fatalf("allocated ip should not equal first ip: %d == %d", first, allocated) + } +} + +func assertIPEquals(t *testing.T, ip1, ip2 *net.IP) { + if !ip1.Equal(*ip2) { + t.Fatalf("Expected IP %s, got %s", ip1, ip2) + } +} diff --git a/runtime/networkdriver/network.go b/runtime/networkdriver/network.go new file mode 100644 index 0000000000..8dda789d2f --- /dev/null +++ b/runtime/networkdriver/network.go @@ -0,0 +1,10 @@ +package networkdriver + +import ( + "errors" +) + +var ( + ErrNetworkOverlapsWithNameservers = errors.New("requested network overlaps with nameserver") + ErrNetworkOverlaps = errors.New("requested network overlaps with existing network") +) diff --git a/runtime/networkdriver/network_test.go b/runtime/networkdriver/network_test.go new file mode 100644 index 0000000000..6224c2dffb --- /dev/null +++ b/runtime/networkdriver/network_test.go @@ -0,0 +1,190 @@ +package networkdriver + +import ( + "github.com/dotcloud/docker/pkg/netlink" + "net" + "testing" +) + +func TestNonOverlapingNameservers(t *testing.T) { + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + nameservers := []string{ + "127.0.0.1/32", + } + + if err := CheckNameserverOverlaps(nameservers, network); err != nil { + t.Fatal(err) + } +} + +func TestOverlapingNameservers(t *testing.T) { + network := &net.IPNet{ + IP: []byte{192, 168, 0, 1}, + Mask: []byte{255, 255, 255, 0}, + } + nameservers := []string{ + "192.168.0.1/32", + } + + if err := CheckNameserverOverlaps(nameservers, network); err == nil { + t.Fatalf("Expected error %s got %s", ErrNetworkOverlapsWithNameservers, err) + } +} + +func TestCheckRouteOverlaps(t *testing.T) { + orig := networkGetRoutesFct + defer func() { + networkGetRoutesFct = orig + }() + networkGetRoutesFct = func() ([]netlink.Route, error) { + routesData := []string{"10.0.2.0/32", "10.0.3.0/24", "10.0.42.0/24", "172.16.42.0/24", "192.168.142.0/24"} + + routes := []netlink.Route{} + for _, addr := range routesData { + _, netX, _ := net.ParseCIDR(addr) + routes = append(routes, netlink.Route{IPNet: netX}) + } + return routes, nil + } + + _, netX, _ := net.ParseCIDR("172.16.0.1/24") + if err := CheckRouteOverlaps(netX); err != nil { + t.Fatal(err) + } + + _, netX, _ = net.ParseCIDR("10.0.2.0/24") + if err := CheckRouteOverlaps(netX); err == nil { + t.Fatalf("10.0.2.0/24 and 10.0.2.0 should overlap but it doesn't") + } +} + +func TestCheckNameserverOverlaps(t *testing.T) { + nameservers := []string{"10.0.2.3/32", "192.168.102.1/32"} + + _, netX, _ := net.ParseCIDR("10.0.2.3/32") + + if err := CheckNameserverOverlaps(nameservers, netX); err == nil { + t.Fatalf("%s should overlap 10.0.2.3/32 but doesn't", netX) + } + + _, netX, _ = net.ParseCIDR("192.168.102.2/32") + + if err := CheckNameserverOverlaps(nameservers, netX); err != nil { + t.Fatalf("%s should not overlap %v but it does", netX, nameservers) + } +} + +func AssertOverlap(CIDRx string, CIDRy string, t *testing.T) { + _, netX, _ := net.ParseCIDR(CIDRx) + _, netY, _ := net.ParseCIDR(CIDRy) + if !NetworkOverlaps(netX, netY) { + t.Errorf("%v and %v should overlap", netX, netY) + } +} + +func AssertNoOverlap(CIDRx string, CIDRy string, t *testing.T) { + _, netX, _ := net.ParseCIDR(CIDRx) + _, netY, _ := net.ParseCIDR(CIDRy) + if NetworkOverlaps(netX, netY) { + t.Errorf("%v and %v should not overlap", netX, netY) + } +} + +func TestNetworkOverlaps(t *testing.T) { + //netY starts at same IP and ends within netX + AssertOverlap("172.16.0.1/24", "172.16.0.1/25", t) + //netY starts within netX and ends at same IP + AssertOverlap("172.16.0.1/24", "172.16.0.128/25", t) + //netY starts and ends within netX + AssertOverlap("172.16.0.1/24", "172.16.0.64/25", t) + //netY starts at same IP and ends outside of netX + AssertOverlap("172.16.0.1/24", "172.16.0.1/23", t) + //netY starts before and ends at same IP of netX + AssertOverlap("172.16.1.1/24", "172.16.0.1/23", t) + //netY starts before and ends outside of netX + AssertOverlap("172.16.1.1/24", "172.16.0.1/22", t) + //netY starts and ends before netX + AssertNoOverlap("172.16.1.1/25", "172.16.0.1/24", t) + //netX starts and ends before netY + AssertNoOverlap("172.16.1.1/25", "172.16.2.1/24", t) +} + +func TestNetworkRange(t *testing.T) { + // Simple class C test + _, network, _ := net.ParseCIDR("192.168.0.1/24") + first, last := NetworkRange(network) + if !first.Equal(net.ParseIP("192.168.0.0")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("192.168.0.255")) { + t.Error(last.String()) + } + if size := NetworkSize(network.Mask); size != 256 { + t.Error(size) + } + + // Class A test + _, network, _ = net.ParseCIDR("10.0.0.1/8") + first, last = NetworkRange(network) + if !first.Equal(net.ParseIP("10.0.0.0")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("10.255.255.255")) { + t.Error(last.String()) + } + if size := NetworkSize(network.Mask); size != 16777216 { + t.Error(size) + } + + // Class A, random IP address + _, network, _ = net.ParseCIDR("10.1.2.3/8") + first, last = NetworkRange(network) + if !first.Equal(net.ParseIP("10.0.0.0")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("10.255.255.255")) { + t.Error(last.String()) + } + + // 32bit mask + _, network, _ = net.ParseCIDR("10.1.2.3/32") + first, last = NetworkRange(network) + if !first.Equal(net.ParseIP("10.1.2.3")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("10.1.2.3")) { + t.Error(last.String()) + } + if size := NetworkSize(network.Mask); size != 1 { + t.Error(size) + } + + // 31bit mask + _, network, _ = net.ParseCIDR("10.1.2.3/31") + first, last = NetworkRange(network) + if !first.Equal(net.ParseIP("10.1.2.2")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("10.1.2.3")) { + t.Error(last.String()) + } + if size := NetworkSize(network.Mask); size != 2 { + t.Error(size) + } + + // 26bit mask + _, network, _ = net.ParseCIDR("10.1.2.3/26") + first, last = NetworkRange(network) + if !first.Equal(net.ParseIP("10.1.2.0")) { + t.Error(first.String()) + } + if !last.Equal(net.ParseIP("10.1.2.63")) { + t.Error(last.String()) + } + if size := NetworkSize(network.Mask); size != 64 { + t.Error(size) + } +} diff --git a/runtime/networkdriver/portallocator/portallocator.go b/runtime/networkdriver/portallocator/portallocator.go new file mode 100644 index 0000000000..9ecd447116 --- /dev/null +++ b/runtime/networkdriver/portallocator/portallocator.go @@ -0,0 +1,188 @@ +package portallocator + +import ( + "errors" + "github.com/dotcloud/docker/pkg/collections" + "net" + "sync" +) + +const ( + BeginPortRange = 49153 + EndPortRange = 65535 +) + +type ( + portMappings map[string]*collections.OrderedIntSet + ipMapping map[string]portMappings +) + +var ( + ErrAllPortsAllocated = errors.New("all ports are allocated") + ErrPortAlreadyAllocated = errors.New("port has already been allocated") + ErrUnknownProtocol = errors.New("unknown protocol") +) + +var ( + currentDynamicPort = map[string]int{ + "tcp": BeginPortRange - 1, + "udp": BeginPortRange - 1, + } + defaultIP = net.ParseIP("0.0.0.0") + defaultAllocatedPorts = portMappings{} + otherAllocatedPorts = ipMapping{} + lock = sync.Mutex{} +) + +func init() { + defaultAllocatedPorts["tcp"] = collections.NewOrderedIntSet() + defaultAllocatedPorts["udp"] = collections.NewOrderedIntSet() +} + +// RequestPort returns an available port if the port is 0 +// If the provided port is not 0 then it will be checked if +// it is available for allocation +func RequestPort(ip net.IP, proto string, port int) (int, error) { + lock.Lock() + defer lock.Unlock() + + if err := validateProtocol(proto); err != nil { + return 0, err + } + + // If the user requested a specific port to be allocated + if port > 0 { + if err := registerSetPort(ip, proto, port); err != nil { + return 0, err + } + return port, nil + } + return registerDynamicPort(ip, proto) +} + +// ReleasePort will return the provided port back into the +// pool for reuse +func ReleasePort(ip net.IP, proto string, port int) error { + lock.Lock() + defer lock.Unlock() + + if err := validateProtocol(proto); err != nil { + return err + } + + allocated := defaultAllocatedPorts[proto] + allocated.Remove(port) + + if !equalsDefault(ip) { + registerIP(ip) + + // Remove the port for the specific ip address + allocated = otherAllocatedPorts[ip.String()][proto] + allocated.Remove(port) + } + return nil +} + +func ReleaseAll() error { + lock.Lock() + defer lock.Unlock() + + currentDynamicPort["tcp"] = BeginPortRange - 1 + currentDynamicPort["udp"] = BeginPortRange - 1 + + defaultAllocatedPorts = portMappings{} + defaultAllocatedPorts["tcp"] = collections.NewOrderedIntSet() + defaultAllocatedPorts["udp"] = collections.NewOrderedIntSet() + + otherAllocatedPorts = ipMapping{} + + return nil +} + +func registerDynamicPort(ip net.IP, proto string) (int, error) { + + if !equalsDefault(ip) { + registerIP(ip) + + ipAllocated := otherAllocatedPorts[ip.String()][proto] + + port, err := findNextPort(proto, ipAllocated) + if err != nil { + return 0, err + } + ipAllocated.Push(port) + return port, nil + + } else { + + allocated := defaultAllocatedPorts[proto] + + port, err := findNextPort(proto, allocated) + if err != nil { + return 0, err + } + allocated.Push(port) + return port, nil + } +} + +func registerSetPort(ip net.IP, proto string, port int) error { + allocated := defaultAllocatedPorts[proto] + if allocated.Exists(port) { + return ErrPortAlreadyAllocated + } + + if !equalsDefault(ip) { + registerIP(ip) + + ipAllocated := otherAllocatedPorts[ip.String()][proto] + if ipAllocated.Exists(port) { + return ErrPortAlreadyAllocated + } + ipAllocated.Push(port) + } else { + allocated.Push(port) + } + return nil +} + +func equalsDefault(ip net.IP) bool { + return ip == nil || ip.Equal(defaultIP) +} + +func findNextPort(proto string, allocated *collections.OrderedIntSet) (int, error) { + port := nextPort(proto) + startSearchPort := port + for allocated.Exists(port) { + port = nextPort(proto) + if startSearchPort == port { + return 0, ErrAllPortsAllocated + } + } + return port, nil +} + +func nextPort(proto string) int { + c := currentDynamicPort[proto] + 1 + if c > EndPortRange { + c = BeginPortRange + } + currentDynamicPort[proto] = c + return c +} + +func registerIP(ip net.IP) { + if _, exists := otherAllocatedPorts[ip.String()]; !exists { + otherAllocatedPorts[ip.String()] = portMappings{ + "tcp": collections.NewOrderedIntSet(), + "udp": collections.NewOrderedIntSet(), + } + } +} + +func validateProtocol(proto string) error { + if _, exists := defaultAllocatedPorts[proto]; !exists { + return ErrUnknownProtocol + } + return nil +} diff --git a/runtime/networkdriver/portallocator/portallocator_test.go b/runtime/networkdriver/portallocator/portallocator_test.go new file mode 100644 index 0000000000..5a4765ddd4 --- /dev/null +++ b/runtime/networkdriver/portallocator/portallocator_test.go @@ -0,0 +1,213 @@ +package portallocator + +import ( + "net" + "testing" +) + +func reset() { + ReleaseAll() +} + +func TestRequestNewPort(t *testing.T) { + defer reset() + + port, err := RequestPort(defaultIP, "tcp", 0) + if err != nil { + t.Fatal(err) + } + + if expected := BeginPortRange; port != expected { + t.Fatalf("Expected port %d got %d", expected, port) + } +} + +func TestRequestSpecificPort(t *testing.T) { + defer reset() + + port, err := RequestPort(defaultIP, "tcp", 5000) + if err != nil { + t.Fatal(err) + } + if port != 5000 { + t.Fatalf("Expected port 5000 got %d", port) + } +} + +func TestReleasePort(t *testing.T) { + defer reset() + + port, err := RequestPort(defaultIP, "tcp", 5000) + if err != nil { + t.Fatal(err) + } + if port != 5000 { + t.Fatalf("Expected port 5000 got %d", port) + } + + if err := ReleasePort(defaultIP, "tcp", 5000); err != nil { + t.Fatal(err) + } +} + +func TestReuseReleasedPort(t *testing.T) { + defer reset() + + port, err := RequestPort(defaultIP, "tcp", 5000) + if err != nil { + t.Fatal(err) + } + if port != 5000 { + t.Fatalf("Expected port 5000 got %d", port) + } + + if err := ReleasePort(defaultIP, "tcp", 5000); err != nil { + t.Fatal(err) + } + + port, err = RequestPort(defaultIP, "tcp", 5000) + if err != nil { + t.Fatal(err) + } +} + +func TestReleaseUnreadledPort(t *testing.T) { + defer reset() + + port, err := RequestPort(defaultIP, "tcp", 5000) + if err != nil { + t.Fatal(err) + } + if port != 5000 { + t.Fatalf("Expected port 5000 got %d", port) + } + + port, err = RequestPort(defaultIP, "tcp", 5000) + if err != ErrPortAlreadyAllocated { + t.Fatalf("Expected error %s got %s", ErrPortAlreadyAllocated, err) + } +} + +func TestUnknowProtocol(t *testing.T) { + defer reset() + + if _, err := RequestPort(defaultIP, "tcpp", 0); err != ErrUnknownProtocol { + t.Fatalf("Expected error %s got %s", ErrUnknownProtocol, err) + } +} + +func TestAllocateAllPorts(t *testing.T) { + defer reset() + + for i := 0; i <= EndPortRange-BeginPortRange; i++ { + port, err := RequestPort(defaultIP, "tcp", 0) + if err != nil { + t.Fatal(err) + } + + if expected := BeginPortRange + i; port != expected { + t.Fatalf("Expected port %d got %d", expected, port) + } + } + + if _, err := RequestPort(defaultIP, "tcp", 0); err != ErrAllPortsAllocated { + t.Fatalf("Expected error %s got %s", ErrAllPortsAllocated, err) + } + + _, err := RequestPort(defaultIP, "udp", 0) + if err != nil { + t.Fatal(err) + } + + // release a port in the middle and ensure we get another tcp port + port := BeginPortRange + 5 + if err := ReleasePort(defaultIP, "tcp", port); err != nil { + t.Fatal(err) + } + newPort, err := RequestPort(defaultIP, "tcp", 0) + if err != nil { + t.Fatal(err) + } + if newPort != port { + t.Fatalf("Expected port %d got %d", port, newPort) + } +} + +func BenchmarkAllocatePorts(b *testing.B) { + defer reset() + + for i := 0; i < b.N; i++ { + for i := 0; i <= EndPortRange-BeginPortRange; i++ { + port, err := RequestPort(defaultIP, "tcp", 0) + if err != nil { + b.Fatal(err) + } + + if expected := BeginPortRange + i; port != expected { + b.Fatalf("Expected port %d got %d", expected, port) + } + } + reset() + } +} + +func TestPortAllocation(t *testing.T) { + defer reset() + + ip := net.ParseIP("192.168.0.1") + ip2 := net.ParseIP("192.168.0.2") + if port, err := RequestPort(ip, "tcp", 80); err != nil { + t.Fatal(err) + } else if port != 80 { + t.Fatalf("Acquire(80) should return 80, not %d", port) + } + port, err := RequestPort(ip, "tcp", 0) + if err != nil { + t.Fatal(err) + } + if port <= 0 { + t.Fatalf("Acquire(0) should return a non-zero port") + } + + if _, err := RequestPort(ip, "tcp", port); err == nil { + t.Fatalf("Acquiring a port already in use should return an error") + } + + if newPort, err := RequestPort(ip, "tcp", 0); err != nil { + t.Fatal(err) + } else if newPort == port { + t.Fatalf("Acquire(0) allocated the same port twice: %d", port) + } + + if _, err := RequestPort(ip, "tcp", 80); err == nil { + t.Fatalf("Acquiring a port already in use should return an error") + } + if _, err := RequestPort(ip2, "tcp", 80); err != nil { + t.Fatalf("It should be possible to allocate the same port on a different interface") + } + if _, err := RequestPort(ip2, "tcp", 80); err == nil { + t.Fatalf("Acquiring a port already in use should return an error") + } + if err := ReleasePort(ip, "tcp", 80); err != nil { + t.Fatal(err) + } + if _, err := RequestPort(ip, "tcp", 80); err != nil { + t.Fatal(err) + } + + port, err = RequestPort(ip, "tcp", 0) + if err != nil { + t.Fatal(err) + } + port2, err := RequestPort(ip, "tcp", port+1) + if err != nil { + t.Fatal(err) + } + port3, err := RequestPort(ip, "tcp", 0) + if err != nil { + t.Fatal(err) + } + if port3 == port2 { + t.Fatal("Requesting a dynamic port should never allocate a used port") + } +} diff --git a/runtime/networkdriver/portmapper/mapper.go b/runtime/networkdriver/portmapper/mapper.go new file mode 100644 index 0000000000..e29959a245 --- /dev/null +++ b/runtime/networkdriver/portmapper/mapper.go @@ -0,0 +1,131 @@ +package portmapper + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/iptables" + "github.com/dotcloud/docker/pkg/proxy" + "net" + "sync" +) + +type mapping struct { + proto string + userlandProxy proxy.Proxy + host net.Addr + container net.Addr +} + +var ( + chain *iptables.Chain + lock sync.Mutex + + // udp:ip:port + currentMappings = make(map[string]*mapping) + newProxy = proxy.NewProxy +) + +var ( + ErrUnknownBackendAddressType = errors.New("unknown container address type not supported") + ErrPortMappedForIP = errors.New("port is already mapped to ip") + ErrPortNotMapped = errors.New("port is not mapped") +) + +func SetIptablesChain(c *iptables.Chain) { + chain = c +} + +func Map(container net.Addr, hostIP net.IP, hostPort int) error { + lock.Lock() + defer lock.Unlock() + + var m *mapping + switch container.(type) { + case *net.TCPAddr: + m = &mapping{ + proto: "tcp", + host: &net.TCPAddr{IP: hostIP, Port: hostPort}, + container: container, + } + case *net.UDPAddr: + m = &mapping{ + proto: "udp", + host: &net.UDPAddr{IP: hostIP, Port: hostPort}, + container: container, + } + default: + return ErrUnknownBackendAddressType + } + + key := getKey(m.host) + if _, exists := currentMappings[key]; exists { + return ErrPortMappedForIP + } + + containerIP, containerPort := getIPAndPort(m.container) + if err := forward(iptables.Add, m.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil { + return err + } + + p, err := newProxy(m.host, m.container) + if err != nil { + // need to undo the iptables rules before we reutrn + forward(iptables.Delete, m.proto, hostIP, hostPort, containerIP.String(), containerPort) + return err + } + + m.userlandProxy = p + currentMappings[key] = m + + go p.Run() + + return nil +} + +func Unmap(host net.Addr) error { + lock.Lock() + defer lock.Unlock() + + key := getKey(host) + data, exists := currentMappings[key] + if !exists { + return ErrPortNotMapped + } + + data.userlandProxy.Close() + delete(currentMappings, key) + + containerIP, containerPort := getIPAndPort(data.container) + hostIP, hostPort := getIPAndPort(data.host) + if err := forward(iptables.Delete, data.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil { + return err + } + return nil +} + +func getKey(a net.Addr) string { + switch t := a.(type) { + case *net.TCPAddr: + return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "tcp") + case *net.UDPAddr: + return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "udp") + } + return "" +} + +func getIPAndPort(a net.Addr) (net.IP, int) { + switch t := a.(type) { + case *net.TCPAddr: + return t.IP, t.Port + case *net.UDPAddr: + return t.IP, t.Port + } + return nil, 0 +} + +func forward(action iptables.Action, proto string, sourceIP net.IP, sourcePort int, containerIP string, containerPort int) error { + if chain == nil { + return nil + } + return chain.Forward(action, sourceIP, sourcePort, proto, containerIP, containerPort) +} diff --git a/runtime/networkdriver/portmapper/mapper_test.go b/runtime/networkdriver/portmapper/mapper_test.go new file mode 100644 index 0000000000..4c09f3c651 --- /dev/null +++ b/runtime/networkdriver/portmapper/mapper_test.go @@ -0,0 +1,107 @@ +package portmapper + +import ( + "github.com/dotcloud/docker/pkg/iptables" + "github.com/dotcloud/docker/pkg/proxy" + "net" + "testing" +) + +func init() { + // override this func to mock out the proxy server + newProxy = proxy.NewStubProxy +} + +func reset() { + chain = nil + currentMappings = make(map[string]*mapping) +} + +func TestSetIptablesChain(t *testing.T) { + defer reset() + + c := &iptables.Chain{ + Name: "TEST", + Bridge: "192.168.1.1", + } + + if chain != nil { + t.Fatal("chain should be nil at init") + } + + SetIptablesChain(c) + if chain == nil { + t.Fatal("chain should not be nil after set") + } +} + +func TestMapPorts(t *testing.T) { + dstIp1 := net.ParseIP("192.168.0.1") + dstIp2 := net.ParseIP("192.168.0.2") + dstAddr1 := &net.TCPAddr{IP: dstIp1, Port: 80} + dstAddr2 := &net.TCPAddr{IP: dstIp2, Port: 80} + + srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")} + srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")} + + if err := Map(srcAddr1, dstIp1, 80); err != nil { + t.Fatalf("Failed to allocate port: %s", err) + } + + if Map(srcAddr1, dstIp1, 80) == nil { + t.Fatalf("Port is in use - mapping should have failed") + } + + if Map(srcAddr2, dstIp1, 80) == nil { + t.Fatalf("Port is in use - mapping should have failed") + } + + if err := Map(srcAddr2, dstIp2, 80); err != nil { + t.Fatalf("Failed to allocate port: %s", err) + } + + if Unmap(dstAddr1) != nil { + t.Fatalf("Failed to release port") + } + + if Unmap(dstAddr2) != nil { + t.Fatalf("Failed to release port") + } + + if Unmap(dstAddr2) == nil { + t.Fatalf("Port already released, but no error reported") + } +} + +func TestGetUDPKey(t *testing.T) { + addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53} + + key := getKey(addr) + + if expected := "192.168.1.5:53/udp"; key != expected { + t.Fatalf("expected key %s got %s", expected, key) + } +} + +func TestGetTCPKey(t *testing.T) { + addr := &net.TCPAddr{IP: net.ParseIP("192.168.1.5"), Port: 80} + + key := getKey(addr) + + if expected := "192.168.1.5:80/tcp"; key != expected { + t.Fatalf("expected key %s got %s", expected, key) + } +} + +func TestGetUDPIPAndPort(t *testing.T) { + addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53} + + ip, port := getIPAndPort(addr) + if expected := "192.168.1.5"; ip.String() != expected { + t.Fatalf("expected ip %s got %s", expected, ip) + } + + if ep := 53; port != ep { + t.Fatalf("expected port %d got %d", ep, port) + } +} diff --git a/runtime/networkdriver/utils.go b/runtime/networkdriver/utils.go new file mode 100644 index 0000000000..0a4ef70c95 --- /dev/null +++ b/runtime/networkdriver/utils.go @@ -0,0 +1,118 @@ +package networkdriver + +import ( + "encoding/binary" + "errors" + "fmt" + "net" + + "github.com/dotcloud/docker/pkg/netlink" +) + +var ( + networkGetRoutesFct = netlink.NetworkGetRoutes + ErrNoDefaultRoute = errors.New("no default route") +) + +func CheckNameserverOverlaps(nameservers []string, toCheck *net.IPNet) error { + if len(nameservers) > 0 { + for _, ns := range nameservers { + _, nsNetwork, err := net.ParseCIDR(ns) + if err != nil { + return err + } + if NetworkOverlaps(toCheck, nsNetwork) { + return ErrNetworkOverlapsWithNameservers + } + } + } + return nil +} + +func CheckRouteOverlaps(toCheck *net.IPNet) error { + networks, err := networkGetRoutesFct() + if err != nil { + return err + } + + for _, network := range networks { + if network.IPNet != nil && NetworkOverlaps(toCheck, network.IPNet) { + return ErrNetworkOverlaps + } + } + return nil +} + +// Detects overlap between one IPNet and another +func NetworkOverlaps(netX *net.IPNet, netY *net.IPNet) bool { + if firstIP, _ := NetworkRange(netX); netY.Contains(firstIP) { + return true + } + if firstIP, _ := NetworkRange(netY); netX.Contains(firstIP) { + return true + } + return false +} + +// Calculates the first and last IP addresses in an IPNet +func NetworkRange(network *net.IPNet) (net.IP, net.IP) { + var ( + netIP = network.IP.To4() + firstIP = netIP.Mask(network.Mask) + lastIP = net.IPv4(0, 0, 0, 0).To4() + ) + + for i := 0; i < len(lastIP); i++ { + lastIP[i] = netIP[i] | ^network.Mask[i] + } + return firstIP, lastIP +} + +// Given a netmask, calculates the number of available hosts +func NetworkSize(mask net.IPMask) int32 { + m := net.IPv4Mask(0, 0, 0, 0) + for i := 0; i < net.IPv4len; i++ { + m[i] = ^mask[i] + } + return int32(binary.BigEndian.Uint32(m)) + 1 +} + +// Return the IPv4 address of a network interface +func GetIfaceAddr(name string) (net.Addr, error) { + iface, err := net.InterfaceByName(name) + if err != nil { + return nil, err + } + addrs, err := iface.Addrs() + if err != nil { + return nil, err + } + var addrs4 []net.Addr + for _, addr := range addrs { + ip := (addr.(*net.IPNet)).IP + if ip4 := ip.To4(); len(ip4) == net.IPv4len { + addrs4 = append(addrs4, addr) + } + } + switch { + case len(addrs4) == 0: + return nil, fmt.Errorf("Interface %v has no IP addresses", name) + case len(addrs4) > 1: + fmt.Printf("Interface %v has more than 1 IPv4 address. Defaulting to using %v\n", + name, (addrs4[0].(*net.IPNet)).IP) + } + return addrs4[0], nil +} + +func GetDefaultRouteIface() (*net.Interface, error) { + rs, err := networkGetRoutesFct() + if err != nil { + return nil, fmt.Errorf("unable to get routes: %v", err) + } + for _, r := range rs { + if r.Default { + return r.Iface, nil + } + } + return nil, ErrNoDefaultRoute +} diff --git a/runtime/runtime.go b/runtime/runtime.go new file mode 100644 index 0000000000..98903cfa08 --- /dev/null +++ b/runtime/runtime.go @@ -0,0 +1,993 @@ +package runtime + +import ( + "container/list" + "fmt" + "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/daemonconfig" + "github.com/dotcloud/docker/dockerversion" + "github.com/dotcloud/docker/engine" + "github.com/dotcloud/docker/graph" + "github.com/dotcloud/docker/image" + "github.com/dotcloud/docker/pkg/graphdb" + "github.com/dotcloud/docker/pkg/mount" + "github.com/dotcloud/docker/pkg/selinux" + "github.com/dotcloud/docker/pkg/sysinfo" + "github.com/dotcloud/docker/runconfig" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/runtime/execdriver/execdrivers" + "github.com/dotcloud/docker/runtime/execdriver/lxc" + "github.com/dotcloud/docker/runtime/graphdriver" + _ "github.com/dotcloud/docker/runtime/graphdriver/vfs" + _ "github.com/dotcloud/docker/runtime/networkdriver/bridge" + "github.com/dotcloud/docker/runtime/networkdriver/portallocator" + "github.com/dotcloud/docker/utils" + "io" + "io/ioutil" + "log" + "os" + "path" + "regexp" + "strings" + "sync" + "time" +) + +// Set the max depth to the aufs default that most +// kernels are compiled with +// For more information see: http://sourceforge.net/p/aufs/aufs3-standalone/ci/aufs3.12/tree/config.mk +const MaxImageDepth = 127 + +var ( + DefaultDns = []string{"8.8.8.8", "8.8.4.4"} + validContainerNameChars = `[a-zA-Z0-9_.-]` + validContainerNamePattern = regexp.MustCompile(`^/?` + validContainerNameChars + `+$`) +) + +type Runtime struct { + repository string + sysInitPath string + containers *list.List + graph *graph.Graph + repositories *graph.TagStore + idIndex *utils.TruncIndex + sysInfo *sysinfo.SysInfo + volumes *graph.Graph + srv Server + eng *engine.Engine + config *daemonconfig.Config + containerGraph *graphdb.Database + driver graphdriver.Driver + execDriver execdriver.Driver +} + +// Mountpoints should be private to the container +func remountPrivate(mountPoint string) error { + mounted, err := mount.Mounted(mountPoint) + if err != nil { + return err + } + + if !mounted { + if err := mount.Mount(mountPoint, mountPoint, "none", "bind,rw"); err != nil { + return err + } + } + return mount.ForceMount("", mountPoint, "none", "private") +} + +// List returns an array of all containers registered in the runtime. +func (runtime *Runtime) List() []*Container { + containers := new(History) + for e := runtime.containers.Front(); e != nil; e = e.Next() { + containers.Add(e.Value.(*Container)) + } + return *containers +} + +func (runtime *Runtime) getContainerElement(id string) *list.Element { + for e := runtime.containers.Front(); e != nil; e = e.Next() { + container := e.Value.(*Container) + if container.ID == id { + return e + } + } + return nil +} + +// Get looks for a container by the specified ID or name, and returns it. +// If the container is not found, or if an error occurs, nil is returned. +func (runtime *Runtime) Get(name string) *Container { + if c, _ := runtime.GetByName(name); c != nil { + return c + } + + id, err := runtime.idIndex.Get(name) + if err != nil { + return nil + } + + e := runtime.getContainerElement(id) + if e == nil { + return nil + } + return e.Value.(*Container) +} + +// Exists returns a true if a container of the specified ID or name exists, +// false otherwise. +func (runtime *Runtime) Exists(id string) bool { + return runtime.Get(id) != nil +} + +func (runtime *Runtime) containerRoot(id string) string { + return path.Join(runtime.repository, id) +} + +// Load reads the contents of a container from disk +// This is typically done at startup. +func (runtime *Runtime) load(id string) (*Container, error) { + container := &Container{root: runtime.containerRoot(id)} + if err := container.FromDisk(); err != nil { + return nil, err + } + if container.ID != id { + return container, fmt.Errorf("Container %s is stored at %s", container.ID, id) + } + if container.State.IsRunning() { + container.State.SetGhost(true) + } + return container, nil +} + +// Register makes a container object usable by the runtime as <container.ID> +func (runtime *Runtime) Register(container *Container) error { + if container.runtime != nil || runtime.Exists(container.ID) { + return fmt.Errorf("Container is already loaded") + } + if err := validateID(container.ID); err != nil { + return err + } + if err := runtime.ensureName(container); err != nil { + return err + } + + container.runtime = runtime + + // Attach to stdout and stderr + container.stderr = utils.NewWriteBroadcaster() + container.stdout = utils.NewWriteBroadcaster() + // Attach to stdin + if container.Config.OpenStdin { + container.stdin, container.stdinPipe = io.Pipe() + } else { + container.stdinPipe = utils.NopWriteCloser(ioutil.Discard) // Silently drop stdin + } + // done + runtime.containers.PushBack(container) + runtime.idIndex.Add(container.ID) + + // FIXME: if the container is supposed to be running but is not, auto restart it? + // if so, then we need to restart monitor and init a new lock + // If the container is supposed to be running, make sure of it + if container.State.IsRunning() { + if container.State.IsGhost() { + utils.Debugf("killing ghost %s", container.ID) + + existingPid := container.State.Pid + container.State.SetGhost(false) + container.State.SetStopped(0) + + // We only have to handle this for lxc because the other drivers will ensure that + // no ghost processes are left when docker dies + if container.ExecDriver == "" || strings.Contains(container.ExecDriver, "lxc") { + lxc.KillLxc(container.ID, 9) + } else { + // use the current driver and ensure that the container is dead x.x + cmd := &execdriver.Command{ + ID: container.ID, + } + var err error + cmd.Process, err = os.FindProcess(existingPid) + if err != nil { + utils.Debugf("cannot find existing process for %d", existingPid) + } + runtime.execDriver.Terminate(cmd) + } + if err := container.Unmount(); err != nil { + utils.Debugf("ghost unmount error %s", err) + } + if err := container.ToDisk(); err != nil { + utils.Debugf("saving ghost state to disk %s", err) + } + } + + info := runtime.execDriver.Info(container.ID) + if !info.IsRunning() { + utils.Debugf("Container %s was supposed to be running but is not.", container.ID) + if runtime.config.AutoRestart { + utils.Debugf("Restarting") + if err := container.Unmount(); err != nil { + utils.Debugf("restart unmount error %s", err) + } + + container.State.SetGhost(false) + container.State.SetStopped(0) + if err := container.Start(); err != nil { + return err + } + } else { + utils.Debugf("Marking as stopped") + container.State.SetStopped(-127) + if err := container.ToDisk(); err != nil { + return err + } + } + } + } else { + // When the container is not running, we still initialize the waitLock + // chan and close it. Receiving on nil chan blocks whereas receiving on a + // closed chan does not. In this case we do not want to block. + container.waitLock = make(chan struct{}) + close(container.waitLock) + } + return nil +} + +func (runtime *Runtime) ensureName(container *Container) error { + if container.Name == "" { + name, err := generateRandomName(runtime) + if err != nil { + name = utils.TruncateID(container.ID) + } + container.Name = name + + if err := container.ToDisk(); err != nil { + utils.Debugf("Error saving container name %s", err) + } + if !runtime.containerGraph.Exists(name) { + if _, err := runtime.containerGraph.Set(name, container.ID); err != nil { + utils.Debugf("Setting default id - %s", err) + } + } + } + return nil +} + +func (runtime *Runtime) LogToDisk(src *utils.WriteBroadcaster, dst, stream string) error { + log, err := os.OpenFile(dst, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0600) + if err != nil { + return err + } + src.AddWriter(log, stream) + return nil +} + +// Destroy unregisters a container from the runtime and cleanly removes its contents from the filesystem. +func (runtime *Runtime) Destroy(container *Container) error { + if container == nil { + return fmt.Errorf("The given container is <nil>") + } + + element := runtime.getContainerElement(container.ID) + if element == nil { + return fmt.Errorf("Container %v not found - maybe it was already destroyed?", container.ID) + } + + if err := container.Stop(3); err != nil { + return err + } + + if err := runtime.driver.Remove(container.ID); err != nil { + return fmt.Errorf("Driver %s failed to remove root filesystem %s: %s", runtime.driver, container.ID, err) + } + + initID := fmt.Sprintf("%s-init", container.ID) + if err := runtime.driver.Remove(initID); err != nil { + return fmt.Errorf("Driver %s failed to remove init filesystem %s: %s", runtime.driver, initID, err) + } + + if _, err := runtime.containerGraph.Purge(container.ID); err != nil { + utils.Debugf("Unable to remove container from link graph: %s", err) + } + + // Deregister the container before removing its directory, to avoid race conditions + runtime.idIndex.Delete(container.ID) + runtime.containers.Remove(element) + if err := os.RemoveAll(container.root); err != nil { + return fmt.Errorf("Unable to remove filesystem for %v: %v", container.ID, err) + } + return nil +} + +func (runtime *Runtime) restore() error { + if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" { + fmt.Printf("Loading containers: ") + } + dir, err := ioutil.ReadDir(runtime.repository) + if err != nil { + return err + } + containers := make(map[string]*Container) + currentDriver := runtime.driver.String() + + for _, v := range dir { + id := v.Name() + container, err := runtime.load(id) + if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" { + fmt.Print(".") + } + if err != nil { + utils.Errorf("Failed to load container %v: %v", id, err) + continue + } + + // Ignore the container if it does not support the current driver being used by the graph + if container.Driver == "" && currentDriver == "aufs" || container.Driver == currentDriver { + utils.Debugf("Loaded container %v", container.ID) + containers[container.ID] = container + } else { + utils.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID) + } + } + + register := func(container *Container) { + if err := runtime.Register(container); err != nil { + utils.Debugf("Failed to register container %s: %s", container.ID, err) + } + } + + if entities := runtime.containerGraph.List("/", -1); entities != nil { + for _, p := range entities.Paths() { + if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" { + fmt.Print(".") + } + e := entities[p] + if container, ok := containers[e.ID()]; ok { + register(container) + delete(containers, e.ID()) + } + } + } + + // Any containers that are left over do not exist in the graph + for _, container := range containers { + // Try to set the default name for a container if it exists prior to links + container.Name, err = generateRandomName(runtime) + if err != nil { + container.Name = utils.TruncateID(container.ID) + } + + if _, err := runtime.containerGraph.Set(container.Name, container.ID); err != nil { + utils.Debugf("Setting default id - %s", err) + } + register(container) + } + + if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" { + fmt.Printf(": done.\n") + } + + return nil +} + +// Create creates a new container from the given configuration with a given name. +func (runtime *Runtime) Create(config *runconfig.Config, name string) (*Container, []string, error) { + var ( + container *Container + warnings []string + ) + + img, err := runtime.repositories.LookupImage(config.Image) + if err != nil { + return nil, nil, err + } + if err := runtime.checkImageDepth(img); err != nil { + return nil, nil, err + } + if warnings, err = runtime.mergeAndVerifyConfig(config, img); err != nil { + return nil, nil, err + } + if container, err = runtime.newContainer(name, config, img); err != nil { + return nil, nil, err + } + if err := runtime.createRootfs(container, img); err != nil { + return nil, nil, err + } + if err := container.ToDisk(); err != nil { + return nil, nil, err + } + if err := runtime.Register(container); err != nil { + return nil, nil, err + } + return container, warnings, nil +} + +func (runtime *Runtime) checkImageDepth(img *image.Image) error { + // We add 2 layers to the depth because the container's rw and + // init layer add to the restriction + depth, err := img.Depth() + if err != nil { + return err + } + if depth+2 >= MaxImageDepth { + return fmt.Errorf("Cannot create container with more than %d parents", MaxImageDepth) + } + return nil +} + +func (runtime *Runtime) checkDeprecatedExpose(config *runconfig.Config) bool { + if config != nil { + if config.PortSpecs != nil { + for _, p := range config.PortSpecs { + if strings.Contains(p, ":") { + return true + } + } + } + } + return false +} + +func (runtime *Runtime) mergeAndVerifyConfig(config *runconfig.Config, img *image.Image) ([]string, error) { + warnings := []string{} + if runtime.checkDeprecatedExpose(img.Config) || runtime.checkDeprecatedExpose(config) { + warnings = append(warnings, "The mapping to public ports on your host via Dockerfile EXPOSE (host:port:port) has been deprecated. Use -p to publish the ports.") + } + if img.Config != nil { + if err := runconfig.Merge(config, img.Config); err != nil { + return nil, err + } + } + if len(config.Entrypoint) == 0 && len(config.Cmd) == 0 { + return nil, fmt.Errorf("No command specified") + } + return warnings, nil +} + +func (runtime *Runtime) generateIdAndName(name string) (string, string, error) { + var ( + err error + id = utils.GenerateRandomID() + ) + + if name == "" { + name, err = generateRandomName(runtime) + if err != nil { + name = utils.TruncateID(id) + } + } else { + if !validContainerNamePattern.MatchString(name) { + return "", "", fmt.Errorf("Invalid container name (%s), only %s are allowed", name, validContainerNameChars) + } + } + if name[0] != '/' { + name = "/" + name + } + // Set the enitity in the graph using the default name specified + if _, err := runtime.containerGraph.Set(name, id); err != nil { + if !graphdb.IsNonUniqueNameError(err) { + return "", "", err + } + + conflictingContainer, err := runtime.GetByName(name) + if err != nil { + if strings.Contains(err.Error(), "Could not find entity") { + return "", "", err + } + + // Remove name and continue starting the container + if err := runtime.containerGraph.Delete(name); err != nil { + return "", "", err + } + } else { + nameAsKnownByUser := strings.TrimPrefix(name, "/") + return "", "", fmt.Errorf( + "Conflict, The name %s is already assigned to %s. You have to delete (or rename) that container to be able to assign %s to a container again.", nameAsKnownByUser, + utils.TruncateID(conflictingContainer.ID), nameAsKnownByUser) + } + } + return id, name, nil +} + +func (runtime *Runtime) generateHostname(id string, config *runconfig.Config) { + // Generate default hostname + // FIXME: the lxc template no longer needs to set a default hostname + if config.Hostname == "" { + config.Hostname = id[:12] + } +} + +func (runtime *Runtime) getEntrypointAndArgs(config *runconfig.Config) (string, []string) { + var ( + entrypoint string + args []string + ) + if len(config.Entrypoint) != 0 { + entrypoint = config.Entrypoint[0] + args = append(config.Entrypoint[1:], config.Cmd...) + } else { + entrypoint = config.Cmd[0] + args = config.Cmd[1:] + } + return entrypoint, args +} + +func (runtime *Runtime) newContainer(name string, config *runconfig.Config, img *image.Image) (*Container, error) { + var ( + id string + err error + ) + id, name, err = runtime.generateIdAndName(name) + if err != nil { + return nil, err + } + + runtime.generateHostname(id, config) + entrypoint, args := runtime.getEntrypointAndArgs(config) + + container := &Container{ + // FIXME: we should generate the ID here instead of receiving it as an argument + ID: id, + Created: time.Now().UTC(), + Path: entrypoint, + Args: args, //FIXME: de-duplicate from config + Config: config, + hostConfig: &runconfig.HostConfig{}, + Image: img.ID, // Always use the resolved image id + NetworkSettings: &NetworkSettings{}, + Name: name, + Driver: runtime.driver.String(), + ExecDriver: runtime.execDriver.Name(), + } + container.root = runtime.containerRoot(container.ID) + return container, nil +} + +func (runtime *Runtime) createRootfs(container *Container, img *image.Image) error { + // Step 1: create the container directory. + // This doubles as a barrier to avoid race conditions. + if err := os.Mkdir(container.root, 0700); err != nil { + return err + } + initID := fmt.Sprintf("%s-init", container.ID) + if err := runtime.driver.Create(initID, img.ID, ""); err != nil { + return err + } + initPath, err := runtime.driver.Get(initID) + if err != nil { + return err + } + defer runtime.driver.Put(initID) + + if err := graph.SetupInitLayer(initPath); err != nil { + return err + } + + if err := runtime.driver.Create(container.ID, initID, ""); err != nil { + return err + } + return nil +} + +// Commit creates a new filesystem image from the current state of a container. +// The image can optionally be tagged into a repository +func (runtime *Runtime) Commit(container *Container, repository, tag, comment, author string, config *runconfig.Config) (*image.Image, error) { + // FIXME: freeze the container before copying it to avoid data corruption? + if err := container.Mount(); err != nil { + return nil, err + } + defer container.Unmount() + + rwTar, err := container.ExportRw() + if err != nil { + return nil, err + } + defer rwTar.Close() + + // Create a new image from the container's base layers + a new layer from container changes + var ( + containerID, containerImage string + containerConfig *runconfig.Config + ) + if container != nil { + containerID = container.ID + containerImage = container.Image + containerConfig = container.Config + } + img, err := runtime.graph.Create(rwTar, containerID, containerImage, comment, author, containerConfig, config) + if err != nil { + return nil, err + } + // Register the image if needed + if repository != "" { + if err := runtime.repositories.Set(repository, tag, img.ID, true); err != nil { + return img, err + } + } + return img, nil +} + +func GetFullContainerName(name string) (string, error) { + if name == "" { + return "", fmt.Errorf("Container name cannot be empty") + } + if name[0] != '/' { + name = "/" + name + } + return name, nil +} + +func (runtime *Runtime) GetByName(name string) (*Container, error) { + fullName, err := GetFullContainerName(name) + if err != nil { + return nil, err + } + entity := runtime.containerGraph.Get(fullName) + if entity == nil { + return nil, fmt.Errorf("Could not find entity for %s", name) + } + e := runtime.getContainerElement(entity.ID()) + if e == nil { + return nil, fmt.Errorf("Could not find container for entity id %s", entity.ID()) + } + return e.Value.(*Container), nil +} + +func (runtime *Runtime) Children(name string) (map[string]*Container, error) { + name, err := GetFullContainerName(name) + if err != nil { + return nil, err + } + children := make(map[string]*Container) + + err = runtime.containerGraph.Walk(name, func(p string, e *graphdb.Entity) error { + c := runtime.Get(e.ID()) + if c == nil { + return fmt.Errorf("Could not get container for name %s and id %s", e.ID(), p) + } + children[p] = c + return nil + }, 0) + + if err != nil { + return nil, err + } + return children, nil +} + +func (runtime *Runtime) RegisterLink(parent, child *Container, alias string) error { + fullName := path.Join(parent.Name, alias) + if !runtime.containerGraph.Exists(fullName) { + _, err := runtime.containerGraph.Set(fullName, child.ID) + return err + } + return nil +} + +// FIXME: harmonize with NewGraph() +func NewRuntime(config *daemonconfig.Config, eng *engine.Engine) (*Runtime, error) { + runtime, err := NewRuntimeFromDirectory(config, eng) + if err != nil { + return nil, err + } + return runtime, nil +} + +func NewRuntimeFromDirectory(config *daemonconfig.Config, eng *engine.Engine) (*Runtime, error) { + if !config.EnableSelinuxSupport { + selinux.SetDisabled() + } + + // Set the default driver + graphdriver.DefaultDriver = config.GraphDriver + + // Load storage driver + driver, err := graphdriver.New(config.Root) + if err != nil { + return nil, err + } + utils.Debugf("Using graph driver %s", driver) + + if err := remountPrivate(config.Root); err != nil { + return nil, err + } + + runtimeRepo := path.Join(config.Root, "containers") + + if err := os.MkdirAll(runtimeRepo, 0700); err != nil && !os.IsExist(err) { + return nil, err + } + + // Migrate the container if it is aufs and aufs is enabled + if err = migrateIfAufs(driver, config.Root); err != nil { + return nil, err + } + + utils.Debugf("Creating images graph") + g, err := graph.NewGraph(path.Join(config.Root, "graph"), driver) + if err != nil { + return nil, err + } + + // We don't want to use a complex driver like aufs or devmapper + // for volumes, just a plain filesystem + volumesDriver, err := graphdriver.GetDriver("vfs", config.Root) + if err != nil { + return nil, err + } + utils.Debugf("Creating volumes graph") + volumes, err := graph.NewGraph(path.Join(config.Root, "volumes"), volumesDriver) + if err != nil { + return nil, err + } + utils.Debugf("Creating repository list") + repositories, err := graph.NewTagStore(path.Join(config.Root, "repositories-"+driver.String()), g) + if err != nil { + return nil, fmt.Errorf("Couldn't create Tag store: %s", err) + } + + if !config.DisableNetwork { + job := eng.Job("init_networkdriver") + + job.SetenvBool("EnableIptables", config.EnableIptables) + job.SetenvBool("InterContainerCommunication", config.InterContainerCommunication) + job.SetenvBool("EnableIpForward", config.EnableIpForward) + job.Setenv("BridgeIface", config.BridgeIface) + job.Setenv("BridgeIP", config.BridgeIP) + job.Setenv("DefaultBindingIP", config.DefaultIp.String()) + + if err := job.Run(); err != nil { + return nil, err + } + } + + graphdbPath := path.Join(config.Root, "linkgraph.db") + graph, err := graphdb.NewSqliteConn(graphdbPath) + if err != nil { + return nil, err + } + + localCopy := path.Join(config.Root, "init", fmt.Sprintf("dockerinit-%s", dockerversion.VERSION)) + sysInitPath := utils.DockerInitPath(localCopy) + if sysInitPath == "" { + return nil, fmt.Errorf("Could not locate dockerinit: This usually means docker was built incorrectly. See http://docs.docker.io/en/latest/contributing/devenvironment for official build instructions.") + } + + if sysInitPath != localCopy { + // When we find a suitable dockerinit binary (even if it's our local binary), we copy it into config.Root at localCopy for future use (so that the original can go away without that being a problem, for example during a package upgrade). + if err := os.Mkdir(path.Dir(localCopy), 0700); err != nil && !os.IsExist(err) { + return nil, err + } + if _, err := utils.CopyFile(sysInitPath, localCopy); err != nil { + return nil, err + } + if err := os.Chmod(localCopy, 0700); err != nil { + return nil, err + } + sysInitPath = localCopy + } + + sysInfo := sysinfo.New(false) + ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInitPath, sysInfo) + if err != nil { + return nil, err + } + + runtime := &Runtime{ + repository: runtimeRepo, + containers: list.New(), + graph: g, + repositories: repositories, + idIndex: utils.NewTruncIndex(), + sysInfo: sysInfo, + volumes: volumes, + config: config, + containerGraph: graph, + driver: driver, + sysInitPath: sysInitPath, + execDriver: ed, + eng: eng, + } + + if err := runtime.checkLocaldns(); err != nil { + return nil, err + } + if err := runtime.restore(); err != nil { + return nil, err + } + return runtime, nil +} + +func (runtime *Runtime) shutdown() error { + group := sync.WaitGroup{} + utils.Debugf("starting clean shutdown of all containers...") + for _, container := range runtime.List() { + c := container + if c.State.IsRunning() { + utils.Debugf("stopping %s", c.ID) + group.Add(1) + + go func() { + defer group.Done() + if err := c.KillSig(15); err != nil { + utils.Debugf("kill 15 error for %s - %s", c.ID, err) + } + c.Wait() + utils.Debugf("container stopped %s", c.ID) + }() + } + } + group.Wait() + + return nil +} + +func (runtime *Runtime) Close() error { + errorsStrings := []string{} + if err := runtime.shutdown(); err != nil { + utils.Errorf("runtime.shutdown(): %s", err) + errorsStrings = append(errorsStrings, err.Error()) + } + if err := portallocator.ReleaseAll(); err != nil { + utils.Errorf("portallocator.ReleaseAll(): %s", err) + errorsStrings = append(errorsStrings, err.Error()) + } + if err := runtime.driver.Cleanup(); err != nil { + utils.Errorf("runtime.driver.Cleanup(): %s", err.Error()) + errorsStrings = append(errorsStrings, err.Error()) + } + if err := runtime.containerGraph.Close(); err != nil { + utils.Errorf("runtime.containerGraph.Close(): %s", err.Error()) + errorsStrings = append(errorsStrings, err.Error()) + } + if len(errorsStrings) > 0 { + return fmt.Errorf("%s", strings.Join(errorsStrings, ", ")) + } + return nil +} + +func (runtime *Runtime) Mount(container *Container) error { + dir, err := runtime.driver.Get(container.ID) + if err != nil { + return fmt.Errorf("Error getting container %s from driver %s: %s", container.ID, runtime.driver, err) + } + if container.basefs == "" { + container.basefs = dir + } else if container.basefs != dir { + return fmt.Errorf("Error: driver %s is returning inconsistent paths for container %s ('%s' then '%s')", + runtime.driver, container.ID, container.basefs, dir) + } + return nil +} + +func (runtime *Runtime) Unmount(container *Container) error { + runtime.driver.Put(container.ID) + return nil +} + +func (runtime *Runtime) Changes(container *Container) ([]archive.Change, error) { + if differ, ok := runtime.driver.(graphdriver.Differ); ok { + return differ.Changes(container.ID) + } + cDir, err := runtime.driver.Get(container.ID) + if err != nil { + return nil, fmt.Errorf("Error getting container rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err) + } + defer runtime.driver.Put(container.ID) + initDir, err := runtime.driver.Get(container.ID + "-init") + if err != nil { + return nil, fmt.Errorf("Error getting container init rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err) + } + defer runtime.driver.Put(container.ID + "-init") + return archive.ChangesDirs(cDir, initDir) +} + +func (runtime *Runtime) Diff(container *Container) (archive.Archive, error) { + if differ, ok := runtime.driver.(graphdriver.Differ); ok { + return differ.Diff(container.ID) + } + + changes, err := runtime.Changes(container) + if err != nil { + return nil, err + } + + cDir, err := runtime.driver.Get(container.ID) + if err != nil { + return nil, fmt.Errorf("Error getting container rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err) + } + + archive, err := archive.ExportChanges(cDir, changes) + if err != nil { + return nil, err + } + return utils.NewReadCloserWrapper(archive, func() error { + err := archive.Close() + runtime.driver.Put(container.ID) + return err + }), nil +} + +func (runtime *Runtime) Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + return runtime.execDriver.Run(c.command, pipes, startCallback) +} + +func (runtime *Runtime) Kill(c *Container, sig int) error { + return runtime.execDriver.Kill(c.command, sig) +} + +// Nuke kills all containers then removes all content +// from the content root, including images, volumes and +// container filesystems. +// Again: this will remove your entire docker runtime! +func (runtime *Runtime) Nuke() error { + var wg sync.WaitGroup + for _, container := range runtime.List() { + wg.Add(1) + go func(c *Container) { + c.Kill() + wg.Done() + }(container) + } + wg.Wait() + runtime.Close() + + return os.RemoveAll(runtime.config.Root) +} + +// FIXME: this is a convenience function for integration tests +// which need direct access to runtime.graph. +// Once the tests switch to using engine and jobs, this method +// can go away. +func (runtime *Runtime) Graph() *graph.Graph { + return runtime.graph +} + +func (runtime *Runtime) Repositories() *graph.TagStore { + return runtime.repositories +} + +func (runtime *Runtime) Config() *daemonconfig.Config { + return runtime.config +} + +func (runtime *Runtime) SystemConfig() *sysinfo.SysInfo { + return runtime.sysInfo +} + +func (runtime *Runtime) SystemInitPath() string { + return runtime.sysInitPath +} + +func (runtime *Runtime) GraphDriver() graphdriver.Driver { + return runtime.driver +} + +func (runtime *Runtime) ExecutionDriver() execdriver.Driver { + return runtime.execDriver +} + +func (runtime *Runtime) Volumes() *graph.Graph { + return runtime.volumes +} + +func (runtime *Runtime) ContainerGraph() *graphdb.Database { + return runtime.containerGraph +} + +func (runtime *Runtime) SetServer(server Server) { + runtime.srv = server +} + +func (runtime *Runtime) checkLocaldns() error { + resolvConf, err := utils.GetResolvConf() + if err != nil { + return err + } + if len(runtime.config.Dns) == 0 && utils.CheckLocalDns(resolvConf) { + log.Printf("Local (127.0.0.1) DNS resolver found in resolv.conf and containers can't use it. Using default external servers : %v\n", DefaultDns) + runtime.config.Dns = DefaultDns + } + return nil +} diff --git a/runtime/runtime_aufs.go b/runtime/runtime_aufs.go new file mode 100644 index 0000000000..5a32615df5 --- /dev/null +++ b/runtime/runtime_aufs.go @@ -0,0 +1,22 @@ +// +build !exclude_graphdriver_aufs + +package runtime + +import ( + "github.com/dotcloud/docker/graph" + "github.com/dotcloud/docker/runtime/graphdriver" + "github.com/dotcloud/docker/runtime/graphdriver/aufs" + "github.com/dotcloud/docker/utils" +) + +// Given the graphdriver ad, if it is aufs, then migrate it. +// If aufs driver is not built, this func is a noop. +func migrateIfAufs(driver graphdriver.Driver, root string) error { + if ad, ok := driver.(*aufs.Driver); ok { + utils.Debugf("Migrating existing containers") + if err := ad.Migrate(root, graph.SetupInitLayer); err != nil { + return err + } + } + return nil +} diff --git a/runtime/runtime_btrfs.go b/runtime/runtime_btrfs.go new file mode 100644 index 0000000000..c59b103ff9 --- /dev/null +++ b/runtime/runtime_btrfs.go @@ -0,0 +1,7 @@ +// +build !exclude_graphdriver_btrfs + +package runtime + +import ( + _ "github.com/dotcloud/docker/runtime/graphdriver/btrfs" +) diff --git a/runtime/runtime_devicemapper.go b/runtime/runtime_devicemapper.go new file mode 100644 index 0000000000..5b418b377a --- /dev/null +++ b/runtime/runtime_devicemapper.go @@ -0,0 +1,7 @@ +// +build !exclude_graphdriver_devicemapper + +package runtime + +import ( + _ "github.com/dotcloud/docker/runtime/graphdriver/devmapper" +) diff --git a/runtime/runtime_no_aufs.go b/runtime/runtime_no_aufs.go new file mode 100644 index 0000000000..05a01fe151 --- /dev/null +++ b/runtime/runtime_no_aufs.go @@ -0,0 +1,11 @@ +// +build exclude_graphdriver_aufs + +package runtime + +import ( + "github.com/dotcloud/docker/runtime/graphdriver" +) + +func migrateIfAufs(driver graphdriver.Driver, root string) error { + return nil +} diff --git a/runtime/server.go b/runtime/server.go new file mode 100644 index 0000000000..a74c4d1200 --- /dev/null +++ b/runtime/server.go @@ -0,0 +1,10 @@ +package runtime + +import ( + "github.com/dotcloud/docker/utils" +) + +type Server interface { + LogEvent(action, id, from string) *utils.JSONMessage + IsRunning() bool // returns true if the server is currently in operation +} diff --git a/runtime/sorter.go b/runtime/sorter.go new file mode 100644 index 0000000000..c5af772dae --- /dev/null +++ b/runtime/sorter.go @@ -0,0 +1,25 @@ +package runtime + +import "sort" + +type containerSorter struct { + containers []*Container + by func(i, j *Container) bool +} + +func (s *containerSorter) Len() int { + return len(s.containers) +} + +func (s *containerSorter) Swap(i, j int) { + s.containers[i], s.containers[j] = s.containers[j], s.containers[i] +} + +func (s *containerSorter) Less(i, j int) bool { + return s.by(s.containers[i], s.containers[j]) +} + +func sortContainers(containers []*Container, predicate func(i, j *Container) bool) { + s := &containerSorter{containers, predicate} + sort.Sort(s) +} diff --git a/runtime/state.go b/runtime/state.go new file mode 100644 index 0000000000..316b8a40f1 --- /dev/null +++ b/runtime/state.go @@ -0,0 +1,84 @@ +package runtime + +import ( + "fmt" + "github.com/dotcloud/docker/utils" + "sync" + "time" +) + +type State struct { + sync.RWMutex + Running bool + Pid int + ExitCode int + StartedAt time.Time + FinishedAt time.Time + Ghost bool +} + +// String returns a human-readable description of the state +func (s *State) String() string { + s.RLock() + defer s.RUnlock() + + if s.Running { + if s.Ghost { + return fmt.Sprintf("Ghost") + } + return fmt.Sprintf("Up %s", utils.HumanDuration(time.Now().UTC().Sub(s.StartedAt))) + } + if s.FinishedAt.IsZero() { + return "" + } + return fmt.Sprintf("Exited (%d) %s ago", s.ExitCode, utils.HumanDuration(time.Now().UTC().Sub(s.FinishedAt))) +} + +func (s *State) IsRunning() bool { + s.RLock() + defer s.RUnlock() + + return s.Running +} + +func (s *State) IsGhost() bool { + s.RLock() + defer s.RUnlock() + + return s.Ghost +} + +func (s *State) GetExitCode() int { + s.RLock() + defer s.RUnlock() + + return s.ExitCode +} + +func (s *State) SetGhost(val bool) { + s.Lock() + defer s.Unlock() + + s.Ghost = val +} + +func (s *State) SetRunning(pid int) { + s.Lock() + defer s.Unlock() + + s.Running = true + s.Ghost = false + s.ExitCode = 0 + s.Pid = pid + s.StartedAt = time.Now().UTC() +} + +func (s *State) SetStopped(exitCode int) { + s.Lock() + defer s.Unlock() + + s.Running = false + s.Pid = 0 + s.FinishedAt = time.Now().UTC() + s.ExitCode = exitCode +} diff --git a/runtime/utils.go b/runtime/utils.go new file mode 100644 index 0000000000..b983e67d41 --- /dev/null +++ b/runtime/utils.go @@ -0,0 +1,64 @@ +package runtime + +import ( + "fmt" + "github.com/dotcloud/docker/nat" + "github.com/dotcloud/docker/pkg/namesgenerator" + "github.com/dotcloud/docker/runconfig" + "strings" +) + +func migratePortMappings(config *runconfig.Config, hostConfig *runconfig.HostConfig) error { + if config.PortSpecs != nil { + ports, bindings, err := nat.ParsePortSpecs(config.PortSpecs) + if err != nil { + return err + } + config.PortSpecs = nil + if len(bindings) > 0 { + if hostConfig == nil { + hostConfig = &runconfig.HostConfig{} + } + hostConfig.PortBindings = bindings + } + + if config.ExposedPorts == nil { + config.ExposedPorts = make(nat.PortSet, len(ports)) + } + for k, v := range ports { + config.ExposedPorts[k] = v + } + } + return nil +} + +func mergeLxcConfIntoOptions(hostConfig *runconfig.HostConfig, driverConfig map[string][]string) { + if hostConfig == nil { + return + } + + // merge in the lxc conf options into the generic config map + if lxcConf := hostConfig.LxcConf; lxcConf != nil { + lxc := driverConfig["lxc"] + for _, pair := range lxcConf { + // because lxc conf gets the driver name lxc.XXXX we need to trim it off + // and let the lxc driver add it back later if needed + parts := strings.SplitN(pair.Key, ".", 2) + lxc = append(lxc, fmt.Sprintf("%s=%s", parts[1], pair.Value)) + } + driverConfig["lxc"] = lxc + } +} + +type checker struct { + runtime *Runtime +} + +func (c *checker) Exists(name string) bool { + return c.runtime.containerGraph.Exists("/" + name) +} + +// Generate a random and unique name +func generateRandomName(runtime *Runtime) (string, error) { + return namesgenerator.GenerateRandomName(&checker{runtime}) +} diff --git a/runtime/utils_test.go b/runtime/utils_test.go new file mode 100644 index 0000000000..bdf3543a49 --- /dev/null +++ b/runtime/utils_test.go @@ -0,0 +1,29 @@ +package runtime + +import ( + "testing" + + "github.com/dotcloud/docker/runconfig" + "github.com/dotcloud/docker/utils" +) + +func TestMergeLxcConfig(t *testing.T) { + var ( + hostConfig = &runconfig.HostConfig{ + LxcConf: []utils.KeyValuePair{ + {Key: "lxc.cgroups.cpuset", Value: "1,2"}, + }, + } + driverConfig = make(map[string][]string) + ) + + mergeLxcConfIntoOptions(hostConfig, driverConfig) + if l := len(driverConfig["lxc"]); l > 1 { + t.Fatalf("expected lxc options len of 1 got %d", l) + } + + cpuset := driverConfig["lxc"][0] + if expected := "cgroups.cpuset=1,2"; cpuset != expected { + t.Fatalf("expected %s got %s", expected, cpuset) + } +} diff --git a/runtime/volumes.go b/runtime/volumes.go new file mode 100644 index 0000000000..004f1bb024 --- /dev/null +++ b/runtime/volumes.go @@ -0,0 +1,287 @@ +package runtime + +import ( + "fmt" + "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/runtime/execdriver" + "github.com/dotcloud/docker/utils" + "io/ioutil" + "os" + "path/filepath" + "strings" + "syscall" +) + +type BindMap struct { + SrcPath string + DstPath string + Mode string +} + +func prepareVolumesForContainer(container *Container) error { + if container.Volumes == nil || len(container.Volumes) == 0 { + container.Volumes = make(map[string]string) + container.VolumesRW = make(map[string]bool) + if err := applyVolumesFrom(container); err != nil { + return err + } + } + + if err := createVolumes(container); err != nil { + return err + } + return nil +} + +func setupMountsForContainer(container *Container, envPath string) error { + mounts := []execdriver.Mount{ + {container.runtime.sysInitPath, "/.dockerinit", false, true}, + {envPath, "/.dockerenv", false, true}, + {container.ResolvConfPath, "/etc/resolv.conf", false, true}, + } + + if container.HostnamePath != "" && container.HostsPath != "" { + mounts = append(mounts, execdriver.Mount{container.HostnamePath, "/etc/hostname", false, true}) + mounts = append(mounts, execdriver.Mount{container.HostsPath, "/etc/hosts", false, true}) + } + + // Mount user specified volumes + // Note, these are not private because you may want propagation of (un)mounts from host + // volumes. For instance if you use -v /usr:/usr and the host later mounts /usr/share you + // want this new mount in the container + for r, v := range container.Volumes { + mounts = append(mounts, execdriver.Mount{v, r, container.VolumesRW[r], false}) + } + + container.command.Mounts = mounts + + return nil +} + +func applyVolumesFrom(container *Container) error { + volumesFrom := container.hostConfig.VolumesFrom + if len(volumesFrom) > 0 { + for _, containerSpec := range volumesFrom { + var ( + mountRW = true + specParts = strings.SplitN(containerSpec, ":", 2) + ) + + switch len(specParts) { + case 0: + return fmt.Errorf("Malformed volumes-from specification: %s", containerSpec) + case 2: + switch specParts[1] { + case "ro": + mountRW = false + case "rw": // mountRW is already true + default: + return fmt.Errorf("Malformed volumes-from specification: %s", containerSpec) + } + } + + c := container.runtime.Get(specParts[0]) + if c == nil { + return fmt.Errorf("Container %s not found. Impossible to mount its volumes", specParts[0]) + } + + if err := c.Mount(); err != nil { + return fmt.Errorf("Container %s failed to mount. Impossible to mount its volumes", specParts[0]) + } + defer c.Unmount() + + for volPath, id := range c.Volumes { + if _, exists := container.Volumes[volPath]; exists { + continue + } + stat, err := os.Stat(filepath.Join(c.basefs, volPath)) + if err != nil { + return err + } + if err := createIfNotExists(filepath.Join(container.basefs, volPath), stat.IsDir()); err != nil { + return err + } + container.Volumes[volPath] = id + if isRW, exists := c.VolumesRW[volPath]; exists { + container.VolumesRW[volPath] = isRW && mountRW + } + } + + } + } + return nil +} + +func getBindMap(container *Container) (map[string]BindMap, error) { + var ( + // Create the requested bind mounts + binds = make(map[string]BindMap) + // Define illegal container destinations + illegalDsts = []string{"/", "."} + ) + + for _, bind := range container.hostConfig.Binds { + // FIXME: factorize bind parsing in parseBind + var ( + src, dst, mode string + arr = strings.Split(bind, ":") + ) + + if len(arr) == 2 { + src = arr[0] + dst = arr[1] + mode = "rw" + } else if len(arr) == 3 { + src = arr[0] + dst = arr[1] + mode = arr[2] + } else { + return nil, fmt.Errorf("Invalid bind specification: %s", bind) + } + + // Bail if trying to mount to an illegal destination + for _, illegal := range illegalDsts { + if dst == illegal { + return nil, fmt.Errorf("Illegal bind destination: %s", dst) + } + } + + bindMap := BindMap{ + SrcPath: src, + DstPath: dst, + Mode: mode, + } + binds[filepath.Clean(dst)] = bindMap + } + return binds, nil +} + +func createVolumes(container *Container) error { + binds, err := getBindMap(container) + if err != nil { + return err + } + + volumesDriver := container.runtime.volumes.Driver() + // Create the requested volumes if they don't exist + for volPath := range container.Config.Volumes { + volPath = filepath.Clean(volPath) + volIsDir := true + // Skip existing volumes + if _, exists := container.Volumes[volPath]; exists { + continue + } + var srcPath string + var isBindMount bool + srcRW := false + // If an external bind is defined for this volume, use that as a source + if bindMap, exists := binds[volPath]; exists { + isBindMount = true + srcPath = bindMap.SrcPath + if !filepath.IsAbs(srcPath) { + return fmt.Errorf("%s must be an absolute path", srcPath) + } + if strings.ToLower(bindMap.Mode) == "rw" { + srcRW = true + } + if stat, err := os.Stat(bindMap.SrcPath); err != nil { + return err + } else { + volIsDir = stat.IsDir() + } + // Otherwise create an directory in $ROOT/volumes/ and use that + } else { + + // Do not pass a container as the parameter for the volume creation. + // The graph driver using the container's information ( Image ) to + // create the parent. + c, err := container.runtime.volumes.Create(nil, "", "", "", "", nil, nil) + if err != nil { + return err + } + srcPath, err = volumesDriver.Get(c.ID) + if err != nil { + return fmt.Errorf("Driver %s failed to get volume rootfs %s: %s", volumesDriver, c.ID, err) + } + srcRW = true // RW by default + } + + if p, err := filepath.EvalSymlinks(srcPath); err != nil { + return err + } else { + srcPath = p + } + + container.Volumes[volPath] = srcPath + container.VolumesRW[volPath] = srcRW + + // Create the mountpoint + volPath = filepath.Join(container.basefs, volPath) + rootVolPath, err := utils.FollowSymlinkInScope(volPath, container.basefs) + if err != nil { + return err + } + if err := createIfNotExists(rootVolPath, volIsDir); err != nil { + return err + } + + // Do not copy or change permissions if we are mounting from the host + if srcRW && !isBindMount { + volList, err := ioutil.ReadDir(rootVolPath) + if err != nil { + return err + } + if len(volList) > 0 { + srcList, err := ioutil.ReadDir(srcPath) + if err != nil { + return err + } + if len(srcList) == 0 { + // If the source volume is empty copy files from the root into the volume + if err := archive.CopyWithTar(rootVolPath, srcPath); err != nil { + return err + } + + var stat syscall.Stat_t + if err := syscall.Stat(rootVolPath, &stat); err != nil { + return err + } + var srcStat syscall.Stat_t + if err := syscall.Stat(srcPath, &srcStat); err != nil { + return err + } + // Change the source volume's ownership if it differs from the root + // files that were just copied + if stat.Uid != srcStat.Uid || stat.Gid != srcStat.Gid { + if err := os.Chown(srcPath, int(stat.Uid), int(stat.Gid)); err != nil { + return err + } + } + } + } + } + } + return nil +} + +func createIfNotExists(path string, isDir bool) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + } else { + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + defer f.Close() + } + } + } + return nil +} |
