summaryrefslogtreecommitdiff
path: root/runtime
diff options
context:
space:
mode:
authorunclejack <unclejack@users.noreply.github.com>2014-04-09 01:56:01 +0300
committerunclejack <unclejack@users.noreply.github.com>2014-04-09 01:56:01 +0300
commite128a606e39fa63c6b4fd6e53a1d88cf00aad868 (patch)
tree199ee7eb6678ffecd2ddad95fce794c795ad5183 /runtime
parent143c9707a9fafc39e1d9747f528db97b2564f01e (diff)
parentdc9c28f51d669d6b09e81c2381f800f1a33bb659 (diff)
downloaddocker-release-0.10.tar.gz
Merge pull request #5079 from unclejack/bump_v0.10.0release-0.100.10.1-hotfixes
Bump version to v0.10.0
Diffstat (limited to 'runtime')
-rw-r--r--runtime/container.go1229
-rw-r--r--runtime/container_unit_test.go145
-rw-r--r--runtime/execdriver/MAINTAINERS2
-rw-r--r--runtime/execdriver/driver.go144
-rw-r--r--runtime/execdriver/execdrivers/execdrivers.go23
-rw-r--r--runtime/execdriver/lxc/driver.go418
-rw-r--r--runtime/execdriver/lxc/info.go50
-rw-r--r--runtime/execdriver/lxc/info_test.go36
-rw-r--r--runtime/execdriver/lxc/init.go175
-rw-r--r--runtime/execdriver/lxc/lxc_init_linux.go11
-rw-r--r--runtime/execdriver/lxc/lxc_init_unsupported.go7
-rw-r--r--runtime/execdriver/lxc/lxc_template.go176
-rw-r--r--runtime/execdriver/lxc/lxc_template_unit_test.go135
-rw-r--r--runtime/execdriver/native/configuration/parse.go186
-rw-r--r--runtime/execdriver/native/configuration/parse_test.go166
-rw-r--r--runtime/execdriver/native/create.go114
-rw-r--r--runtime/execdriver/native/driver.go292
-rw-r--r--runtime/execdriver/native/info.go21
-rw-r--r--runtime/execdriver/native/template/default_template.go45
-rw-r--r--runtime/execdriver/native/term.go42
-rw-r--r--runtime/execdriver/pipes.go23
-rw-r--r--runtime/execdriver/termconsole.go126
-rw-r--r--runtime/graphdriver/aufs/aufs.go401
-rw-r--r--runtime/graphdriver/aufs/aufs_test.go697
-rw-r--r--runtime/graphdriver/aufs/dirs.go46
-rw-r--r--runtime/graphdriver/aufs/migrate.go194
-rw-r--r--runtime/graphdriver/aufs/mount.go17
-rw-r--r--runtime/graphdriver/aufs/mount_linux.go11
-rw-r--r--runtime/graphdriver/aufs/mount_unsupported.go11
-rw-r--r--runtime/graphdriver/btrfs/btrfs.go213
-rw-r--r--runtime/graphdriver/btrfs/dummy_unsupported.go3
-rw-r--r--runtime/graphdriver/devmapper/attach_loopback.go126
-rw-r--r--runtime/graphdriver/devmapper/deviceset.go1122
-rw-r--r--runtime/graphdriver/devmapper/devmapper.go595
-rw-r--r--runtime/graphdriver/devmapper/devmapper_doc.go106
-rw-r--r--runtime/graphdriver/devmapper/devmapper_log.go15
-rw-r--r--runtime/graphdriver/devmapper/devmapper_test.go287
-rw-r--r--runtime/graphdriver/devmapper/devmapper_wrapper.go229
-rw-r--r--runtime/graphdriver/devmapper/driver.go142
-rw-r--r--runtime/graphdriver/devmapper/driver_test.go886
-rw-r--r--runtime/graphdriver/devmapper/ioctl.go71
-rw-r--r--runtime/graphdriver/devmapper/mount.go27
-rw-r--r--runtime/graphdriver/devmapper/sys.go57
-rw-r--r--runtime/graphdriver/driver.go92
-rw-r--r--runtime/graphdriver/vfs/driver.go95
-rw-r--r--runtime/history.go30
-rw-r--r--runtime/networkdriver/bridge/driver.go470
-rw-r--r--runtime/networkdriver/ipallocator/allocator.go159
-rw-r--r--runtime/networkdriver/ipallocator/allocator_test.go241
-rw-r--r--runtime/networkdriver/network.go10
-rw-r--r--runtime/networkdriver/network_test.go190
-rw-r--r--runtime/networkdriver/portallocator/portallocator.go188
-rw-r--r--runtime/networkdriver/portallocator/portallocator_test.go213
-rw-r--r--runtime/networkdriver/portmapper/mapper.go131
-rw-r--r--runtime/networkdriver/portmapper/mapper_test.go107
-rw-r--r--runtime/networkdriver/utils.go118
-rw-r--r--runtime/runtime.go993
-rw-r--r--runtime/runtime_aufs.go22
-rw-r--r--runtime/runtime_btrfs.go7
-rw-r--r--runtime/runtime_devicemapper.go7
-rw-r--r--runtime/runtime_no_aufs.go11
-rw-r--r--runtime/server.go10
-rw-r--r--runtime/sorter.go25
-rw-r--r--runtime/state.go84
-rw-r--r--runtime/utils.go64
-rw-r--r--runtime/utils_test.go29
-rw-r--r--runtime/volumes.go287
67 files changed, 12405 insertions, 0 deletions
diff --git a/runtime/container.go b/runtime/container.go
new file mode 100644
index 0000000000..c8053b146c
--- /dev/null
+++ b/runtime/container.go
@@ -0,0 +1,1229 @@
+package runtime
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ "github.com/dotcloud/docker/engine"
+ "github.com/dotcloud/docker/image"
+ "github.com/dotcloud/docker/links"
+ "github.com/dotcloud/docker/nat"
+ "github.com/dotcloud/docker/runconfig"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "github.com/dotcloud/docker/utils"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "path"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+)
+
+const DefaultPathEnv = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+var (
+ ErrNotATTY = errors.New("The PTY is not a file")
+ ErrNoTTY = errors.New("No PTY found")
+ ErrContainerStart = errors.New("The container failed to start. Unknown error")
+ ErrContainerStartTimeout = errors.New("The container failed to start due to timed out.")
+)
+
+type Container struct {
+ sync.Mutex
+ root string // Path to the "home" of the container, including metadata.
+ basefs string // Path to the graphdriver mountpoint
+
+ ID string
+
+ Created time.Time
+
+ Path string
+ Args []string
+
+ Config *runconfig.Config
+ State State
+ Image string
+
+ NetworkSettings *NetworkSettings
+
+ ResolvConfPath string
+ HostnamePath string
+ HostsPath string
+ Name string
+ Driver string
+ ExecDriver string
+
+ command *execdriver.Command
+ stdout *utils.WriteBroadcaster
+ stderr *utils.WriteBroadcaster
+ stdin io.ReadCloser
+ stdinPipe io.WriteCloser
+
+ runtime *Runtime
+
+ waitLock chan struct{}
+ Volumes map[string]string
+ // Store rw/ro in a separate structure to preserve reverse-compatibility on-disk.
+ // Easier than migrating older container configs :)
+ VolumesRW map[string]bool
+ hostConfig *runconfig.HostConfig
+
+ activeLinks map[string]*links.Link
+}
+
+// FIXME: move deprecated port stuff to nat to clean up the core.
+type PortMapping map[string]string // Deprecated
+
+type NetworkSettings struct {
+ IPAddress string
+ IPPrefixLen int
+ Gateway string
+ Bridge string
+ PortMapping map[string]PortMapping // Deprecated
+ Ports nat.PortMap
+}
+
+func (settings *NetworkSettings) PortMappingAPI() *engine.Table {
+ var outs = engine.NewTable("", 0)
+ for port, bindings := range settings.Ports {
+ p, _ := nat.ParsePort(port.Port())
+ if len(bindings) == 0 {
+ out := &engine.Env{}
+ out.SetInt("PublicPort", p)
+ out.Set("Type", port.Proto())
+ outs.Add(out)
+ continue
+ }
+ for _, binding := range bindings {
+ out := &engine.Env{}
+ h, _ := nat.ParsePort(binding.HostPort)
+ out.SetInt("PrivatePort", p)
+ out.SetInt("PublicPort", h)
+ out.Set("Type", port.Proto())
+ out.Set("IP", binding.HostIp)
+ outs.Add(out)
+ }
+ }
+ return outs
+}
+
+// Inject the io.Reader at the given path. Note: do not close the reader
+func (container *Container) Inject(file io.Reader, pth string) error {
+ if err := container.Mount(); err != nil {
+ return fmt.Errorf("inject: error mounting container %s: %s", container.ID, err)
+ }
+ defer container.Unmount()
+
+ // Return error if path exists
+ destPath := path.Join(container.basefs, pth)
+ if _, err := os.Stat(destPath); err == nil {
+ // Since err is nil, the path could be stat'd and it exists
+ return fmt.Errorf("%s exists", pth)
+ } else if !os.IsNotExist(err) {
+ // Expect err might be that the file doesn't exist, so
+ // if it's some other error, return that.
+
+ return err
+ }
+
+ // Make sure the directory exists
+ if err := os.MkdirAll(path.Join(container.basefs, path.Dir(pth)), 0755); err != nil {
+ return err
+ }
+
+ dest, err := os.Create(destPath)
+ if err != nil {
+ return err
+ }
+ defer dest.Close()
+
+ if _, err := io.Copy(dest, file); err != nil {
+ return err
+ }
+ return nil
+}
+
+func (container *Container) When() time.Time {
+ return container.Created
+}
+
+func (container *Container) FromDisk() error {
+ data, err := ioutil.ReadFile(container.jsonPath())
+ if err != nil {
+ return err
+ }
+ // Load container settings
+ // udp broke compat of docker.PortMapping, but it's not used when loading a container, we can skip it
+ if err := json.Unmarshal(data, container); err != nil && !strings.Contains(err.Error(), "docker.PortMapping") {
+ return err
+ }
+ return container.readHostConfig()
+}
+
+func (container *Container) ToDisk() (err error) {
+ data, err := json.Marshal(container)
+ if err != nil {
+ return
+ }
+ err = ioutil.WriteFile(container.jsonPath(), data, 0666)
+ if err != nil {
+ return
+ }
+ return container.WriteHostConfig()
+}
+
+func (container *Container) readHostConfig() error {
+ container.hostConfig = &runconfig.HostConfig{}
+ // If the hostconfig file does not exist, do not read it.
+ // (We still have to initialize container.hostConfig,
+ // but that's OK, since we just did that above.)
+ _, err := os.Stat(container.hostConfigPath())
+ if os.IsNotExist(err) {
+ return nil
+ }
+ data, err := ioutil.ReadFile(container.hostConfigPath())
+ if err != nil {
+ return err
+ }
+ return json.Unmarshal(data, container.hostConfig)
+}
+
+func (container *Container) WriteHostConfig() (err error) {
+ data, err := json.Marshal(container.hostConfig)
+ if err != nil {
+ return
+ }
+ return ioutil.WriteFile(container.hostConfigPath(), data, 0666)
+}
+
+func (container *Container) generateEnvConfig(env []string) error {
+ data, err := json.Marshal(env)
+ if err != nil {
+ return err
+ }
+ p, err := container.EnvConfigPath()
+ if err != nil {
+ return err
+ }
+ ioutil.WriteFile(p, data, 0600)
+ return nil
+}
+
+func (container *Container) Attach(stdin io.ReadCloser, stdinCloser io.Closer, stdout io.Writer, stderr io.Writer) chan error {
+ var cStdout, cStderr io.ReadCloser
+
+ var nJobs int
+ errors := make(chan error, 3)
+ if stdin != nil && container.Config.OpenStdin {
+ nJobs += 1
+ if cStdin, err := container.StdinPipe(); err != nil {
+ errors <- err
+ } else {
+ go func() {
+ utils.Debugf("attach: stdin: begin")
+ defer utils.Debugf("attach: stdin: end")
+ // No matter what, when stdin is closed (io.Copy unblock), close stdout and stderr
+ if container.Config.StdinOnce && !container.Config.Tty {
+ defer cStdin.Close()
+ } else {
+ defer func() {
+ if cStdout != nil {
+ cStdout.Close()
+ }
+ if cStderr != nil {
+ cStderr.Close()
+ }
+ }()
+ }
+ if container.Config.Tty {
+ _, err = utils.CopyEscapable(cStdin, stdin)
+ } else {
+ _, err = io.Copy(cStdin, stdin)
+ }
+ if err == io.ErrClosedPipe {
+ err = nil
+ }
+ if err != nil {
+ utils.Errorf("attach: stdin: %s", err)
+ }
+ errors <- err
+ }()
+ }
+ }
+ if stdout != nil {
+ nJobs += 1
+ if p, err := container.StdoutPipe(); err != nil {
+ errors <- err
+ } else {
+ cStdout = p
+ go func() {
+ utils.Debugf("attach: stdout: begin")
+ defer utils.Debugf("attach: stdout: end")
+ // If we are in StdinOnce mode, then close stdin
+ if container.Config.StdinOnce && stdin != nil {
+ defer stdin.Close()
+ }
+ if stdinCloser != nil {
+ defer stdinCloser.Close()
+ }
+ _, err := io.Copy(stdout, cStdout)
+ if err == io.ErrClosedPipe {
+ err = nil
+ }
+ if err != nil {
+ utils.Errorf("attach: stdout: %s", err)
+ }
+ errors <- err
+ }()
+ }
+ } else {
+ go func() {
+ if stdinCloser != nil {
+ defer stdinCloser.Close()
+ }
+ if cStdout, err := container.StdoutPipe(); err != nil {
+ utils.Errorf("attach: stdout pipe: %s", err)
+ } else {
+ io.Copy(&utils.NopWriter{}, cStdout)
+ }
+ }()
+ }
+ if stderr != nil {
+ nJobs += 1
+ if p, err := container.StderrPipe(); err != nil {
+ errors <- err
+ } else {
+ cStderr = p
+ go func() {
+ utils.Debugf("attach: stderr: begin")
+ defer utils.Debugf("attach: stderr: end")
+ // If we are in StdinOnce mode, then close stdin
+ if container.Config.StdinOnce && stdin != nil {
+ defer stdin.Close()
+ }
+ if stdinCloser != nil {
+ defer stdinCloser.Close()
+ }
+ _, err := io.Copy(stderr, cStderr)
+ if err == io.ErrClosedPipe {
+ err = nil
+ }
+ if err != nil {
+ utils.Errorf("attach: stderr: %s", err)
+ }
+ errors <- err
+ }()
+ }
+ } else {
+ go func() {
+ if stdinCloser != nil {
+ defer stdinCloser.Close()
+ }
+
+ if cStderr, err := container.StderrPipe(); err != nil {
+ utils.Errorf("attach: stdout pipe: %s", err)
+ } else {
+ io.Copy(&utils.NopWriter{}, cStderr)
+ }
+ }()
+ }
+
+ return utils.Go(func() error {
+ defer func() {
+ if cStdout != nil {
+ cStdout.Close()
+ }
+ if cStderr != nil {
+ cStderr.Close()
+ }
+ }()
+
+ // FIXME: how to clean up the stdin goroutine without the unwanted side effect
+ // of closing the passed stdin? Add an intermediary io.Pipe?
+ for i := 0; i < nJobs; i += 1 {
+ utils.Debugf("attach: waiting for job %d/%d", i+1, nJobs)
+ if err := <-errors; err != nil {
+ utils.Errorf("attach: job %d returned error %s, aborting all jobs", i+1, err)
+ return err
+ }
+ utils.Debugf("attach: job %d completed successfully", i+1)
+ }
+ utils.Debugf("attach: all jobs completed successfully")
+ return nil
+ })
+}
+
+func populateCommand(c *Container) {
+ var (
+ en *execdriver.Network
+ driverConfig = make(map[string][]string)
+ )
+
+ en = &execdriver.Network{
+ Mtu: c.runtime.config.Mtu,
+ Interface: nil,
+ }
+
+ if !c.Config.NetworkDisabled {
+ network := c.NetworkSettings
+ en.Interface = &execdriver.NetworkInterface{
+ Gateway: network.Gateway,
+ Bridge: network.Bridge,
+ IPAddress: network.IPAddress,
+ IPPrefixLen: network.IPPrefixLen,
+ }
+ }
+
+ // TODO: this can be removed after lxc-conf is fully deprecated
+ mergeLxcConfIntoOptions(c.hostConfig, driverConfig)
+
+ resources := &execdriver.Resources{
+ Memory: c.Config.Memory,
+ MemorySwap: c.Config.MemorySwap,
+ CpuShares: c.Config.CpuShares,
+ }
+ c.command = &execdriver.Command{
+ ID: c.ID,
+ Privileged: c.hostConfig.Privileged,
+ Rootfs: c.RootfsPath(),
+ InitPath: "/.dockerinit",
+ Entrypoint: c.Path,
+ Arguments: c.Args,
+ WorkingDir: c.Config.WorkingDir,
+ Network: en,
+ Tty: c.Config.Tty,
+ User: c.Config.User,
+ Config: driverConfig,
+ Resources: resources,
+ }
+ c.command.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
+}
+
+func (container *Container) ArgsAsString() string {
+ var args []string
+ for _, arg := range container.Args {
+ if strings.Contains(arg, " ") {
+ args = append(args, fmt.Sprintf("'%s'", arg))
+ } else {
+ args = append(args, arg)
+ }
+ }
+ return strings.Join(args, " ")
+}
+
+func (container *Container) Start() (err error) {
+ container.Lock()
+ defer container.Unlock()
+
+ if container.State.IsRunning() {
+ return nil
+ }
+
+ defer func() {
+ if err != nil {
+ container.cleanup()
+ }
+ }()
+
+ if container.ResolvConfPath == "" {
+ if err := container.setupContainerDns(); err != nil {
+ return err
+ }
+ }
+
+ if err := container.Mount(); err != nil {
+ return err
+ }
+
+ if container.runtime.config.DisableNetwork {
+ container.Config.NetworkDisabled = true
+ container.buildHostnameAndHostsFiles("127.0.1.1")
+ } else {
+ if err := container.allocateNetwork(); err != nil {
+ return err
+ }
+ container.buildHostnameAndHostsFiles(container.NetworkSettings.IPAddress)
+ }
+
+ // Make sure the config is compatible with the current kernel
+ if container.Config.Memory > 0 && !container.runtime.sysInfo.MemoryLimit {
+ log.Printf("WARNING: Your kernel does not support memory limit capabilities. Limitation discarded.\n")
+ container.Config.Memory = 0
+ }
+ if container.Config.Memory > 0 && !container.runtime.sysInfo.SwapLimit {
+ log.Printf("WARNING: Your kernel does not support swap limit capabilities. Limitation discarded.\n")
+ container.Config.MemorySwap = -1
+ }
+
+ if container.runtime.sysInfo.IPv4ForwardingDisabled {
+ log.Printf("WARNING: IPv4 forwarding is disabled. Networking will not work")
+ }
+
+ if err := prepareVolumesForContainer(container); err != nil {
+ return err
+ }
+
+ // Setup environment
+ env := []string{
+ "HOME=/",
+ "PATH=" + DefaultPathEnv,
+ "HOSTNAME=" + container.Config.Hostname,
+ }
+
+ if container.Config.Tty {
+ env = append(env, "TERM=xterm")
+ }
+
+ // Init any links between the parent and children
+ runtime := container.runtime
+
+ children, err := runtime.Children(container.Name)
+ if err != nil {
+ return err
+ }
+
+ if len(children) > 0 {
+ container.activeLinks = make(map[string]*links.Link, len(children))
+
+ // If we encounter an error make sure that we rollback any network
+ // config and ip table changes
+ rollback := func() {
+ for _, link := range container.activeLinks {
+ link.Disable()
+ }
+ container.activeLinks = nil
+ }
+
+ for linkAlias, child := range children {
+ if !child.State.IsRunning() {
+ return fmt.Errorf("Cannot link to a non running container: %s AS %s", child.Name, linkAlias)
+ }
+
+ link, err := links.NewLink(
+ container.NetworkSettings.IPAddress,
+ child.NetworkSettings.IPAddress,
+ linkAlias,
+ child.Config.Env,
+ child.Config.ExposedPorts,
+ runtime.eng)
+
+ if err != nil {
+ rollback()
+ return err
+ }
+
+ container.activeLinks[link.Alias()] = link
+ if err := link.Enable(); err != nil {
+ rollback()
+ return err
+ }
+
+ for _, envVar := range link.ToEnv() {
+ env = append(env, envVar)
+ }
+ }
+ }
+
+ // because the env on the container can override certain default values
+ // we need to replace the 'env' keys where they match and append anything
+ // else.
+ env = utils.ReplaceOrAppendEnvValues(env, container.Config.Env)
+ if err := container.generateEnvConfig(env); err != nil {
+ return err
+ }
+
+ if container.Config.WorkingDir != "" {
+ container.Config.WorkingDir = path.Clean(container.Config.WorkingDir)
+
+ pthInfo, err := os.Stat(path.Join(container.basefs, container.Config.WorkingDir))
+ if err != nil {
+ if !os.IsNotExist(err) {
+ return err
+ }
+ if err := os.MkdirAll(path.Join(container.basefs, container.Config.WorkingDir), 0755); err != nil {
+ return err
+ }
+ }
+ if pthInfo != nil && !pthInfo.IsDir() {
+ return fmt.Errorf("Cannot mkdir: %s is not a directory", container.Config.WorkingDir)
+ }
+ }
+
+ envPath, err := container.EnvConfigPath()
+ if err != nil {
+ return err
+ }
+
+ populateCommand(container)
+ container.command.Env = env
+
+ if err := setupMountsForContainer(container, envPath); err != nil {
+ return err
+ }
+
+ // Setup logging of stdout and stderr to disk
+ if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil {
+ return err
+ }
+ if err := container.runtime.LogToDisk(container.stderr, container.logPath("json"), "stderr"); err != nil {
+ return err
+ }
+ container.waitLock = make(chan struct{})
+
+ callbackLock := make(chan struct{})
+ callback := func(command *execdriver.Command) {
+ container.State.SetRunning(command.Pid())
+ if command.Tty {
+ // The callback is called after the process Start()
+ // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlace
+ // which we close here.
+ if c, ok := command.Stdout.(io.Closer); ok {
+ c.Close()
+ }
+ }
+ if err := container.ToDisk(); err != nil {
+ utils.Debugf("%s", err)
+ }
+ close(callbackLock)
+ }
+
+ // We use a callback here instead of a goroutine and an chan for
+ // syncronization purposes
+ cErr := utils.Go(func() error { return container.monitor(callback) })
+
+ // Start should not return until the process is actually running
+ select {
+ case <-callbackLock:
+ case err := <-cErr:
+ return err
+ }
+ return nil
+}
+
+func (container *Container) Run() error {
+ if err := container.Start(); err != nil {
+ return err
+ }
+ container.Wait()
+ return nil
+}
+
+func (container *Container) Output() (output []byte, err error) {
+ pipe, err := container.StdoutPipe()
+ if err != nil {
+ return nil, err
+ }
+ defer pipe.Close()
+ if err := container.Start(); err != nil {
+ return nil, err
+ }
+ output, err = ioutil.ReadAll(pipe)
+ container.Wait()
+ return output, err
+}
+
+// Container.StdinPipe returns a WriteCloser which can be used to feed data
+// to the standard input of the container's active process.
+// Container.StdoutPipe and Container.StderrPipe each return a ReadCloser
+// which can be used to retrieve the standard output (and error) generated
+// by the container's active process. The output (and error) are actually
+// copied and delivered to all StdoutPipe and StderrPipe consumers, using
+// a kind of "broadcaster".
+
+func (container *Container) StdinPipe() (io.WriteCloser, error) {
+ return container.stdinPipe, nil
+}
+
+func (container *Container) StdoutPipe() (io.ReadCloser, error) {
+ reader, writer := io.Pipe()
+ container.stdout.AddWriter(writer, "")
+ return utils.NewBufReader(reader), nil
+}
+
+func (container *Container) StderrPipe() (io.ReadCloser, error) {
+ reader, writer := io.Pipe()
+ container.stderr.AddWriter(writer, "")
+ return utils.NewBufReader(reader), nil
+}
+
+func (container *Container) buildHostnameAndHostsFiles(IP string) {
+ container.HostnamePath = path.Join(container.root, "hostname")
+ ioutil.WriteFile(container.HostnamePath, []byte(container.Config.Hostname+"\n"), 0644)
+
+ hostsContent := []byte(`
+127.0.0.1 localhost
+::1 localhost ip6-localhost ip6-loopback
+fe00::0 ip6-localnet
+ff00::0 ip6-mcastprefix
+ff02::1 ip6-allnodes
+ff02::2 ip6-allrouters
+`)
+
+ container.HostsPath = path.Join(container.root, "hosts")
+
+ if container.Config.Domainname != "" {
+ hostsContent = append([]byte(fmt.Sprintf("%s\t%s.%s %s\n", IP, container.Config.Hostname, container.Config.Domainname, container.Config.Hostname)), hostsContent...)
+ } else if !container.Config.NetworkDisabled {
+ hostsContent = append([]byte(fmt.Sprintf("%s\t%s\n", IP, container.Config.Hostname)), hostsContent...)
+ }
+
+ ioutil.WriteFile(container.HostsPath, hostsContent, 0644)
+}
+
+func (container *Container) allocateNetwork() error {
+ if container.Config.NetworkDisabled {
+ return nil
+ }
+
+ var (
+ env *engine.Env
+ err error
+ eng = container.runtime.eng
+ )
+
+ if container.State.IsGhost() {
+ if container.runtime.config.DisableNetwork {
+ env = &engine.Env{}
+ } else {
+ currentIP := container.NetworkSettings.IPAddress
+
+ job := eng.Job("allocate_interface", container.ID)
+ if currentIP != "" {
+ job.Setenv("RequestIP", currentIP)
+ }
+
+ env, err = job.Stdout.AddEnv()
+ if err != nil {
+ return err
+ }
+
+ if err := job.Run(); err != nil {
+ return err
+ }
+ }
+ } else {
+ job := eng.Job("allocate_interface", container.ID)
+ env, err = job.Stdout.AddEnv()
+ if err != nil {
+ return err
+ }
+ if err := job.Run(); err != nil {
+ return err
+ }
+ }
+
+ if container.Config.PortSpecs != nil {
+ utils.Debugf("Migrating port mappings for container: %s", strings.Join(container.Config.PortSpecs, ", "))
+ if err := migratePortMappings(container.Config, container.hostConfig); err != nil {
+ return err
+ }
+ container.Config.PortSpecs = nil
+ if err := container.WriteHostConfig(); err != nil {
+ return err
+ }
+ }
+
+ var (
+ portSpecs = make(nat.PortSet)
+ bindings = make(nat.PortMap)
+ )
+
+ if !container.State.IsGhost() {
+ if container.Config.ExposedPorts != nil {
+ portSpecs = container.Config.ExposedPorts
+ }
+ if container.hostConfig.PortBindings != nil {
+ bindings = container.hostConfig.PortBindings
+ }
+ } else {
+ if container.NetworkSettings.Ports != nil {
+ for port, binding := range container.NetworkSettings.Ports {
+ portSpecs[port] = struct{}{}
+ bindings[port] = binding
+ }
+ }
+ }
+
+ container.NetworkSettings.PortMapping = nil
+
+ for port := range portSpecs {
+ binding := bindings[port]
+ if container.hostConfig.PublishAllPorts && len(binding) == 0 {
+ binding = append(binding, nat.PortBinding{})
+ }
+
+ for i := 0; i < len(binding); i++ {
+ b := binding[i]
+
+ portJob := eng.Job("allocate_port", container.ID)
+ portJob.Setenv("HostIP", b.HostIp)
+ portJob.Setenv("HostPort", b.HostPort)
+ portJob.Setenv("Proto", port.Proto())
+ portJob.Setenv("ContainerPort", port.Port())
+
+ portEnv, err := portJob.Stdout.AddEnv()
+ if err != nil {
+ return err
+ }
+ if err := portJob.Run(); err != nil {
+ eng.Job("release_interface", container.ID).Run()
+ return err
+ }
+ b.HostIp = portEnv.Get("HostIP")
+ b.HostPort = portEnv.Get("HostPort")
+
+ binding[i] = b
+ }
+ bindings[port] = binding
+ }
+ container.WriteHostConfig()
+
+ container.NetworkSettings.Ports = bindings
+
+ container.NetworkSettings.Bridge = env.Get("Bridge")
+ container.NetworkSettings.IPAddress = env.Get("IP")
+ container.NetworkSettings.IPPrefixLen = env.GetInt("IPPrefixLen")
+ container.NetworkSettings.Gateway = env.Get("Gateway")
+
+ return nil
+}
+
+func (container *Container) releaseNetwork() {
+ if container.Config.NetworkDisabled {
+ return
+ }
+ eng := container.runtime.eng
+
+ eng.Job("release_interface", container.ID).Run()
+ container.NetworkSettings = &NetworkSettings{}
+}
+
+func (container *Container) monitor(callback execdriver.StartCallback) error {
+ var (
+ err error
+ exitCode int
+ )
+
+ pipes := execdriver.NewPipes(container.stdin, container.stdout, container.stderr, container.Config.OpenStdin)
+ exitCode, err = container.runtime.Run(container, pipes, callback)
+ if err != nil {
+ utils.Errorf("Error running container: %s", err)
+ }
+
+ if container.runtime != nil && container.runtime.srv != nil && container.runtime.srv.IsRunning() {
+ container.State.SetStopped(exitCode)
+
+ // FIXME: there is a race condition here which causes this to fail during the unit tests.
+ // If another goroutine was waiting for Wait() to return before removing the container's root
+ // from the filesystem... At this point it may already have done so.
+ // This is because State.setStopped() has already been called, and has caused Wait()
+ // to return.
+ // FIXME: why are we serializing running state to disk in the first place?
+ //log.Printf("%s: Failed to dump configuration to the disk: %s", container.ID, err)
+ if err := container.ToDisk(); err != nil {
+ utils.Errorf("Error dumping container state to disk: %s\n", err)
+ }
+ }
+
+ // Cleanup
+ container.cleanup()
+
+ // Re-create a brand new stdin pipe once the container exited
+ if container.Config.OpenStdin {
+ container.stdin, container.stdinPipe = io.Pipe()
+ }
+
+ if container.runtime != nil && container.runtime.srv != nil {
+ container.runtime.srv.LogEvent("die", container.ID, container.runtime.repositories.ImageName(container.Image))
+ }
+
+ close(container.waitLock)
+
+ return err
+}
+
+func (container *Container) cleanup() {
+ container.releaseNetwork()
+
+ // Disable all active links
+ if container.activeLinks != nil {
+ for _, link := range container.activeLinks {
+ link.Disable()
+ }
+ }
+ if container.Config.OpenStdin {
+ if err := container.stdin.Close(); err != nil {
+ utils.Errorf("%s: Error close stdin: %s", container.ID, err)
+ }
+ }
+ if err := container.stdout.CloseWriters(); err != nil {
+ utils.Errorf("%s: Error close stdout: %s", container.ID, err)
+ }
+ if err := container.stderr.CloseWriters(); err != nil {
+ utils.Errorf("%s: Error close stderr: %s", container.ID, err)
+ }
+ if container.command != nil && container.command.Terminal != nil {
+ if err := container.command.Terminal.Close(); err != nil {
+ utils.Errorf("%s: Error closing terminal: %s", container.ID, err)
+ }
+ }
+
+ if err := container.Unmount(); err != nil {
+ log.Printf("%v: Failed to umount filesystem: %v", container.ID, err)
+ }
+}
+
+func (container *Container) KillSig(sig int) error {
+ container.Lock()
+ defer container.Unlock()
+
+ if !container.State.IsRunning() {
+ return nil
+ }
+ return container.runtime.Kill(container, sig)
+}
+
+func (container *Container) Kill() error {
+ if !container.State.IsRunning() {
+ return nil
+ }
+
+ // 1. Send SIGKILL
+ if err := container.KillSig(9); err != nil {
+ return err
+ }
+
+ // 2. Wait for the process to die, in last resort, try to kill the process directly
+ if err := container.WaitTimeout(10 * time.Second); err != nil {
+ log.Printf("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", utils.TruncateID(container.ID))
+ if err := syscall.Kill(container.State.Pid, 9); err != nil {
+ return err
+ }
+ }
+
+ container.Wait()
+ return nil
+}
+
+func (container *Container) Stop(seconds int) error {
+ if !container.State.IsRunning() {
+ return nil
+ }
+
+ // 1. Send a SIGTERM
+ if err := container.KillSig(15); err != nil {
+ log.Print("Failed to send SIGTERM to the process, force killing")
+ if err := container.KillSig(9); err != nil {
+ return err
+ }
+ }
+
+ // 2. Wait for the process to exit on its own
+ if err := container.WaitTimeout(time.Duration(seconds) * time.Second); err != nil {
+ log.Printf("Container %v failed to exit within %d seconds of SIGTERM - using the force", container.ID, seconds)
+ // 3. If it doesn't, then send SIGKILL
+ if err := container.Kill(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (container *Container) Restart(seconds int) error {
+ // Avoid unnecessarily unmounting and then directly mounting
+ // the container when the container stops and then starts
+ // again
+ if err := container.Mount(); err == nil {
+ defer container.Unmount()
+ }
+
+ if err := container.Stop(seconds); err != nil {
+ return err
+ }
+ return container.Start()
+}
+
+// Wait blocks until the container stops running, then returns its exit code.
+func (container *Container) Wait() int {
+ <-container.waitLock
+ return container.State.GetExitCode()
+}
+
+func (container *Container) Resize(h, w int) error {
+ return container.command.Terminal.Resize(h, w)
+}
+
+func (container *Container) ExportRw() (archive.Archive, error) {
+ if err := container.Mount(); err != nil {
+ return nil, err
+ }
+ if container.runtime == nil {
+ return nil, fmt.Errorf("Can't load storage driver for unregistered container %s", container.ID)
+ }
+ archive, err := container.runtime.Diff(container)
+ if err != nil {
+ container.Unmount()
+ return nil, err
+ }
+ return utils.NewReadCloserWrapper(archive, func() error {
+ err := archive.Close()
+ container.Unmount()
+ return err
+ }),
+ nil
+}
+
+func (container *Container) Export() (archive.Archive, error) {
+ if err := container.Mount(); err != nil {
+ return nil, err
+ }
+
+ archive, err := archive.Tar(container.basefs, archive.Uncompressed)
+ if err != nil {
+ container.Unmount()
+ return nil, err
+ }
+ return utils.NewReadCloserWrapper(archive, func() error {
+ err := archive.Close()
+ container.Unmount()
+ return err
+ }),
+ nil
+}
+
+func (container *Container) WaitTimeout(timeout time.Duration) error {
+ done := make(chan bool)
+ go func() {
+ container.Wait()
+ done <- true
+ }()
+
+ select {
+ case <-time.After(timeout):
+ return fmt.Errorf("Timed Out")
+ case <-done:
+ return nil
+ }
+}
+
+func (container *Container) Mount() error {
+ return container.runtime.Mount(container)
+}
+
+func (container *Container) Changes() ([]archive.Change, error) {
+ return container.runtime.Changes(container)
+}
+
+func (container *Container) GetImage() (*image.Image, error) {
+ if container.runtime == nil {
+ return nil, fmt.Errorf("Can't get image of unregistered container")
+ }
+ return container.runtime.graph.Get(container.Image)
+}
+
+func (container *Container) Unmount() error {
+ return container.runtime.Unmount(container)
+}
+
+func (container *Container) logPath(name string) string {
+ return path.Join(container.root, fmt.Sprintf("%s-%s.log", container.ID, name))
+}
+
+func (container *Container) ReadLog(name string) (io.Reader, error) {
+ return os.Open(container.logPath(name))
+}
+
+func (container *Container) hostConfigPath() string {
+ return path.Join(container.root, "hostconfig.json")
+}
+
+func (container *Container) jsonPath() string {
+ return path.Join(container.root, "config.json")
+}
+
+func (container *Container) EnvConfigPath() (string, error) {
+ p := path.Join(container.root, "config.env")
+ if _, err := os.Stat(p); err != nil {
+ if os.IsNotExist(err) {
+ f, err := os.Create(p)
+ if err != nil {
+ return "", err
+ }
+ f.Close()
+ } else {
+ return "", err
+ }
+ }
+ return p, nil
+}
+
+// This method must be exported to be used from the lxc template
+// This directory is only usable when the container is running
+func (container *Container) RootfsPath() string {
+ return container.basefs
+}
+
+func validateID(id string) error {
+ if id == "" {
+ return fmt.Errorf("Invalid empty id")
+ }
+ return nil
+}
+
+// GetSize, return real size, virtual size
+func (container *Container) GetSize() (int64, int64) {
+ var (
+ sizeRw, sizeRootfs int64
+ err error
+ driver = container.runtime.driver
+ )
+
+ if err := container.Mount(); err != nil {
+ utils.Errorf("Warning: failed to compute size of container rootfs %s: %s", container.ID, err)
+ return sizeRw, sizeRootfs
+ }
+ defer container.Unmount()
+
+ if differ, ok := container.runtime.driver.(graphdriver.Differ); ok {
+ sizeRw, err = differ.DiffSize(container.ID)
+ if err != nil {
+ utils.Errorf("Warning: driver %s couldn't return diff size of container %s: %s", driver, container.ID, err)
+ // FIXME: GetSize should return an error. Not changing it now in case
+ // there is a side-effect.
+ sizeRw = -1
+ }
+ } else {
+ changes, _ := container.Changes()
+ if changes != nil {
+ sizeRw = archive.ChangesSize(container.basefs, changes)
+ } else {
+ sizeRw = -1
+ }
+ }
+
+ if _, err = os.Stat(container.basefs); err != nil {
+ if sizeRootfs, err = utils.TreeSize(container.basefs); err != nil {
+ sizeRootfs = -1
+ }
+ }
+ return sizeRw, sizeRootfs
+}
+
+func (container *Container) Copy(resource string) (io.ReadCloser, error) {
+ if err := container.Mount(); err != nil {
+ return nil, err
+ }
+ var filter []string
+ basePath := path.Join(container.basefs, resource)
+ stat, err := os.Stat(basePath)
+ if err != nil {
+ container.Unmount()
+ return nil, err
+ }
+ if !stat.IsDir() {
+ d, f := path.Split(basePath)
+ basePath = d
+ filter = []string{f}
+ } else {
+ filter = []string{path.Base(basePath)}
+ basePath = path.Dir(basePath)
+ }
+
+ archive, err := archive.TarFilter(basePath, &archive.TarOptions{
+ Compression: archive.Uncompressed,
+ Includes: filter,
+ })
+ if err != nil {
+ return nil, err
+ }
+ return utils.NewReadCloserWrapper(archive, func() error {
+ err := archive.Close()
+ container.Unmount()
+ return err
+ }),
+ nil
+}
+
+// Returns true if the container exposes a certain port
+func (container *Container) Exposes(p nat.Port) bool {
+ _, exists := container.Config.ExposedPorts[p]
+ return exists
+}
+
+func (container *Container) GetPtyMaster() (*os.File, error) {
+ ttyConsole, ok := container.command.Terminal.(execdriver.TtyTerminal)
+ if !ok {
+ return nil, ErrNoTTY
+ }
+ return ttyConsole.Master(), nil
+}
+
+func (container *Container) HostConfig() *runconfig.HostConfig {
+ return container.hostConfig
+}
+
+func (container *Container) SetHostConfig(hostConfig *runconfig.HostConfig) {
+ container.hostConfig = hostConfig
+}
+
+func (container *Container) DisableLink(name string) {
+ if container.activeLinks != nil {
+ if link, exists := container.activeLinks[name]; exists {
+ link.Disable()
+ } else {
+ utils.Debugf("Could not find active link for %s", name)
+ }
+ }
+}
+
+func (container *Container) setupContainerDns() error {
+ var (
+ config = container.hostConfig
+ runtime = container.runtime
+ )
+ resolvConf, err := utils.GetResolvConf()
+ if err != nil {
+ return err
+ }
+ // If custom dns exists, then create a resolv.conf for the container
+ if len(config.Dns) > 0 || len(runtime.config.Dns) > 0 || len(config.DnsSearch) > 0 || len(runtime.config.DnsSearch) > 0 {
+ var (
+ dns = utils.GetNameservers(resolvConf)
+ dnsSearch = utils.GetSearchDomains(resolvConf)
+ )
+ if len(config.Dns) > 0 {
+ dns = config.Dns
+ } else if len(runtime.config.Dns) > 0 {
+ dns = runtime.config.Dns
+ }
+ if len(config.DnsSearch) > 0 {
+ dnsSearch = config.DnsSearch
+ } else if len(runtime.config.DnsSearch) > 0 {
+ dnsSearch = runtime.config.DnsSearch
+ }
+ container.ResolvConfPath = path.Join(container.root, "resolv.conf")
+ f, err := os.Create(container.ResolvConfPath)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ for _, dns := range dns {
+ if _, err := f.Write([]byte("nameserver " + dns + "\n")); err != nil {
+ return err
+ }
+ }
+ if len(dnsSearch) > 0 {
+ if _, err := f.Write([]byte("search " + strings.Join(dnsSearch, " ") + "\n")); err != nil {
+ return err
+ }
+ }
+ } else {
+ container.ResolvConfPath = "/etc/resolv.conf"
+ }
+ return nil
+}
diff --git a/runtime/container_unit_test.go b/runtime/container_unit_test.go
new file mode 100644
index 0000000000..fba036ca50
--- /dev/null
+++ b/runtime/container_unit_test.go
@@ -0,0 +1,145 @@
+package runtime
+
+import (
+ "github.com/dotcloud/docker/nat"
+ "testing"
+)
+
+func TestParseNetworkOptsPrivateOnly(t *testing.T) {
+ ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100::80"})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(ports) != 1 {
+ t.Logf("Expected 1 got %d", len(ports))
+ t.FailNow()
+ }
+ if len(bindings) != 1 {
+ t.Logf("Expected 1 got %d", len(bindings))
+ t.FailNow()
+ }
+ for k := range ports {
+ if k.Proto() != "tcp" {
+ t.Logf("Expected tcp got %s", k.Proto())
+ t.Fail()
+ }
+ if k.Port() != "80" {
+ t.Logf("Expected 80 got %s", k.Port())
+ t.Fail()
+ }
+ b, exists := bindings[k]
+ if !exists {
+ t.Log("Binding does not exist")
+ t.FailNow()
+ }
+ if len(b) != 1 {
+ t.Logf("Expected 1 got %d", len(b))
+ t.FailNow()
+ }
+ s := b[0]
+ if s.HostPort != "" {
+ t.Logf("Expected \"\" got %s", s.HostPort)
+ t.Fail()
+ }
+ if s.HostIp != "192.168.1.100" {
+ t.Fail()
+ }
+ }
+}
+
+func TestParseNetworkOptsPublic(t *testing.T) {
+ ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100:8080:80"})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(ports) != 1 {
+ t.Logf("Expected 1 got %d", len(ports))
+ t.FailNow()
+ }
+ if len(bindings) != 1 {
+ t.Logf("Expected 1 got %d", len(bindings))
+ t.FailNow()
+ }
+ for k := range ports {
+ if k.Proto() != "tcp" {
+ t.Logf("Expected tcp got %s", k.Proto())
+ t.Fail()
+ }
+ if k.Port() != "80" {
+ t.Logf("Expected 80 got %s", k.Port())
+ t.Fail()
+ }
+ b, exists := bindings[k]
+ if !exists {
+ t.Log("Binding does not exist")
+ t.FailNow()
+ }
+ if len(b) != 1 {
+ t.Logf("Expected 1 got %d", len(b))
+ t.FailNow()
+ }
+ s := b[0]
+ if s.HostPort != "8080" {
+ t.Logf("Expected 8080 got %s", s.HostPort)
+ t.Fail()
+ }
+ if s.HostIp != "192.168.1.100" {
+ t.Fail()
+ }
+ }
+}
+
+func TestParseNetworkOptsUdp(t *testing.T) {
+ ports, bindings, err := nat.ParsePortSpecs([]string{"192.168.1.100::6000/udp"})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(ports) != 1 {
+ t.Logf("Expected 1 got %d", len(ports))
+ t.FailNow()
+ }
+ if len(bindings) != 1 {
+ t.Logf("Expected 1 got %d", len(bindings))
+ t.FailNow()
+ }
+ for k := range ports {
+ if k.Proto() != "udp" {
+ t.Logf("Expected udp got %s", k.Proto())
+ t.Fail()
+ }
+ if k.Port() != "6000" {
+ t.Logf("Expected 6000 got %s", k.Port())
+ t.Fail()
+ }
+ b, exists := bindings[k]
+ if !exists {
+ t.Log("Binding does not exist")
+ t.FailNow()
+ }
+ if len(b) != 1 {
+ t.Logf("Expected 1 got %d", len(b))
+ t.FailNow()
+ }
+ s := b[0]
+ if s.HostPort != "" {
+ t.Logf("Expected \"\" got %s", s.HostPort)
+ t.Fail()
+ }
+ if s.HostIp != "192.168.1.100" {
+ t.Fail()
+ }
+ }
+}
+
+func TestGetFullName(t *testing.T) {
+ name, err := GetFullContainerName("testing")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if name != "/testing" {
+ t.Fatalf("Expected /testing got %s", name)
+ }
+ if _, err := GetFullContainerName(""); err == nil {
+ t.Fatal("Error should not be nil")
+ }
+}
diff --git a/runtime/execdriver/MAINTAINERS b/runtime/execdriver/MAINTAINERS
new file mode 100644
index 0000000000..1cb551364d
--- /dev/null
+++ b/runtime/execdriver/MAINTAINERS
@@ -0,0 +1,2 @@
+Michael Crosby <michael@crosbymichael.com> (@crosbymichael)
+Guillaume J. Charmes <guillaume@docker.com> (@creack)
diff --git a/runtime/execdriver/driver.go b/runtime/execdriver/driver.go
new file mode 100644
index 0000000000..27a575cb3a
--- /dev/null
+++ b/runtime/execdriver/driver.go
@@ -0,0 +1,144 @@
+package execdriver
+
+import (
+ "errors"
+ "io"
+ "os"
+ "os/exec"
+)
+
+// Context is a generic key value pair that allows
+// arbatrary data to be sent
+type Context map[string]string
+
+var (
+ ErrNotRunning = errors.New("Process could not be started")
+ ErrWaitTimeoutReached = errors.New("Wait timeout reached")
+ ErrDriverAlreadyRegistered = errors.New("A driver already registered this docker init function")
+ ErrDriverNotFound = errors.New("The requested docker init has not been found")
+)
+
+var dockerInitFcts map[string]InitFunc
+
+type (
+ StartCallback func(*Command)
+ InitFunc func(i *InitArgs) error
+)
+
+func RegisterInitFunc(name string, fct InitFunc) error {
+ if dockerInitFcts == nil {
+ dockerInitFcts = make(map[string]InitFunc)
+ }
+ if _, ok := dockerInitFcts[name]; ok {
+ return ErrDriverAlreadyRegistered
+ }
+ dockerInitFcts[name] = fct
+ return nil
+}
+
+func GetInitFunc(name string) (InitFunc, error) {
+ fct, ok := dockerInitFcts[name]
+ if !ok {
+ return nil, ErrDriverNotFound
+ }
+ return fct, nil
+}
+
+// Args provided to the init function for a driver
+type InitArgs struct {
+ User string
+ Gateway string
+ Ip string
+ WorkDir string
+ Privileged bool
+ Env []string
+ Args []string
+ Mtu int
+ Driver string
+ Console string
+ Pipe int
+ Root string
+}
+
+// Driver specific information based on
+// processes registered with the driver
+type Info interface {
+ IsRunning() bool
+}
+
+// Terminal in an interface for drivers to implement
+// if they want to support Close and Resize calls from
+// the core
+type Terminal interface {
+ io.Closer
+ Resize(height, width int) error
+}
+
+type TtyTerminal interface {
+ Master() *os.File
+}
+
+type Driver interface {
+ Run(c *Command, pipes *Pipes, startCallback StartCallback) (int, error) // Run executes the process and blocks until the process exits and returns the exit code
+ Kill(c *Command, sig int) error
+ Name() string // Driver name
+ Info(id string) Info // "temporary" hack (until we move state from core to plugins)
+ GetPidsForContainer(id string) ([]int, error) // Returns a list of pids for the given container.
+ Terminate(c *Command) error // kill it with fire
+}
+
+// Network settings of the container
+type Network struct {
+ Interface *NetworkInterface `json:"interface"` // if interface is nil then networking is disabled
+ Mtu int `json:"mtu"`
+}
+
+type NetworkInterface struct {
+ Gateway string `json:"gateway"`
+ IPAddress string `json:"ip"`
+ Bridge string `json:"bridge"`
+ IPPrefixLen int `json:"ip_prefix_len"`
+}
+
+type Resources struct {
+ Memory int64 `json:"memory"`
+ MemorySwap int64 `json:"memory_swap"`
+ CpuShares int64 `json:"cpu_shares"`
+}
+
+type Mount struct {
+ Source string `json:"source"`
+ Destination string `json:"destination"`
+ Writable bool `json:"writable"`
+ Private bool `json:"private"`
+}
+
+// Process wrapps an os/exec.Cmd to add more metadata
+type Command struct {
+ exec.Cmd `json:"-"`
+
+ ID string `json:"id"`
+ Privileged bool `json:"privileged"`
+ User string `json:"user"`
+ Rootfs string `json:"rootfs"` // root fs of the container
+ InitPath string `json:"initpath"` // dockerinit
+ Entrypoint string `json:"entrypoint"`
+ Arguments []string `json:"arguments"`
+ WorkingDir string `json:"working_dir"`
+ ConfigPath string `json:"config_path"` // this should be able to be removed when the lxc template is moved into the driver
+ Tty bool `json:"tty"`
+ Network *Network `json:"network"`
+ Config map[string][]string `json:"config"` // generic values that specific drivers can consume
+ Resources *Resources `json:"resources"`
+ Mounts []Mount `json:"mounts"`
+
+ Terminal Terminal `json:"-"` // standard or tty terminal
+ Console string `json:"-"` // dev/console path
+ ContainerPid int `json:"container_pid"` // the pid for the process inside a container
+}
+
+// Return the pid of the process
+// If the process is nil -1 will be returned
+func (c *Command) Pid() int {
+ return c.ContainerPid
+}
diff --git a/runtime/execdriver/execdrivers/execdrivers.go b/runtime/execdriver/execdrivers/execdrivers.go
new file mode 100644
index 0000000000..9e277c86df
--- /dev/null
+++ b/runtime/execdriver/execdrivers/execdrivers.go
@@ -0,0 +1,23 @@
+package execdrivers
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/pkg/sysinfo"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/runtime/execdriver/lxc"
+ "github.com/dotcloud/docker/runtime/execdriver/native"
+ "path"
+)
+
+func NewDriver(name, root, initPath string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) {
+ switch name {
+ case "lxc":
+ // we want to five the lxc driver the full docker root because it needs
+ // to access and write config and template files in /var/lib/docker/containers/*
+ // to be backwards compatible
+ return lxc.NewDriver(root, sysInfo.AppArmor)
+ case "native":
+ return native.NewDriver(path.Join(root, "execdriver", "native"), initPath)
+ }
+ return nil, fmt.Errorf("unknown exec driver %s", name)
+}
diff --git a/runtime/execdriver/lxc/driver.go b/runtime/execdriver/lxc/driver.go
new file mode 100644
index 0000000000..ef16dcc380
--- /dev/null
+++ b/runtime/execdriver/lxc/driver.go
@@ -0,0 +1,418 @@
+package lxc
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/pkg/cgroups"
+ "github.com/dotcloud/docker/pkg/label"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/utils"
+ "io/ioutil"
+ "log"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+)
+
+const DriverName = "lxc"
+
+func init() {
+ execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error {
+ if err := setupEnv(args); err != nil {
+ return err
+ }
+
+ if err := setupHostname(args); err != nil {
+ return err
+ }
+
+ if err := setupNetworking(args); err != nil {
+ return err
+ }
+
+ if err := setupCapabilities(args); err != nil {
+ return err
+ }
+
+ if err := setupWorkingDirectory(args); err != nil {
+ return err
+ }
+
+ if err := changeUser(args); err != nil {
+ return err
+ }
+
+ path, err := exec.LookPath(args.Args[0])
+ if err != nil {
+ log.Printf("Unable to locate %v", args.Args[0])
+ os.Exit(127)
+ }
+ if err := syscall.Exec(path, args.Args, os.Environ()); err != nil {
+ return fmt.Errorf("dockerinit unable to execute %s - %s", path, err)
+ }
+ panic("Unreachable")
+ })
+}
+
+type driver struct {
+ root string // root path for the driver to use
+ apparmor bool
+ sharedRoot bool
+}
+
+func NewDriver(root string, apparmor bool) (*driver, error) {
+ // setup unconfined symlink
+ if err := linkLxcStart(root); err != nil {
+ return nil, err
+ }
+ return &driver{
+ apparmor: apparmor,
+ root: root,
+ sharedRoot: rootIsShared(),
+ }, nil
+}
+
+func (d *driver) Name() string {
+ version := d.version()
+ return fmt.Sprintf("%s-%s", DriverName, version)
+}
+
+func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) {
+ if err := execdriver.SetTerminal(c, pipes); err != nil {
+ return -1, err
+ }
+ configPath, err := d.generateLXCConfig(c)
+ if err != nil {
+ return -1, err
+ }
+ params := []string{
+ "lxc-start",
+ "-n", c.ID,
+ "-f", configPath,
+ "--",
+ c.InitPath,
+ "-driver",
+ DriverName,
+ }
+
+ if c.Network.Interface != nil {
+ params = append(params,
+ "-g", c.Network.Interface.Gateway,
+ "-i", fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen),
+ )
+ }
+ params = append(params,
+ "-mtu", strconv.Itoa(c.Network.Mtu),
+ )
+
+ if c.User != "" {
+ params = append(params, "-u", c.User)
+ }
+
+ if c.Privileged {
+ if d.apparmor {
+ params[0] = path.Join(d.root, "lxc-start-unconfined")
+
+ }
+ params = append(params, "-privileged")
+ }
+
+ if c.WorkingDir != "" {
+ params = append(params, "-w", c.WorkingDir)
+ }
+
+ params = append(params, "--", c.Entrypoint)
+ params = append(params, c.Arguments...)
+
+ if d.sharedRoot {
+ // lxc-start really needs / to be non-shared, or all kinds of stuff break
+ // when lxc-start unmount things and those unmounts propagate to the main
+ // mount namespace.
+ // What we really want is to clone into a new namespace and then
+ // mount / MS_REC|MS_SLAVE, but since we can't really clone or fork
+ // without exec in go we have to do this horrible shell hack...
+ shellString :=
+ "mount --make-rslave /; exec " +
+ utils.ShellQuoteArguments(params)
+
+ params = []string{
+ "unshare", "-m", "--", "/bin/sh", "-c", shellString,
+ }
+ }
+
+ var (
+ name = params[0]
+ arg = params[1:]
+ )
+ aname, err := exec.LookPath(name)
+ if err != nil {
+ aname = name
+ }
+ c.Path = aname
+ c.Args = append([]string{name}, arg...)
+
+ if err := c.Start(); err != nil {
+ return -1, err
+ }
+
+ var (
+ waitErr error
+ waitLock = make(chan struct{})
+ )
+ go func() {
+ if err := c.Wait(); err != nil {
+ if _, ok := err.(*exec.ExitError); !ok { // Do not propagate the error if it's simply a status code != 0
+ waitErr = err
+ }
+ }
+ close(waitLock)
+ }()
+
+ // Poll lxc for RUNNING status
+ pid, err := d.waitForStart(c, waitLock)
+ if err != nil {
+ if c.Process != nil {
+ c.Process.Kill()
+ }
+ return -1, err
+ }
+ c.ContainerPid = pid
+
+ if startCallback != nil {
+ startCallback(c)
+ }
+
+ <-waitLock
+
+ return getExitCode(c), waitErr
+}
+
+/// Return the exit code of the process
+// if the process has not exited -1 will be returned
+func getExitCode(c *execdriver.Command) int {
+ if c.ProcessState == nil {
+ return -1
+ }
+ return c.ProcessState.Sys().(syscall.WaitStatus).ExitStatus()
+}
+
+func (d *driver) Kill(c *execdriver.Command, sig int) error {
+ return KillLxc(c.ID, sig)
+}
+
+func (d *driver) Terminate(c *execdriver.Command) error {
+ return KillLxc(c.ID, 9)
+}
+
+func (d *driver) version() string {
+ var (
+ version string
+ output []byte
+ err error
+ )
+ if _, errPath := exec.LookPath("lxc-version"); errPath == nil {
+ output, err = exec.Command("lxc-version").CombinedOutput()
+ } else {
+ output, err = exec.Command("lxc-start", "--version").CombinedOutput()
+ }
+ if err == nil {
+ version = strings.TrimSpace(string(output))
+ if parts := strings.SplitN(version, ":", 2); len(parts) == 2 {
+ version = strings.TrimSpace(parts[1])
+ }
+ }
+ return version
+}
+
+func KillLxc(id string, sig int) error {
+ var (
+ err error
+ output []byte
+ )
+ _, err = exec.LookPath("lxc-kill")
+ if err == nil {
+ output, err = exec.Command("lxc-kill", "-n", id, strconv.Itoa(sig)).CombinedOutput()
+ } else {
+ output, err = exec.Command("lxc-stop", "-k", "-n", id, strconv.Itoa(sig)).CombinedOutput()
+ }
+ if err != nil {
+ return fmt.Errorf("Err: %s Output: %s", err, output)
+ }
+ return nil
+}
+
+// wait for the process to start and return the pid for the process
+func (d *driver) waitForStart(c *execdriver.Command, waitLock chan struct{}) (int, error) {
+ var (
+ err error
+ output []byte
+ )
+ // We wait for the container to be fully running.
+ // Timeout after 5 seconds. In case of broken pipe, just retry.
+ // Note: The container can run and finish correctly before
+ // the end of this loop
+ for now := time.Now(); time.Since(now) < 5*time.Second; {
+ select {
+ case <-waitLock:
+ // If the process dies while waiting for it, just return
+ return -1, nil
+ default:
+ }
+
+ output, err = d.getInfo(c.ID)
+ if err != nil {
+ output, err = d.getInfo(c.ID)
+ if err != nil {
+ return -1, err
+ }
+ }
+ info, err := parseLxcInfo(string(output))
+ if err != nil {
+ return -1, err
+ }
+ if info.Running {
+ return info.Pid, nil
+ }
+ time.Sleep(50 * time.Millisecond)
+ }
+ return -1, execdriver.ErrNotRunning
+}
+
+func (d *driver) getInfo(id string) ([]byte, error) {
+ return exec.Command("lxc-info", "-n", id).CombinedOutput()
+}
+
+type info struct {
+ ID string
+ driver *driver
+}
+
+func (i *info) IsRunning() bool {
+ var running bool
+
+ output, err := i.driver.getInfo(i.ID)
+ if err != nil {
+ utils.Errorf("Error getting info for lxc container %s: %s (%s)", i.ID, err, output)
+ return false
+ }
+ if strings.Contains(string(output), "RUNNING") {
+ running = true
+ }
+ return running
+}
+
+func (d *driver) Info(id string) execdriver.Info {
+ return &info{
+ ID: id,
+ driver: d,
+ }
+}
+
+func (d *driver) GetPidsForContainer(id string) ([]int, error) {
+ pids := []int{}
+
+ // cpu is chosen because it is the only non optional subsystem in cgroups
+ subsystem := "cpu"
+ cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem)
+ if err != nil {
+ return pids, err
+ }
+
+ cgroupDir, err := cgroups.GetThisCgroupDir(subsystem)
+ if err != nil {
+ return pids, err
+ }
+
+ filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks")
+ if _, err := os.Stat(filename); os.IsNotExist(err) {
+ // With more recent lxc versions use, cgroup will be in lxc/
+ filename = filepath.Join(cgroupRoot, cgroupDir, "lxc", id, "tasks")
+ }
+
+ output, err := ioutil.ReadFile(filename)
+ if err != nil {
+ return pids, err
+ }
+ for _, p := range strings.Split(string(output), "\n") {
+ if len(p) == 0 {
+ continue
+ }
+ pid, err := strconv.Atoi(p)
+ if err != nil {
+ return pids, fmt.Errorf("Invalid pid '%s': %s", p, err)
+ }
+ pids = append(pids, pid)
+ }
+ return pids, nil
+}
+
+func linkLxcStart(root string) error {
+ sourcePath, err := exec.LookPath("lxc-start")
+ if err != nil {
+ return err
+ }
+ targetPath := path.Join(root, "lxc-start-unconfined")
+
+ if _, err := os.Lstat(targetPath); err != nil && !os.IsNotExist(err) {
+ return err
+ } else if err == nil {
+ if err := os.Remove(targetPath); err != nil {
+ return err
+ }
+ }
+ return os.Symlink(sourcePath, targetPath)
+}
+
+// TODO: This can be moved to the mountinfo reader in the mount pkg
+func rootIsShared() bool {
+ if data, err := ioutil.ReadFile("/proc/self/mountinfo"); err == nil {
+ for _, line := range strings.Split(string(data), "\n") {
+ cols := strings.Split(line, " ")
+ if len(cols) >= 6 && cols[4] == "/" {
+ return strings.HasPrefix(cols[6], "shared")
+ }
+ }
+ }
+
+ // No idea, probably safe to assume so
+ return true
+}
+
+func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) {
+ var (
+ process, mount string
+ root = path.Join(d.root, "containers", c.ID, "config.lxc")
+ labels = c.Config["label"]
+ )
+ fo, err := os.Create(root)
+ if err != nil {
+ return "", err
+ }
+ defer fo.Close()
+
+ if len(labels) > 0 {
+ process, mount, err = label.GenLabels(labels[0])
+ if err != nil {
+ return "", err
+ }
+ }
+
+ if err := LxcTemplateCompiled.Execute(fo, struct {
+ *execdriver.Command
+ AppArmor bool
+ ProcessLabel string
+ MountLabel string
+ }{
+ Command: c,
+ AppArmor: d.apparmor,
+ ProcessLabel: process,
+ MountLabel: mount,
+ }); err != nil {
+ return "", err
+ }
+ return root, nil
+}
diff --git a/runtime/execdriver/lxc/info.go b/runtime/execdriver/lxc/info.go
new file mode 100644
index 0000000000..27b4c58604
--- /dev/null
+++ b/runtime/execdriver/lxc/info.go
@@ -0,0 +1,50 @@
+package lxc
+
+import (
+ "bufio"
+ "errors"
+ "strconv"
+ "strings"
+)
+
+var (
+ ErrCannotParse = errors.New("cannot parse raw input")
+)
+
+type lxcInfo struct {
+ Running bool
+ Pid int
+}
+
+func parseLxcInfo(raw string) (*lxcInfo, error) {
+ if raw == "" {
+ return nil, ErrCannotParse
+ }
+ var (
+ err error
+ s = bufio.NewScanner(strings.NewReader(raw))
+ info = &lxcInfo{}
+ )
+ for s.Scan() {
+ text := s.Text()
+
+ if s.Err() != nil {
+ return nil, s.Err()
+ }
+
+ parts := strings.Split(text, ":")
+ if len(parts) < 2 {
+ continue
+ }
+ switch strings.ToLower(strings.TrimSpace(parts[0])) {
+ case "state":
+ info.Running = strings.TrimSpace(parts[1]) == "RUNNING"
+ case "pid":
+ info.Pid, err = strconv.Atoi(strings.TrimSpace(parts[1]))
+ if err != nil {
+ return nil, err
+ }
+ }
+ }
+ return info, nil
+}
diff --git a/runtime/execdriver/lxc/info_test.go b/runtime/execdriver/lxc/info_test.go
new file mode 100644
index 0000000000..edafc02511
--- /dev/null
+++ b/runtime/execdriver/lxc/info_test.go
@@ -0,0 +1,36 @@
+package lxc
+
+import (
+ "testing"
+)
+
+func TestParseRunningInfo(t *testing.T) {
+ raw := `
+ state: RUNNING
+ pid: 50`
+
+ info, err := parseLxcInfo(raw)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !info.Running {
+ t.Fatal("info should return a running state")
+ }
+ if info.Pid != 50 {
+ t.Fatalf("info should have pid 50 got %d", info.Pid)
+ }
+}
+
+func TestEmptyInfo(t *testing.T) {
+ _, err := parseLxcInfo("")
+ if err == nil {
+ t.Fatal("error should not be nil")
+ }
+}
+
+func TestBadInfo(t *testing.T) {
+ _, err := parseLxcInfo("state")
+ if err != nil {
+ t.Fatal(err)
+ }
+}
diff --git a/runtime/execdriver/lxc/init.go b/runtime/execdriver/lxc/init.go
new file mode 100644
index 0000000000..c1933a5e43
--- /dev/null
+++ b/runtime/execdriver/lxc/init.go
@@ -0,0 +1,175 @@
+package lxc
+
+import (
+ "encoding/json"
+ "fmt"
+ "github.com/dotcloud/docker/pkg/netlink"
+ "github.com/dotcloud/docker/pkg/user"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/syndtr/gocapability/capability"
+ "io/ioutil"
+ "net"
+ "os"
+ "strings"
+ "syscall"
+)
+
+// Clear environment pollution introduced by lxc-start
+func setupEnv(args *execdriver.InitArgs) error {
+ // Get env
+ var env []string
+ content, err := ioutil.ReadFile(".dockerenv")
+ if err != nil {
+ return fmt.Errorf("Unable to load environment variables: %v", err)
+ }
+ if err := json.Unmarshal(content, &env); err != nil {
+ return fmt.Errorf("Unable to unmarshal environment variables: %v", err)
+ }
+ // Propagate the plugin-specific container env variable
+ env = append(env, "container="+os.Getenv("container"))
+
+ args.Env = env
+
+ os.Clearenv()
+ for _, kv := range args.Env {
+ parts := strings.SplitN(kv, "=", 2)
+ if len(parts) == 1 {
+ parts = append(parts, "")
+ }
+ os.Setenv(parts[0], parts[1])
+ }
+
+ return nil
+}
+
+func setupHostname(args *execdriver.InitArgs) error {
+ hostname := getEnv(args, "HOSTNAME")
+ if hostname == "" {
+ return nil
+ }
+ return setHostname(hostname)
+}
+
+// Setup networking
+func setupNetworking(args *execdriver.InitArgs) error {
+ if args.Ip != "" {
+ // eth0
+ iface, err := net.InterfaceByName("eth0")
+ if err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ ip, ipNet, err := net.ParseCIDR(args.Ip)
+ if err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ if err := netlink.NetworkLinkAddIp(iface, ip, ipNet); err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ if err := netlink.NetworkSetMTU(iface, args.Mtu); err != nil {
+ return fmt.Errorf("Unable to set MTU: %v", err)
+ }
+ if err := netlink.NetworkLinkUp(iface); err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+
+ // loopback
+ iface, err = net.InterfaceByName("lo")
+ if err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ if err := netlink.NetworkLinkUp(iface); err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ }
+ if args.Gateway != "" {
+ gw := net.ParseIP(args.Gateway)
+ if gw == nil {
+ return fmt.Errorf("Unable to set up networking, %s is not a valid gateway IP", args.Gateway)
+ }
+
+ if err := netlink.AddDefaultGw(gw); err != nil {
+ return fmt.Errorf("Unable to set up networking: %v", err)
+ }
+ }
+
+ return nil
+}
+
+// Setup working directory
+func setupWorkingDirectory(args *execdriver.InitArgs) error {
+ if args.WorkDir == "" {
+ return nil
+ }
+ if err := syscall.Chdir(args.WorkDir); err != nil {
+ return fmt.Errorf("Unable to change dir to %v: %v", args.WorkDir, err)
+ }
+ return nil
+}
+
+// Takes care of dropping privileges to the desired user
+func changeUser(args *execdriver.InitArgs) error {
+ uid, gid, suppGids, err := user.GetUserGroupSupplementary(
+ args.User,
+ syscall.Getuid(), syscall.Getgid(),
+ )
+ if err != nil {
+ return err
+ }
+
+ if err := syscall.Setgroups(suppGids); err != nil {
+ return fmt.Errorf("Setgroups failed: %v", err)
+ }
+ if err := syscall.Setgid(gid); err != nil {
+ return fmt.Errorf("Setgid failed: %v", err)
+ }
+ if err := syscall.Setuid(uid); err != nil {
+ return fmt.Errorf("Setuid failed: %v", err)
+ }
+
+ return nil
+}
+
+func setupCapabilities(args *execdriver.InitArgs) error {
+ if args.Privileged {
+ return nil
+ }
+
+ drop := []capability.Cap{
+ capability.CAP_SETPCAP,
+ capability.CAP_SYS_MODULE,
+ capability.CAP_SYS_RAWIO,
+ capability.CAP_SYS_PACCT,
+ capability.CAP_SYS_ADMIN,
+ capability.CAP_SYS_NICE,
+ capability.CAP_SYS_RESOURCE,
+ capability.CAP_SYS_TIME,
+ capability.CAP_SYS_TTY_CONFIG,
+ capability.CAP_AUDIT_WRITE,
+ capability.CAP_AUDIT_CONTROL,
+ capability.CAP_MAC_OVERRIDE,
+ capability.CAP_MAC_ADMIN,
+ capability.CAP_NET_ADMIN,
+ }
+
+ c, err := capability.NewPid(os.Getpid())
+ if err != nil {
+ return err
+ }
+
+ c.Unset(capability.CAPS|capability.BOUNDS, drop...)
+
+ if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
+ return err
+ }
+ return nil
+}
+
+func getEnv(args *execdriver.InitArgs, key string) string {
+ for _, kv := range args.Env {
+ parts := strings.SplitN(kv, "=", 2)
+ if parts[0] == key && len(parts) == 2 {
+ return parts[1]
+ }
+ }
+ return ""
+}
diff --git a/runtime/execdriver/lxc/lxc_init_linux.go b/runtime/execdriver/lxc/lxc_init_linux.go
new file mode 100644
index 0000000000..7288f5877b
--- /dev/null
+++ b/runtime/execdriver/lxc/lxc_init_linux.go
@@ -0,0 +1,11 @@
+// +build amd64
+
+package lxc
+
+import (
+ "syscall"
+)
+
+func setHostname(hostname string) error {
+ return syscall.Sethostname([]byte(hostname))
+}
diff --git a/runtime/execdriver/lxc/lxc_init_unsupported.go b/runtime/execdriver/lxc/lxc_init_unsupported.go
new file mode 100644
index 0000000000..d68cb91a1e
--- /dev/null
+++ b/runtime/execdriver/lxc/lxc_init_unsupported.go
@@ -0,0 +1,7 @@
+// +build !linux !amd64
+
+package lxc
+
+func setHostname(hostname string) error {
+ panic("Not supported on darwin")
+}
diff --git a/runtime/execdriver/lxc/lxc_template.go b/runtime/execdriver/lxc/lxc_template.go
new file mode 100644
index 0000000000..c49753c6aa
--- /dev/null
+++ b/runtime/execdriver/lxc/lxc_template.go
@@ -0,0 +1,176 @@
+package lxc
+
+import (
+ "github.com/dotcloud/docker/pkg/label"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "strings"
+ "text/template"
+)
+
+const LxcTemplate = `
+{{if .Network.Interface}}
+# network configuration
+lxc.network.type = veth
+lxc.network.link = {{.Network.Interface.Bridge}}
+lxc.network.name = eth0
+{{else}}
+# network is disabled (-n=false)
+lxc.network.type = empty
+lxc.network.flags = up
+{{end}}
+lxc.network.mtu = {{.Network.Mtu}}
+
+# root filesystem
+{{$ROOTFS := .Rootfs}}
+lxc.rootfs = {{$ROOTFS}}
+
+# use a dedicated pts for the container (and limit the number of pseudo terminal
+# available)
+lxc.pts = 1024
+
+# disable the main console
+lxc.console = none
+{{if .ProcessLabel}}
+lxc.se_context = {{ .ProcessLabel}}
+{{end}}
+{{$MOUNTLABEL := .MountLabel}}
+
+# no controlling tty at all
+lxc.tty = 1
+
+{{if .Privileged}}
+lxc.cgroup.devices.allow = a
+{{else}}
+# no implicit access to devices
+lxc.cgroup.devices.deny = a
+
+# but allow mknod for any device
+lxc.cgroup.devices.allow = c *:* m
+lxc.cgroup.devices.allow = b *:* m
+
+# /dev/null and zero
+lxc.cgroup.devices.allow = c 1:3 rwm
+lxc.cgroup.devices.allow = c 1:5 rwm
+
+# consoles
+lxc.cgroup.devices.allow = c 5:1 rwm
+lxc.cgroup.devices.allow = c 5:0 rwm
+lxc.cgroup.devices.allow = c 4:0 rwm
+lxc.cgroup.devices.allow = c 4:1 rwm
+
+# /dev/urandom,/dev/random
+lxc.cgroup.devices.allow = c 1:9 rwm
+lxc.cgroup.devices.allow = c 1:8 rwm
+
+# /dev/pts/ - pts namespaces are "coming soon"
+lxc.cgroup.devices.allow = c 136:* rwm
+lxc.cgroup.devices.allow = c 5:2 rwm
+
+# tuntap
+lxc.cgroup.devices.allow = c 10:200 rwm
+
+# fuse
+#lxc.cgroup.devices.allow = c 10:229 rwm
+
+# rtc
+#lxc.cgroup.devices.allow = c 254:0 rwm
+{{end}}
+
+# standard mount point
+# Use mnt.putold as per https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/986385
+lxc.pivotdir = lxc_putold
+
+# NOTICE: These mounts must be applied within the namespace
+
+# WARNING: procfs is a known attack vector and should probably be disabled
+# if your userspace allows it. eg. see http://blog.zx2c4.com/749
+lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0
+
+# WARNING: sysfs is a known attack vector and should probably be disabled
+# if your userspace allows it. eg. see http://bit.ly/T9CkqJ
+lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0
+
+{{if .Tty}}
+lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0
+{{end}}
+
+lxc.mount.entry = devpts {{escapeFstabSpaces $ROOTFS}}/dev/pts devpts {{formatMountLabel "newinstance,ptmxmode=0666,nosuid,noexec" $MOUNTLABEL}} 0 0
+lxc.mount.entry = shm {{escapeFstabSpaces $ROOTFS}}/dev/shm tmpfs {{formatMountLabel "size=65536k,nosuid,nodev,noexec" $MOUNTLABEL}} 0 0
+
+{{range $value := .Mounts}}
+{{if $value.Writable}}
+lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,rw 0 0
+{{else}}
+lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,ro 0 0
+{{end}}
+{{end}}
+
+{{if .Privileged}}
+{{if .AppArmor}}
+lxc.aa_profile = unconfined
+{{else}}
+#lxc.aa_profile = unconfined
+{{end}}
+{{end}}
+
+# limits
+{{if .Resources}}
+{{if .Resources.Memory}}
+lxc.cgroup.memory.limit_in_bytes = {{.Resources.Memory}}
+lxc.cgroup.memory.soft_limit_in_bytes = {{.Resources.Memory}}
+{{with $memSwap := getMemorySwap .Resources}}
+lxc.cgroup.memory.memsw.limit_in_bytes = {{$memSwap}}
+{{end}}
+{{end}}
+{{if .Resources.CpuShares}}
+lxc.cgroup.cpu.shares = {{.Resources.CpuShares}}
+{{end}}
+{{end}}
+
+{{if .Config.lxc}}
+{{range $value := .Config.lxc}}
+lxc.{{$value}}
+{{end}}
+{{end}}
+`
+
+var LxcTemplateCompiled *template.Template
+
+// Escape spaces in strings according to the fstab documentation, which is the
+// format for "lxc.mount.entry" lines in lxc.conf. See also "man 5 fstab".
+func escapeFstabSpaces(field string) string {
+ return strings.Replace(field, " ", "\\040", -1)
+}
+
+func getMemorySwap(v *execdriver.Resources) int64 {
+ // By default, MemorySwap is set to twice the size of RAM.
+ // If you want to omit MemorySwap, set it to `-1'.
+ if v.MemorySwap < 0 {
+ return 0
+ }
+ return v.Memory * 2
+}
+
+func getLabel(c map[string][]string, name string) string {
+ label := c["label"]
+ for _, l := range label {
+ parts := strings.SplitN(l, "=", 2)
+ if strings.TrimSpace(parts[0]) == name {
+ return strings.TrimSpace(parts[1])
+ }
+ }
+ return ""
+}
+
+func init() {
+ var err error
+ funcMap := template.FuncMap{
+ "getMemorySwap": getMemorySwap,
+ "escapeFstabSpaces": escapeFstabSpaces,
+ "formatMountLabel": label.FormatMountLabel,
+ }
+ LxcTemplateCompiled, err = template.New("lxc").Funcs(funcMap).Parse(LxcTemplate)
+ if err != nil {
+ panic(err)
+ }
+}
diff --git a/runtime/execdriver/lxc/lxc_template_unit_test.go b/runtime/execdriver/lxc/lxc_template_unit_test.go
new file mode 100644
index 0000000000..7f473a0502
--- /dev/null
+++ b/runtime/execdriver/lxc/lxc_template_unit_test.go
@@ -0,0 +1,135 @@
+package lxc
+
+import (
+ "bufio"
+ "fmt"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "io/ioutil"
+ "math/rand"
+ "os"
+ "path"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestLXCConfig(t *testing.T) {
+ root, err := ioutil.TempDir("", "TestLXCConfig")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+
+ os.MkdirAll(path.Join(root, "containers", "1"), 0777)
+
+ // Memory is allocated randomly for testing
+ rand.Seed(time.Now().UTC().UnixNano())
+ var (
+ memMin = 33554432
+ memMax = 536870912
+ mem = memMin + rand.Intn(memMax-memMin)
+ cpuMin = 100
+ cpuMax = 10000
+ cpu = cpuMin + rand.Intn(cpuMax-cpuMin)
+ )
+
+ driver, err := NewDriver(root, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+ command := &execdriver.Command{
+ ID: "1",
+ Resources: &execdriver.Resources{
+ Memory: int64(mem),
+ CpuShares: int64(cpu),
+ },
+ Network: &execdriver.Network{
+ Mtu: 1500,
+ Interface: nil,
+ },
+ }
+ p, err := driver.generateLXCConfig(command)
+ if err != nil {
+ t.Fatal(err)
+ }
+ grepFile(t, p,
+ fmt.Sprintf("lxc.cgroup.memory.limit_in_bytes = %d", mem))
+
+ grepFile(t, p,
+ fmt.Sprintf("lxc.cgroup.memory.memsw.limit_in_bytes = %d", mem*2))
+}
+
+func TestCustomLxcConfig(t *testing.T) {
+ root, err := ioutil.TempDir("", "TestCustomLxcConfig")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+
+ os.MkdirAll(path.Join(root, "containers", "1"), 0777)
+
+ driver, err := NewDriver(root, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+ command := &execdriver.Command{
+ ID: "1",
+ Privileged: false,
+ Config: map[string][]string{
+ "lxc": {
+ "lxc.utsname = docker",
+ "lxc.cgroup.cpuset.cpus = 0,1",
+ },
+ },
+ Network: &execdriver.Network{
+ Mtu: 1500,
+ Interface: nil,
+ },
+ }
+
+ p, err := driver.generateLXCConfig(command)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ grepFile(t, p, "lxc.utsname = docker")
+ grepFile(t, p, "lxc.cgroup.cpuset.cpus = 0,1")
+}
+
+func grepFile(t *testing.T, path string, pattern string) {
+ f, err := os.Open(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+ r := bufio.NewReader(f)
+ var (
+ line string
+ )
+ err = nil
+ for err == nil {
+ line, err = r.ReadString('\n')
+ if strings.Contains(line, pattern) == true {
+ return
+ }
+ }
+ t.Fatalf("grepFile: pattern \"%s\" not found in \"%s\"", pattern, path)
+}
+
+func TestEscapeFstabSpaces(t *testing.T) {
+ var testInputs = map[string]string{
+ " ": "\\040",
+ "": "",
+ "/double space": "/double\\040\\040space",
+ "/some long test string": "/some\\040long\\040test\\040string",
+ "/var/lib/docker": "/var/lib/docker",
+ " leading": "\\040leading",
+ "trailing ": "trailing\\040",
+ }
+ for in, exp := range testInputs {
+ if out := escapeFstabSpaces(in); exp != out {
+ t.Logf("Expected %s got %s", exp, out)
+ t.Fail()
+ }
+ }
+}
diff --git a/runtime/execdriver/native/configuration/parse.go b/runtime/execdriver/native/configuration/parse.go
new file mode 100644
index 0000000000..6d6c643919
--- /dev/null
+++ b/runtime/execdriver/native/configuration/parse.go
@@ -0,0 +1,186 @@
+package configuration
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/pkg/libcontainer"
+ "github.com/dotcloud/docker/utils"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "strings"
+)
+
+type Action func(*libcontainer.Container, interface{}, string) error
+
+var actions = map[string]Action{
+ "cap.add": addCap, // add a cap
+ "cap.drop": dropCap, // drop a cap
+
+ "ns.add": addNamespace, // add a namespace
+ "ns.drop": dropNamespace, // drop a namespace when cloning
+
+ "net.join": joinNetNamespace, // join another containers net namespace
+
+ "cgroups.cpu_shares": cpuShares, // set the cpu shares
+ "cgroups.memory": memory, // set the memory limit
+ "cgroups.memory_swap": memorySwap, // set the memory swap limit
+ "cgroups.cpuset.cpus": cpusetCpus, // set the cpus used
+
+ "apparmor_profile": apparmorProfile, // set the apparmor profile to apply
+
+ "fs.readonly": readonlyFs, // make the rootfs of the container read only
+}
+
+func cpusetCpus(container *libcontainer.Container, context interface{}, value string) error {
+ if container.Cgroups == nil {
+ return fmt.Errorf("cannot set cgroups when they are disabled")
+ }
+ container.Cgroups.CpusetCpus = value
+
+ return nil
+}
+
+func apparmorProfile(container *libcontainer.Container, context interface{}, value string) error {
+ container.Context["apparmor_profile"] = value
+ return nil
+}
+
+func cpuShares(container *libcontainer.Container, context interface{}, value string) error {
+ if container.Cgroups == nil {
+ return fmt.Errorf("cannot set cgroups when they are disabled")
+ }
+ v, err := strconv.ParseInt(value, 10, 0)
+ if err != nil {
+ return err
+ }
+ container.Cgroups.CpuShares = v
+ return nil
+}
+
+func memory(container *libcontainer.Container, context interface{}, value string) error {
+ if container.Cgroups == nil {
+ return fmt.Errorf("cannot set cgroups when they are disabled")
+ }
+
+ v, err := utils.RAMInBytes(value)
+ if err != nil {
+ return err
+ }
+ container.Cgroups.Memory = v
+ return nil
+}
+
+func memorySwap(container *libcontainer.Container, context interface{}, value string) error {
+ if container.Cgroups == nil {
+ return fmt.Errorf("cannot set cgroups when they are disabled")
+ }
+ v, err := strconv.ParseInt(value, 0, 64)
+ if err != nil {
+ return err
+ }
+ container.Cgroups.MemorySwap = v
+ return nil
+}
+
+func addCap(container *libcontainer.Container, context interface{}, value string) error {
+ c := container.CapabilitiesMask.Get(value)
+ if c == nil {
+ return fmt.Errorf("%s is not a valid capability", value)
+ }
+ c.Enabled = true
+ return nil
+}
+
+func dropCap(container *libcontainer.Container, context interface{}, value string) error {
+ c := container.CapabilitiesMask.Get(value)
+ if c == nil {
+ return fmt.Errorf("%s is not a valid capability", value)
+ }
+ c.Enabled = false
+ return nil
+}
+
+func addNamespace(container *libcontainer.Container, context interface{}, value string) error {
+ ns := container.Namespaces.Get(value)
+ if ns == nil {
+ return fmt.Errorf("%s is not a valid namespace", value[1:])
+ }
+ ns.Enabled = true
+ return nil
+}
+
+func dropNamespace(container *libcontainer.Container, context interface{}, value string) error {
+ ns := container.Namespaces.Get(value)
+ if ns == nil {
+ return fmt.Errorf("%s is not a valid namespace", value[1:])
+ }
+ ns.Enabled = false
+ return nil
+}
+
+func readonlyFs(container *libcontainer.Container, context interface{}, value string) error {
+ switch value {
+ case "1", "true":
+ container.ReadonlyFs = true
+ default:
+ container.ReadonlyFs = false
+ }
+ return nil
+}
+
+func joinNetNamespace(container *libcontainer.Container, context interface{}, value string) error {
+ var (
+ running = context.(map[string]*exec.Cmd)
+ cmd = running[value]
+ )
+
+ if cmd == nil || cmd.Process == nil {
+ return fmt.Errorf("%s is not a valid running container to join", value)
+ }
+ nspath := filepath.Join("/proc", fmt.Sprint(cmd.Process.Pid), "ns", "net")
+ container.Networks = append(container.Networks, &libcontainer.Network{
+ Type: "netns",
+ Context: libcontainer.Context{
+ "nspath": nspath,
+ },
+ })
+ return nil
+}
+
+func vethMacAddress(container *libcontainer.Container, context interface{}, value string) error {
+ var veth *libcontainer.Network
+ for _, network := range container.Networks {
+ if network.Type == "veth" {
+ veth = network
+ break
+ }
+ }
+ if veth == nil {
+ return fmt.Errorf("not veth configured for container")
+ }
+ veth.Context["mac"] = value
+ return nil
+}
+
+// configureCustomOptions takes string commands from the user and allows modification of the
+// container's default configuration.
+//
+// TODO: this can be moved to a general utils or parser in pkg
+func ParseConfiguration(container *libcontainer.Container, running map[string]*exec.Cmd, opts []string) error {
+ for _, opt := range opts {
+ kv := strings.SplitN(opt, "=", 2)
+ if len(kv) < 2 {
+ return fmt.Errorf("invalid format for %s", opt)
+ }
+
+ action, exists := actions[kv[0]]
+ if !exists {
+ return fmt.Errorf("%s is not a valid option for the native driver", kv[0])
+ }
+
+ if err := action(container, running, kv[1]); err != nil {
+ return err
+ }
+ }
+ return nil
+}
diff --git a/runtime/execdriver/native/configuration/parse_test.go b/runtime/execdriver/native/configuration/parse_test.go
new file mode 100644
index 0000000000..8001358766
--- /dev/null
+++ b/runtime/execdriver/native/configuration/parse_test.go
@@ -0,0 +1,166 @@
+package configuration
+
+import (
+ "github.com/dotcloud/docker/runtime/execdriver/native/template"
+ "testing"
+)
+
+func TestSetReadonlyRootFs(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "fs.readonly=true",
+ }
+ )
+
+ if container.ReadonlyFs {
+ t.Fatal("container should not have a readonly rootfs by default")
+ }
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if !container.ReadonlyFs {
+ t.Fatal("container should have a readonly rootfs")
+ }
+}
+
+func TestConfigurationsDoNotConflict(t *testing.T) {
+ var (
+ container1 = template.New()
+ container2 = template.New()
+ opts = []string{
+ "cap.add=NET_ADMIN",
+ }
+ )
+
+ if err := ParseConfiguration(container1, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if !container1.CapabilitiesMask.Get("NET_ADMIN").Enabled {
+ t.Fatal("container one should have NET_ADMIN enabled")
+ }
+ if container2.CapabilitiesMask.Get("NET_ADMIN").Enabled {
+ t.Fatal("container two should not have NET_ADMIN enabled")
+ }
+}
+
+func TestCpusetCpus(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "cgroups.cpuset.cpus=1,2",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := "1,2"; container.Cgroups.CpusetCpus != expected {
+ t.Fatalf("expected %s got %s for cpuset.cpus", expected, container.Cgroups.CpusetCpus)
+ }
+}
+
+func TestAppArmorProfile(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "apparmor_profile=koye-the-protector",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+ if expected := "koye-the-protector"; container.Context["apparmor_profile"] != expected {
+ t.Fatalf("expected profile %s got %s", expected, container.Context["apparmor_profile"])
+ }
+}
+
+func TestCpuShares(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "cgroups.cpu_shares=1048",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := int64(1048); container.Cgroups.CpuShares != expected {
+ t.Fatalf("expected cpu shares %d got %d", expected, container.Cgroups.CpuShares)
+ }
+}
+
+func TestCgroupMemory(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "cgroups.memory=500m",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := int64(500 * 1024 * 1024); container.Cgroups.Memory != expected {
+ t.Fatalf("expected memory %d got %d", expected, container.Cgroups.Memory)
+ }
+}
+
+func TestAddCap(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "cap.add=MKNOD",
+ "cap.add=SYS_ADMIN",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if !container.CapabilitiesMask.Get("MKNOD").Enabled {
+ t.Fatal("container should have MKNOD enabled")
+ }
+ if !container.CapabilitiesMask.Get("SYS_ADMIN").Enabled {
+ t.Fatal("container should have SYS_ADMIN enabled")
+ }
+}
+
+func TestDropCap(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "cap.drop=MKNOD",
+ }
+ )
+ // enabled all caps like in privileged mode
+ for _, c := range container.CapabilitiesMask {
+ c.Enabled = true
+ }
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if container.CapabilitiesMask.Get("MKNOD").Enabled {
+ t.Fatal("container should not have MKNOD enabled")
+ }
+}
+
+func TestDropNamespace(t *testing.T) {
+ var (
+ container = template.New()
+ opts = []string{
+ "ns.drop=NEWNET",
+ }
+ )
+ if err := ParseConfiguration(container, nil, opts); err != nil {
+ t.Fatal(err)
+ }
+
+ if container.Namespaces.Get("NEWNET").Enabled {
+ t.Fatal("container should not have NEWNET enabled")
+ }
+}
diff --git a/runtime/execdriver/native/create.go b/runtime/execdriver/native/create.go
new file mode 100644
index 0000000000..71fab3e064
--- /dev/null
+++ b/runtime/execdriver/native/create.go
@@ -0,0 +1,114 @@
+package native
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/dotcloud/docker/pkg/label"
+ "github.com/dotcloud/docker/pkg/libcontainer"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/runtime/execdriver/native/configuration"
+ "github.com/dotcloud/docker/runtime/execdriver/native/template"
+)
+
+// createContainer populates and configures the container type with the
+// data provided by the execdriver.Command
+func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container, error) {
+ container := template.New()
+
+ container.Hostname = getEnv("HOSTNAME", c.Env)
+ container.Tty = c.Tty
+ container.User = c.User
+ container.WorkingDir = c.WorkingDir
+ container.Env = c.Env
+ container.Cgroups.Name = c.ID
+ // check to see if we are running in ramdisk to disable pivot root
+ container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
+
+ if err := d.createNetwork(container, c); err != nil {
+ return nil, err
+ }
+ if c.Privileged {
+ if err := d.setPrivileged(container); err != nil {
+ return nil, err
+ }
+ }
+ if err := d.setupCgroups(container, c); err != nil {
+ return nil, err
+ }
+ if err := d.setupMounts(container, c); err != nil {
+ return nil, err
+ }
+ if err := d.setupLabels(container, c); err != nil {
+ return nil, err
+ }
+ if err := configuration.ParseConfiguration(container, d.activeContainers, c.Config["native"]); err != nil {
+ return nil, err
+ }
+ return container, nil
+}
+
+func (d *driver) createNetwork(container *libcontainer.Container, c *execdriver.Command) error {
+ container.Networks = []*libcontainer.Network{
+ {
+ Mtu: c.Network.Mtu,
+ Address: fmt.Sprintf("%s/%d", "127.0.0.1", 0),
+ Gateway: "localhost",
+ Type: "loopback",
+ Context: libcontainer.Context{},
+ },
+ }
+
+ if c.Network.Interface != nil {
+ vethNetwork := libcontainer.Network{
+ Mtu: c.Network.Mtu,
+ Address: fmt.Sprintf("%s/%d", c.Network.Interface.IPAddress, c.Network.Interface.IPPrefixLen),
+ Gateway: c.Network.Interface.Gateway,
+ Type: "veth",
+ Context: libcontainer.Context{
+ "prefix": "veth",
+ "bridge": c.Network.Interface.Bridge,
+ },
+ }
+ container.Networks = append(container.Networks, &vethNetwork)
+ }
+ return nil
+}
+
+func (d *driver) setPrivileged(container *libcontainer.Container) error {
+ for _, c := range container.CapabilitiesMask {
+ c.Enabled = true
+ }
+ container.Cgroups.DeviceAccess = true
+ container.Context["apparmor_profile"] = "unconfined"
+ return nil
+}
+
+func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.Command) error {
+ if c.Resources != nil {
+ container.Cgroups.CpuShares = c.Resources.CpuShares
+ container.Cgroups.Memory = c.Resources.Memory
+ container.Cgroups.MemorySwap = c.Resources.MemorySwap
+ }
+ return nil
+}
+
+func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error {
+ for _, m := range c.Mounts {
+ container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private})
+ }
+ return nil
+}
+
+func (d *driver) setupLabels(container *libcontainer.Container, c *execdriver.Command) error {
+ labels := c.Config["label"]
+ if len(labels) > 0 {
+ process, mount, err := label.GenLabels(labels[0])
+ if err != nil {
+ return err
+ }
+ container.Context["mount_label"] = mount
+ container.Context["process_label"] = process
+ }
+ return nil
+}
diff --git a/runtime/execdriver/native/driver.go b/runtime/execdriver/native/driver.go
new file mode 100644
index 0000000000..d18865e508
--- /dev/null
+++ b/runtime/execdriver/native/driver.go
@@ -0,0 +1,292 @@
+package native
+
+import (
+ "encoding/json"
+ "fmt"
+ "github.com/dotcloud/docker/pkg/cgroups"
+ "github.com/dotcloud/docker/pkg/libcontainer"
+ "github.com/dotcloud/docker/pkg/libcontainer/apparmor"
+ "github.com/dotcloud/docker/pkg/libcontainer/nsinit"
+ "github.com/dotcloud/docker/pkg/system"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+)
+
+const (
+ DriverName = "native"
+ Version = "0.1"
+ BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root
+)
+
+func init() {
+ execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error {
+ var (
+ container *libcontainer.Container
+ ns = nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{args.Root}, createLogger(""))
+ )
+ f, err := os.Open(filepath.Join(args.Root, "container.json"))
+ if err != nil {
+ return err
+ }
+ if err := json.NewDecoder(f).Decode(&container); err != nil {
+ f.Close()
+ return err
+ }
+ f.Close()
+
+ cwd, err := os.Getwd()
+ if err != nil {
+ return err
+ }
+ syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(args.Pipe))
+ if err != nil {
+ return err
+ }
+ if err := ns.Init(container, cwd, args.Console, syncPipe, args.Args); err != nil {
+ return err
+ }
+ return nil
+ })
+}
+
+type driver struct {
+ root string
+ initPath string
+ activeContainers map[string]*exec.Cmd
+}
+
+func NewDriver(root, initPath string) (*driver, error) {
+ if err := os.MkdirAll(root, 0700); err != nil {
+ return nil, err
+ }
+ // native driver root is at docker_root/execdriver/native. Put apparmor at docker_root
+ if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil {
+ return nil, err
+ }
+ return &driver{
+ root: root,
+ initPath: initPath,
+ activeContainers: make(map[string]*exec.Cmd),
+ }, nil
+}
+
+func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) {
+ // take the Command and populate the libcontainer.Container from it
+ container, err := d.createContainer(c)
+ if err != nil {
+ return -1, err
+ }
+ d.activeContainers[c.ID] = &c.Cmd
+
+ var (
+ term nsinit.Terminal
+ factory = &dockerCommandFactory{c: c, driver: d}
+ stateWriter = &dockerStateWriter{
+ callback: startCallback,
+ c: c,
+ dsw: &nsinit.DefaultStateWriter{filepath.Join(d.root, c.ID)},
+ }
+ ns = nsinit.NewNsInit(factory, stateWriter, createLogger(os.Getenv("DEBUG")))
+ args = append([]string{c.Entrypoint}, c.Arguments...)
+ )
+ if err := d.createContainerRoot(c.ID); err != nil {
+ return -1, err
+ }
+ defer d.removeContainerRoot(c.ID)
+
+ if c.Tty {
+ term = &dockerTtyTerm{
+ pipes: pipes,
+ }
+ } else {
+ term = &dockerStdTerm{
+ pipes: pipes,
+ }
+ }
+ c.Terminal = term
+ if err := d.writeContainerFile(container, c.ID); err != nil {
+ return -1, err
+ }
+ return ns.Exec(container, term, args)
+}
+
+func (d *driver) Kill(p *execdriver.Command, sig int) error {
+ return syscall.Kill(p.Process.Pid, syscall.Signal(sig))
+}
+
+func (d *driver) Terminate(p *execdriver.Command) error {
+ // lets check the start time for the process
+ started, err := d.readStartTime(p)
+ if err != nil {
+ // if we don't have the data on disk then we can assume the process is gone
+ // because this is only removed after we know the process has stopped
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+
+ currentStartTime, err := system.GetProcessStartTime(p.Process.Pid)
+ if err != nil {
+ return err
+ }
+ if started == currentStartTime {
+ err = syscall.Kill(p.Process.Pid, 9)
+ }
+ d.removeContainerRoot(p.ID)
+ return err
+
+}
+
+func (d *driver) readStartTime(p *execdriver.Command) (string, error) {
+ data, err := ioutil.ReadFile(filepath.Join(d.root, p.ID, "start"))
+ if err != nil {
+ return "", err
+ }
+ return string(data), nil
+}
+
+func (d *driver) Info(id string) execdriver.Info {
+ return &info{
+ ID: id,
+ driver: d,
+ }
+}
+
+func (d *driver) Name() string {
+ return fmt.Sprintf("%s-%s", DriverName, Version)
+}
+
+// TODO: this can be improved with our driver
+// there has to be a better way to do this
+func (d *driver) GetPidsForContainer(id string) ([]int, error) {
+ pids := []int{}
+
+ subsystem := "devices"
+ cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem)
+ if err != nil {
+ return pids, err
+ }
+ cgroupDir, err := cgroups.GetThisCgroupDir(subsystem)
+ if err != nil {
+ return pids, err
+ }
+
+ filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks")
+ if _, err := os.Stat(filename); os.IsNotExist(err) {
+ filename = filepath.Join(cgroupRoot, cgroupDir, "docker", id, "tasks")
+ }
+
+ output, err := ioutil.ReadFile(filename)
+ if err != nil {
+ return pids, err
+ }
+ for _, p := range strings.Split(string(output), "\n") {
+ if len(p) == 0 {
+ continue
+ }
+ pid, err := strconv.Atoi(p)
+ if err != nil {
+ return pids, fmt.Errorf("Invalid pid '%s': %s", p, err)
+ }
+ pids = append(pids, pid)
+ }
+ return pids, nil
+}
+
+func (d *driver) writeContainerFile(container *libcontainer.Container, id string) error {
+ data, err := json.Marshal(container)
+ if err != nil {
+ return err
+ }
+ return ioutil.WriteFile(filepath.Join(d.root, id, "container.json"), data, 0655)
+}
+
+func (d *driver) createContainerRoot(id string) error {
+ return os.MkdirAll(filepath.Join(d.root, id), 0655)
+}
+
+func (d *driver) removeContainerRoot(id string) error {
+ return os.RemoveAll(filepath.Join(d.root, id))
+}
+
+func getEnv(key string, env []string) string {
+ for _, pair := range env {
+ parts := strings.Split(pair, "=")
+ if parts[0] == key {
+ return parts[1]
+ }
+ }
+ return ""
+}
+
+type dockerCommandFactory struct {
+ c *execdriver.Command
+ driver *driver
+}
+
+// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces
+// defined on the container's configuration and use the current binary as the init with the
+// args provided
+func (d *dockerCommandFactory) Create(container *libcontainer.Container, console string, syncFile *os.File, args []string) *exec.Cmd {
+ // we need to join the rootfs because nsinit will setup the rootfs and chroot
+ initPath := filepath.Join(d.c.Rootfs, d.c.InitPath)
+
+ d.c.Path = d.driver.initPath
+ d.c.Args = append([]string{
+ initPath,
+ "-driver", DriverName,
+ "-console", console,
+ "-pipe", "3",
+ "-root", filepath.Join(d.driver.root, d.c.ID),
+ "--",
+ }, args...)
+
+ // set this to nil so that when we set the clone flags anything else is reset
+ d.c.SysProcAttr = nil
+ system.SetCloneFlags(&d.c.Cmd, uintptr(nsinit.GetNamespaceFlags(container.Namespaces)))
+ d.c.ExtraFiles = []*os.File{syncFile}
+
+ d.c.Env = container.Env
+ d.c.Dir = d.c.Rootfs
+
+ return &d.c.Cmd
+}
+
+type dockerStateWriter struct {
+ dsw nsinit.StateWriter
+ c *execdriver.Command
+ callback execdriver.StartCallback
+}
+
+func (d *dockerStateWriter) WritePid(pid int, started string) error {
+ d.c.ContainerPid = pid
+ err := d.dsw.WritePid(pid, started)
+ if d.callback != nil {
+ d.callback(d.c)
+ }
+ return err
+}
+
+func (d *dockerStateWriter) DeletePid() error {
+ return d.dsw.DeletePid()
+}
+
+func createLogger(debug string) *log.Logger {
+ var w io.Writer
+ // if we are in debug mode set the logger to stderr
+ if debug != "" {
+ w = os.Stderr
+ } else {
+ w = ioutil.Discard
+ }
+ return log.New(w, "[libcontainer] ", log.LstdFlags)
+}
diff --git a/runtime/execdriver/native/info.go b/runtime/execdriver/native/info.go
new file mode 100644
index 0000000000..aef2f85c6b
--- /dev/null
+++ b/runtime/execdriver/native/info.go
@@ -0,0 +1,21 @@
+package native
+
+import (
+ "os"
+ "path/filepath"
+)
+
+type info struct {
+ ID string
+ driver *driver
+}
+
+// IsRunning is determined by looking for the
+// pid file for a container. If the file exists then the
+// container is currently running
+func (i *info) IsRunning() bool {
+ if _, err := os.Stat(filepath.Join(i.driver.root, i.ID, "pid")); err == nil {
+ return true
+ }
+ return false
+}
diff --git a/runtime/execdriver/native/template/default_template.go b/runtime/execdriver/native/template/default_template.go
new file mode 100644
index 0000000000..a1ecb04d76
--- /dev/null
+++ b/runtime/execdriver/native/template/default_template.go
@@ -0,0 +1,45 @@
+package template
+
+import (
+ "github.com/dotcloud/docker/pkg/cgroups"
+ "github.com/dotcloud/docker/pkg/libcontainer"
+)
+
+// New returns the docker default configuration for libcontainer
+func New() *libcontainer.Container {
+ container := &libcontainer.Container{
+ CapabilitiesMask: libcontainer.Capabilities{
+ libcontainer.GetCapability("SETPCAP"),
+ libcontainer.GetCapability("SYS_MODULE"),
+ libcontainer.GetCapability("SYS_RAWIO"),
+ libcontainer.GetCapability("SYS_PACCT"),
+ libcontainer.GetCapability("SYS_ADMIN"),
+ libcontainer.GetCapability("SYS_NICE"),
+ libcontainer.GetCapability("SYS_RESOURCE"),
+ libcontainer.GetCapability("SYS_TIME"),
+ libcontainer.GetCapability("SYS_TTY_CONFIG"),
+ libcontainer.GetCapability("AUDIT_WRITE"),
+ libcontainer.GetCapability("AUDIT_CONTROL"),
+ libcontainer.GetCapability("MAC_OVERRIDE"),
+ libcontainer.GetCapability("MAC_ADMIN"),
+ libcontainer.GetCapability("NET_ADMIN"),
+ libcontainer.GetCapability("MKNOD"),
+ },
+ Namespaces: libcontainer.Namespaces{
+ libcontainer.GetNamespace("NEWNS"),
+ libcontainer.GetNamespace("NEWUTS"),
+ libcontainer.GetNamespace("NEWIPC"),
+ libcontainer.GetNamespace("NEWPID"),
+ libcontainer.GetNamespace("NEWNET"),
+ },
+ Cgroups: &cgroups.Cgroup{
+ Parent: "docker",
+ DeviceAccess: false,
+ },
+ Context: libcontainer.Context{
+ "apparmor_profile": "docker-default",
+ },
+ }
+ container.CapabilitiesMask.Get("MKNOD").Enabled = true
+ return container
+}
diff --git a/runtime/execdriver/native/term.go b/runtime/execdriver/native/term.go
new file mode 100644
index 0000000000..0d5298d388
--- /dev/null
+++ b/runtime/execdriver/native/term.go
@@ -0,0 +1,42 @@
+/*
+ These types are wrappers around the libcontainer Terminal interface so that
+ we can resuse the docker implementations where possible.
+*/
+package native
+
+import (
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "io"
+ "os"
+ "os/exec"
+)
+
+type dockerStdTerm struct {
+ execdriver.StdConsole
+ pipes *execdriver.Pipes
+}
+
+func (d *dockerStdTerm) Attach(cmd *exec.Cmd) error {
+ return d.AttachPipes(cmd, d.pipes)
+}
+
+func (d *dockerStdTerm) SetMaster(master *os.File) {
+ // do nothing
+}
+
+type dockerTtyTerm struct {
+ execdriver.TtyConsole
+ pipes *execdriver.Pipes
+}
+
+func (t *dockerTtyTerm) Attach(cmd *exec.Cmd) error {
+ go io.Copy(t.pipes.Stdout, t.MasterPty)
+ if t.pipes.Stdin != nil {
+ go io.Copy(t.MasterPty, t.pipes.Stdin)
+ }
+ return nil
+}
+
+func (t *dockerTtyTerm) SetMaster(master *os.File) {
+ t.MasterPty = master
+}
diff --git a/runtime/execdriver/pipes.go b/runtime/execdriver/pipes.go
new file mode 100644
index 0000000000..158219f0c5
--- /dev/null
+++ b/runtime/execdriver/pipes.go
@@ -0,0 +1,23 @@
+package execdriver
+
+import (
+ "io"
+)
+
+// Pipes is a wrapper around a containers output for
+// stdin, stdout, stderr
+type Pipes struct {
+ Stdin io.ReadCloser
+ Stdout, Stderr io.Writer
+}
+
+func NewPipes(stdin io.ReadCloser, stdout, stderr io.Writer, useStdin bool) *Pipes {
+ p := &Pipes{
+ Stdout: stdout,
+ Stderr: stderr,
+ }
+ if useStdin {
+ p.Stdin = stdin
+ }
+ return p
+}
diff --git a/runtime/execdriver/termconsole.go b/runtime/execdriver/termconsole.go
new file mode 100644
index 0000000000..af6b88d3d1
--- /dev/null
+++ b/runtime/execdriver/termconsole.go
@@ -0,0 +1,126 @@
+package execdriver
+
+import (
+ "github.com/dotcloud/docker/pkg/term"
+ "github.com/kr/pty"
+ "io"
+ "os"
+ "os/exec"
+)
+
+func SetTerminal(command *Command, pipes *Pipes) error {
+ var (
+ term Terminal
+ err error
+ )
+ if command.Tty {
+ term, err = NewTtyConsole(command, pipes)
+ } else {
+ term, err = NewStdConsole(command, pipes)
+ }
+ if err != nil {
+ return err
+ }
+ command.Terminal = term
+ return nil
+}
+
+type TtyConsole struct {
+ MasterPty *os.File
+ SlavePty *os.File
+}
+
+func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) {
+ ptyMaster, ptySlave, err := pty.Open()
+ if err != nil {
+ return nil, err
+ }
+ tty := &TtyConsole{
+ MasterPty: ptyMaster,
+ SlavePty: ptySlave,
+ }
+ if err := tty.AttachPipes(&command.Cmd, pipes); err != nil {
+ tty.Close()
+ return nil, err
+ }
+ command.Console = tty.SlavePty.Name()
+ return tty, nil
+}
+
+func (t *TtyConsole) Master() *os.File {
+ return t.MasterPty
+}
+
+func (t *TtyConsole) Resize(h, w int) error {
+ return term.SetWinsize(t.MasterPty.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
+}
+
+func (t *TtyConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error {
+ command.Stdout = t.SlavePty
+ command.Stderr = t.SlavePty
+
+ go func() {
+ if wb, ok := pipes.Stdout.(interface {
+ CloseWriters() error
+ }); ok {
+ defer wb.CloseWriters()
+ }
+ io.Copy(pipes.Stdout, t.MasterPty)
+ }()
+
+ if pipes.Stdin != nil {
+ command.Stdin = t.SlavePty
+ command.SysProcAttr.Setctty = true
+
+ go func() {
+ defer pipes.Stdin.Close()
+ io.Copy(t.MasterPty, pipes.Stdin)
+ }()
+ }
+ return nil
+}
+
+func (t *TtyConsole) Close() error {
+ t.SlavePty.Close()
+ return t.MasterPty.Close()
+}
+
+type StdConsole struct {
+}
+
+func NewStdConsole(command *Command, pipes *Pipes) (*StdConsole, error) {
+ std := &StdConsole{}
+
+ if err := std.AttachPipes(&command.Cmd, pipes); err != nil {
+ return nil, err
+ }
+ return std, nil
+}
+
+func (s *StdConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error {
+ command.Stdout = pipes.Stdout
+ command.Stderr = pipes.Stderr
+
+ if pipes.Stdin != nil {
+ stdin, err := command.StdinPipe()
+ if err != nil {
+ return err
+ }
+
+ go func() {
+ defer stdin.Close()
+ io.Copy(stdin, pipes.Stdin)
+ }()
+ }
+ return nil
+}
+
+func (s *StdConsole) Resize(h, w int) error {
+ // we do not need to reside a non tty
+ return nil
+}
+
+func (s *StdConsole) Close() error {
+ // nothing to close here
+ return nil
+}
diff --git a/runtime/graphdriver/aufs/aufs.go b/runtime/graphdriver/aufs/aufs.go
new file mode 100644
index 0000000000..401bbd8c86
--- /dev/null
+++ b/runtime/graphdriver/aufs/aufs.go
@@ -0,0 +1,401 @@
+/*
+
+aufs driver directory structure
+
+.
+├── layers // Metadata of layers
+│   ├── 1
+│   ├── 2
+│   └── 3
+├── diffs // Content of the layer
+│   ├── 1 // Contains layers that need to be mounted for the id
+│   ├── 2
+│   └── 3
+└── mnt // Mount points for the rw layers to be mounted
+ ├── 1
+ ├── 2
+ └── 3
+
+*/
+
+package aufs
+
+import (
+ "bufio"
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ mountpk "github.com/dotcloud/docker/pkg/mount"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "github.com/dotcloud/docker/utils"
+ "os"
+ "os/exec"
+ "path"
+ "strings"
+ "sync"
+)
+
+var (
+ ErrAufsNotSupported = fmt.Errorf("AUFS was not found in /proc/filesystems")
+)
+
+func init() {
+ graphdriver.Register("aufs", Init)
+}
+
+type Driver struct {
+ root string
+ sync.Mutex // Protects concurrent modification to active
+ active map[string]int
+}
+
+// New returns a new AUFS driver.
+// An error is returned if AUFS is not supported.
+func Init(root string) (graphdriver.Driver, error) {
+ // Try to load the aufs kernel module
+ if err := supportsAufs(); err != nil {
+ return nil, err
+ }
+ paths := []string{
+ "mnt",
+ "diff",
+ "layers",
+ }
+
+ a := &Driver{
+ root: root,
+ active: make(map[string]int),
+ }
+
+ // Create the root aufs driver dir and return
+ // if it already exists
+ // If not populate the dir structure
+ if err := os.MkdirAll(root, 0755); err != nil {
+ if os.IsExist(err) {
+ return a, nil
+ }
+ return nil, err
+ }
+
+ for _, p := range paths {
+ if err := os.MkdirAll(path.Join(root, p), 0755); err != nil {
+ return nil, err
+ }
+ }
+ return a, nil
+}
+
+// Return a nil error if the kernel supports aufs
+// We cannot modprobe because inside dind modprobe fails
+// to run
+func supportsAufs() error {
+ // We can try to modprobe aufs first before looking at
+ // proc/filesystems for when aufs is supported
+ exec.Command("modprobe", "aufs").Run()
+
+ f, err := os.Open("/proc/filesystems")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ if strings.Contains(s.Text(), "aufs") {
+ return nil
+ }
+ }
+ return ErrAufsNotSupported
+}
+
+func (a Driver) rootPath() string {
+ return a.root
+}
+
+func (Driver) String() string {
+ return "aufs"
+}
+
+func (a Driver) Status() [][2]string {
+ ids, _ := loadIds(path.Join(a.rootPath(), "layers"))
+ return [][2]string{
+ {"Root Dir", a.rootPath()},
+ {"Dirs", fmt.Sprintf("%d", len(ids))},
+ }
+}
+
+// Exists returns true if the given id is registered with
+// this driver
+func (a Driver) Exists(id string) bool {
+ if _, err := os.Lstat(path.Join(a.rootPath(), "layers", id)); err != nil {
+ return false
+ }
+ return true
+}
+
+// Three folders are created for each id
+// mnt, layers, and diff
+func (a *Driver) Create(id, parent string, mountLabel string) error {
+ if err := a.createDirsFor(id); err != nil {
+ return err
+ }
+ // Write the layers metadata
+ f, err := os.Create(path.Join(a.rootPath(), "layers", id))
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ if parent != "" {
+ ids, err := getParentIds(a.rootPath(), parent)
+ if err != nil {
+ return err
+ }
+
+ if _, err := fmt.Fprintln(f, parent); err != nil {
+ return err
+ }
+ for _, i := range ids {
+ if _, err := fmt.Fprintln(f, i); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+func (a *Driver) createDirsFor(id string) error {
+ paths := []string{
+ "mnt",
+ "diff",
+ }
+
+ for _, p := range paths {
+ if err := os.MkdirAll(path.Join(a.rootPath(), p, id), 0755); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Unmount and remove the dir information
+func (a *Driver) Remove(id string) error {
+ // Protect the a.active from concurrent access
+ a.Lock()
+ defer a.Unlock()
+
+ if a.active[id] != 0 {
+ utils.Errorf("Warning: removing active id %s\n", id)
+ }
+
+ // Make sure the dir is umounted first
+ if err := a.unmount(id); err != nil {
+ return err
+ }
+ tmpDirs := []string{
+ "mnt",
+ "diff",
+ }
+
+ // Atomically remove each directory in turn by first moving it out of the
+ // way (so that docker doesn't find it anymore) before doing removal of
+ // the whole tree.
+ for _, p := range tmpDirs {
+
+ realPath := path.Join(a.rootPath(), p, id)
+ tmpPath := path.Join(a.rootPath(), p, fmt.Sprintf("%s-removing", id))
+ if err := os.Rename(realPath, tmpPath); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ defer os.RemoveAll(tmpPath)
+ }
+
+ // Remove the layers file for the id
+ if err := os.Remove(path.Join(a.rootPath(), "layers", id)); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ return nil
+}
+
+// Return the rootfs path for the id
+// This will mount the dir at it's given path
+func (a *Driver) Get(id string) (string, error) {
+ ids, err := getParentIds(a.rootPath(), id)
+ if err != nil {
+ if !os.IsNotExist(err) {
+ return "", err
+ }
+ ids = []string{}
+ }
+
+ // Protect the a.active from concurrent access
+ a.Lock()
+ defer a.Unlock()
+
+ count := a.active[id]
+
+ // If a dir does not have a parent ( no layers )do not try to mount
+ // just return the diff path to the data
+ out := path.Join(a.rootPath(), "diff", id)
+ if len(ids) > 0 {
+ out = path.Join(a.rootPath(), "mnt", id)
+
+ if count == 0 {
+ if err := a.mount(id); err != nil {
+ return "", err
+ }
+ }
+ }
+
+ a.active[id] = count + 1
+
+ return out, nil
+}
+
+func (a *Driver) Put(id string) {
+ // Protect the a.active from concurrent access
+ a.Lock()
+ defer a.Unlock()
+
+ if count := a.active[id]; count > 1 {
+ a.active[id] = count - 1
+ } else {
+ ids, _ := getParentIds(a.rootPath(), id)
+ // We only mounted if there are any parents
+ if ids != nil && len(ids) > 0 {
+ a.unmount(id)
+ }
+ delete(a.active, id)
+ }
+}
+
+// Returns an archive of the contents for the id
+func (a *Driver) Diff(id string) (archive.Archive, error) {
+ return archive.TarFilter(path.Join(a.rootPath(), "diff", id), &archive.TarOptions{
+ Compression: archive.Uncompressed,
+ })
+}
+
+func (a *Driver) ApplyDiff(id string, diff archive.ArchiveReader) error {
+ return archive.Untar(diff, path.Join(a.rootPath(), "diff", id), nil)
+}
+
+// Returns the size of the contents for the id
+func (a *Driver) DiffSize(id string) (int64, error) {
+ return utils.TreeSize(path.Join(a.rootPath(), "diff", id))
+}
+
+func (a *Driver) Changes(id string) ([]archive.Change, error) {
+ layers, err := a.getParentLayerPaths(id)
+ if err != nil {
+ return nil, err
+ }
+ return archive.Changes(layers, path.Join(a.rootPath(), "diff", id))
+}
+
+func (a *Driver) getParentLayerPaths(id string) ([]string, error) {
+ parentIds, err := getParentIds(a.rootPath(), id)
+ if err != nil {
+ return nil, err
+ }
+ if len(parentIds) == 0 {
+ return nil, fmt.Errorf("Dir %s does not have any parent layers", id)
+ }
+ layers := make([]string, len(parentIds))
+
+ // Get the diff paths for all the parent ids
+ for i, p := range parentIds {
+ layers[i] = path.Join(a.rootPath(), "diff", p)
+ }
+ return layers, nil
+}
+
+func (a *Driver) mount(id string) error {
+ // If the id is mounted or we get an error return
+ if mounted, err := a.mounted(id); err != nil || mounted {
+ return err
+ }
+
+ var (
+ target = path.Join(a.rootPath(), "mnt", id)
+ rw = path.Join(a.rootPath(), "diff", id)
+ )
+
+ layers, err := a.getParentLayerPaths(id)
+ if err != nil {
+ return err
+ }
+
+ if err := a.aufsMount(layers, rw, target); err != nil {
+ return err
+ }
+ return nil
+}
+
+func (a *Driver) unmount(id string) error {
+ if mounted, err := a.mounted(id); err != nil || !mounted {
+ return err
+ }
+ target := path.Join(a.rootPath(), "mnt", id)
+ return Unmount(target)
+}
+
+func (a *Driver) mounted(id string) (bool, error) {
+ target := path.Join(a.rootPath(), "mnt", id)
+ return mountpk.Mounted(target)
+}
+
+// During cleanup aufs needs to unmount all mountpoints
+func (a *Driver) Cleanup() error {
+ ids, err := loadIds(path.Join(a.rootPath(), "layers"))
+ if err != nil {
+ return err
+ }
+ for _, id := range ids {
+ if err := a.unmount(id); err != nil {
+ utils.Errorf("Unmounting %s: %s", utils.TruncateID(id), err)
+ }
+ }
+ return nil
+}
+
+func (a *Driver) aufsMount(ro []string, rw, target string) (err error) {
+ defer func() {
+ if err != nil {
+ Unmount(target)
+ }
+ }()
+
+ if err = a.tryMount(ro, rw, target); err != nil {
+ if err = a.mountRw(rw, target); err != nil {
+ return
+ }
+
+ for _, layer := range ro {
+ branch := fmt.Sprintf("append:%s=ro+wh", layer)
+ if err = mount("none", target, "aufs", MsRemount, branch); err != nil {
+ return
+ }
+ }
+ }
+ return
+}
+
+// Try to mount using the aufs fast path, if this fails then
+// append ro layers.
+func (a *Driver) tryMount(ro []string, rw, target string) (err error) {
+ var (
+ rwBranch = fmt.Sprintf("%s=rw", rw)
+ roBranches = fmt.Sprintf("%s=ro+wh:", strings.Join(ro, "=ro+wh:"))
+ )
+ return mount("none", target, "aufs", 0, fmt.Sprintf("br:%v:%v,xino=/dev/shm/aufs.xino", rwBranch, roBranches))
+}
+
+func (a *Driver) mountRw(rw, target string) error {
+ return mount("none", target, "aufs", 0, fmt.Sprintf("br:%s,xino=/dev/shm/aufs.xino", rw))
+}
+
+func rollbackMount(target string, err error) {
+ if err != nil {
+ Unmount(target)
+ }
+}
diff --git a/runtime/graphdriver/aufs/aufs_test.go b/runtime/graphdriver/aufs/aufs_test.go
new file mode 100644
index 0000000000..9cfdebd160
--- /dev/null
+++ b/runtime/graphdriver/aufs/aufs_test.go
@@ -0,0 +1,697 @@
+package aufs
+
+import (
+ "crypto/sha256"
+ "encoding/hex"
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "io/ioutil"
+ "os"
+ "path"
+ "testing"
+)
+
+var (
+ tmp = path.Join(os.TempDir(), "aufs-tests", "aufs")
+)
+
+func testInit(dir string, t *testing.T) graphdriver.Driver {
+ d, err := Init(dir)
+ if err != nil {
+ if err == ErrAufsNotSupported {
+ t.Skip(err)
+ } else {
+ t.Fatal(err)
+ }
+ }
+ return d
+}
+
+func newDriver(t *testing.T) *Driver {
+ if err := os.MkdirAll(tmp, 0755); err != nil {
+ t.Fatal(err)
+ }
+
+ d := testInit(tmp, t)
+ return d.(*Driver)
+}
+
+func TestNewDriver(t *testing.T) {
+ if err := os.MkdirAll(tmp, 0755); err != nil {
+ t.Fatal(err)
+ }
+
+ d := testInit(tmp, t)
+ defer os.RemoveAll(tmp)
+ if d == nil {
+ t.Fatalf("Driver should not be nil")
+ }
+}
+
+func TestAufsString(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if d.String() != "aufs" {
+ t.Fatalf("Expected aufs got %s", d.String())
+ }
+}
+
+func TestCreateDirStructure(t *testing.T) {
+ newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ paths := []string{
+ "mnt",
+ "layers",
+ "diff",
+ }
+
+ for _, p := range paths {
+ if _, err := os.Stat(path.Join(tmp, p)); err != nil {
+ t.Fatal(err)
+ }
+ }
+}
+
+// We should be able to create two drivers with the same dir structure
+func TestNewDriverFromExistingDir(t *testing.T) {
+ if err := os.MkdirAll(tmp, 0755); err != nil {
+ t.Fatal(err)
+ }
+
+ testInit(tmp, t)
+ testInit(tmp, t)
+ os.RemoveAll(tmp)
+}
+
+func TestCreateNewDir(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestCreateNewDirStructure(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ paths := []string{
+ "mnt",
+ "diff",
+ "layers",
+ }
+
+ for _, p := range paths {
+ if _, err := os.Stat(path.Join(tmp, p, "1")); err != nil {
+ t.Fatal(err)
+ }
+ }
+}
+
+func TestRemoveImage(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := d.Remove("1"); err != nil {
+ t.Fatal(err)
+ }
+
+ paths := []string{
+ "mnt",
+ "diff",
+ "layers",
+ }
+
+ for _, p := range paths {
+ if _, err := os.Stat(path.Join(tmp, p, "1")); err == nil {
+ t.Fatalf("Error should not be nil because dirs with id 1 should be delted: %s", p)
+ }
+ }
+}
+
+func TestGetWithoutParent(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffPath, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+ expected := path.Join(tmp, "diff", "1")
+ if diffPath != expected {
+ t.Fatalf("Expected path %s got %s", expected, diffPath)
+ }
+}
+
+func TestCleanupWithNoDirs(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestCleanupWithDir(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestMountedFalseResponse(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ response, err := d.mounted("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if response != false {
+ t.Fatalf("Response if dir id 1 is mounted should be false")
+ }
+}
+
+func TestMountedTrueReponse(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ _, err := d.Get("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ response, err := d.mounted("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if response != true {
+ t.Fatalf("Response if dir id 2 is mounted should be true")
+ }
+}
+
+func TestMountWithParent(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+ }()
+
+ mntPath, err := d.Get("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if mntPath == "" {
+ t.Fatal("mntPath should not be empty string")
+ }
+
+ expected := path.Join(tmp, "mnt", "2")
+ if mntPath != expected {
+ t.Fatalf("Expected %s got %s", expected, mntPath)
+ }
+}
+
+func TestRemoveMountedDir(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+ }()
+
+ mntPath, err := d.Get("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if mntPath == "" {
+ t.Fatal("mntPath should not be empty string")
+ }
+
+ mounted, err := d.mounted("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !mounted {
+ t.Fatalf("Dir id 2 should be mounted")
+ }
+
+ if err := d.Remove("2"); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestCreateWithInvalidParent(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "docker", ""); err == nil {
+ t.Fatalf("Error should not be nil with parent does not exist")
+ }
+}
+
+func TestGetDiff(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffPath, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Add a file to the diff path with a fixed size
+ size := int64(1024)
+
+ f, err := os.Create(path.Join(diffPath, "test_file"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Truncate(size); err != nil {
+ t.Fatal(err)
+ }
+ f.Close()
+
+ a, err := d.Diff("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if a == nil {
+ t.Fatalf("Archive should not be nil")
+ }
+}
+
+func TestChanges(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+ }()
+
+ mntPoint, err := d.Get("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Create a file to save in the mountpoint
+ f, err := os.Create(path.Join(mntPoint, "test.txt"))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if _, err := f.WriteString("testline"); err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Close(); err != nil {
+ t.Fatal(err)
+ }
+
+ changes, err := d.Changes("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(changes) != 1 {
+ t.Fatalf("Dir 2 should have one change from parent got %d", len(changes))
+ }
+ change := changes[0]
+
+ expectedPath := "/test.txt"
+ if change.Path != expectedPath {
+ t.Fatalf("Expected path %s got %s", expectedPath, change.Path)
+ }
+
+ if change.Kind != archive.ChangeAdd {
+ t.Fatalf("Change kind should be ChangeAdd got %s", change.Kind)
+ }
+
+ if err := d.Create("3", "2", ""); err != nil {
+ t.Fatal(err)
+ }
+ mntPoint, err = d.Get("3")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Create a file to save in the mountpoint
+ f, err = os.Create(path.Join(mntPoint, "test2.txt"))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if _, err := f.WriteString("testline"); err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Close(); err != nil {
+ t.Fatal(err)
+ }
+
+ changes, err = d.Changes("3")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if len(changes) != 1 {
+ t.Fatalf("Dir 2 should have one change from parent got %d", len(changes))
+ }
+ change = changes[0]
+
+ expectedPath = "/test2.txt"
+ if change.Path != expectedPath {
+ t.Fatalf("Expected path %s got %s", expectedPath, change.Path)
+ }
+
+ if change.Kind != archive.ChangeAdd {
+ t.Fatalf("Change kind should be ChangeAdd got %s", change.Kind)
+ }
+}
+
+func TestDiffSize(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffPath, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Add a file to the diff path with a fixed size
+ size := int64(1024)
+
+ f, err := os.Create(path.Join(diffPath, "test_file"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Truncate(size); err != nil {
+ t.Fatal(err)
+ }
+ s, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+ size = s.Size()
+ if err := f.Close(); err != nil {
+ t.Fatal(err)
+ }
+
+ diffSize, err := d.DiffSize("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if diffSize != size {
+ t.Fatalf("Expected size to be %d got %d", size, diffSize)
+ }
+}
+
+func TestChildDiffSize(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffPath, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Add a file to the diff path with a fixed size
+ size := int64(1024)
+
+ f, err := os.Create(path.Join(diffPath, "test_file"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Truncate(size); err != nil {
+ t.Fatal(err)
+ }
+ s, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+ size = s.Size()
+ if err := f.Close(); err != nil {
+ t.Fatal(err)
+ }
+
+ diffSize, err := d.DiffSize("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if diffSize != size {
+ t.Fatalf("Expected size to be %d got %d", size, diffSize)
+ }
+
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffSize, err = d.DiffSize("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+ // The diff size for the child should be zero
+ if diffSize != 0 {
+ t.Fatalf("Expected size to be %d got %d", 0, diffSize)
+ }
+}
+
+func TestExists(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ if d.Exists("none") {
+ t.Fatal("id name should not exist in the driver")
+ }
+
+ if !d.Exists("1") {
+ t.Fatal("id 1 should exist in the driver")
+ }
+}
+
+func TestStatus(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ status := d.Status()
+ if status == nil || len(status) == 0 {
+ t.Fatal("Status should not be nil or empty")
+ }
+ rootDir := status[0]
+ dirs := status[1]
+ if rootDir[0] != "Root Dir" {
+ t.Fatalf("Expected Root Dir got %s", rootDir[0])
+ }
+ if rootDir[1] != d.rootPath() {
+ t.Fatalf("Expected %s got %s", d.rootPath(), rootDir[1])
+ }
+ if dirs[0] != "Dirs" {
+ t.Fatalf("Expected Dirs got %s", dirs[0])
+ }
+ if dirs[1] != "1" {
+ t.Fatalf("Expected 1 got %s", dirs[1])
+ }
+}
+
+func TestApplyDiff(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ diffPath, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Add a file to the diff path with a fixed size
+ size := int64(1024)
+
+ f, err := os.Create(path.Join(diffPath, "test_file"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Truncate(size); err != nil {
+ t.Fatal(err)
+ }
+ f.Close()
+
+ diff, err := d.Diff("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if err := d.Create("2", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if err := d.Create("3", "2", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := d.ApplyDiff("3", diff); err != nil {
+ t.Fatal(err)
+ }
+
+ // Ensure that the file is in the mount point for id 3
+
+ mountPoint, err := d.Get("3")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if _, err := os.Stat(path.Join(mountPoint, "test_file")); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func hash(c string) string {
+ h := sha256.New()
+ fmt.Fprint(h, c)
+ return hex.EncodeToString(h.Sum(nil))
+}
+
+func TestMountMoreThan42Layers(t *testing.T) {
+ d := newDriver(t)
+ defer os.RemoveAll(tmp)
+ defer d.Cleanup()
+ var last string
+ var expected int
+
+ for i := 1; i < 127; i++ {
+ expected++
+ var (
+ parent = fmt.Sprintf("%d", i-1)
+ current = fmt.Sprintf("%d", i)
+ )
+
+ if parent == "0" {
+ parent = ""
+ } else {
+ parent = hash(parent)
+ }
+ current = hash(current)
+
+ if err := d.Create(current, parent, ""); err != nil {
+ t.Logf("Current layer %d", i)
+ t.Fatal(err)
+ }
+ point, err := d.Get(current)
+ if err != nil {
+ t.Logf("Current layer %d", i)
+ t.Fatal(err)
+ }
+ f, err := os.Create(path.Join(point, current))
+ if err != nil {
+ t.Logf("Current layer %d", i)
+ t.Fatal(err)
+ }
+ f.Close()
+
+ if i%10 == 0 {
+ if err := os.Remove(path.Join(point, parent)); err != nil {
+ t.Logf("Current layer %d", i)
+ t.Fatal(err)
+ }
+ expected--
+ }
+ last = current
+ }
+
+ // Perform the actual mount for the top most image
+ point, err := d.Get(last)
+ if err != nil {
+ t.Fatal(err)
+ }
+ files, err := ioutil.ReadDir(point)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(files) != expected {
+ t.Fatalf("Expected %d got %d", expected, len(files))
+ }
+}
diff --git a/runtime/graphdriver/aufs/dirs.go b/runtime/graphdriver/aufs/dirs.go
new file mode 100644
index 0000000000..fb9b81edd2
--- /dev/null
+++ b/runtime/graphdriver/aufs/dirs.go
@@ -0,0 +1,46 @@
+package aufs
+
+import (
+ "bufio"
+ "io/ioutil"
+ "os"
+ "path"
+)
+
+// Return all the directories
+func loadIds(root string) ([]string, error) {
+ dirs, err := ioutil.ReadDir(root)
+ if err != nil {
+ return nil, err
+ }
+ out := []string{}
+ for _, d := range dirs {
+ if !d.IsDir() {
+ out = append(out, d.Name())
+ }
+ }
+ return out, nil
+}
+
+// Read the layers file for the current id and return all the
+// layers represented by new lines in the file
+//
+// If there are no lines in the file then the id has no parent
+// and an empty slice is returned.
+func getParentIds(root, id string) ([]string, error) {
+ f, err := os.Open(path.Join(root, "layers", id))
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ out := []string{}
+ s := bufio.NewScanner(f)
+
+ for s.Scan() {
+ if t := s.Text(); t != "" {
+ out = append(out, s.Text())
+ }
+ }
+ return out, s.Err()
+}
diff --git a/runtime/graphdriver/aufs/migrate.go b/runtime/graphdriver/aufs/migrate.go
new file mode 100644
index 0000000000..400e260797
--- /dev/null
+++ b/runtime/graphdriver/aufs/migrate.go
@@ -0,0 +1,194 @@
+package aufs
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path"
+)
+
+type metadata struct {
+ ID string `json:"id"`
+ ParentID string `json:"parent,omitempty"`
+ Image string `json:"Image,omitempty"`
+
+ parent *metadata
+}
+
+func pathExists(pth string) bool {
+ if _, err := os.Stat(pth); err != nil {
+ return false
+ }
+ return true
+}
+
+// Migrate existing images and containers from docker < 0.7.x
+//
+// The format pre 0.7 is for docker to store the metadata and filesystem
+// content in the same directory. For the migration to work we need to move Image layer
+// data from /var/lib/docker/graph/<id>/layers to the diff of the registered id.
+//
+// Next we need to migrate the container's rw layer to diff of the driver. After the
+// contents are migrated we need to register the image and container ids with the
+// driver.
+//
+// For the migration we try to move the folder containing the layer files, if that
+// fails because the data is currently mounted we will fallback to creating a
+// symlink.
+func (a *Driver) Migrate(pth string, setupInit func(p string) error) error {
+ if pathExists(path.Join(pth, "graph")) {
+ if err := a.migrateRepositories(pth); err != nil {
+ return err
+ }
+ if err := a.migrateImages(path.Join(pth, "graph")); err != nil {
+ return err
+ }
+ return a.migrateContainers(path.Join(pth, "containers"), setupInit)
+ }
+ return nil
+}
+
+func (a *Driver) migrateRepositories(pth string) error {
+ name := path.Join(pth, "repositories")
+ if err := os.Rename(name, name+"-aufs"); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ return nil
+}
+
+func (a *Driver) migrateContainers(pth string, setupInit func(p string) error) error {
+ fis, err := ioutil.ReadDir(pth)
+ if err != nil {
+ return err
+ }
+
+ for _, fi := range fis {
+ if id := fi.Name(); fi.IsDir() && pathExists(path.Join(pth, id, "rw")) {
+ if err := tryRelocate(path.Join(pth, id, "rw"), path.Join(a.rootPath(), "diff", id)); err != nil {
+ return err
+ }
+
+ if !a.Exists(id) {
+
+ metadata, err := loadMetadata(path.Join(pth, id, "config.json"))
+ if err != nil {
+ return err
+ }
+
+ initID := fmt.Sprintf("%s-init", id)
+ if err := a.Create(initID, metadata.Image, ""); err != nil {
+ return err
+ }
+
+ initPath, err := a.Get(initID)
+ if err != nil {
+ return err
+ }
+ // setup init layer
+ if err := setupInit(initPath); err != nil {
+ return err
+ }
+
+ if err := a.Create(id, initID, ""); err != nil {
+ return err
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func (a *Driver) migrateImages(pth string) error {
+ fis, err := ioutil.ReadDir(pth)
+ if err != nil {
+ return err
+ }
+ var (
+ m = make(map[string]*metadata)
+ current *metadata
+ exists bool
+ )
+
+ for _, fi := range fis {
+ if id := fi.Name(); fi.IsDir() && pathExists(path.Join(pth, id, "layer")) {
+ if current, exists = m[id]; !exists {
+ current, err = loadMetadata(path.Join(pth, id, "json"))
+ if err != nil {
+ return err
+ }
+ m[id] = current
+ }
+ }
+ }
+
+ for _, v := range m {
+ v.parent = m[v.ParentID]
+ }
+
+ migrated := make(map[string]bool)
+ for _, v := range m {
+ if err := a.migrateImage(v, pth, migrated); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (a *Driver) migrateImage(m *metadata, pth string, migrated map[string]bool) error {
+ if !migrated[m.ID] {
+ if m.parent != nil {
+ a.migrateImage(m.parent, pth, migrated)
+ }
+ if err := tryRelocate(path.Join(pth, m.ID, "layer"), path.Join(a.rootPath(), "diff", m.ID)); err != nil {
+ return err
+ }
+ if !a.Exists(m.ID) {
+ if err := a.Create(m.ID, m.ParentID, ""); err != nil {
+ return err
+ }
+ }
+ migrated[m.ID] = true
+ }
+ return nil
+}
+
+// tryRelocate will try to rename the old path to the new pack and if
+// the operation fails, it will fallback to a symlink
+func tryRelocate(oldPath, newPath string) error {
+ s, err := os.Lstat(newPath)
+ if err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ // If the destination is a symlink then we already tried to relocate once before
+ // and it failed so we delete it and try to remove
+ if s != nil && s.Mode()&os.ModeSymlink == os.ModeSymlink {
+ if err := os.RemoveAll(newPath); err != nil {
+ return err
+ }
+ }
+ if err := os.Rename(oldPath, newPath); err != nil {
+ if sErr := os.Symlink(oldPath, newPath); sErr != nil {
+ return fmt.Errorf("Unable to relocate %s to %s: Rename err %s Symlink err %s", oldPath, newPath, err, sErr)
+ }
+ }
+ return nil
+}
+
+func loadMetadata(pth string) (*metadata, error) {
+ f, err := os.Open(pth)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ var (
+ out = &metadata{}
+ dec = json.NewDecoder(f)
+ )
+
+ if err := dec.Decode(out); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
diff --git a/runtime/graphdriver/aufs/mount.go b/runtime/graphdriver/aufs/mount.go
new file mode 100644
index 0000000000..1f1d98f809
--- /dev/null
+++ b/runtime/graphdriver/aufs/mount.go
@@ -0,0 +1,17 @@
+package aufs
+
+import (
+ "github.com/dotcloud/docker/utils"
+ "os/exec"
+ "syscall"
+)
+
+func Unmount(target string) error {
+ if err := exec.Command("auplink", target, "flush").Run(); err != nil {
+ utils.Errorf("[warning]: couldn't run auplink before unmount: %s", err)
+ }
+ if err := syscall.Unmount(target, 0); err != nil {
+ return err
+ }
+ return nil
+}
diff --git a/runtime/graphdriver/aufs/mount_linux.go b/runtime/graphdriver/aufs/mount_linux.go
new file mode 100644
index 0000000000..6082d9f240
--- /dev/null
+++ b/runtime/graphdriver/aufs/mount_linux.go
@@ -0,0 +1,11 @@
+// +build amd64
+
+package aufs
+
+import "syscall"
+
+const MsRemount = syscall.MS_REMOUNT
+
+func mount(source string, target string, fstype string, flags uintptr, data string) error {
+ return syscall.Mount(source, target, fstype, flags, data)
+}
diff --git a/runtime/graphdriver/aufs/mount_unsupported.go b/runtime/graphdriver/aufs/mount_unsupported.go
new file mode 100644
index 0000000000..2735624112
--- /dev/null
+++ b/runtime/graphdriver/aufs/mount_unsupported.go
@@ -0,0 +1,11 @@
+// +build !linux !amd64
+
+package aufs
+
+import "errors"
+
+const MsRemount = 0
+
+func mount(source string, target string, fstype string, flags uintptr, data string) (err error) {
+ return errors.New("mount is not implemented on darwin")
+}
diff --git a/runtime/graphdriver/btrfs/btrfs.go b/runtime/graphdriver/btrfs/btrfs.go
new file mode 100644
index 0000000000..2a94a4089f
--- /dev/null
+++ b/runtime/graphdriver/btrfs/btrfs.go
@@ -0,0 +1,213 @@
+// +build linux,amd64
+
+package btrfs
+
+/*
+#include <stdlib.h>
+#include <dirent.h>
+#include <btrfs/ioctl.h>
+*/
+import "C"
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "os"
+ "path"
+ "syscall"
+ "unsafe"
+)
+
+func init() {
+ graphdriver.Register("btrfs", Init)
+}
+
+func Init(home string) (graphdriver.Driver, error) {
+ rootdir := path.Dir(home)
+
+ var buf syscall.Statfs_t
+ if err := syscall.Statfs(rootdir, &buf); err != nil {
+ return nil, err
+ }
+
+ if buf.Type != 0x9123683E {
+ return nil, fmt.Errorf("%s is not a btrfs filesystem", rootdir)
+ }
+
+ return &Driver{
+ home: home,
+ }, nil
+}
+
+type Driver struct {
+ home string
+}
+
+func (d *Driver) String() string {
+ return "btrfs"
+}
+
+func (d *Driver) Status() [][2]string {
+ return nil
+}
+
+func (d *Driver) Cleanup() error {
+ return nil
+}
+
+func free(p *C.char) {
+ C.free(unsafe.Pointer(p))
+}
+
+func openDir(path string) (*C.DIR, error) {
+ Cpath := C.CString(path)
+ defer free(Cpath)
+
+ dir := C.opendir(Cpath)
+ if dir == nil {
+ return nil, fmt.Errorf("Can't open dir")
+ }
+ return dir, nil
+}
+
+func closeDir(dir *C.DIR) {
+ if dir != nil {
+ C.closedir(dir)
+ }
+}
+
+func getDirFd(dir *C.DIR) uintptr {
+ return uintptr(C.dirfd(dir))
+}
+
+func subvolCreate(path, name string, mountLabel string) error {
+ dir, err := openDir(path)
+ if err != nil {
+ return err
+ }
+ defer closeDir(dir)
+
+ var args C.struct_btrfs_ioctl_vol_args
+ for i, c := range []byte(name) {
+ args.name[i] = C.char(c)
+ }
+
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SUBVOL_CREATE,
+ uintptr(unsafe.Pointer(&args)))
+ if errno != 0 {
+ return fmt.Errorf("Failed to create btrfs subvolume: %v", errno.Error())
+ }
+ return nil
+}
+
+func subvolSnapshot(src, dest, name string) error {
+ srcDir, err := openDir(src)
+ if err != nil {
+ return err
+ }
+ defer closeDir(srcDir)
+
+ destDir, err := openDir(dest)
+ if err != nil {
+ return err
+ }
+ defer closeDir(destDir)
+
+ var args C.struct_btrfs_ioctl_vol_args_v2
+ args.fd = C.__s64(getDirFd(srcDir))
+ for i, c := range []byte(name) {
+ args.name[i] = C.char(c)
+ }
+
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(destDir), C.BTRFS_IOC_SNAP_CREATE_V2,
+ uintptr(unsafe.Pointer(&args)))
+ if errno != 0 {
+ return fmt.Errorf("Failed to create btrfs snapshot: %v", errno.Error())
+ }
+ return nil
+}
+
+func subvolDelete(path, name string) error {
+ dir, err := openDir(path)
+ if err != nil {
+ return err
+ }
+ defer closeDir(dir)
+
+ var args C.struct_btrfs_ioctl_vol_args
+ for i, c := range []byte(name) {
+ args.name[i] = C.char(c)
+ }
+
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SNAP_DESTROY,
+ uintptr(unsafe.Pointer(&args)))
+ if errno != 0 {
+ return fmt.Errorf("Failed to destroy btrfs snapshot: %v", errno.Error())
+ }
+ return nil
+}
+
+func (d *Driver) subvolumesDir() string {
+ return path.Join(d.home, "subvolumes")
+}
+
+func (d *Driver) subvolumesDirId(id string) string {
+ return path.Join(d.subvolumesDir(), id)
+}
+
+func (d *Driver) Create(id string, parent string, mountLabel string) error {
+ subvolumes := path.Join(d.home, "subvolumes")
+ if err := os.MkdirAll(subvolumes, 0700); err != nil {
+ return err
+ }
+ if parent == "" {
+ if err := subvolCreate(subvolumes, id, mountLabel); err != nil {
+ return err
+ }
+ } else {
+ parentDir, err := d.Get(parent)
+ if err != nil {
+ return err
+ }
+ if err := subvolSnapshot(parentDir, subvolumes, id); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (d *Driver) Remove(id string) error {
+ dir := d.subvolumesDirId(id)
+ if _, err := os.Stat(dir); err != nil {
+ return err
+ }
+ if err := subvolDelete(d.subvolumesDir(), id); err != nil {
+ return err
+ }
+ return os.RemoveAll(dir)
+}
+
+func (d *Driver) Get(id string) (string, error) {
+ dir := d.subvolumesDirId(id)
+ st, err := os.Stat(dir)
+ if err != nil {
+ return "", err
+ }
+
+ if !st.IsDir() {
+ return "", fmt.Errorf("%s: not a directory", dir)
+ }
+
+ return dir, nil
+}
+
+func (d *Driver) Put(id string) {
+ // Get() creates no runtime resources (like e.g. mounts)
+ // so this doesn't need to do anything.
+}
+
+func (d *Driver) Exists(id string) bool {
+ dir := d.subvolumesDirId(id)
+ _, err := os.Stat(dir)
+ return err == nil
+}
diff --git a/runtime/graphdriver/btrfs/dummy_unsupported.go b/runtime/graphdriver/btrfs/dummy_unsupported.go
new file mode 100644
index 0000000000..6c44615763
--- /dev/null
+++ b/runtime/graphdriver/btrfs/dummy_unsupported.go
@@ -0,0 +1,3 @@
+// +build !linux !amd64
+
+package btrfs
diff --git a/runtime/graphdriver/devmapper/attach_loopback.go b/runtime/graphdriver/devmapper/attach_loopback.go
new file mode 100644
index 0000000000..23339076e8
--- /dev/null
+++ b/runtime/graphdriver/devmapper/attach_loopback.go
@@ -0,0 +1,126 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/utils"
+)
+
+func stringToLoopName(src string) [LoNameSize]uint8 {
+ var dst [LoNameSize]uint8
+ copy(dst[:], src[:])
+ return dst
+}
+
+func getNextFreeLoopbackIndex() (int, error) {
+ f, err := osOpenFile("/dev/loop-control", osORdOnly, 0644)
+ if err != nil {
+ return 0, err
+ }
+ defer f.Close()
+
+ index, err := ioctlLoopCtlGetFree(f.Fd())
+ if index < 0 {
+ index = 0
+ }
+ return index, err
+}
+
+func openNextAvailableLoopback(index int, sparseFile *osFile) (loopFile *osFile, err error) {
+ // Start looking for a free /dev/loop
+ for {
+ target := fmt.Sprintf("/dev/loop%d", index)
+ index++
+
+ fi, err := osStat(target)
+ if err != nil {
+ if osIsNotExist(err) {
+ utils.Errorf("There are no more loopback device available.")
+ }
+ return nil, ErrAttachLoopbackDevice
+ }
+
+ if fi.Mode()&osModeDevice != osModeDevice {
+ utils.Errorf("Loopback device %s is not a block device.", target)
+ continue
+ }
+
+ // OpenFile adds O_CLOEXEC
+ loopFile, err = osOpenFile(target, osORdWr, 0644)
+ if err != nil {
+ utils.Errorf("Error openning loopback device: %s", err)
+ return nil, ErrAttachLoopbackDevice
+ }
+
+ // Try to attach to the loop file
+ if err := ioctlLoopSetFd(loopFile.Fd(), sparseFile.Fd()); err != nil {
+ loopFile.Close()
+
+ // If the error is EBUSY, then try the next loopback
+ if err != sysEBusy {
+ utils.Errorf("Cannot set up loopback device %s: %s", target, err)
+ return nil, ErrAttachLoopbackDevice
+ }
+
+ // Otherwise, we keep going with the loop
+ continue
+ }
+ // In case of success, we finished. Break the loop.
+ break
+ }
+
+ // This can't happen, but let's be sure
+ if loopFile == nil {
+ utils.Errorf("Unreachable code reached! Error attaching %s to a loopback device.", sparseFile.Name())
+ return nil, ErrAttachLoopbackDevice
+ }
+
+ return loopFile, nil
+}
+
+// attachLoopDevice attaches the given sparse file to the next
+// available loopback device. It returns an opened *osFile.
+func attachLoopDevice(sparseName string) (loop *osFile, err error) {
+
+ // Try to retrieve the next available loopback device via syscall.
+ // If it fails, we discard error and start loopking for a
+ // loopback from index 0.
+ startIndex, err := getNextFreeLoopbackIndex()
+ if err != nil {
+ utils.Debugf("Error retrieving the next available loopback: %s", err)
+ }
+
+ // OpenFile adds O_CLOEXEC
+ sparseFile, err := osOpenFile(sparseName, osORdWr, 0644)
+ if err != nil {
+ utils.Errorf("Error openning sparse file %s: %s", sparseName, err)
+ return nil, ErrAttachLoopbackDevice
+ }
+ defer sparseFile.Close()
+
+ loopFile, err := openNextAvailableLoopback(startIndex, sparseFile)
+ if err != nil {
+ return nil, err
+ }
+
+ // Set the status of the loopback device
+ loopInfo := &LoopInfo64{
+ loFileName: stringToLoopName(loopFile.Name()),
+ loOffset: 0,
+ loFlags: LoFlagsAutoClear,
+ }
+
+ if err := ioctlLoopSetStatus64(loopFile.Fd(), loopInfo); err != nil {
+ utils.Errorf("Cannot set up loopback device info: %s", err)
+
+ // If the call failed, then free the loopback device
+ if err := ioctlLoopClrFd(loopFile.Fd()); err != nil {
+ utils.Errorf("Error while cleaning up the loopback device")
+ }
+ loopFile.Close()
+ return nil, ErrAttachLoopbackDevice
+ }
+
+ return loopFile, nil
+}
diff --git a/runtime/graphdriver/devmapper/deviceset.go b/runtime/graphdriver/devmapper/deviceset.go
new file mode 100644
index 0000000000..97d670a3d9
--- /dev/null
+++ b/runtime/graphdriver/devmapper/deviceset.go
@@ -0,0 +1,1122 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "github.com/dotcloud/docker/pkg/label"
+ "github.com/dotcloud/docker/utils"
+ "io"
+ "io/ioutil"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+)
+
+var (
+ DefaultDataLoopbackSize int64 = 100 * 1024 * 1024 * 1024
+ DefaultMetaDataLoopbackSize int64 = 2 * 1024 * 1024 * 1024
+ DefaultBaseFsSize uint64 = 10 * 1024 * 1024 * 1024
+)
+
+type DevInfo struct {
+ Hash string `json:"-"`
+ DeviceId int `json:"device_id"`
+ Size uint64 `json:"size"`
+ TransactionId uint64 `json:"transaction_id"`
+ Initialized bool `json:"initialized"`
+ devices *DeviceSet `json:"-"`
+
+ mountCount int `json:"-"`
+ mountPath string `json:"-"`
+ // A floating mount means one reference is not owned and
+ // will be stolen by the next mount. This allows us to
+ // avoid unmounting directly after creation before the
+ // first get (since we need to mount to set up the device
+ // a bit first).
+ floating bool `json:"-"`
+
+ // The global DeviceSet lock guarantees that we serialize all
+ // the calls to libdevmapper (which is not threadsafe), but we
+ // sometimes release that lock while sleeping. In that case
+ // this per-device lock is still held, protecting against
+ // other accesses to the device that we're doing the wait on.
+ //
+ // WARNING: In order to avoid AB-BA deadlocks when releasing
+ // the global lock while holding the per-device locks all
+ // device locks must be aquired *before* the device lock, and
+ // multiple device locks should be aquired parent before child.
+ lock sync.Mutex `json:"-"`
+}
+
+type MetaData struct {
+ Devices map[string]*DevInfo `json:devices`
+ devicesLock sync.Mutex `json:"-"` // Protects all read/writes to Devices map
+}
+
+type DeviceSet struct {
+ MetaData
+ sync.Mutex // Protects Devices map and serializes calls into libdevmapper
+ root string
+ devicePrefix string
+ TransactionId uint64
+ NewTransactionId uint64
+ nextFreeDevice int
+ sawBusy bool
+}
+
+type DiskUsage struct {
+ Used uint64
+ Total uint64
+}
+
+type Status struct {
+ PoolName string
+ DataLoopback string
+ MetadataLoopback string
+ Data DiskUsage
+ Metadata DiskUsage
+ SectorSize uint64
+}
+
+type DevStatus struct {
+ DeviceId int
+ Size uint64
+ TransactionId uint64
+ SizeInSectors uint64
+ MappedSectors uint64
+ HighestMappedSector uint64
+}
+
+type UnmountMode int
+
+const (
+ UnmountRegular UnmountMode = iota
+ UnmountFloat
+ UnmountSink
+)
+
+func getDevName(name string) string {
+ return "/dev/mapper/" + name
+}
+
+func (info *DevInfo) Name() string {
+ hash := info.Hash
+ if hash == "" {
+ hash = "base"
+ }
+ return fmt.Sprintf("%s-%s", info.devices.devicePrefix, hash)
+}
+
+func (info *DevInfo) DevName() string {
+ return getDevName(info.Name())
+}
+
+func (devices *DeviceSet) loopbackDir() string {
+ return path.Join(devices.root, "devicemapper")
+}
+
+func (devices *DeviceSet) jsonFile() string {
+ return path.Join(devices.loopbackDir(), "json")
+}
+
+func (devices *DeviceSet) getPoolName() string {
+ return devices.devicePrefix + "-pool"
+}
+
+func (devices *DeviceSet) getPoolDevName() string {
+ return getDevName(devices.getPoolName())
+}
+
+func (devices *DeviceSet) hasImage(name string) bool {
+ dirname := devices.loopbackDir()
+ filename := path.Join(dirname, name)
+
+ _, err := osStat(filename)
+ return err == nil
+}
+
+// ensureImage creates a sparse file of <size> bytes at the path
+// <root>/devicemapper/<name>.
+// If the file already exists, it does nothing.
+// Either way it returns the full path.
+func (devices *DeviceSet) ensureImage(name string, size int64) (string, error) {
+ dirname := devices.loopbackDir()
+ filename := path.Join(dirname, name)
+
+ if err := osMkdirAll(dirname, 0700); err != nil && !osIsExist(err) {
+ return "", err
+ }
+
+ if _, err := osStat(filename); err != nil {
+ if !osIsNotExist(err) {
+ return "", err
+ }
+ utils.Debugf("Creating loopback file %s for device-manage use", filename)
+ file, err := osOpenFile(filename, osORdWr|osOCreate, 0600)
+ if err != nil {
+ return "", err
+ }
+ defer file.Close()
+
+ if err = file.Truncate(size); err != nil {
+ return "", err
+ }
+ }
+ return filename, nil
+}
+
+func (devices *DeviceSet) allocateDeviceId() int {
+ // TODO: Add smarter reuse of deleted devices
+ id := devices.nextFreeDevice
+ devices.nextFreeDevice = devices.nextFreeDevice + 1
+ return id
+}
+
+func (devices *DeviceSet) allocateTransactionId() uint64 {
+ devices.NewTransactionId = devices.NewTransactionId + 1
+ return devices.NewTransactionId
+}
+
+func (devices *DeviceSet) saveMetadata() error {
+ devices.devicesLock.Lock()
+ jsonData, err := json.Marshal(devices.MetaData)
+ devices.devicesLock.Unlock()
+ if err != nil {
+ return fmt.Errorf("Error encoding metadata to json: %s", err)
+ }
+ tmpFile, err := ioutil.TempFile(filepath.Dir(devices.jsonFile()), ".json")
+ if err != nil {
+ return fmt.Errorf("Error creating metadata file: %s", err)
+ }
+
+ n, err := tmpFile.Write(jsonData)
+ if err != nil {
+ return fmt.Errorf("Error writing metadata to %s: %s", tmpFile.Name(), err)
+ }
+ if n < len(jsonData) {
+ return io.ErrShortWrite
+ }
+ if err := tmpFile.Sync(); err != nil {
+ return fmt.Errorf("Error syncing metadata file %s: %s", tmpFile.Name(), err)
+ }
+ if err := tmpFile.Close(); err != nil {
+ return fmt.Errorf("Error closing metadata file %s: %s", tmpFile.Name(), err)
+ }
+ if err := osRename(tmpFile.Name(), devices.jsonFile()); err != nil {
+ return fmt.Errorf("Error committing metadata file %s: %s", tmpFile.Name(), err)
+ }
+
+ if devices.NewTransactionId != devices.TransactionId {
+ if err = setTransactionId(devices.getPoolDevName(), devices.TransactionId, devices.NewTransactionId); err != nil {
+ return fmt.Errorf("Error setting devmapper transition ID: %s", err)
+ }
+ devices.TransactionId = devices.NewTransactionId
+ }
+ return nil
+}
+
+func (devices *DeviceSet) lookupDevice(hash string) (*DevInfo, error) {
+ devices.devicesLock.Lock()
+ defer devices.devicesLock.Unlock()
+ info := devices.Devices[hash]
+ if info == nil {
+ return nil, fmt.Errorf("Unknown device %s", hash)
+ }
+ return info, nil
+}
+
+func (devices *DeviceSet) registerDevice(id int, hash string, size uint64) (*DevInfo, error) {
+ utils.Debugf("registerDevice(%v, %v)", id, hash)
+ info := &DevInfo{
+ Hash: hash,
+ DeviceId: id,
+ Size: size,
+ TransactionId: devices.allocateTransactionId(),
+ Initialized: false,
+ devices: devices,
+ }
+
+ devices.devicesLock.Lock()
+ devices.Devices[hash] = info
+ devices.devicesLock.Unlock()
+
+ if err := devices.saveMetadata(); err != nil {
+ // Try to remove unused device
+ devices.devicesLock.Lock()
+ delete(devices.Devices, hash)
+ devices.devicesLock.Unlock()
+ return nil, err
+ }
+
+ return info, nil
+}
+
+func (devices *DeviceSet) activateDeviceIfNeeded(info *DevInfo) error {
+ utils.Debugf("activateDeviceIfNeeded(%v)", info.Hash)
+
+ if devinfo, _ := getInfo(info.Name()); devinfo != nil && devinfo.Exists != 0 {
+ return nil
+ }
+
+ return activateDevice(devices.getPoolDevName(), info.Name(), info.DeviceId, info.Size)
+}
+
+func (devices *DeviceSet) createFilesystem(info *DevInfo) error {
+ devname := info.DevName()
+
+ err := execRun("mkfs.ext4", "-E", "discard,lazy_itable_init=0,lazy_journal_init=0", devname)
+ if err != nil {
+ err = execRun("mkfs.ext4", "-E", "discard,lazy_itable_init=0", devname)
+ }
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ return nil
+}
+
+func (devices *DeviceSet) loadMetaData() error {
+ utils.Debugf("loadMetadata()")
+ defer utils.Debugf("loadMetadata END")
+ _, _, _, params, err := getStatus(devices.getPoolName())
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ if _, err := fmt.Sscanf(params, "%d", &devices.TransactionId); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ devices.NewTransactionId = devices.TransactionId
+
+ jsonData, err := ioutil.ReadFile(devices.jsonFile())
+ if err != nil && !osIsNotExist(err) {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ devices.MetaData.Devices = make(map[string]*DevInfo)
+ if jsonData != nil {
+ if err := json.Unmarshal(jsonData, &devices.MetaData); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ }
+
+ for hash, d := range devices.Devices {
+ d.Hash = hash
+ d.devices = devices
+
+ if d.DeviceId >= devices.nextFreeDevice {
+ devices.nextFreeDevice = d.DeviceId + 1
+ }
+
+ // If the transaction id is larger than the actual one we lost the device due to some crash
+ if d.TransactionId > devices.TransactionId {
+ utils.Debugf("Removing lost device %s with id %d", hash, d.TransactionId)
+ delete(devices.Devices, hash)
+ }
+ }
+ return nil
+}
+
+func (devices *DeviceSet) setupBaseImage() error {
+ oldInfo, _ := devices.lookupDevice("")
+ if oldInfo != nil && oldInfo.Initialized {
+ return nil
+ }
+
+ if oldInfo != nil && !oldInfo.Initialized {
+ utils.Debugf("Removing uninitialized base image")
+ if err := devices.deleteDevice(oldInfo); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ }
+
+ utils.Debugf("Initializing base device-manager snapshot")
+
+ id := devices.allocateDeviceId()
+
+ // Create initial device
+ if err := createDevice(devices.getPoolDevName(), id); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ utils.Debugf("Registering base device (id %v) with FS size %v", id, DefaultBaseFsSize)
+ info, err := devices.registerDevice(id, "", DefaultBaseFsSize)
+ if err != nil {
+ _ = deleteDevice(devices.getPoolDevName(), id)
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ utils.Debugf("Creating filesystem on base device-manager snapshot")
+
+ if err = devices.activateDeviceIfNeeded(info); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ if err := devices.createFilesystem(info); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ info.Initialized = true
+ if err = devices.saveMetadata(); err != nil {
+ info.Initialized = false
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ return nil
+}
+
+func setCloseOnExec(name string) {
+ if fileInfos, _ := ioutil.ReadDir("/proc/self/fd"); fileInfos != nil {
+ for _, i := range fileInfos {
+ link, _ := osReadlink(filepath.Join("/proc/self/fd", i.Name()))
+ if link == name {
+ fd, err := strconv.Atoi(i.Name())
+ if err == nil {
+ sysCloseOnExec(fd)
+ }
+ }
+ }
+ }
+}
+
+func (devices *DeviceSet) log(level int, file string, line int, dmError int, message string) {
+ if level >= 7 {
+ return // Ignore _LOG_DEBUG
+ }
+
+ if strings.Contains(message, "busy") {
+ devices.sawBusy = true
+ }
+
+ utils.Debugf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message)
+}
+
+func major(device uint64) uint64 {
+ return (device >> 8) & 0xfff
+}
+
+func minor(device uint64) uint64 {
+ return (device & 0xff) | ((device >> 12) & 0xfff00)
+}
+
+func (devices *DeviceSet) ResizePool(size int64) error {
+ dirname := devices.loopbackDir()
+ datafilename := path.Join(dirname, "data")
+ metadatafilename := path.Join(dirname, "metadata")
+
+ datafile, err := osOpenFile(datafilename, osORdWr, 0)
+ if datafile == nil {
+ return err
+ }
+ defer datafile.Close()
+
+ fi, err := datafile.Stat()
+ if fi == nil {
+ return err
+ }
+
+ if fi.Size() > size {
+ return fmt.Errorf("Can't shrink file")
+ }
+
+ dataloopback := FindLoopDeviceFor(datafile)
+ if dataloopback == nil {
+ return fmt.Errorf("Unable to find loopback mount for: %s", datafilename)
+ }
+ defer dataloopback.Close()
+
+ metadatafile, err := osOpenFile(metadatafilename, osORdWr, 0)
+ if metadatafile == nil {
+ return err
+ }
+ defer metadatafile.Close()
+
+ metadataloopback := FindLoopDeviceFor(metadatafile)
+ if metadataloopback == nil {
+ return fmt.Errorf("Unable to find loopback mount for: %s", metadatafilename)
+ }
+ defer metadataloopback.Close()
+
+ // Grow loopback file
+ if err := datafile.Truncate(size); err != nil {
+ return fmt.Errorf("Unable to grow loopback file: %s", err)
+ }
+
+ // Reload size for loopback device
+ if err := LoopbackSetCapacity(dataloopback); err != nil {
+ return fmt.Errorf("Unable to update loopback capacity: %s", err)
+ }
+
+ // Suspend the pool
+ if err := suspendDevice(devices.getPoolName()); err != nil {
+ return fmt.Errorf("Unable to suspend pool: %s", err)
+ }
+
+ // Reload with the new block sizes
+ if err := reloadPool(devices.getPoolName(), dataloopback, metadataloopback); err != nil {
+ return fmt.Errorf("Unable to reload pool: %s", err)
+ }
+
+ // Resume the pool
+ if err := resumeDevice(devices.getPoolName()); err != nil {
+ return fmt.Errorf("Unable to resume pool: %s", err)
+ }
+
+ return nil
+}
+
+func (devices *DeviceSet) initDevmapper(doInit bool) error {
+ logInit(devices)
+
+ // Make sure the sparse images exist in <root>/devicemapper/data and
+ // <root>/devicemapper/metadata
+
+ hasData := devices.hasImage("data")
+ hasMetadata := devices.hasImage("metadata")
+
+ if !doInit && !hasData {
+ return errors.New("Loopback data file not found")
+ }
+
+ if !doInit && !hasMetadata {
+ return errors.New("Loopback metadata file not found")
+ }
+
+ createdLoopback := !hasData || !hasMetadata
+ data, err := devices.ensureImage("data", DefaultDataLoopbackSize)
+ if err != nil {
+ utils.Debugf("Error device ensureImage (data): %s\n", err)
+ return err
+ }
+ metadata, err := devices.ensureImage("metadata", DefaultMetaDataLoopbackSize)
+ if err != nil {
+ utils.Debugf("Error device ensureImage (metadata): %s\n", err)
+ return err
+ }
+
+ // Set the device prefix from the device id and inode of the docker root dir
+
+ st, err := osStat(devices.root)
+ if err != nil {
+ return fmt.Errorf("Error looking up dir %s: %s", devices.root, err)
+ }
+ sysSt := toSysStatT(st.Sys())
+ // "reg-" stands for "regular file".
+ // In the future we might use "dev-" for "device file", etc.
+ // docker-maj,min[-inode] stands for:
+ // - Managed by docker
+ // - The target of this device is at major <maj> and minor <min>
+ // - If <inode> is defined, use that file inside the device as a loopback image. Otherwise use the device itself.
+ devices.devicePrefix = fmt.Sprintf("docker-%d:%d-%d", major(sysSt.Dev), minor(sysSt.Dev), sysSt.Ino)
+ utils.Debugf("Generated prefix: %s", devices.devicePrefix)
+
+ // Check for the existence of the device <prefix>-pool
+ utils.Debugf("Checking for existence of the pool '%s'", devices.getPoolName())
+ info, err := getInfo(devices.getPoolName())
+ if info == nil {
+ utils.Debugf("Error device getInfo: %s", err)
+ return err
+ }
+
+ // It seems libdevmapper opens this without O_CLOEXEC, and go exec will not close files
+ // that are not Close-on-exec, and lxc-start will die if it inherits any unexpected files,
+ // so we add this badhack to make sure it closes itself
+ setCloseOnExec("/dev/mapper/control")
+
+ // If the pool doesn't exist, create it
+ if info.Exists == 0 {
+ utils.Debugf("Pool doesn't exist. Creating it.")
+
+ dataFile, err := attachLoopDevice(data)
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ defer dataFile.Close()
+
+ metadataFile, err := attachLoopDevice(metadata)
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ defer metadataFile.Close()
+
+ if err := createPool(devices.getPoolName(), dataFile, metadataFile); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ }
+
+ // If we didn't just create the data or metadata image, we need to
+ // load the metadata from the existing file.
+ if !createdLoopback {
+ if err = devices.loadMetaData(); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ }
+
+ // Setup the base image
+ if doInit {
+ if err := devices.setupBaseImage(); err != nil {
+ utils.Debugf("Error device setupBaseImage: %s\n", err)
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (devices *DeviceSet) AddDevice(hash, baseHash string) error {
+ baseInfo, err := devices.lookupDevice(baseHash)
+ if err != nil {
+ return err
+ }
+
+ baseInfo.lock.Lock()
+ defer baseInfo.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ if info, _ := devices.lookupDevice(hash); info != nil {
+ return fmt.Errorf("device %s already exists", hash)
+ }
+
+ deviceId := devices.allocateDeviceId()
+
+ if err := devices.createSnapDevice(devices.getPoolDevName(), deviceId, baseInfo.Name(), baseInfo.DeviceId); err != nil {
+ utils.Debugf("Error creating snap device: %s\n", err)
+ return err
+ }
+
+ if _, err := devices.registerDevice(deviceId, hash, baseInfo.Size); err != nil {
+ deleteDevice(devices.getPoolDevName(), deviceId)
+ utils.Debugf("Error registering device: %s\n", err)
+ return err
+ }
+ return nil
+}
+
+func (devices *DeviceSet) deleteDevice(info *DevInfo) error {
+ // This is a workaround for the kernel not discarding block so
+ // on the thin pool when we remove a thinp device, so we do it
+ // manually
+ if err := devices.activateDeviceIfNeeded(info); err == nil {
+ if err := BlockDeviceDiscard(info.DevName()); err != nil {
+ utils.Debugf("Error discarding block on device: %s (ignoring)\n", err)
+ }
+ }
+
+ devinfo, _ := getInfo(info.Name())
+ if devinfo != nil && devinfo.Exists != 0 {
+ if err := devices.removeDeviceAndWait(info.Name()); err != nil {
+ utils.Debugf("Error removing device: %s\n", err)
+ return err
+ }
+ }
+
+ if info.Initialized {
+ info.Initialized = false
+ if err := devices.saveMetadata(); err != nil {
+ utils.Debugf("Error saving meta data: %s\n", err)
+ return err
+ }
+ }
+
+ if err := deleteDevice(devices.getPoolDevName(), info.DeviceId); err != nil {
+ utils.Debugf("Error deleting device: %s\n", err)
+ return err
+ }
+
+ devices.allocateTransactionId()
+ devices.devicesLock.Lock()
+ delete(devices.Devices, info.Hash)
+ devices.devicesLock.Unlock()
+
+ if err := devices.saveMetadata(); err != nil {
+ devices.devicesLock.Lock()
+ devices.Devices[info.Hash] = info
+ devices.devicesLock.Unlock()
+ utils.Debugf("Error saving meta data: %s\n", err)
+ return err
+ }
+
+ return nil
+}
+
+func (devices *DeviceSet) DeleteDevice(hash string) error {
+ info, err := devices.lookupDevice(hash)
+ if err != nil {
+ return err
+ }
+
+ info.lock.Lock()
+ defer info.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ return devices.deleteDevice(info)
+}
+
+func (devices *DeviceSet) deactivatePool() error {
+ utils.Debugf("[devmapper] deactivatePool()")
+ defer utils.Debugf("[devmapper] deactivatePool END")
+ devname := devices.getPoolDevName()
+ devinfo, err := getInfo(devname)
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ if devinfo.Exists != 0 {
+ return removeDevice(devname)
+ }
+
+ return nil
+}
+
+func (devices *DeviceSet) deactivateDevice(info *DevInfo) error {
+ utils.Debugf("[devmapper] deactivateDevice(%s)", info.Hash)
+ defer utils.Debugf("[devmapper] deactivateDevice END")
+
+ // Wait for the unmount to be effective,
+ // by watching the value of Info.OpenCount for the device
+ if err := devices.waitClose(info); err != nil {
+ utils.Errorf("Warning: error waiting for device %s to close: %s\n", info.Hash, err)
+ }
+
+ devinfo, err := getInfo(info.Name())
+ if err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ if devinfo.Exists != 0 {
+ if err := devices.removeDeviceAndWait(info.Name()); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ }
+
+ return nil
+}
+
+// Issues the underlying dm remove operation and then waits
+// for it to finish.
+func (devices *DeviceSet) removeDeviceAndWait(devname string) error {
+ var err error
+
+ for i := 0; i < 1000; i++ {
+ devices.sawBusy = false
+ err = removeDevice(devname)
+ if err == nil {
+ break
+ }
+ if !devices.sawBusy {
+ return err
+ }
+
+ // If we see EBUSY it may be a transient error,
+ // sleep a bit a retry a few times.
+ devices.Unlock()
+ time.Sleep(10 * time.Millisecond)
+ devices.Lock()
+ }
+ if err != nil {
+ return err
+ }
+
+ if err := devices.waitRemove(devname); err != nil {
+ return err
+ }
+ return nil
+}
+
+// waitRemove blocks until either:
+// a) the device registered at <device_set_prefix>-<hash> is removed,
+// or b) the 10 second timeout expires.
+func (devices *DeviceSet) waitRemove(devname string) error {
+ utils.Debugf("[deviceset %s] waitRemove(%s)", devices.devicePrefix, devname)
+ defer utils.Debugf("[deviceset %s] waitRemove(%s) END", devices.devicePrefix, devname)
+ i := 0
+ for ; i < 1000; i += 1 {
+ devinfo, err := getInfo(devname)
+ if err != nil {
+ // If there is an error we assume the device doesn't exist.
+ // The error might actually be something else, but we can't differentiate.
+ return nil
+ }
+ if i%100 == 0 {
+ utils.Debugf("Waiting for removal of %s: exists=%d", devname, devinfo.Exists)
+ }
+ if devinfo.Exists == 0 {
+ break
+ }
+
+ devices.Unlock()
+ time.Sleep(10 * time.Millisecond)
+ devices.Lock()
+ }
+ if i == 1000 {
+ return fmt.Errorf("Timeout while waiting for device %s to be removed", devname)
+ }
+ return nil
+}
+
+// waitClose blocks until either:
+// a) the device registered at <device_set_prefix>-<hash> is closed,
+// or b) the 10 second timeout expires.
+func (devices *DeviceSet) waitClose(info *DevInfo) error {
+ i := 0
+ for ; i < 1000; i += 1 {
+ devinfo, err := getInfo(info.Name())
+ if err != nil {
+ return err
+ }
+ if i%100 == 0 {
+ utils.Debugf("Waiting for unmount of %s: opencount=%d", info.Hash, devinfo.OpenCount)
+ }
+ if devinfo.OpenCount == 0 {
+ break
+ }
+ devices.Unlock()
+ time.Sleep(10 * time.Millisecond)
+ devices.Lock()
+ }
+ if i == 1000 {
+ return fmt.Errorf("Timeout while waiting for device %s to close", info.Hash)
+ }
+ return nil
+}
+
+func (devices *DeviceSet) Shutdown() error {
+
+ utils.Debugf("[deviceset %s] shutdown()", devices.devicePrefix)
+ utils.Debugf("[devmapper] Shutting down DeviceSet: %s", devices.root)
+ defer utils.Debugf("[deviceset %s] shutdown END", devices.devicePrefix)
+
+ var devs []*DevInfo
+
+ devices.devicesLock.Lock()
+ for _, info := range devices.Devices {
+ devs = append(devs, info)
+ }
+ devices.devicesLock.Unlock()
+
+ for _, info := range devs {
+ info.lock.Lock()
+ if info.mountCount > 0 {
+ // We use MNT_DETACH here in case it is still busy in some running
+ // container. This means it'll go away from the global scope directly,
+ // and the device will be released when that container dies.
+ if err := sysUnmount(info.mountPath, syscall.MNT_DETACH); err != nil {
+ utils.Debugf("Shutdown unmounting %s, error: %s\n", info.mountPath, err)
+ }
+
+ devices.Lock()
+ if err := devices.deactivateDevice(info); err != nil {
+ utils.Debugf("Shutdown deactivate %s , error: %s\n", info.Hash, err)
+ }
+ devices.Unlock()
+ }
+ info.lock.Unlock()
+ }
+
+ info, _ := devices.lookupDevice("")
+ if info != nil {
+ info.lock.Lock()
+ devices.Lock()
+ if err := devices.deactivateDevice(info); err != nil {
+ utils.Debugf("Shutdown deactivate base , error: %s\n", err)
+ }
+ devices.Unlock()
+ info.lock.Unlock()
+ }
+
+ devices.Lock()
+ if err := devices.deactivatePool(); err != nil {
+ utils.Debugf("Shutdown deactivate pool , error: %s\n", err)
+ }
+ devices.Unlock()
+
+ return nil
+}
+
+func (devices *DeviceSet) MountDevice(hash, path string, mountLabel string) error {
+ info, err := devices.lookupDevice(hash)
+ if err != nil {
+ return err
+ }
+
+ info.lock.Lock()
+ defer info.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ if info.mountCount > 0 {
+ if path != info.mountPath {
+ return fmt.Errorf("Trying to mount devmapper device in multple places (%s, %s)", info.mountPath, path)
+ }
+
+ if info.floating {
+ // Steal floating ref
+ info.floating = false
+ } else {
+ info.mountCount++
+ }
+ return nil
+ }
+
+ if err := devices.activateDeviceIfNeeded(info); err != nil {
+ return fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err)
+ }
+
+ var flags uintptr = sysMsMgcVal
+
+ mountOptions := label.FormatMountLabel("discard", mountLabel)
+ err = sysMount(info.DevName(), path, "ext4", flags, mountOptions)
+ if err != nil && err == sysEInval {
+ mountOptions = label.FormatMountLabel(mountLabel, "")
+ err = sysMount(info.DevName(), path, "ext4", flags, mountOptions)
+ }
+ if err != nil {
+ return fmt.Errorf("Error mounting '%s' on '%s': %s", info.DevName(), path, err)
+ }
+
+ info.mountCount = 1
+ info.mountPath = path
+ info.floating = false
+
+ return devices.setInitialized(info)
+}
+
+func (devices *DeviceSet) UnmountDevice(hash string, mode UnmountMode) error {
+ utils.Debugf("[devmapper] UnmountDevice(hash=%s, mode=%d)", hash, mode)
+ defer utils.Debugf("[devmapper] UnmountDevice END")
+
+ info, err := devices.lookupDevice(hash)
+ if err != nil {
+ return err
+ }
+
+ info.lock.Lock()
+ defer info.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ if mode == UnmountFloat {
+ if info.floating {
+ return fmt.Errorf("UnmountDevice: can't float floating reference %s\n", hash)
+ }
+
+ // Leave this reference floating
+ info.floating = true
+ return nil
+ }
+
+ if mode == UnmountSink {
+ if !info.floating {
+ // Someone already sunk this
+ return nil
+ }
+ // Otherwise, treat this as a regular unmount
+ }
+
+ if info.mountCount == 0 {
+ return fmt.Errorf("UnmountDevice: device not-mounted id %s\n", hash)
+ }
+
+ info.mountCount--
+ if info.mountCount > 0 {
+ return nil
+ }
+
+ utils.Debugf("[devmapper] Unmount(%s)", info.mountPath)
+ if err := sysUnmount(info.mountPath, 0); err != nil {
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+ utils.Debugf("[devmapper] Unmount done")
+
+ if err := devices.deactivateDevice(info); err != nil {
+ return err
+ }
+
+ info.mountPath = ""
+
+ return nil
+}
+
+func (devices *DeviceSet) HasDevice(hash string) bool {
+ devices.Lock()
+ defer devices.Unlock()
+
+ info, _ := devices.lookupDevice(hash)
+ return info != nil
+}
+
+func (devices *DeviceSet) HasInitializedDevice(hash string) bool {
+ devices.Lock()
+ defer devices.Unlock()
+
+ info, _ := devices.lookupDevice(hash)
+ return info != nil && info.Initialized
+}
+
+func (devices *DeviceSet) HasActivatedDevice(hash string) bool {
+ info, _ := devices.lookupDevice(hash)
+ if info == nil {
+ return false
+ }
+
+ info.lock.Lock()
+ defer info.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ devinfo, _ := getInfo(info.Name())
+ return devinfo != nil && devinfo.Exists != 0
+}
+
+func (devices *DeviceSet) setInitialized(info *DevInfo) error {
+ info.Initialized = true
+ if err := devices.saveMetadata(); err != nil {
+ info.Initialized = false
+ utils.Debugf("\n--->Err: %s\n", err)
+ return err
+ }
+
+ return nil
+}
+
+func (devices *DeviceSet) List() []string {
+ devices.Lock()
+ defer devices.Unlock()
+
+ devices.devicesLock.Lock()
+ ids := make([]string, len(devices.Devices))
+ i := 0
+ for k := range devices.Devices {
+ ids[i] = k
+ i++
+ }
+ devices.devicesLock.Unlock()
+
+ return ids
+}
+
+func (devices *DeviceSet) deviceStatus(devName string) (sizeInSectors, mappedSectors, highestMappedSector uint64, err error) {
+ var params string
+ _, sizeInSectors, _, params, err = getStatus(devName)
+ if err != nil {
+ return
+ }
+ if _, err = fmt.Sscanf(params, "%d %d", &mappedSectors, &highestMappedSector); err == nil {
+ return
+ }
+ return
+}
+
+func (devices *DeviceSet) GetDeviceStatus(hash string) (*DevStatus, error) {
+ info, err := devices.lookupDevice(hash)
+ if err != nil {
+ return nil, err
+ }
+
+ info.lock.Lock()
+ defer info.lock.Unlock()
+
+ devices.Lock()
+ defer devices.Unlock()
+
+ status := &DevStatus{
+ DeviceId: info.DeviceId,
+ Size: info.Size,
+ TransactionId: info.TransactionId,
+ }
+
+ if err := devices.activateDeviceIfNeeded(info); err != nil {
+ return nil, fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err)
+ }
+
+ if sizeInSectors, mappedSectors, highestMappedSector, err := devices.deviceStatus(info.DevName()); err != nil {
+ return nil, err
+ } else {
+ status.SizeInSectors = sizeInSectors
+ status.MappedSectors = mappedSectors
+ status.HighestMappedSector = highestMappedSector
+ }
+
+ return status, nil
+}
+
+func (devices *DeviceSet) poolStatus() (totalSizeInSectors, transactionId, dataUsed, dataTotal, metadataUsed, metadataTotal uint64, err error) {
+ var params string
+ if _, totalSizeInSectors, _, params, err = getStatus(devices.getPoolName()); err == nil {
+ _, err = fmt.Sscanf(params, "%d %d/%d %d/%d", &transactionId, &metadataUsed, &metadataTotal, &dataUsed, &dataTotal)
+ }
+ return
+}
+
+func (devices *DeviceSet) Status() *Status {
+ devices.Lock()
+ defer devices.Unlock()
+
+ status := &Status{}
+
+ status.PoolName = devices.getPoolName()
+ status.DataLoopback = path.Join(devices.loopbackDir(), "data")
+ status.MetadataLoopback = path.Join(devices.loopbackDir(), "metadata")
+
+ totalSizeInSectors, _, dataUsed, dataTotal, metadataUsed, metadataTotal, err := devices.poolStatus()
+ if err == nil {
+ // Convert from blocks to bytes
+ blockSizeInSectors := totalSizeInSectors / dataTotal
+
+ status.Data.Used = dataUsed * blockSizeInSectors * 512
+ status.Data.Total = dataTotal * blockSizeInSectors * 512
+
+ // metadata blocks are always 4k
+ status.Metadata.Used = metadataUsed * 4096
+ status.Metadata.Total = metadataTotal * 4096
+
+ status.SectorSize = blockSizeInSectors * 512
+ }
+
+ return status
+}
+
+func NewDeviceSet(root string, doInit bool) (*DeviceSet, error) {
+ SetDevDir("/dev")
+
+ devices := &DeviceSet{
+ root: root,
+ MetaData: MetaData{Devices: make(map[string]*DevInfo)},
+ }
+
+ if err := devices.initDevmapper(doInit); err != nil {
+ return nil, err
+ }
+
+ return devices, nil
+}
diff --git a/runtime/graphdriver/devmapper/devmapper.go b/runtime/graphdriver/devmapper/devmapper.go
new file mode 100644
index 0000000000..7317118dcf
--- /dev/null
+++ b/runtime/graphdriver/devmapper/devmapper.go
@@ -0,0 +1,595 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "errors"
+ "fmt"
+ "github.com/dotcloud/docker/utils"
+ "runtime"
+ "syscall"
+)
+
+type DevmapperLogger interface {
+ log(level int, file string, line int, dmError int, message string)
+}
+
+const (
+ DeviceCreate TaskType = iota
+ DeviceReload
+ DeviceRemove
+ DeviceRemoveAll
+ DeviceSuspend
+ DeviceResume
+ DeviceInfo
+ DeviceDeps
+ DeviceRename
+ DeviceVersion
+ DeviceStatus
+ DeviceTable
+ DeviceWaitevent
+ DeviceList
+ DeviceClear
+ DeviceMknodes
+ DeviceListVersions
+ DeviceTargetMsg
+ DeviceSetGeometry
+)
+
+const (
+ AddNodeOnResume AddNodeType = iota
+ AddNodeOnCreate
+)
+
+var (
+ ErrTaskRun = errors.New("dm_task_run failed")
+ ErrTaskSetName = errors.New("dm_task_set_name failed")
+ ErrTaskSetMessage = errors.New("dm_task_set_message failed")
+ ErrTaskSetAddNode = errors.New("dm_task_set_add_node failed")
+ ErrTaskSetRo = errors.New("dm_task_set_ro failed")
+ ErrTaskAddTarget = errors.New("dm_task_add_target failed")
+ ErrTaskSetSector = errors.New("dm_task_set_sector failed")
+ ErrTaskGetInfo = errors.New("dm_task_get_info failed")
+ ErrTaskSetCookie = errors.New("dm_task_set_cookie failed")
+ ErrNilCookie = errors.New("cookie ptr can't be nil")
+ ErrAttachLoopbackDevice = errors.New("loopback mounting failed")
+ ErrGetBlockSize = errors.New("Can't get block size")
+ ErrUdevWait = errors.New("wait on udev cookie failed")
+ ErrSetDevDir = errors.New("dm_set_dev_dir failed")
+ ErrGetLibraryVersion = errors.New("dm_get_library_version failed")
+ ErrCreateRemoveTask = errors.New("Can't create task of type DeviceRemove")
+ ErrRunRemoveDevice = errors.New("running removeDevice failed")
+ ErrInvalidAddNode = errors.New("Invalide AddNoce type")
+ ErrGetLoopbackBackingFile = errors.New("Unable to get loopback backing file")
+ ErrLoopbackSetCapacity = errors.New("Unable set loopback capacity")
+)
+
+type (
+ Task struct {
+ unmanaged *CDmTask
+ }
+ Info struct {
+ Exists int
+ Suspended int
+ LiveTable int
+ InactiveTable int
+ OpenCount int32
+ EventNr uint32
+ Major uint32
+ Minor uint32
+ ReadOnly int
+ TargetCount int32
+ }
+ TaskType int
+ AddNodeType int
+)
+
+func (t *Task) destroy() {
+ if t != nil {
+ DmTaskDestroy(t.unmanaged)
+ runtime.SetFinalizer(t, nil)
+ }
+}
+
+func TaskCreate(tasktype TaskType) *Task {
+ Ctask := DmTaskCreate(int(tasktype))
+ if Ctask == nil {
+ return nil
+ }
+ task := &Task{unmanaged: Ctask}
+ runtime.SetFinalizer(task, (*Task).destroy)
+ return task
+}
+
+func (t *Task) Run() error {
+ if res := DmTaskRun(t.unmanaged); res != 1 {
+ return ErrTaskRun
+ }
+ return nil
+}
+
+func (t *Task) SetName(name string) error {
+ if res := DmTaskSetName(t.unmanaged, name); res != 1 {
+ return ErrTaskSetName
+ }
+ return nil
+}
+
+func (t *Task) SetMessage(message string) error {
+ if res := DmTaskSetMessage(t.unmanaged, message); res != 1 {
+ return ErrTaskSetMessage
+ }
+ return nil
+}
+
+func (t *Task) SetSector(sector uint64) error {
+ if res := DmTaskSetSector(t.unmanaged, sector); res != 1 {
+ return ErrTaskSetSector
+ }
+ return nil
+}
+
+func (t *Task) SetCookie(cookie *uint, flags uint16) error {
+ if cookie == nil {
+ return ErrNilCookie
+ }
+ if res := DmTaskSetCookie(t.unmanaged, cookie, flags); res != 1 {
+ return ErrTaskSetCookie
+ }
+ return nil
+}
+
+func (t *Task) SetAddNode(addNode AddNodeType) error {
+ if addNode != AddNodeOnResume && addNode != AddNodeOnCreate {
+ return ErrInvalidAddNode
+ }
+ if res := DmTaskSetAddNode(t.unmanaged, addNode); res != 1 {
+ return ErrTaskSetAddNode
+ }
+ return nil
+}
+
+func (t *Task) SetRo() error {
+ if res := DmTaskSetRo(t.unmanaged); res != 1 {
+ return ErrTaskSetRo
+ }
+ return nil
+}
+
+func (t *Task) AddTarget(start, size uint64, ttype, params string) error {
+ if res := DmTaskAddTarget(t.unmanaged, start, size,
+ ttype, params); res != 1 {
+ return ErrTaskAddTarget
+ }
+ return nil
+}
+
+func (t *Task) GetInfo() (*Info, error) {
+ info := &Info{}
+ if res := DmTaskGetInfo(t.unmanaged, info); res != 1 {
+ return nil, ErrTaskGetInfo
+ }
+ return info, nil
+}
+
+func (t *Task) GetNextTarget(next uintptr) (nextPtr uintptr, start uint64,
+ length uint64, targetType string, params string) {
+
+ return DmGetNextTarget(t.unmanaged, next, &start, &length,
+ &targetType, &params),
+ start, length, targetType, params
+}
+
+func getLoopbackBackingFile(file *osFile) (uint64, uint64, error) {
+ loopInfo, err := ioctlLoopGetStatus64(file.Fd())
+ if err != nil {
+ utils.Errorf("Error get loopback backing file: %s\n", err)
+ return 0, 0, ErrGetLoopbackBackingFile
+ }
+ return loopInfo.loDevice, loopInfo.loInode, nil
+}
+
+func LoopbackSetCapacity(file *osFile) error {
+ if err := ioctlLoopSetCapacity(file.Fd(), 0); err != nil {
+ utils.Errorf("Error loopbackSetCapacity: %s", err)
+ return ErrLoopbackSetCapacity
+ }
+ return nil
+}
+
+func FindLoopDeviceFor(file *osFile) *osFile {
+ stat, err := file.Stat()
+ if err != nil {
+ return nil
+ }
+ targetInode := stat.Sys().(*sysStatT).Ino
+ targetDevice := stat.Sys().(*sysStatT).Dev
+
+ for i := 0; true; i++ {
+ path := fmt.Sprintf("/dev/loop%d", i)
+
+ file, err := osOpenFile(path, osORdWr, 0)
+ if err != nil {
+ if osIsNotExist(err) {
+ return nil
+ }
+
+ // Ignore all errors until the first not-exist
+ // we want to continue looking for the file
+ continue
+ }
+
+ dev, inode, err := getLoopbackBackingFile(file)
+ if err == nil && dev == targetDevice && inode == targetInode {
+ return file
+ }
+ file.Close()
+ }
+
+ return nil
+}
+
+func UdevWait(cookie uint) error {
+ if res := DmUdevWait(cookie); res != 1 {
+ utils.Debugf("Failed to wait on udev cookie %d", cookie)
+ return ErrUdevWait
+ }
+ return nil
+}
+
+func LogInitVerbose(level int) {
+ DmLogInitVerbose(level)
+}
+
+var dmLogger DevmapperLogger = nil
+
+func logInit(logger DevmapperLogger) {
+ dmLogger = logger
+ LogWithErrnoInit()
+}
+
+func SetDevDir(dir string) error {
+ if res := DmSetDevDir(dir); res != 1 {
+ utils.Debugf("Error dm_set_dev_dir")
+ return ErrSetDevDir
+ }
+ return nil
+}
+
+func GetLibraryVersion() (string, error) {
+ var version string
+ if res := DmGetLibraryVersion(&version); res != 1 {
+ return "", ErrGetLibraryVersion
+ }
+ return version, nil
+}
+
+// Useful helper for cleanup
+func RemoveDevice(name string) error {
+ task := TaskCreate(DeviceRemove)
+ if task == nil {
+ return ErrCreateRemoveTask
+ }
+ if err := task.SetName(name); err != nil {
+ utils.Debugf("Can't set task name %s", name)
+ return err
+ }
+ if err := task.Run(); err != nil {
+ return ErrRunRemoveDevice
+ }
+ return nil
+}
+
+func GetBlockDeviceSize(file *osFile) (uint64, error) {
+ size, err := ioctlBlkGetSize64(file.Fd())
+ if err != nil {
+ utils.Errorf("Error getblockdevicesize: %s", err)
+ return 0, ErrGetBlockSize
+ }
+ return uint64(size), nil
+}
+
+func BlockDeviceDiscard(path string) error {
+ file, err := osOpenFile(path, osORdWr, 0)
+ if err != nil {
+ return err
+ }
+ defer file.Close()
+
+ size, err := GetBlockDeviceSize(file)
+ if err != nil {
+ return err
+ }
+
+ if err := ioctlBlkDiscard(file.Fd(), 0, size); err != nil {
+ return err
+ }
+
+ // Without this sometimes the remove of the device that happens after
+ // discard fails with EBUSY.
+ syscall.Sync()
+
+ return nil
+}
+
+// This is the programmatic example of "dmsetup create"
+func createPool(poolName string, dataFile, metadataFile *osFile) error {
+ task, err := createTask(DeviceCreate, poolName)
+ if task == nil {
+ return err
+ }
+
+ size, err := GetBlockDeviceSize(dataFile)
+ if err != nil {
+ return fmt.Errorf("Can't get data size")
+ }
+
+ params := metadataFile.Name() + " " + dataFile.Name() + " 128 32768 1 skip_block_zeroing"
+ if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil {
+ return fmt.Errorf("Can't add target")
+ }
+
+ var cookie uint = 0
+ if err := task.SetCookie(&cookie, 0); err != nil {
+ return fmt.Errorf("Can't set cookie")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running DeviceCreate (createPool)")
+ }
+
+ UdevWait(cookie)
+
+ return nil
+}
+
+func reloadPool(poolName string, dataFile, metadataFile *osFile) error {
+ task, err := createTask(DeviceReload, poolName)
+ if task == nil {
+ return err
+ }
+
+ size, err := GetBlockDeviceSize(dataFile)
+ if err != nil {
+ return fmt.Errorf("Can't get data size")
+ }
+
+ params := metadataFile.Name() + " " + dataFile.Name() + " 128 32768"
+ if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil {
+ return fmt.Errorf("Can't add target")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running DeviceCreate")
+ }
+
+ return nil
+}
+
+func createTask(t TaskType, name string) (*Task, error) {
+ task := TaskCreate(t)
+ if task == nil {
+ return nil, fmt.Errorf("Can't create task of type %d", int(t))
+ }
+ if err := task.SetName(name); err != nil {
+ return nil, fmt.Errorf("Can't set task name %s", name)
+ }
+ return task, nil
+}
+
+func getInfo(name string) (*Info, error) {
+ task, err := createTask(DeviceInfo, name)
+ if task == nil {
+ return nil, err
+ }
+ if err := task.Run(); err != nil {
+ return nil, err
+ }
+ return task.GetInfo()
+}
+
+func getStatus(name string) (uint64, uint64, string, string, error) {
+ task, err := createTask(DeviceStatus, name)
+ if task == nil {
+ utils.Debugf("getStatus: Error createTask: %s", err)
+ return 0, 0, "", "", err
+ }
+ if err := task.Run(); err != nil {
+ utils.Debugf("getStatus: Error Run: %s", err)
+ return 0, 0, "", "", err
+ }
+
+ devinfo, err := task.GetInfo()
+ if err != nil {
+ utils.Debugf("getStatus: Error GetInfo: %s", err)
+ return 0, 0, "", "", err
+ }
+ if devinfo.Exists == 0 {
+ utils.Debugf("getStatus: Non existing device %s", name)
+ return 0, 0, "", "", fmt.Errorf("Non existing device %s", name)
+ }
+
+ _, start, length, targetType, params := task.GetNextTarget(0)
+ return start, length, targetType, params, nil
+}
+
+func setTransactionId(poolName string, oldId uint64, newId uint64) error {
+ task, err := createTask(DeviceTargetMsg, poolName)
+ if task == nil {
+ return err
+ }
+
+ if err := task.SetSector(0); err != nil {
+ return fmt.Errorf("Can't set sector")
+ }
+
+ if err := task.SetMessage(fmt.Sprintf("set_transaction_id %d %d", oldId, newId)); err != nil {
+ return fmt.Errorf("Can't set message")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running setTransactionId")
+ }
+ return nil
+}
+
+func suspendDevice(name string) error {
+ task, err := createTask(DeviceSuspend, name)
+ if task == nil {
+ return err
+ }
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running DeviceSuspend: %s", err)
+ }
+ return nil
+}
+
+func resumeDevice(name string) error {
+ task, err := createTask(DeviceResume, name)
+ if task == nil {
+ return err
+ }
+
+ var cookie uint = 0
+ if err := task.SetCookie(&cookie, 0); err != nil {
+ return fmt.Errorf("Can't set cookie")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running DeviceResume")
+ }
+
+ UdevWait(cookie)
+
+ return nil
+}
+
+func createDevice(poolName string, deviceId int) error {
+ utils.Debugf("[devmapper] createDevice(poolName=%v, deviceId=%v)", poolName, deviceId)
+ task, err := createTask(DeviceTargetMsg, poolName)
+ if task == nil {
+ return err
+ }
+
+ if err := task.SetSector(0); err != nil {
+ return fmt.Errorf("Can't set sector")
+ }
+
+ if err := task.SetMessage(fmt.Sprintf("create_thin %d", deviceId)); err != nil {
+ return fmt.Errorf("Can't set message")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running createDevice")
+ }
+ return nil
+}
+
+func deleteDevice(poolName string, deviceId int) error {
+ task, err := createTask(DeviceTargetMsg, poolName)
+ if task == nil {
+ return err
+ }
+
+ if err := task.SetSector(0); err != nil {
+ return fmt.Errorf("Can't set sector")
+ }
+
+ if err := task.SetMessage(fmt.Sprintf("delete %d", deviceId)); err != nil {
+ return fmt.Errorf("Can't set message")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running deleteDevice")
+ }
+ return nil
+}
+
+func removeDevice(name string) error {
+ utils.Debugf("[devmapper] removeDevice START")
+ defer utils.Debugf("[devmapper] removeDevice END")
+ task, err := createTask(DeviceRemove, name)
+ if task == nil {
+ return err
+ }
+ if err = task.Run(); err != nil {
+ return fmt.Errorf("Error running removeDevice")
+ }
+ return nil
+}
+
+func activateDevice(poolName string, name string, deviceId int, size uint64) error {
+ task, err := createTask(DeviceCreate, name)
+ if task == nil {
+ return err
+ }
+
+ params := fmt.Sprintf("%s %d", poolName, deviceId)
+ if err := task.AddTarget(0, size/512, "thin", params); err != nil {
+ return fmt.Errorf("Can't add target")
+ }
+ if err := task.SetAddNode(AddNodeOnCreate); err != nil {
+ return fmt.Errorf("Can't add node")
+ }
+
+ var cookie uint = 0
+ if err := task.SetCookie(&cookie, 0); err != nil {
+ return fmt.Errorf("Can't set cookie")
+ }
+
+ if err := task.Run(); err != nil {
+ return fmt.Errorf("Error running DeviceCreate (activateDevice)")
+ }
+
+ UdevWait(cookie)
+
+ return nil
+}
+
+func (devices *DeviceSet) createSnapDevice(poolName string, deviceId int, baseName string, baseDeviceId int) error {
+ devinfo, _ := getInfo(baseName)
+ doSuspend := devinfo != nil && devinfo.Exists != 0
+
+ if doSuspend {
+ if err := suspendDevice(baseName); err != nil {
+ return err
+ }
+ }
+
+ task, err := createTask(DeviceTargetMsg, poolName)
+ if task == nil {
+ if doSuspend {
+ resumeDevice(baseName)
+ }
+ return err
+ }
+
+ if err := task.SetSector(0); err != nil {
+ if doSuspend {
+ resumeDevice(baseName)
+ }
+ return fmt.Errorf("Can't set sector")
+ }
+
+ if err := task.SetMessage(fmt.Sprintf("create_snap %d %d", deviceId, baseDeviceId)); err != nil {
+ if doSuspend {
+ resumeDevice(baseName)
+ }
+ return fmt.Errorf("Can't set message")
+ }
+
+ if err := task.Run(); err != nil {
+ if doSuspend {
+ resumeDevice(baseName)
+ }
+ return fmt.Errorf("Error running DeviceCreate (createSnapDevice)")
+ }
+
+ if doSuspend {
+ if err := resumeDevice(baseName); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/runtime/graphdriver/devmapper/devmapper_doc.go b/runtime/graphdriver/devmapper/devmapper_doc.go
new file mode 100644
index 0000000000..c1c3e3891b
--- /dev/null
+++ b/runtime/graphdriver/devmapper/devmapper_doc.go
@@ -0,0 +1,106 @@
+package devmapper
+
+// Definition of struct dm_task and sub structures (from lvm2)
+//
+// struct dm_ioctl {
+// /*
+// * The version number is made up of three parts:
+// * major - no backward or forward compatibility,
+// * minor - only backwards compatible,
+// * patch - both backwards and forwards compatible.
+// *
+// * All clients of the ioctl interface should fill in the
+// * version number of the interface that they were
+// * compiled with.
+// *
+// * All recognised ioctl commands (ie. those that don't
+// * return -ENOTTY) fill out this field, even if the
+// * command failed.
+// */
+// uint32_t version[3]; /* in/out */
+// uint32_t data_size; /* total size of data passed in
+// * including this struct */
+
+// uint32_t data_start; /* offset to start of data
+// * relative to start of this struct */
+
+// uint32_t target_count; /* in/out */
+// int32_t open_count; /* out */
+// uint32_t flags; /* in/out */
+
+// /*
+// * event_nr holds either the event number (input and output) or the
+// * udev cookie value (input only).
+// * The DM_DEV_WAIT ioctl takes an event number as input.
+// * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+// * use the field as a cookie to return in the DM_COOKIE
+// * variable with the uevents they issue.
+// * For output, the ioctls return the event number, not the cookie.
+// */
+// uint32_t event_nr; /* in/out */
+// uint32_t padding;
+
+// uint64_t dev; /* in/out */
+
+// char name[DM_NAME_LEN]; /* device name */
+// char uuid[DM_UUID_LEN]; /* unique identifier for
+// * the block device */
+// char data[7]; /* padding or data */
+// };
+
+// struct target {
+// uint64_t start;
+// uint64_t length;
+// char *type;
+// char *params;
+
+// struct target *next;
+// };
+
+// typedef enum {
+// DM_ADD_NODE_ON_RESUME, /* add /dev/mapper node with dmsetup resume */
+// DM_ADD_NODE_ON_CREATE /* add /dev/mapper node with dmsetup create */
+// } dm_add_node_t;
+
+// struct dm_task {
+// int type;
+// char *dev_name;
+// char *mangled_dev_name;
+
+// struct target *head, *tail;
+
+// int read_only;
+// uint32_t event_nr;
+// int major;
+// int minor;
+// int allow_default_major_fallback;
+// uid_t uid;
+// gid_t gid;
+// mode_t mode;
+// uint32_t read_ahead;
+// uint32_t read_ahead_flags;
+// union {
+// struct dm_ioctl *v4;
+// } dmi;
+// char *newname;
+// char *message;
+// char *geometry;
+// uint64_t sector;
+// int no_flush;
+// int no_open_count;
+// int skip_lockfs;
+// int query_inactive_table;
+// int suppress_identical_reload;
+// dm_add_node_t add_node;
+// uint64_t existing_table_size;
+// int cookie_set;
+// int new_uuid;
+// int secure_data;
+// int retry_remove;
+// int enable_checks;
+// int expected_errno;
+
+// char *uuid;
+// char *mangled_uuid;
+// };
+//
diff --git a/runtime/graphdriver/devmapper/devmapper_log.go b/runtime/graphdriver/devmapper/devmapper_log.go
new file mode 100644
index 0000000000..18dde7cca5
--- /dev/null
+++ b/runtime/graphdriver/devmapper/devmapper_log.go
@@ -0,0 +1,15 @@
+// +build linux,amd64
+
+package devmapper
+
+import "C"
+
+// Due to the way cgo works this has to be in a separate file, as devmapper.go has
+// definitions in the cgo block, which is incompatible with using "//export"
+
+//export DevmapperLogCallback
+func DevmapperLogCallback(level C.int, file *C.char, line C.int, dm_errno_or_class C.int, message *C.char) {
+ if dmLogger != nil {
+ dmLogger.log(int(level), C.GoString(file), int(line), int(dm_errno_or_class), C.GoString(message))
+ }
+}
diff --git a/runtime/graphdriver/devmapper/devmapper_test.go b/runtime/graphdriver/devmapper/devmapper_test.go
new file mode 100644
index 0000000000..3ffa163ceb
--- /dev/null
+++ b/runtime/graphdriver/devmapper/devmapper_test.go
@@ -0,0 +1,287 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "testing"
+)
+
+func TestTaskCreate(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ // Test success
+ taskCreate(t, DeviceInfo)
+
+ // Test Failure
+ DmTaskCreate = dmTaskCreateFail
+ defer func() { DmTaskCreate = dmTaskCreateFct }()
+ if task := TaskCreate(-1); task != nil {
+ t.Fatalf("An error should have occured while creating an invalid task.")
+ }
+}
+
+func TestTaskRun(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ // Perform the RUN
+ if err := task.Run(); err != nil {
+ t.Fatal(err)
+ }
+ // Make sure we don't have error with GetInfo
+ if _, err := task.GetInfo(); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ DmTaskRun = dmTaskRunFail
+ defer func() { DmTaskRun = dmTaskRunFct }()
+
+ task = taskCreate(t, DeviceInfo)
+ // Perform the RUN
+ if err := task.Run(); err != ErrTaskRun {
+ t.Fatalf("An error should have occured while running task.")
+ }
+ // Make sure GetInfo also fails
+ if _, err := task.GetInfo(); err != ErrTaskGetInfo {
+ t.Fatalf("GetInfo should fail if task.Run() failed.")
+ }
+}
+
+func TestTaskSetName(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.SetName("test"); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ DmTaskSetName = dmTaskSetNameFail
+ defer func() { DmTaskSetName = dmTaskSetNameFct }()
+
+ if err := task.SetName("test"); err != ErrTaskSetName {
+ t.Fatalf("An error should have occured while runnign SetName.")
+ }
+}
+
+func TestTaskSetMessage(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.SetMessage("test"); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ DmTaskSetMessage = dmTaskSetMessageFail
+ defer func() { DmTaskSetMessage = dmTaskSetMessageFct }()
+
+ if err := task.SetMessage("test"); err != ErrTaskSetMessage {
+ t.Fatalf("An error should have occured while runnign SetMessage.")
+ }
+}
+
+func TestTaskSetSector(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.SetSector(128); err != nil {
+ t.Fatal(err)
+ }
+
+ DmTaskSetSector = dmTaskSetSectorFail
+ defer func() { DmTaskSetSector = dmTaskSetSectorFct }()
+
+ // Test failure
+ if err := task.SetSector(0); err != ErrTaskSetSector {
+ t.Fatalf("An error should have occured while running SetSector.")
+ }
+}
+
+func TestTaskSetCookie(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ var (
+ cookie uint = 0
+ task = taskCreate(t, DeviceInfo)
+ )
+
+ // Test success
+ if err := task.SetCookie(&cookie, 0); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ if err := task.SetCookie(nil, 0); err != ErrNilCookie {
+ t.Fatalf("An error should have occured while running SetCookie with nil cookie.")
+ }
+
+ DmTaskSetCookie = dmTaskSetCookieFail
+ defer func() { DmTaskSetCookie = dmTaskSetCookieFct }()
+
+ if err := task.SetCookie(&cookie, 0); err != ErrTaskSetCookie {
+ t.Fatalf("An error should have occured while running SetCookie.")
+ }
+}
+
+func TestTaskSetAddNode(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.SetAddNode(0); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ if err := task.SetAddNode(-1); err != ErrInvalidAddNode {
+ t.Fatalf("An error should have occured running SetAddNode with wrong node.")
+ }
+
+ DmTaskSetAddNode = dmTaskSetAddNodeFail
+ defer func() { DmTaskSetAddNode = dmTaskSetAddNodeFct }()
+
+ if err := task.SetAddNode(0); err != ErrTaskSetAddNode {
+ t.Fatalf("An error should have occured running SetAddNode.")
+ }
+}
+
+func TestTaskSetRo(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.SetRo(); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ DmTaskSetRo = dmTaskSetRoFail
+ defer func() { DmTaskSetRo = dmTaskSetRoFct }()
+
+ if err := task.SetRo(); err != ErrTaskSetRo {
+ t.Fatalf("An error should have occured running SetRo.")
+ }
+}
+
+func TestTaskAddTarget(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ task := taskCreate(t, DeviceInfo)
+
+ // Test success
+ if err := task.AddTarget(0, 128, "thinp", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ // Test failure
+ DmTaskAddTarget = dmTaskAddTargetFail
+ defer func() { DmTaskAddTarget = dmTaskAddTargetFct }()
+
+ if err := task.AddTarget(0, 128, "thinp", ""); err != ErrTaskAddTarget {
+ t.Fatalf("An error should have occured running AddTarget.")
+ }
+}
+
+// func TestTaskGetInfo(t *testing.T) {
+// task := taskCreate(t, DeviceInfo)
+
+// // Test success
+// if _, err := task.GetInfo(); err != nil {
+// t.Fatal(err)
+// }
+
+// // Test failure
+// DmTaskGetInfo = dmTaskGetInfoFail
+// defer func() { DmTaskGetInfo = dmTaskGetInfoFct }()
+
+// if _, err := task.GetInfo(); err != ErrTaskGetInfo {
+// t.Fatalf("An error should have occured running GetInfo.")
+// }
+// }
+
+// func TestTaskGetNextTarget(t *testing.T) {
+// task := taskCreate(t, DeviceInfo)
+
+// if next, _, _, _, _ := task.GetNextTarget(0); next == 0 {
+// t.Fatalf("The next target should not be 0.")
+// }
+// }
+
+/// Utils
+func taskCreate(t *testing.T, taskType TaskType) *Task {
+ task := TaskCreate(taskType)
+ if task == nil {
+ t.Fatalf("Error creating task")
+ }
+ return task
+}
+
+/// Failure function replacement
+func dmTaskCreateFail(t int) *CDmTask {
+ return nil
+}
+
+func dmTaskRunFail(task *CDmTask) int {
+ return -1
+}
+
+func dmTaskSetNameFail(task *CDmTask, name string) int {
+ return -1
+}
+
+func dmTaskSetMessageFail(task *CDmTask, message string) int {
+ return -1
+}
+
+func dmTaskSetSectorFail(task *CDmTask, sector uint64) int {
+ return -1
+}
+
+func dmTaskSetCookieFail(task *CDmTask, cookie *uint, flags uint16) int {
+ return -1
+}
+
+func dmTaskSetAddNodeFail(task *CDmTask, addNode AddNodeType) int {
+ return -1
+}
+
+func dmTaskSetRoFail(task *CDmTask) int {
+ return -1
+}
+
+func dmTaskAddTargetFail(task *CDmTask,
+ start, size uint64, ttype, params string) int {
+ return -1
+}
+
+func dmTaskGetInfoFail(task *CDmTask, info *Info) int {
+ return -1
+}
+
+func dmGetNextTargetFail(task *CDmTask, next uintptr, start, length *uint64,
+ target, params *string) uintptr {
+ return 0
+}
+
+func dmAttachLoopDeviceFail(filename string, fd *int) string {
+ return ""
+}
+
+func sysGetBlockSizeFail(fd uintptr, size *uint64) sysErrno {
+ return 1
+}
+
+func dmUdevWaitFail(cookie uint) int {
+ return -1
+}
+
+func dmSetDevDirFail(dir string) int {
+ return -1
+}
+
+func dmGetLibraryVersionFail(version *string) int {
+ return -1
+}
diff --git a/runtime/graphdriver/devmapper/devmapper_wrapper.go b/runtime/graphdriver/devmapper/devmapper_wrapper.go
new file mode 100644
index 0000000000..bf558affc8
--- /dev/null
+++ b/runtime/graphdriver/devmapper/devmapper_wrapper.go
@@ -0,0 +1,229 @@
+// +build linux,amd64
+
+package devmapper
+
+/*
+#cgo LDFLAGS: -L. -ldevmapper
+#include <libdevmapper.h>
+#include <linux/loop.h> // FIXME: present only for defines, maybe we can remove it?
+#include <linux/fs.h> // FIXME: present only for BLKGETSIZE64, maybe we can remove it?
+
+#ifndef LOOP_CTL_GET_FREE
+ #define LOOP_CTL_GET_FREE 0x4C82
+#endif
+
+#ifndef LO_FLAGS_PARTSCAN
+ #define LO_FLAGS_PARTSCAN 8
+#endif
+
+// FIXME: Can't we find a way to do the logging in pure Go?
+extern void DevmapperLogCallback(int level, char *file, int line, int dm_errno_or_class, char *str);
+
+static void log_cb(int level, const char *file, int line, int dm_errno_or_class, const char *f, ...)
+{
+ char buffer[256];
+ va_list ap;
+
+ va_start(ap, f);
+ vsnprintf(buffer, 256, f, ap);
+ va_end(ap);
+
+ DevmapperLogCallback(level, (char *)file, line, dm_errno_or_class, buffer);
+}
+
+static void log_with_errno_init()
+{
+ dm_log_with_errno_init(log_cb);
+}
+*/
+import "C"
+
+import (
+ "unsafe"
+)
+
+type (
+ CDmTask C.struct_dm_task
+
+ CLoopInfo64 C.struct_loop_info64
+ LoopInfo64 struct {
+ loDevice uint64 /* ioctl r/o */
+ loInode uint64 /* ioctl r/o */
+ loRdevice uint64 /* ioctl r/o */
+ loOffset uint64
+ loSizelimit uint64 /* bytes, 0 == max available */
+ loNumber uint32 /* ioctl r/o */
+ loEncrypt_type uint32
+ loEncrypt_key_size uint32 /* ioctl w/o */
+ loFlags uint32 /* ioctl r/o */
+ loFileName [LoNameSize]uint8
+ loCryptName [LoNameSize]uint8
+ loEncryptKey [LoKeySize]uint8 /* ioctl w/o */
+ loInit [2]uint64
+ }
+)
+
+// IOCTL consts
+const (
+ BlkGetSize64 = C.BLKGETSIZE64
+ BlkDiscard = C.BLKDISCARD
+
+ LoopSetFd = C.LOOP_SET_FD
+ LoopCtlGetFree = C.LOOP_CTL_GET_FREE
+ LoopGetStatus64 = C.LOOP_GET_STATUS64
+ LoopSetStatus64 = C.LOOP_SET_STATUS64
+ LoopClrFd = C.LOOP_CLR_FD
+ LoopSetCapacity = C.LOOP_SET_CAPACITY
+)
+
+const (
+ LoFlagsAutoClear = C.LO_FLAGS_AUTOCLEAR
+ LoFlagsReadOnly = C.LO_FLAGS_READ_ONLY
+ LoFlagsPartScan = C.LO_FLAGS_PARTSCAN
+ LoKeySize = C.LO_KEY_SIZE
+ LoNameSize = C.LO_NAME_SIZE
+)
+
+var (
+ DmGetLibraryVersion = dmGetLibraryVersionFct
+ DmGetNextTarget = dmGetNextTargetFct
+ DmLogInitVerbose = dmLogInitVerboseFct
+ DmSetDevDir = dmSetDevDirFct
+ DmTaskAddTarget = dmTaskAddTargetFct
+ DmTaskCreate = dmTaskCreateFct
+ DmTaskDestroy = dmTaskDestroyFct
+ DmTaskGetInfo = dmTaskGetInfoFct
+ DmTaskRun = dmTaskRunFct
+ DmTaskSetAddNode = dmTaskSetAddNodeFct
+ DmTaskSetCookie = dmTaskSetCookieFct
+ DmTaskSetMessage = dmTaskSetMessageFct
+ DmTaskSetName = dmTaskSetNameFct
+ DmTaskSetRo = dmTaskSetRoFct
+ DmTaskSetSector = dmTaskSetSectorFct
+ DmUdevWait = dmUdevWaitFct
+ LogWithErrnoInit = logWithErrnoInitFct
+)
+
+func free(p *C.char) {
+ C.free(unsafe.Pointer(p))
+}
+
+func dmTaskDestroyFct(task *CDmTask) {
+ C.dm_task_destroy((*C.struct_dm_task)(task))
+}
+
+func dmTaskCreateFct(taskType int) *CDmTask {
+ return (*CDmTask)(C.dm_task_create(C.int(taskType)))
+}
+
+func dmTaskRunFct(task *CDmTask) int {
+ ret, _ := C.dm_task_run((*C.struct_dm_task)(task))
+ return int(ret)
+}
+
+func dmTaskSetNameFct(task *CDmTask, name string) int {
+ Cname := C.CString(name)
+ defer free(Cname)
+
+ return int(C.dm_task_set_name((*C.struct_dm_task)(task), Cname))
+}
+
+func dmTaskSetMessageFct(task *CDmTask, message string) int {
+ Cmessage := C.CString(message)
+ defer free(Cmessage)
+
+ return int(C.dm_task_set_message((*C.struct_dm_task)(task), Cmessage))
+}
+
+func dmTaskSetSectorFct(task *CDmTask, sector uint64) int {
+ return int(C.dm_task_set_sector((*C.struct_dm_task)(task), C.uint64_t(sector)))
+}
+
+func dmTaskSetCookieFct(task *CDmTask, cookie *uint, flags uint16) int {
+ cCookie := C.uint32_t(*cookie)
+ defer func() {
+ *cookie = uint(cCookie)
+ }()
+ return int(C.dm_task_set_cookie((*C.struct_dm_task)(task), &cCookie, C.uint16_t(flags)))
+}
+
+func dmTaskSetAddNodeFct(task *CDmTask, addNode AddNodeType) int {
+ return int(C.dm_task_set_add_node((*C.struct_dm_task)(task), C.dm_add_node_t(addNode)))
+}
+
+func dmTaskSetRoFct(task *CDmTask) int {
+ return int(C.dm_task_set_ro((*C.struct_dm_task)(task)))
+}
+
+func dmTaskAddTargetFct(task *CDmTask,
+ start, size uint64, ttype, params string) int {
+
+ Cttype := C.CString(ttype)
+ defer free(Cttype)
+
+ Cparams := C.CString(params)
+ defer free(Cparams)
+
+ return int(C.dm_task_add_target((*C.struct_dm_task)(task), C.uint64_t(start), C.uint64_t(size), Cttype, Cparams))
+}
+
+func dmTaskGetInfoFct(task *CDmTask, info *Info) int {
+ Cinfo := C.struct_dm_info{}
+ defer func() {
+ info.Exists = int(Cinfo.exists)
+ info.Suspended = int(Cinfo.suspended)
+ info.LiveTable = int(Cinfo.live_table)
+ info.InactiveTable = int(Cinfo.inactive_table)
+ info.OpenCount = int32(Cinfo.open_count)
+ info.EventNr = uint32(Cinfo.event_nr)
+ info.Major = uint32(Cinfo.major)
+ info.Minor = uint32(Cinfo.minor)
+ info.ReadOnly = int(Cinfo.read_only)
+ info.TargetCount = int32(Cinfo.target_count)
+ }()
+ return int(C.dm_task_get_info((*C.struct_dm_task)(task), &Cinfo))
+}
+
+func dmGetNextTargetFct(task *CDmTask, next uintptr, start, length *uint64, target, params *string) uintptr {
+ var (
+ Cstart, Clength C.uint64_t
+ CtargetType, Cparams *C.char
+ )
+ defer func() {
+ *start = uint64(Cstart)
+ *length = uint64(Clength)
+ *target = C.GoString(CtargetType)
+ *params = C.GoString(Cparams)
+ }()
+
+ nextp := C.dm_get_next_target((*C.struct_dm_task)(task), unsafe.Pointer(next), &Cstart, &Clength, &CtargetType, &Cparams)
+ return uintptr(nextp)
+}
+
+func dmUdevWaitFct(cookie uint) int {
+ return int(C.dm_udev_wait(C.uint32_t(cookie)))
+}
+
+func dmLogInitVerboseFct(level int) {
+ C.dm_log_init_verbose(C.int(level))
+}
+
+func logWithErrnoInitFct() {
+ C.log_with_errno_init()
+}
+
+func dmSetDevDirFct(dir string) int {
+ Cdir := C.CString(dir)
+ defer free(Cdir)
+
+ return int(C.dm_set_dev_dir(Cdir))
+}
+
+func dmGetLibraryVersionFct(version *string) int {
+ buffer := C.CString(string(make([]byte, 128)))
+ defer free(buffer)
+ defer func() {
+ *version = C.GoString(buffer)
+ }()
+ return int(C.dm_get_library_version(buffer, 128))
+}
diff --git a/runtime/graphdriver/devmapper/driver.go b/runtime/graphdriver/devmapper/driver.go
new file mode 100644
index 0000000000..35fe883f26
--- /dev/null
+++ b/runtime/graphdriver/devmapper/driver.go
@@ -0,0 +1,142 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "github.com/dotcloud/docker/utils"
+ "io/ioutil"
+ "os"
+ "path"
+)
+
+func init() {
+ graphdriver.Register("devicemapper", Init)
+}
+
+// Placeholder interfaces, to be replaced
+// at integration.
+
+// End of placeholder interfaces.
+
+type Driver struct {
+ *DeviceSet
+ home string
+}
+
+var Init = func(home string) (graphdriver.Driver, error) {
+ deviceSet, err := NewDeviceSet(home, true)
+ if err != nil {
+ return nil, err
+ }
+ d := &Driver{
+ DeviceSet: deviceSet,
+ home: home,
+ }
+ return d, nil
+}
+
+func (d *Driver) String() string {
+ return "devicemapper"
+}
+
+func (d *Driver) Status() [][2]string {
+ s := d.DeviceSet.Status()
+
+ status := [][2]string{
+ {"Pool Name", s.PoolName},
+ {"Data file", s.DataLoopback},
+ {"Metadata file", s.MetadataLoopback},
+ {"Data Space Used", fmt.Sprintf("%.1f Mb", float64(s.Data.Used)/(1024*1024))},
+ {"Data Space Total", fmt.Sprintf("%.1f Mb", float64(s.Data.Total)/(1024*1024))},
+ {"Metadata Space Used", fmt.Sprintf("%.1f Mb", float64(s.Metadata.Used)/(1024*1024))},
+ {"Metadata Space Total", fmt.Sprintf("%.1f Mb", float64(s.Metadata.Total)/(1024*1024))},
+ }
+ return status
+}
+
+func (d *Driver) Cleanup() error {
+ return d.DeviceSet.Shutdown()
+}
+
+func (d *Driver) Create(id, parent string, mountLabel string) error {
+ if err := d.DeviceSet.AddDevice(id, parent); err != nil {
+ return err
+ }
+ mp := path.Join(d.home, "mnt", id)
+ if err := d.mount(id, mp); err != nil {
+ return err
+ }
+
+ if err := osMkdirAll(path.Join(mp, "rootfs"), 0755); err != nil && !osIsExist(err) {
+ return err
+ }
+
+ // Create an "id" file with the container/image id in it to help reconscruct this in case
+ // of later problems
+ if err := ioutil.WriteFile(path.Join(mp, "id"), []byte(id), 0600); err != nil {
+ return err
+ }
+
+ // We float this reference so that the next Get call can
+ // steal it, so we don't have to unmount
+ if err := d.DeviceSet.UnmountDevice(id, UnmountFloat); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (d *Driver) Remove(id string) error {
+ if !d.DeviceSet.HasDevice(id) {
+ // Consider removing a non-existing device a no-op
+ // This is useful to be able to progress on container removal
+ // if the underlying device has gone away due to earlier errors
+ return nil
+ }
+
+ // Sink the float from create in case no Get() call was made
+ if err := d.DeviceSet.UnmountDevice(id, UnmountSink); err != nil {
+ return err
+ }
+ // This assumes the device has been properly Get/Put:ed and thus is unmounted
+ if err := d.DeviceSet.DeleteDevice(id); err != nil {
+ return err
+ }
+
+ mp := path.Join(d.home, "mnt", id)
+ if err := os.RemoveAll(mp); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+
+ return nil
+}
+
+func (d *Driver) Get(id string) (string, error) {
+ mp := path.Join(d.home, "mnt", id)
+ if err := d.mount(id, mp); err != nil {
+ return "", err
+ }
+
+ return path.Join(mp, "rootfs"), nil
+}
+
+func (d *Driver) Put(id string) {
+ if err := d.DeviceSet.UnmountDevice(id, UnmountRegular); err != nil {
+ utils.Errorf("Warning: error unmounting device %s: %s\n", id, err)
+ }
+}
+
+func (d *Driver) mount(id, mountPoint string) error {
+ // Create the target directories if they don't exist
+ if err := osMkdirAll(mountPoint, 0755); err != nil && !osIsExist(err) {
+ return err
+ }
+ // Mount the device
+ return d.DeviceSet.MountDevice(id, mountPoint, "")
+}
+
+func (d *Driver) Exists(id string) bool {
+ return d.Devices[id] != nil
+}
diff --git a/runtime/graphdriver/devmapper/driver_test.go b/runtime/graphdriver/devmapper/driver_test.go
new file mode 100644
index 0000000000..4ca72db0ca
--- /dev/null
+++ b/runtime/graphdriver/devmapper/driver_test.go
@@ -0,0 +1,886 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "io/ioutil"
+ "path"
+ "runtime"
+ "strings"
+ "syscall"
+ "testing"
+)
+
+func init() {
+ // Reduce the size the the base fs and loopback for the tests
+ DefaultDataLoopbackSize = 300 * 1024 * 1024
+ DefaultMetaDataLoopbackSize = 200 * 1024 * 1024
+ DefaultBaseFsSize = 300 * 1024 * 1024
+}
+
+// denyAllDevmapper mocks all calls to libdevmapper in the unit tests, and denies them by default
+func denyAllDevmapper() {
+ // Hijack all calls to libdevmapper with default panics.
+ // Authorized calls are selectively hijacked in each tests.
+ DmTaskCreate = func(t int) *CDmTask {
+ panic("DmTaskCreate: this method should not be called here")
+ }
+ DmTaskRun = func(task *CDmTask) int {
+ panic("DmTaskRun: this method should not be called here")
+ }
+ DmTaskSetName = func(task *CDmTask, name string) int {
+ panic("DmTaskSetName: this method should not be called here")
+ }
+ DmTaskSetMessage = func(task *CDmTask, message string) int {
+ panic("DmTaskSetMessage: this method should not be called here")
+ }
+ DmTaskSetSector = func(task *CDmTask, sector uint64) int {
+ panic("DmTaskSetSector: this method should not be called here")
+ }
+ DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int {
+ panic("DmTaskSetCookie: this method should not be called here")
+ }
+ DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int {
+ panic("DmTaskSetAddNode: this method should not be called here")
+ }
+ DmTaskSetRo = func(task *CDmTask) int {
+ panic("DmTaskSetRo: this method should not be called here")
+ }
+ DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int {
+ panic("DmTaskAddTarget: this method should not be called here")
+ }
+ DmTaskGetInfo = func(task *CDmTask, info *Info) int {
+ panic("DmTaskGetInfo: this method should not be called here")
+ }
+ DmGetNextTarget = func(task *CDmTask, next uintptr, start, length *uint64, target, params *string) uintptr {
+ panic("DmGetNextTarget: this method should not be called here")
+ }
+ DmUdevWait = func(cookie uint) int {
+ panic("DmUdevWait: this method should not be called here")
+ }
+ DmSetDevDir = func(dir string) int {
+ panic("DmSetDevDir: this method should not be called here")
+ }
+ DmGetLibraryVersion = func(version *string) int {
+ panic("DmGetLibraryVersion: this method should not be called here")
+ }
+ DmLogInitVerbose = func(level int) {
+ panic("DmLogInitVerbose: this method should not be called here")
+ }
+ DmTaskDestroy = func(task *CDmTask) {
+ panic("DmTaskDestroy: this method should not be called here")
+ }
+ LogWithErrnoInit = func() {
+ panic("LogWithErrnoInit: this method should not be called here")
+ }
+}
+
+func denyAllSyscall() {
+ sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) {
+ panic("sysMount: this method should not be called here")
+ }
+ sysUnmount = func(target string, flags int) (err error) {
+ panic("sysUnmount: this method should not be called here")
+ }
+ sysCloseOnExec = func(fd int) {
+ panic("sysCloseOnExec: this method should not be called here")
+ }
+ sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
+ panic("sysSyscall: this method should not be called here")
+ }
+ // Not a syscall, but forbidding it here anyway
+ Mounted = func(mnt string) (bool, error) {
+ panic("devmapper.Mounted: this method should not be called here")
+ }
+ // osOpenFile = os.OpenFile
+ // osNewFile = os.NewFile
+ // osCreate = os.Create
+ // osStat = os.Stat
+ // osIsNotExist = os.IsNotExist
+ // osIsExist = os.IsExist
+ // osMkdirAll = os.MkdirAll
+ // osRemoveAll = os.RemoveAll
+ // osRename = os.Rename
+ // osReadlink = os.Readlink
+
+ // execRun = func(name string, args ...string) error {
+ // return exec.Command(name, args...).Run()
+ // }
+}
+
+func mkTestDirectory(t *testing.T) string {
+ dir, err := ioutil.TempDir("", "docker-test-devmapper-")
+ if err != nil {
+ t.Fatal(err)
+ }
+ return dir
+}
+
+func newDriver(t *testing.T) *Driver {
+ home := mkTestDirectory(t)
+ d, err := Init(home)
+ if err != nil {
+ t.Fatal(err)
+ }
+ return d.(*Driver)
+}
+
+func cleanup(d *Driver) {
+ d.Cleanup()
+ osRemoveAll(d.home)
+}
+
+type Set map[string]bool
+
+func (r Set) Assert(t *testing.T, names ...string) {
+ for _, key := range names {
+ required := true
+ if strings.HasPrefix(key, "?") {
+ key = key[1:]
+ required = false
+ }
+ if _, exists := r[key]; !exists && required {
+ t.Fatalf("Key not set: %s", key)
+ }
+ delete(r, key)
+ }
+ if len(r) != 0 {
+ t.Fatalf("Unexpected keys: %v", r)
+ }
+}
+
+func TestInit(t *testing.T) {
+ var (
+ calls = make(Set)
+ taskMessages = make(Set)
+ taskTypes = make(Set)
+ home = mkTestDirectory(t)
+ )
+ defer osRemoveAll(home)
+
+ func() {
+ denyAllDevmapper()
+ DmSetDevDir = func(dir string) int {
+ calls["DmSetDevDir"] = true
+ expectedDir := "/dev"
+ if dir != expectedDir {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmSetDevDir(%v)\nReceived: DmSetDevDir(%v)\n", expectedDir, dir)
+ }
+ return 0
+ }
+ LogWithErrnoInit = func() {
+ calls["DmLogWithErrnoInit"] = true
+ }
+ var task1 CDmTask
+ DmTaskCreate = func(taskType int) *CDmTask {
+ calls["DmTaskCreate"] = true
+ taskTypes[fmt.Sprintf("%d", taskType)] = true
+ return &task1
+ }
+ DmTaskSetName = func(task *CDmTask, name string) int {
+ calls["DmTaskSetName"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetName(%v)\nReceived: DmTaskSetName(%v)\n", expectedTask, task)
+ }
+ // FIXME: use Set.AssertRegexp()
+ if !strings.HasPrefix(name, "docker-") && !strings.HasPrefix(name, "/dev/mapper/docker-") ||
+ !strings.HasSuffix(name, "-pool") && !strings.HasSuffix(name, "-base") {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetName(%v)\nReceived: DmTaskSetName(%v)\n", "docker-...-pool", name)
+ }
+ return 1
+ }
+ DmTaskRun = func(task *CDmTask) int {
+ calls["DmTaskRun"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskRun(%v)\nReceived: DmTaskRun(%v)\n", expectedTask, task)
+ }
+ return 1
+ }
+ DmTaskGetInfo = func(task *CDmTask, info *Info) int {
+ calls["DmTaskGetInfo"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskGetInfo(%v)\nReceived: DmTaskGetInfo(%v)\n", expectedTask, task)
+ }
+ // This will crash if info is not dereferenceable
+ info.Exists = 0
+ return 1
+ }
+ DmTaskSetSector = func(task *CDmTask, sector uint64) int {
+ calls["DmTaskSetSector"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetSector(%v)\nReceived: DmTaskSetSector(%v)\n", expectedTask, task)
+ }
+ if expectedSector := uint64(0); sector != expectedSector {
+ t.Fatalf("Wrong libdevmapper call to DmTaskSetSector\nExpected: %v\nReceived: %v\n", expectedSector, sector)
+ }
+ return 1
+ }
+ DmTaskSetMessage = func(task *CDmTask, message string) int {
+ calls["DmTaskSetMessage"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskSetSector(%v)\nReceived: DmTaskSetSector(%v)\n", expectedTask, task)
+ }
+ taskMessages[message] = true
+ return 1
+ }
+ DmTaskDestroy = func(task *CDmTask) {
+ calls["DmTaskDestroy"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task)
+ }
+ }
+ DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int {
+ calls["DmTaskSetTarget"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task)
+ }
+ if start != 0 {
+ t.Fatalf("Wrong start: %d != %d", start, 0)
+ }
+ if ttype != "thin" && ttype != "thin-pool" {
+ t.Fatalf("Wrong ttype: %s", ttype)
+ }
+ // Quick smoke test
+ if params == "" {
+ t.Fatalf("Params should not be empty")
+ }
+ return 1
+ }
+ fakeCookie := uint(4321)
+ DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int {
+ calls["DmTaskSetCookie"] = true
+ expectedTask := &task1
+ if task != expectedTask {
+ t.Fatalf("Wrong libdevmapper call\nExpected: DmTaskDestroy(%v)\nReceived: DmTaskDestroy(%v)\n", expectedTask, task)
+ }
+ if flags != 0 {
+ t.Fatalf("Cookie flags should be 0 (not %x)", flags)
+ }
+ *cookie = fakeCookie
+ return 1
+ }
+ DmUdevWait = func(cookie uint) int {
+ calls["DmUdevWait"] = true
+ if cookie != fakeCookie {
+ t.Fatalf("Wrong cookie: %d != %d", cookie, fakeCookie)
+ }
+ return 1
+ }
+ DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int {
+ if addNode != AddNodeOnCreate {
+ t.Fatalf("Wrong AddNoteType: %v (expected %v)", addNode, AddNodeOnCreate)
+ }
+ calls["DmTaskSetAddNode"] = true
+ return 1
+ }
+ execRun = func(name string, args ...string) error {
+ calls["execRun"] = true
+ if name != "mkfs.ext4" {
+ t.Fatalf("Expected %s to be executed, not %s", "mkfs.ext4", name)
+ }
+ return nil
+ }
+ driver, err := Init(home)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer func() {
+ if err := driver.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+ }()
+ }()
+ // Put all tests in a function to make sure the garbage collection will
+ // occur.
+
+ // Call GC to cleanup runtime.Finalizers
+ runtime.GC()
+
+ calls.Assert(t,
+ "DmSetDevDir",
+ "DmLogWithErrnoInit",
+ "DmTaskSetName",
+ "DmTaskRun",
+ "DmTaskGetInfo",
+ "DmTaskDestroy",
+ "execRun",
+ "DmTaskCreate",
+ "DmTaskSetTarget",
+ "DmTaskSetCookie",
+ "DmUdevWait",
+ "DmTaskSetSector",
+ "DmTaskSetMessage",
+ "DmTaskSetAddNode",
+ )
+ taskTypes.Assert(t, "0", "6", "17")
+ taskMessages.Assert(t, "create_thin 0", "set_transaction_id 0 1")
+}
+
+func fakeInit() func(home string) (graphdriver.Driver, error) {
+ oldInit := Init
+ Init = func(home string) (graphdriver.Driver, error) {
+ return &Driver{
+ home: home,
+ }, nil
+ }
+ return oldInit
+}
+
+func restoreInit(init func(home string) (graphdriver.Driver, error)) {
+ Init = init
+}
+
+func mockAllDevmapper(calls Set) {
+ DmSetDevDir = func(dir string) int {
+ calls["DmSetDevDir"] = true
+ return 0
+ }
+ LogWithErrnoInit = func() {
+ calls["DmLogWithErrnoInit"] = true
+ }
+ DmTaskCreate = func(taskType int) *CDmTask {
+ calls["DmTaskCreate"] = true
+ return &CDmTask{}
+ }
+ DmTaskSetName = func(task *CDmTask, name string) int {
+ calls["DmTaskSetName"] = true
+ return 1
+ }
+ DmTaskRun = func(task *CDmTask) int {
+ calls["DmTaskRun"] = true
+ return 1
+ }
+ DmTaskGetInfo = func(task *CDmTask, info *Info) int {
+ calls["DmTaskGetInfo"] = true
+ return 1
+ }
+ DmTaskSetSector = func(task *CDmTask, sector uint64) int {
+ calls["DmTaskSetSector"] = true
+ return 1
+ }
+ DmTaskSetMessage = func(task *CDmTask, message string) int {
+ calls["DmTaskSetMessage"] = true
+ return 1
+ }
+ DmTaskDestroy = func(task *CDmTask) {
+ calls["DmTaskDestroy"] = true
+ }
+ DmTaskAddTarget = func(task *CDmTask, start, size uint64, ttype, params string) int {
+ calls["DmTaskSetTarget"] = true
+ return 1
+ }
+ DmTaskSetCookie = func(task *CDmTask, cookie *uint, flags uint16) int {
+ calls["DmTaskSetCookie"] = true
+ return 1
+ }
+ DmUdevWait = func(cookie uint) int {
+ calls["DmUdevWait"] = true
+ return 1
+ }
+ DmTaskSetAddNode = func(task *CDmTask, addNode AddNodeType) int {
+ calls["DmTaskSetAddNode"] = true
+ return 1
+ }
+ execRun = func(name string, args ...string) error {
+ calls["execRun"] = true
+ return nil
+ }
+}
+
+func TestDriverName(t *testing.T) {
+ denyAllDevmapper()
+ defer denyAllDevmapper()
+
+ oldInit := fakeInit()
+ defer restoreInit(oldInit)
+
+ d := newDriver(t)
+ if d.String() != "devicemapper" {
+ t.Fatalf("Expected driver name to be devicemapper got %s", d.String())
+ }
+}
+
+func TestDriverCreate(t *testing.T) {
+ denyAllDevmapper()
+ denyAllSyscall()
+ defer denyAllSyscall()
+ defer denyAllDevmapper()
+
+ calls := make(Set)
+ mockAllDevmapper(calls)
+
+ sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) {
+ calls["sysMount"] = true
+ // FIXME: compare the exact source and target strings (inodes + devname)
+ if expectedSource := "/dev/mapper/docker-"; !strings.HasPrefix(source, expectedSource) {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedSource, source)
+ }
+ if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target)
+ }
+ if expectedFstype := "ext4"; fstype != expectedFstype {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFstype, fstype)
+ }
+ if expectedFlags := uintptr(3236757504); flags != expectedFlags {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags)
+ }
+ return nil
+ }
+
+ Mounted = func(mnt string) (bool, error) {
+ calls["Mounted"] = true
+ if !strings.HasPrefix(mnt, "/tmp/docker-test-devmapper-") || !strings.HasSuffix(mnt, "/mnt/1") {
+ t.Fatalf("Wrong mounted call\nExpected: Mounted(%v)\nReceived: Mounted(%v)\n", "/tmp/docker-test-devmapper-.../mnt/1", mnt)
+ }
+ return false, nil
+ }
+
+ sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
+ calls["sysSyscall"] = true
+ if trap != sysSysIoctl {
+ t.Fatalf("Unexpected syscall. Expecting SYS_IOCTL, received: %d", trap)
+ }
+ switch a2 {
+ case LoopSetFd:
+ calls["ioctl.loopsetfd"] = true
+ case LoopCtlGetFree:
+ calls["ioctl.loopctlgetfree"] = true
+ case LoopGetStatus64:
+ calls["ioctl.loopgetstatus"] = true
+ case LoopSetStatus64:
+ calls["ioctl.loopsetstatus"] = true
+ case LoopClrFd:
+ calls["ioctl.loopclrfd"] = true
+ case LoopSetCapacity:
+ calls["ioctl.loopsetcapacity"] = true
+ case BlkGetSize64:
+ calls["ioctl.blkgetsize"] = true
+ default:
+ t.Fatalf("Unexpected IOCTL. Received %d", a2)
+ }
+ return 0, 0, 0
+ }
+
+ func() {
+ d := newDriver(t)
+
+ calls.Assert(t,
+ "DmSetDevDir",
+ "DmLogWithErrnoInit",
+ "DmTaskSetName",
+ "DmTaskRun",
+ "DmTaskGetInfo",
+ "execRun",
+ "DmTaskCreate",
+ "DmTaskSetTarget",
+ "DmTaskSetCookie",
+ "DmUdevWait",
+ "DmTaskSetSector",
+ "DmTaskSetMessage",
+ "DmTaskSetAddNode",
+ "sysSyscall",
+ "ioctl.blkgetsize",
+ "ioctl.loopsetfd",
+ "ioctl.loopsetstatus",
+ "?ioctl.loopctlgetfree",
+ )
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ calls.Assert(t,
+ "DmTaskCreate",
+ "DmTaskGetInfo",
+ "sysMount",
+ "DmTaskRun",
+ "DmTaskSetTarget",
+ "DmTaskSetSector",
+ "DmTaskSetCookie",
+ "DmUdevWait",
+ "DmTaskSetName",
+ "DmTaskSetMessage",
+ "DmTaskSetAddNode",
+ )
+
+ }()
+
+ runtime.GC()
+
+ calls.Assert(t,
+ "DmTaskDestroy",
+ )
+}
+
+func TestDriverRemove(t *testing.T) {
+ denyAllDevmapper()
+ denyAllSyscall()
+ defer denyAllSyscall()
+ defer denyAllDevmapper()
+
+ calls := make(Set)
+ mockAllDevmapper(calls)
+
+ sysMount = func(source, target, fstype string, flags uintptr, data string) (err error) {
+ calls["sysMount"] = true
+ // FIXME: compare the exact source and target strings (inodes + devname)
+ if expectedSource := "/dev/mapper/docker-"; !strings.HasPrefix(source, expectedSource) {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedSource, source)
+ }
+ if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target)
+ }
+ if expectedFstype := "ext4"; fstype != expectedFstype {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFstype, fstype)
+ }
+ if expectedFlags := uintptr(3236757504); flags != expectedFlags {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags)
+ }
+ return nil
+ }
+ sysUnmount = func(target string, flags int) (err error) {
+ calls["sysUnmount"] = true
+ // FIXME: compare the exact source and target strings (inodes + devname)
+ if expectedTarget := "/tmp/docker-test-devmapper-"; !strings.HasPrefix(target, expectedTarget) {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedTarget, target)
+ }
+ if expectedFlags := 0; flags != expectedFlags {
+ t.Fatalf("Wrong syscall call\nExpected: Mount(%v)\nReceived: Mount(%v)\n", expectedFlags, flags)
+ }
+ return nil
+ }
+ Mounted = func(mnt string) (bool, error) {
+ calls["Mounted"] = true
+ return false, nil
+ }
+
+ sysSyscall = func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
+ calls["sysSyscall"] = true
+ if trap != sysSysIoctl {
+ t.Fatalf("Unexpected syscall. Expecting SYS_IOCTL, received: %d", trap)
+ }
+ switch a2 {
+ case LoopSetFd:
+ calls["ioctl.loopsetfd"] = true
+ case LoopCtlGetFree:
+ calls["ioctl.loopctlgetfree"] = true
+ case LoopGetStatus64:
+ calls["ioctl.loopgetstatus"] = true
+ case LoopSetStatus64:
+ calls["ioctl.loopsetstatus"] = true
+ case LoopClrFd:
+ calls["ioctl.loopclrfd"] = true
+ case LoopSetCapacity:
+ calls["ioctl.loopsetcapacity"] = true
+ case BlkGetSize64:
+ calls["ioctl.blkgetsize"] = true
+ default:
+ t.Fatalf("Unexpected IOCTL. Received %d", a2)
+ }
+ return 0, 0, 0
+ }
+
+ func() {
+ d := newDriver(t)
+
+ calls.Assert(t,
+ "DmSetDevDir",
+ "DmLogWithErrnoInit",
+ "DmTaskSetName",
+ "DmTaskRun",
+ "DmTaskGetInfo",
+ "execRun",
+ "DmTaskCreate",
+ "DmTaskSetTarget",
+ "DmTaskSetCookie",
+ "DmUdevWait",
+ "DmTaskSetSector",
+ "DmTaskSetMessage",
+ "DmTaskSetAddNode",
+ "sysSyscall",
+ "ioctl.blkgetsize",
+ "ioctl.loopsetfd",
+ "ioctl.loopsetstatus",
+ "?ioctl.loopctlgetfree",
+ )
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ calls.Assert(t,
+ "DmTaskCreate",
+ "DmTaskGetInfo",
+ "sysMount",
+ "DmTaskRun",
+ "DmTaskSetTarget",
+ "DmTaskSetSector",
+ "DmTaskSetCookie",
+ "DmUdevWait",
+ "DmTaskSetName",
+ "DmTaskSetMessage",
+ "DmTaskSetAddNode",
+ )
+
+ Mounted = func(mnt string) (bool, error) {
+ calls["Mounted"] = true
+ return true, nil
+ }
+
+ if err := d.Remove("1"); err != nil {
+ t.Fatal(err)
+ }
+
+ calls.Assert(t,
+ "DmTaskRun",
+ "DmTaskSetSector",
+ "DmTaskSetName",
+ "DmTaskSetMessage",
+ "DmTaskCreate",
+ "DmTaskGetInfo",
+ "DmTaskSetCookie",
+ "DmTaskSetTarget",
+ "DmTaskSetAddNode",
+ "DmUdevWait",
+ "sysUnmount",
+ )
+ }()
+ runtime.GC()
+
+ calls.Assert(t,
+ "DmTaskDestroy",
+ )
+}
+
+func TestCleanup(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ t.Skip("Unimplemented")
+ d := newDriver(t)
+ defer osRemoveAll(d.home)
+
+ mountPoints := make([]string, 2)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ // Mount the id
+ p, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+ mountPoints[0] = p
+
+ if err := d.Create("2", "1", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ p, err = d.Get("2")
+ if err != nil {
+ t.Fatal(err)
+ }
+ mountPoints[1] = p
+
+ // Ensure that all the mount points are currently mounted
+ for _, p := range mountPoints {
+ if mounted, err := Mounted(p); err != nil {
+ t.Fatal(err)
+ } else if !mounted {
+ t.Fatalf("Expected %s to be mounted", p)
+ }
+ }
+
+ // Ensure that devices are active
+ for _, p := range []string{"1", "2"} {
+ if !d.HasActivatedDevice(p) {
+ t.Fatalf("Expected %s to have an active device", p)
+ }
+ }
+
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+
+ // Ensure that all the mount points are no longer mounted
+ for _, p := range mountPoints {
+ if mounted, err := Mounted(p); err != nil {
+ t.Fatal(err)
+ } else if mounted {
+ t.Fatalf("Expected %s to not be mounted", p)
+ }
+ }
+
+ // Ensure that devices are no longer activated
+ for _, p := range []string{"1", "2"} {
+ if d.HasActivatedDevice(p) {
+ t.Fatalf("Expected %s not be an active device", p)
+ }
+ }
+}
+
+func TestNotMounted(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ t.Skip("Not implemented")
+ d := newDriver(t)
+ defer cleanup(d)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ mounted, err := Mounted(path.Join(d.home, "mnt", "1"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if mounted {
+ t.Fatal("Id 1 should not be mounted")
+ }
+}
+
+func TestMounted(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ d := newDriver(t)
+ defer cleanup(d)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+
+ mounted, err := Mounted(path.Join(d.home, "mnt", "1"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !mounted {
+ t.Fatal("Id 1 should be mounted")
+ }
+}
+
+func TestInitCleanedDriver(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ d := newDriver(t)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := d.Cleanup(); err != nil {
+ t.Fatal(err)
+ }
+
+ driver, err := Init(d.home)
+ if err != nil {
+ t.Fatal(err)
+ }
+ d = driver.(*Driver)
+ defer cleanup(d)
+
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestMountMountedDriver(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ d := newDriver(t)
+ defer cleanup(d)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ // Perform get on same id to ensure that it will
+ // not be mounted twice
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestGetReturnsValidDevice(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ d := newDriver(t)
+ defer cleanup(d)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ if !d.HasDevice("1") {
+ t.Fatalf("Expected id 1 to be in device set")
+ }
+
+ if _, err := d.Get("1"); err != nil {
+ t.Fatal(err)
+ }
+
+ if !d.HasActivatedDevice("1") {
+ t.Fatalf("Expected id 1 to be activated")
+ }
+
+ if !d.HasInitializedDevice("1") {
+ t.Fatalf("Expected id 1 to be initialized")
+ }
+}
+
+func TestDriverGetSize(t *testing.T) {
+ t.Skip("FIXME: not a unit test")
+ t.Skipf("Size is currently not implemented")
+
+ d := newDriver(t)
+ defer cleanup(d)
+
+ if err := d.Create("1", "", ""); err != nil {
+ t.Fatal(err)
+ }
+
+ mountPoint, err := d.Get("1")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ size := int64(1024)
+
+ f, err := osCreate(path.Join(mountPoint, "test_file"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := f.Truncate(size); err != nil {
+ t.Fatal(err)
+ }
+ f.Close()
+
+ // diffSize, err := d.DiffSize("1")
+ // if err != nil {
+ // t.Fatal(err)
+ // }
+ // if diffSize != size {
+ // t.Fatalf("Expected size %d got %d", size, diffSize)
+ // }
+}
+
+func assertMap(t *testing.T, m map[string]bool, keys ...string) {
+ for _, key := range keys {
+ if _, exists := m[key]; !exists {
+ t.Fatalf("Key not set: %s", key)
+ }
+ delete(m, key)
+ }
+ if len(m) != 0 {
+ t.Fatalf("Unexpected keys: %v", m)
+ }
+}
diff --git a/runtime/graphdriver/devmapper/ioctl.go b/runtime/graphdriver/devmapper/ioctl.go
new file mode 100644
index 0000000000..30bafff943
--- /dev/null
+++ b/runtime/graphdriver/devmapper/ioctl.go
@@ -0,0 +1,71 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "unsafe"
+)
+
+func ioctlLoopCtlGetFree(fd uintptr) (int, error) {
+ index, _, err := sysSyscall(sysSysIoctl, fd, LoopCtlGetFree, 0)
+ if err != 0 {
+ return 0, err
+ }
+ return int(index), nil
+}
+
+func ioctlLoopSetFd(loopFd, sparseFd uintptr) error {
+ if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetFd, sparseFd); err != 0 {
+ return err
+ }
+ return nil
+}
+
+func ioctlLoopSetStatus64(loopFd uintptr, loopInfo *LoopInfo64) error {
+ if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetStatus64, uintptr(unsafe.Pointer(loopInfo))); err != 0 {
+ return err
+ }
+ return nil
+}
+
+func ioctlLoopClrFd(loopFd uintptr) error {
+ if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopClrFd, 0); err != 0 {
+ return err
+ }
+ return nil
+}
+
+func ioctlLoopGetStatus64(loopFd uintptr) (*LoopInfo64, error) {
+ loopInfo := &LoopInfo64{}
+
+ if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopGetStatus64, uintptr(unsafe.Pointer(loopInfo))); err != 0 {
+ return nil, err
+ }
+ return loopInfo, nil
+}
+
+func ioctlLoopSetCapacity(loopFd uintptr, value int) error {
+ if _, _, err := sysSyscall(sysSysIoctl, loopFd, LoopSetCapacity, uintptr(value)); err != 0 {
+ return err
+ }
+ return nil
+}
+
+func ioctlBlkGetSize64(fd uintptr) (int64, error) {
+ var size int64
+ if _, _, err := sysSyscall(sysSysIoctl, fd, BlkGetSize64, uintptr(unsafe.Pointer(&size))); err != 0 {
+ return 0, err
+ }
+ return size, nil
+}
+
+func ioctlBlkDiscard(fd uintptr, offset, length uint64) error {
+ var r [2]uint64
+ r[0] = offset
+ r[1] = length
+
+ if _, _, err := sysSyscall(sysSysIoctl, fd, BlkDiscard, uintptr(unsafe.Pointer(&r[0]))); err != 0 {
+ return err
+ }
+ return nil
+}
diff --git a/runtime/graphdriver/devmapper/mount.go b/runtime/graphdriver/devmapper/mount.go
new file mode 100644
index 0000000000..4f19109bf8
--- /dev/null
+++ b/runtime/graphdriver/devmapper/mount.go
@@ -0,0 +1,27 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "path/filepath"
+)
+
+// FIXME: this is copy-pasted from the aufs driver.
+// It should be moved into the core.
+
+var Mounted = func(mountpoint string) (bool, error) {
+ mntpoint, err := osStat(mountpoint)
+ if err != nil {
+ if osIsNotExist(err) {
+ return false, nil
+ }
+ return false, err
+ }
+ parent, err := osStat(filepath.Join(mountpoint, ".."))
+ if err != nil {
+ return false, err
+ }
+ mntpointSt := toSysStatT(mntpoint.Sys())
+ parentSt := toSysStatT(parent.Sys())
+ return mntpointSt.Dev != parentSt.Dev, nil
+}
diff --git a/runtime/graphdriver/devmapper/sys.go b/runtime/graphdriver/devmapper/sys.go
new file mode 100644
index 0000000000..5a9ab4d74b
--- /dev/null
+++ b/runtime/graphdriver/devmapper/sys.go
@@ -0,0 +1,57 @@
+// +build linux,amd64
+
+package devmapper
+
+import (
+ "os"
+ "os/exec"
+ "syscall"
+)
+
+type (
+ sysStatT syscall.Stat_t
+ sysErrno syscall.Errno
+
+ osFile struct{ *os.File }
+)
+
+var (
+ sysMount = syscall.Mount
+ sysUnmount = syscall.Unmount
+ sysCloseOnExec = syscall.CloseOnExec
+ sysSyscall = syscall.Syscall
+
+ osOpenFile = func(name string, flag int, perm os.FileMode) (*osFile, error) {
+ f, err := os.OpenFile(name, flag, perm)
+ return &osFile{File: f}, err
+ }
+ osOpen = func(name string) (*osFile, error) { f, err := os.Open(name); return &osFile{File: f}, err }
+ osNewFile = os.NewFile
+ osCreate = os.Create
+ osStat = os.Stat
+ osIsNotExist = os.IsNotExist
+ osIsExist = os.IsExist
+ osMkdirAll = os.MkdirAll
+ osRemoveAll = os.RemoveAll
+ osRename = os.Rename
+ osReadlink = os.Readlink
+
+ execRun = func(name string, args ...string) error { return exec.Command(name, args...).Run() }
+)
+
+const (
+ sysMsMgcVal = syscall.MS_MGC_VAL
+ sysMsRdOnly = syscall.MS_RDONLY
+ sysEInval = syscall.EINVAL
+ sysSysIoctl = syscall.SYS_IOCTL
+ sysEBusy = syscall.EBUSY
+
+ osORdOnly = os.O_RDONLY
+ osORdWr = os.O_RDWR
+ osOCreate = os.O_CREATE
+ osModeDevice = os.ModeDevice
+)
+
+func toSysStatT(i interface{}) *sysStatT {
+ return (*sysStatT)(i.(*syscall.Stat_t))
+}
diff --git a/runtime/graphdriver/driver.go b/runtime/graphdriver/driver.go
new file mode 100644
index 0000000000..bd4c2faaca
--- /dev/null
+++ b/runtime/graphdriver/driver.go
@@ -0,0 +1,92 @@
+package graphdriver
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ "github.com/dotcloud/docker/utils"
+ "os"
+ "path"
+)
+
+type InitFunc func(root string) (Driver, error)
+
+type Driver interface {
+ String() string
+
+ Create(id, parent string, mountLabel string) error
+ Remove(id string) error
+
+ Get(id string) (dir string, err error)
+ Put(id string)
+ Exists(id string) bool
+
+ Status() [][2]string
+
+ Cleanup() error
+}
+
+type Differ interface {
+ Diff(id string) (archive.Archive, error)
+ Changes(id string) ([]archive.Change, error)
+ ApplyDiff(id string, diff archive.ArchiveReader) error
+ DiffSize(id string) (bytes int64, err error)
+}
+
+var (
+ DefaultDriver string
+ // All registred drivers
+ drivers map[string]InitFunc
+ // Slice of drivers that should be used in an order
+ priority = []string{
+ "aufs",
+ "btrfs",
+ "devicemapper",
+ "vfs",
+ }
+)
+
+func init() {
+ drivers = make(map[string]InitFunc)
+}
+
+func Register(name string, initFunc InitFunc) error {
+ if _, exists := drivers[name]; exists {
+ return fmt.Errorf("Name already registered %s", name)
+ }
+ drivers[name] = initFunc
+
+ return nil
+}
+
+func GetDriver(name, home string) (Driver, error) {
+ if initFunc, exists := drivers[name]; exists {
+ return initFunc(path.Join(home, name))
+ }
+ return nil, fmt.Errorf("No such driver: %s", name)
+}
+
+func New(root string) (driver Driver, err error) {
+ for _, name := range []string{os.Getenv("DOCKER_DRIVER"), DefaultDriver} {
+ if name != "" {
+ return GetDriver(name, root)
+ }
+ }
+
+ // Check for priority drivers first
+ for _, name := range priority {
+ if driver, err = GetDriver(name, root); err != nil {
+ utils.Debugf("Error loading driver %s: %s", name, err)
+ continue
+ }
+ return driver, nil
+ }
+
+ // Check all registered drivers if no priority driver is found
+ for _, initFunc := range drivers {
+ if driver, err = initFunc(root); err != nil {
+ continue
+ }
+ return driver, nil
+ }
+ return nil, err
+}
diff --git a/runtime/graphdriver/vfs/driver.go b/runtime/graphdriver/vfs/driver.go
new file mode 100644
index 0000000000..fe09560f24
--- /dev/null
+++ b/runtime/graphdriver/vfs/driver.go
@@ -0,0 +1,95 @@
+package vfs
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "os"
+ "os/exec"
+ "path"
+)
+
+func init() {
+ graphdriver.Register("vfs", Init)
+}
+
+func Init(home string) (graphdriver.Driver, error) {
+ d := &Driver{
+ home: home,
+ }
+ return d, nil
+}
+
+type Driver struct {
+ home string
+}
+
+func (d *Driver) String() string {
+ return "vfs"
+}
+
+func (d *Driver) Status() [][2]string {
+ return nil
+}
+
+func (d *Driver) Cleanup() error {
+ return nil
+}
+
+func copyDir(src, dst string) error {
+ if output, err := exec.Command("cp", "-aT", "--reflink=auto", src, dst).CombinedOutput(); err != nil {
+ return fmt.Errorf("Error VFS copying directory: %s (%s)", err, output)
+ }
+ return nil
+}
+
+func (d *Driver) Create(id string, parent string, mountLabel string) error {
+ dir := d.dir(id)
+ if err := os.MkdirAll(path.Dir(dir), 0700); err != nil {
+ return err
+ }
+ if err := os.Mkdir(dir, 0700); err != nil {
+ return err
+ }
+ if parent == "" {
+ return nil
+ }
+ parentDir, err := d.Get(parent)
+ if err != nil {
+ return fmt.Errorf("%s: %s", parent, err)
+ }
+ if err := copyDir(parentDir, dir); err != nil {
+ return err
+ }
+ return nil
+}
+
+func (d *Driver) dir(id string) string {
+ return path.Join(d.home, "dir", path.Base(id))
+}
+
+func (d *Driver) Remove(id string) error {
+ if _, err := os.Stat(d.dir(id)); err != nil {
+ return err
+ }
+ return os.RemoveAll(d.dir(id))
+}
+
+func (d *Driver) Get(id string) (string, error) {
+ dir := d.dir(id)
+ if st, err := os.Stat(dir); err != nil {
+ return "", err
+ } else if !st.IsDir() {
+ return "", fmt.Errorf("%s: not a directory", dir)
+ }
+ return dir, nil
+}
+
+func (d *Driver) Put(id string) {
+ // The vfs driver has no runtime resources (e.g. mounts)
+ // to clean up, so we don't need anything here
+}
+
+func (d *Driver) Exists(id string) bool {
+ _, err := os.Stat(d.dir(id))
+ return err == nil
+}
diff --git a/runtime/history.go b/runtime/history.go
new file mode 100644
index 0000000000..835ac9c11e
--- /dev/null
+++ b/runtime/history.go
@@ -0,0 +1,30 @@
+package runtime
+
+import (
+ "sort"
+)
+
+// History is a convenience type for storing a list of containers,
+// ordered by creation date.
+type History []*Container
+
+func (history *History) Len() int {
+ return len(*history)
+}
+
+func (history *History) Less(i, j int) bool {
+ containers := *history
+ return containers[j].When().Before(containers[i].When())
+}
+
+func (history *History) Swap(i, j int) {
+ containers := *history
+ tmp := containers[i]
+ containers[i] = containers[j]
+ containers[j] = tmp
+}
+
+func (history *History) Add(container *Container) {
+ *history = append(*history, container)
+ sort.Sort(history)
+}
diff --git a/runtime/networkdriver/bridge/driver.go b/runtime/networkdriver/bridge/driver.go
new file mode 100644
index 0000000000..f7c3bc6b01
--- /dev/null
+++ b/runtime/networkdriver/bridge/driver.go
@@ -0,0 +1,470 @@
+package bridge
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/engine"
+ "github.com/dotcloud/docker/pkg/iptables"
+ "github.com/dotcloud/docker/pkg/netlink"
+ "github.com/dotcloud/docker/runtime/networkdriver"
+ "github.com/dotcloud/docker/runtime/networkdriver/ipallocator"
+ "github.com/dotcloud/docker/runtime/networkdriver/portallocator"
+ "github.com/dotcloud/docker/runtime/networkdriver/portmapper"
+ "github.com/dotcloud/docker/utils"
+ "io/ioutil"
+ "log"
+ "net"
+ "strings"
+)
+
+const (
+ DefaultNetworkBridge = "docker0"
+)
+
+// Network interface represents the networking stack of a container
+type networkInterface struct {
+ IP net.IP
+ PortMappings []net.Addr // there are mappings to the host interfaces
+}
+
+var (
+ addrs = []string{
+ // Here we don't follow the convention of using the 1st IP of the range for the gateway.
+ // This is to use the same gateway IPs as the /24 ranges, which predate the /16 ranges.
+ // In theory this shouldn't matter - in practice there's bound to be a few scripts relying
+ // on the internal addressing or other stupid things like that.
+ // The shouldn't, but hey, let's not break them unless we really have to.
+ "172.17.42.1/16", // Don't use 172.16.0.0/16, it conflicts with EC2 DNS 172.16.0.23
+ "10.0.42.1/16", // Don't even try using the entire /8, that's too intrusive
+ "10.1.42.1/16",
+ "10.42.42.1/16",
+ "172.16.42.1/24",
+ "172.16.43.1/24",
+ "172.16.44.1/24",
+ "10.0.42.1/24",
+ "10.0.43.1/24",
+ "192.168.42.1/24",
+ "192.168.43.1/24",
+ "192.168.44.1/24",
+ }
+
+ bridgeIface string
+ bridgeNetwork *net.IPNet
+
+ defaultBindingIP = net.ParseIP("0.0.0.0")
+ currentInterfaces = make(map[string]*networkInterface)
+)
+
+func InitDriver(job *engine.Job) engine.Status {
+ var (
+ network *net.IPNet
+ enableIPTables = job.GetenvBool("EnableIptables")
+ icc = job.GetenvBool("InterContainerCommunication")
+ ipForward = job.GetenvBool("EnableIpForward")
+ bridgeIP = job.Getenv("BridgeIP")
+ )
+
+ if defaultIP := job.Getenv("DefaultBindingIP"); defaultIP != "" {
+ defaultBindingIP = net.ParseIP(defaultIP)
+ }
+
+ bridgeIface = job.Getenv("BridgeIface")
+ if bridgeIface == "" {
+ bridgeIface = DefaultNetworkBridge
+ }
+
+ addr, err := networkdriver.GetIfaceAddr(bridgeIface)
+ if err != nil {
+ // If the iface is not found, try to create it
+ job.Logf("creating new bridge for %s", bridgeIface)
+ if err := createBridge(bridgeIP); err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+
+ job.Logf("getting iface addr")
+ addr, err = networkdriver.GetIfaceAddr(bridgeIface)
+ if err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+ network = addr.(*net.IPNet)
+ } else {
+ network = addr.(*net.IPNet)
+ // validate that the bridge ip matches the ip specified by BridgeIP
+ if bridgeIP != "" {
+ if !network.IP.Equal(net.ParseIP(bridgeIP)) {
+ return job.Errorf("bridge ip (%s) does not match existing bridge configuration %s", network.IP, bridgeIP)
+ }
+ }
+ }
+
+ // Configure iptables for link support
+ if enableIPTables {
+ if err := setupIPTables(addr, icc); err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+ }
+
+ if ipForward {
+ // Enable IPv4 forwarding
+ if err := ioutil.WriteFile("/proc/sys/net/ipv4/ip_forward", []byte{'1', '\n'}, 0644); err != nil {
+ job.Logf("WARNING: unable to enable IPv4 forwarding: %s\n", err)
+ }
+ }
+
+ // We can always try removing the iptables
+ if err := iptables.RemoveExistingChain("DOCKER"); err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+
+ if enableIPTables {
+ chain, err := iptables.NewChain("DOCKER", bridgeIface)
+ if err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+ portmapper.SetIptablesChain(chain)
+ }
+
+ bridgeNetwork = network
+
+ // https://github.com/dotcloud/docker/issues/2768
+ job.Eng.Hack_SetGlobalVar("httpapi.bridgeIP", bridgeNetwork.IP)
+
+ for name, f := range map[string]engine.Handler{
+ "allocate_interface": Allocate,
+ "release_interface": Release,
+ "allocate_port": AllocatePort,
+ "link": LinkContainers,
+ } {
+ if err := job.Eng.Register(name, f); err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+ }
+ return engine.StatusOK
+}
+
+func setupIPTables(addr net.Addr, icc bool) error {
+ // Enable NAT
+ natArgs := []string{"POSTROUTING", "-t", "nat", "-s", addr.String(), "!", "-d", addr.String(), "-j", "MASQUERADE"}
+
+ if !iptables.Exists(natArgs...) {
+ if output, err := iptables.Raw(append([]string{"-I"}, natArgs...)...); err != nil {
+ return fmt.Errorf("Unable to enable network bridge NAT: %s", err)
+ } else if len(output) != 0 {
+ return fmt.Errorf("Error iptables postrouting: %s", output)
+ }
+ }
+
+ var (
+ args = []string{"FORWARD", "-i", bridgeIface, "-o", bridgeIface, "-j"}
+ acceptArgs = append(args, "ACCEPT")
+ dropArgs = append(args, "DROP")
+ )
+
+ if !icc {
+ iptables.Raw(append([]string{"-D"}, acceptArgs...)...)
+
+ if !iptables.Exists(dropArgs...) {
+ utils.Debugf("Disable inter-container communication")
+ if output, err := iptables.Raw(append([]string{"-I"}, dropArgs...)...); err != nil {
+ return fmt.Errorf("Unable to prevent intercontainer communication: %s", err)
+ } else if len(output) != 0 {
+ return fmt.Errorf("Error disabling intercontainer communication: %s", output)
+ }
+ }
+ } else {
+ iptables.Raw(append([]string{"-D"}, dropArgs...)...)
+
+ if !iptables.Exists(acceptArgs...) {
+ utils.Debugf("Enable inter-container communication")
+ if output, err := iptables.Raw(append([]string{"-I"}, acceptArgs...)...); err != nil {
+ return fmt.Errorf("Unable to allow intercontainer communication: %s", err)
+ } else if len(output) != 0 {
+ return fmt.Errorf("Error enabling intercontainer communication: %s", output)
+ }
+ }
+ }
+
+ // Accept all non-intercontainer outgoing packets
+ outgoingArgs := []string{"FORWARD", "-i", bridgeIface, "!", "-o", bridgeIface, "-j", "ACCEPT"}
+ if !iptables.Exists(outgoingArgs...) {
+ if output, err := iptables.Raw(append([]string{"-I"}, outgoingArgs...)...); err != nil {
+ return fmt.Errorf("Unable to allow outgoing packets: %s", err)
+ } else if len(output) != 0 {
+ return fmt.Errorf("Error iptables allow outgoing: %s", output)
+ }
+ }
+
+ // Accept incoming packets for existing connections
+ existingArgs := []string{"FORWARD", "-o", bridgeIface, "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED", "-j", "ACCEPT"}
+
+ if !iptables.Exists(existingArgs...) {
+ if output, err := iptables.Raw(append([]string{"-I"}, existingArgs...)...); err != nil {
+ return fmt.Errorf("Unable to allow incoming packets: %s", err)
+ } else if len(output) != 0 {
+ return fmt.Errorf("Error iptables allow incoming: %s", output)
+ }
+ }
+ return nil
+}
+
+// CreateBridgeIface creates a network bridge interface on the host system with the name `ifaceName`,
+// and attempts to configure it with an address which doesn't conflict with any other interface on the host.
+// If it can't find an address which doesn't conflict, it will return an error.
+func createBridge(bridgeIP string) error {
+ nameservers := []string{}
+ resolvConf, _ := utils.GetResolvConf()
+ // we don't check for an error here, because we don't really care
+ // if we can't read /etc/resolv.conf. So instead we skip the append
+ // if resolvConf is nil. It either doesn't exist, or we can't read it
+ // for some reason.
+ if resolvConf != nil {
+ nameservers = append(nameservers, utils.GetNameserversAsCIDR(resolvConf)...)
+ }
+
+ var ifaceAddr string
+ if len(bridgeIP) != 0 {
+ _, _, err := net.ParseCIDR(bridgeIP)
+ if err != nil {
+ return err
+ }
+ ifaceAddr = bridgeIP
+ } else {
+ for _, addr := range addrs {
+ _, dockerNetwork, err := net.ParseCIDR(addr)
+ if err != nil {
+ return err
+ }
+ if err := networkdriver.CheckNameserverOverlaps(nameservers, dockerNetwork); err == nil {
+ if err := networkdriver.CheckRouteOverlaps(dockerNetwork); err == nil {
+ ifaceAddr = addr
+ break
+ } else {
+ utils.Debugf("%s %s", addr, err)
+ }
+ }
+ }
+ }
+
+ if ifaceAddr == "" {
+ return fmt.Errorf("Could not find a free IP address range for interface '%s'. Please configure its address manually and run 'docker -b %s'", bridgeIface, bridgeIface)
+ }
+ utils.Debugf("Creating bridge %s with network %s", bridgeIface, ifaceAddr)
+
+ if err := createBridgeIface(bridgeIface); err != nil {
+ return err
+ }
+
+ iface, err := net.InterfaceByName(bridgeIface)
+ if err != nil {
+ return err
+ }
+
+ ipAddr, ipNet, err := net.ParseCIDR(ifaceAddr)
+ if err != nil {
+ return err
+ }
+
+ if netlink.NetworkLinkAddIp(iface, ipAddr, ipNet); err != nil {
+ return fmt.Errorf("Unable to add private network: %s", err)
+ }
+ if err := netlink.NetworkLinkUp(iface); err != nil {
+ return fmt.Errorf("Unable to start network bridge: %s", err)
+ }
+ return nil
+}
+
+func createBridgeIface(name string) error {
+ kv, err := utils.GetKernelVersion()
+ // only set the bridge's mac address if the kernel version is > 3.3
+ // before that it was not supported
+ setBridgeMacAddr := err == nil && (kv.Kernel >= 3 && kv.Major >= 3)
+ utils.Debugf("setting bridge mac address = %v", setBridgeMacAddr)
+ return netlink.CreateBridge(name, setBridgeMacAddr)
+}
+
+// Allocate a network interface
+func Allocate(job *engine.Job) engine.Status {
+ var (
+ ip *net.IP
+ err error
+ id = job.Args[0]
+ requestedIP = net.ParseIP(job.Getenv("RequestedIP"))
+ )
+
+ if requestedIP != nil {
+ ip, err = ipallocator.RequestIP(bridgeNetwork, &requestedIP)
+ } else {
+ ip, err = ipallocator.RequestIP(bridgeNetwork, nil)
+ }
+ if err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+
+ out := engine.Env{}
+ out.Set("IP", ip.String())
+ out.Set("Mask", bridgeNetwork.Mask.String())
+ out.Set("Gateway", bridgeNetwork.IP.String())
+ out.Set("Bridge", bridgeIface)
+
+ size, _ := bridgeNetwork.Mask.Size()
+ out.SetInt("IPPrefixLen", size)
+
+ currentInterfaces[id] = &networkInterface{
+ IP: *ip,
+ }
+
+ out.WriteTo(job.Stdout)
+
+ return engine.StatusOK
+}
+
+// release an interface for a select ip
+func Release(job *engine.Job) engine.Status {
+ var (
+ id = job.Args[0]
+ containerInterface = currentInterfaces[id]
+ ip net.IP
+ port int
+ proto string
+ )
+
+ if containerInterface == nil {
+ return job.Errorf("No network information to release for %s", id)
+ }
+
+ for _, nat := range containerInterface.PortMappings {
+ if err := portmapper.Unmap(nat); err != nil {
+ log.Printf("Unable to unmap port %s: %s", nat, err)
+ }
+
+ // this is host mappings
+ switch a := nat.(type) {
+ case *net.TCPAddr:
+ proto = "tcp"
+ ip = a.IP
+ port = a.Port
+ case *net.UDPAddr:
+ proto = "udp"
+ ip = a.IP
+ port = a.Port
+ }
+
+ if err := portallocator.ReleasePort(ip, proto, port); err != nil {
+ log.Printf("Unable to release port %s", nat)
+ }
+ }
+
+ if err := ipallocator.ReleaseIP(bridgeNetwork, &containerInterface.IP); err != nil {
+ log.Printf("Unable to release ip %s\n", err)
+ }
+ return engine.StatusOK
+}
+
+// Allocate an external port and map it to the interface
+func AllocatePort(job *engine.Job) engine.Status {
+ var (
+ err error
+
+ ip = defaultBindingIP
+ id = job.Args[0]
+ hostIP = job.Getenv("HostIP")
+ hostPort = job.GetenvInt("HostPort")
+ containerPort = job.GetenvInt("ContainerPort")
+ proto = job.Getenv("Proto")
+ network = currentInterfaces[id]
+ )
+
+ if hostIP != "" {
+ ip = net.ParseIP(hostIP)
+ }
+
+ // host ip, proto, and host port
+ hostPort, err = portallocator.RequestPort(ip, proto, hostPort)
+ if err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+
+ var (
+ container net.Addr
+ host net.Addr
+ )
+
+ if proto == "tcp" {
+ host = &net.TCPAddr{IP: ip, Port: hostPort}
+ container = &net.TCPAddr{IP: network.IP, Port: containerPort}
+ } else {
+ host = &net.UDPAddr{IP: ip, Port: hostPort}
+ container = &net.UDPAddr{IP: network.IP, Port: containerPort}
+ }
+
+ if err := portmapper.Map(container, ip, hostPort); err != nil {
+ portallocator.ReleasePort(ip, proto, hostPort)
+
+ job.Error(err)
+ return engine.StatusErr
+ }
+ network.PortMappings = append(network.PortMappings, host)
+
+ out := engine.Env{}
+ out.Set("HostIP", ip.String())
+ out.SetInt("HostPort", hostPort)
+
+ if _, err := out.WriteTo(job.Stdout); err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ }
+ return engine.StatusOK
+}
+
+func LinkContainers(job *engine.Job) engine.Status {
+ var (
+ action = job.Args[0]
+ childIP = job.Getenv("ChildIP")
+ parentIP = job.Getenv("ParentIP")
+ ignoreErrors = job.GetenvBool("IgnoreErrors")
+ ports = job.GetenvList("Ports")
+ )
+ split := func(p string) (string, string) {
+ parts := strings.Split(p, "/")
+ return parts[0], parts[1]
+ }
+
+ for _, p := range ports {
+ port, proto := split(p)
+ if output, err := iptables.Raw(action, "FORWARD",
+ "-i", bridgeIface, "-o", bridgeIface,
+ "-p", proto,
+ "-s", parentIP,
+ "--dport", port,
+ "-d", childIP,
+ "-j", "ACCEPT"); !ignoreErrors && err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ } else if len(output) != 0 {
+ job.Errorf("Error toggle iptables forward: %s", output)
+ return engine.StatusErr
+ }
+
+ if output, err := iptables.Raw(action, "FORWARD",
+ "-i", bridgeIface, "-o", bridgeIface,
+ "-p", proto,
+ "-s", childIP,
+ "--sport", port,
+ "-d", parentIP,
+ "-j", "ACCEPT"); !ignoreErrors && err != nil {
+ job.Error(err)
+ return engine.StatusErr
+ } else if len(output) != 0 {
+ job.Errorf("Error toggle iptables forward: %s", output)
+ return engine.StatusErr
+ }
+ }
+ return engine.StatusOK
+}
diff --git a/runtime/networkdriver/ipallocator/allocator.go b/runtime/networkdriver/ipallocator/allocator.go
new file mode 100644
index 0000000000..70a7028bbe
--- /dev/null
+++ b/runtime/networkdriver/ipallocator/allocator.go
@@ -0,0 +1,159 @@
+package ipallocator
+
+import (
+ "encoding/binary"
+ "errors"
+ "github.com/dotcloud/docker/pkg/collections"
+ "github.com/dotcloud/docker/runtime/networkdriver"
+ "net"
+ "sync"
+)
+
+type networkSet map[string]*collections.OrderedIntSet
+
+var (
+ ErrNoAvailableIPs = errors.New("no available ip addresses on network")
+ ErrIPAlreadyAllocated = errors.New("ip already allocated")
+)
+
+var (
+ lock = sync.Mutex{}
+ allocatedIPs = networkSet{}
+ availableIPS = networkSet{}
+)
+
+// RequestIP requests an available ip from the given network. It
+// will return the next available ip if the ip provided is nil. If the
+// ip provided is not nil it will validate that the provided ip is available
+// for use or return an error
+func RequestIP(address *net.IPNet, ip *net.IP) (*net.IP, error) {
+ lock.Lock()
+ defer lock.Unlock()
+
+ checkAddress(address)
+
+ if ip == nil {
+ next, err := getNextIp(address)
+ if err != nil {
+ return nil, err
+ }
+ return next, nil
+ }
+
+ if err := registerIP(address, ip); err != nil {
+ return nil, err
+ }
+ return ip, nil
+}
+
+// ReleaseIP adds the provided ip back into the pool of
+// available ips to be returned for use.
+func ReleaseIP(address *net.IPNet, ip *net.IP) error {
+ lock.Lock()
+ defer lock.Unlock()
+
+ checkAddress(address)
+
+ var (
+ existing = allocatedIPs[address.String()]
+ available = availableIPS[address.String()]
+ pos = getPosition(address, ip)
+ )
+
+ existing.Remove(int(pos))
+ available.Push(int(pos))
+
+ return nil
+}
+
+// convert the ip into the position in the subnet. Only
+// position are saved in the set
+func getPosition(address *net.IPNet, ip *net.IP) int32 {
+ var (
+ first, _ = networkdriver.NetworkRange(address)
+ base = ipToInt(&first)
+ i = ipToInt(ip)
+ )
+ return i - base
+}
+
+// return an available ip if one is currently available. If not,
+// return the next available ip for the nextwork
+func getNextIp(address *net.IPNet) (*net.IP, error) {
+ var (
+ ownIP = ipToInt(&address.IP)
+ available = availableIPS[address.String()]
+ allocated = allocatedIPs[address.String()]
+ first, _ = networkdriver.NetworkRange(address)
+ base = ipToInt(&first)
+ size = int(networkdriver.NetworkSize(address.Mask))
+ max = int32(size - 2) // size -1 for the broadcast address, -1 for the gateway address
+ pos = int32(available.Pop())
+ )
+
+ // We pop and push the position not the ip
+ if pos != 0 {
+ ip := intToIP(int32(base + pos))
+ allocated.Push(int(pos))
+
+ return ip, nil
+ }
+
+ var (
+ firstNetIP = address.IP.To4().Mask(address.Mask)
+ firstAsInt = ipToInt(&firstNetIP) + 1
+ )
+
+ pos = int32(allocated.PullBack())
+ for i := int32(0); i < max; i++ {
+ pos = pos%max + 1
+ next := int32(base + pos)
+
+ if next == ownIP || next == firstAsInt {
+ continue
+ }
+
+ if !allocated.Exists(int(pos)) {
+ ip := intToIP(next)
+ allocated.Push(int(pos))
+ return ip, nil
+ }
+ }
+ return nil, ErrNoAvailableIPs
+}
+
+func registerIP(address *net.IPNet, ip *net.IP) error {
+ var (
+ existing = allocatedIPs[address.String()]
+ available = availableIPS[address.String()]
+ pos = getPosition(address, ip)
+ )
+
+ if existing.Exists(int(pos)) {
+ return ErrIPAlreadyAllocated
+ }
+ available.Remove(int(pos))
+
+ return nil
+}
+
+// Converts a 4 bytes IP into a 32 bit integer
+func ipToInt(ip *net.IP) int32 {
+ return int32(binary.BigEndian.Uint32(ip.To4()))
+}
+
+// Converts 32 bit integer into a 4 bytes IP address
+func intToIP(n int32) *net.IP {
+ b := make([]byte, 4)
+ binary.BigEndian.PutUint32(b, uint32(n))
+ ip := net.IP(b)
+ return &ip
+}
+
+func checkAddress(address *net.IPNet) {
+ key := address.String()
+ if _, exists := allocatedIPs[key]; !exists {
+ allocatedIPs[key] = collections.NewOrderedIntSet()
+ availableIPS[key] = collections.NewOrderedIntSet()
+ }
+}
diff --git a/runtime/networkdriver/ipallocator/allocator_test.go b/runtime/networkdriver/ipallocator/allocator_test.go
new file mode 100644
index 0000000000..5e9fcfc983
--- /dev/null
+++ b/runtime/networkdriver/ipallocator/allocator_test.go
@@ -0,0 +1,241 @@
+package ipallocator
+
+import (
+ "fmt"
+ "net"
+ "testing"
+)
+
+func reset() {
+ allocatedIPs = networkSet{}
+ availableIPS = networkSet{}
+}
+
+func TestRequestNewIps(t *testing.T) {
+ defer reset()
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+
+ for i := 2; i < 10; i++ {
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := fmt.Sprintf("192.168.0.%d", i); ip.String() != expected {
+ t.Fatalf("Expected ip %s got %s", expected, ip.String())
+ }
+ }
+}
+
+func TestReleaseIp(t *testing.T) {
+ defer reset()
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if err := ReleaseIP(network, ip); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestGetReleasedIp(t *testing.T) {
+ defer reset()
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ value := ip.String()
+ if err := ReleaseIP(network, ip); err != nil {
+ t.Fatal(err)
+ }
+
+ ip, err = RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if ip.String() != value {
+ t.Fatalf("Expected to receive same ip %s got %s", value, ip.String())
+ }
+}
+
+func TestRequesetSpecificIp(t *testing.T) {
+ defer reset()
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+
+ ip := net.ParseIP("192.168.1.5")
+
+ if _, err := RequestIP(network, &ip); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestConversion(t *testing.T) {
+ ip := net.ParseIP("127.0.0.1")
+ i := ipToInt(&ip)
+ if i == 0 {
+ t.Fatal("converted to zero")
+ }
+ conv := intToIP(i)
+ if !ip.Equal(*conv) {
+ t.Error(conv.String())
+ }
+}
+
+func TestIPAllocator(t *testing.T) {
+ expectedIPs := []net.IP{
+ 0: net.IPv4(127, 0, 0, 2),
+ 1: net.IPv4(127, 0, 0, 3),
+ 2: net.IPv4(127, 0, 0, 4),
+ 3: net.IPv4(127, 0, 0, 5),
+ 4: net.IPv4(127, 0, 0, 6),
+ }
+
+ gwIP, n, _ := net.ParseCIDR("127.0.0.1/29")
+ network := &net.IPNet{IP: gwIP, Mask: n.Mask}
+ // Pool after initialisation (f = free, u = used)
+ // 2(f) - 3(f) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // Check that we get 5 IPs, from 127.0.0.2–127.0.0.6, in that
+ // order.
+ for i := 0; i < 5; i++ {
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ assertIPEquals(t, &expectedIPs[i], ip)
+ }
+ // Before loop begin
+ // 2(f) - 3(f) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // After i = 0
+ // 2(u) - 3(f) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // After i = 1
+ // 2(u) - 3(u) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // After i = 2
+ // 2(u) - 3(u) - 4(u) - 5(f) - 6(f)
+ // ↑
+
+ // After i = 3
+ // 2(u) - 3(u) - 4(u) - 5(u) - 6(f)
+ // ↑
+
+ // After i = 4
+ // 2(u) - 3(u) - 4(u) - 5(u) - 6(u)
+ // ↑
+
+ // Check that there are no more IPs
+ ip, err := RequestIP(network, nil)
+ if err == nil {
+ t.Fatalf("There shouldn't be any IP addresses at this point, got %s\n", ip)
+ }
+
+ // Release some IPs in non-sequential order
+ if err := ReleaseIP(network, &expectedIPs[3]); err != nil {
+ t.Fatal(err)
+ }
+ // 2(u) - 3(u) - 4(u) - 5(f) - 6(u)
+ // ↑
+
+ if err := ReleaseIP(network, &expectedIPs[2]); err != nil {
+ t.Fatal(err)
+ }
+ // 2(u) - 3(u) - 4(f) - 5(f) - 6(u)
+ // ↑
+
+ if err := ReleaseIP(network, &expectedIPs[4]); err != nil {
+ t.Fatal(err)
+ }
+ // 2(u) - 3(u) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // Make sure that IPs are reused in sequential order, starting
+ // with the first released IP
+ newIPs := make([]*net.IP, 3)
+ for i := 0; i < 3; i++ {
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ newIPs[i] = ip
+ }
+ // Before loop begin
+ // 2(u) - 3(u) - 4(f) - 5(f) - 6(f)
+ // ↑
+
+ // After i = 0
+ // 2(u) - 3(u) - 4(f) - 5(u) - 6(f)
+ // ↑
+
+ // After i = 1
+ // 2(u) - 3(u) - 4(f) - 5(u) - 6(u)
+ // ↑
+
+ // After i = 2
+ // 2(u) - 3(u) - 4(u) - 5(u) - 6(u)
+ // ↑
+
+ // Reordered these because the new set will always return the
+ // lowest ips first and not in the order that they were released
+ assertIPEquals(t, &expectedIPs[2], newIPs[0])
+ assertIPEquals(t, &expectedIPs[3], newIPs[1])
+ assertIPEquals(t, &expectedIPs[4], newIPs[2])
+
+ _, err = RequestIP(network, nil)
+ if err == nil {
+ t.Fatal("There shouldn't be any IP addresses at this point")
+ }
+}
+
+func TestAllocateFirstIP(t *testing.T) {
+ defer reset()
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 0},
+ Mask: []byte{255, 255, 255, 0},
+ }
+
+ firstIP := network.IP.To4().Mask(network.Mask)
+ first := ipToInt(&firstIP) + 1
+
+ ip, err := RequestIP(network, nil)
+ if err != nil {
+ t.Fatal(err)
+ }
+ allocated := ipToInt(ip)
+
+ if allocated == first {
+ t.Fatalf("allocated ip should not equal first ip: %d == %d", first, allocated)
+ }
+}
+
+func assertIPEquals(t *testing.T, ip1, ip2 *net.IP) {
+ if !ip1.Equal(*ip2) {
+ t.Fatalf("Expected IP %s, got %s", ip1, ip2)
+ }
+}
diff --git a/runtime/networkdriver/network.go b/runtime/networkdriver/network.go
new file mode 100644
index 0000000000..8dda789d2f
--- /dev/null
+++ b/runtime/networkdriver/network.go
@@ -0,0 +1,10 @@
+package networkdriver
+
+import (
+ "errors"
+)
+
+var (
+ ErrNetworkOverlapsWithNameservers = errors.New("requested network overlaps with nameserver")
+ ErrNetworkOverlaps = errors.New("requested network overlaps with existing network")
+)
diff --git a/runtime/networkdriver/network_test.go b/runtime/networkdriver/network_test.go
new file mode 100644
index 0000000000..6224c2dffb
--- /dev/null
+++ b/runtime/networkdriver/network_test.go
@@ -0,0 +1,190 @@
+package networkdriver
+
+import (
+ "github.com/dotcloud/docker/pkg/netlink"
+ "net"
+ "testing"
+)
+
+func TestNonOverlapingNameservers(t *testing.T) {
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+ nameservers := []string{
+ "127.0.0.1/32",
+ }
+
+ if err := CheckNameserverOverlaps(nameservers, network); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestOverlapingNameservers(t *testing.T) {
+ network := &net.IPNet{
+ IP: []byte{192, 168, 0, 1},
+ Mask: []byte{255, 255, 255, 0},
+ }
+ nameservers := []string{
+ "192.168.0.1/32",
+ }
+
+ if err := CheckNameserverOverlaps(nameservers, network); err == nil {
+ t.Fatalf("Expected error %s got %s", ErrNetworkOverlapsWithNameservers, err)
+ }
+}
+
+func TestCheckRouteOverlaps(t *testing.T) {
+ orig := networkGetRoutesFct
+ defer func() {
+ networkGetRoutesFct = orig
+ }()
+ networkGetRoutesFct = func() ([]netlink.Route, error) {
+ routesData := []string{"10.0.2.0/32", "10.0.3.0/24", "10.0.42.0/24", "172.16.42.0/24", "192.168.142.0/24"}
+
+ routes := []netlink.Route{}
+ for _, addr := range routesData {
+ _, netX, _ := net.ParseCIDR(addr)
+ routes = append(routes, netlink.Route{IPNet: netX})
+ }
+ return routes, nil
+ }
+
+ _, netX, _ := net.ParseCIDR("172.16.0.1/24")
+ if err := CheckRouteOverlaps(netX); err != nil {
+ t.Fatal(err)
+ }
+
+ _, netX, _ = net.ParseCIDR("10.0.2.0/24")
+ if err := CheckRouteOverlaps(netX); err == nil {
+ t.Fatalf("10.0.2.0/24 and 10.0.2.0 should overlap but it doesn't")
+ }
+}
+
+func TestCheckNameserverOverlaps(t *testing.T) {
+ nameservers := []string{"10.0.2.3/32", "192.168.102.1/32"}
+
+ _, netX, _ := net.ParseCIDR("10.0.2.3/32")
+
+ if err := CheckNameserverOverlaps(nameservers, netX); err == nil {
+ t.Fatalf("%s should overlap 10.0.2.3/32 but doesn't", netX)
+ }
+
+ _, netX, _ = net.ParseCIDR("192.168.102.2/32")
+
+ if err := CheckNameserverOverlaps(nameservers, netX); err != nil {
+ t.Fatalf("%s should not overlap %v but it does", netX, nameservers)
+ }
+}
+
+func AssertOverlap(CIDRx string, CIDRy string, t *testing.T) {
+ _, netX, _ := net.ParseCIDR(CIDRx)
+ _, netY, _ := net.ParseCIDR(CIDRy)
+ if !NetworkOverlaps(netX, netY) {
+ t.Errorf("%v and %v should overlap", netX, netY)
+ }
+}
+
+func AssertNoOverlap(CIDRx string, CIDRy string, t *testing.T) {
+ _, netX, _ := net.ParseCIDR(CIDRx)
+ _, netY, _ := net.ParseCIDR(CIDRy)
+ if NetworkOverlaps(netX, netY) {
+ t.Errorf("%v and %v should not overlap", netX, netY)
+ }
+}
+
+func TestNetworkOverlaps(t *testing.T) {
+ //netY starts at same IP and ends within netX
+ AssertOverlap("172.16.0.1/24", "172.16.0.1/25", t)
+ //netY starts within netX and ends at same IP
+ AssertOverlap("172.16.0.1/24", "172.16.0.128/25", t)
+ //netY starts and ends within netX
+ AssertOverlap("172.16.0.1/24", "172.16.0.64/25", t)
+ //netY starts at same IP and ends outside of netX
+ AssertOverlap("172.16.0.1/24", "172.16.0.1/23", t)
+ //netY starts before and ends at same IP of netX
+ AssertOverlap("172.16.1.1/24", "172.16.0.1/23", t)
+ //netY starts before and ends outside of netX
+ AssertOverlap("172.16.1.1/24", "172.16.0.1/22", t)
+ //netY starts and ends before netX
+ AssertNoOverlap("172.16.1.1/25", "172.16.0.1/24", t)
+ //netX starts and ends before netY
+ AssertNoOverlap("172.16.1.1/25", "172.16.2.1/24", t)
+}
+
+func TestNetworkRange(t *testing.T) {
+ // Simple class C test
+ _, network, _ := net.ParseCIDR("192.168.0.1/24")
+ first, last := NetworkRange(network)
+ if !first.Equal(net.ParseIP("192.168.0.0")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("192.168.0.255")) {
+ t.Error(last.String())
+ }
+ if size := NetworkSize(network.Mask); size != 256 {
+ t.Error(size)
+ }
+
+ // Class A test
+ _, network, _ = net.ParseCIDR("10.0.0.1/8")
+ first, last = NetworkRange(network)
+ if !first.Equal(net.ParseIP("10.0.0.0")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("10.255.255.255")) {
+ t.Error(last.String())
+ }
+ if size := NetworkSize(network.Mask); size != 16777216 {
+ t.Error(size)
+ }
+
+ // Class A, random IP address
+ _, network, _ = net.ParseCIDR("10.1.2.3/8")
+ first, last = NetworkRange(network)
+ if !first.Equal(net.ParseIP("10.0.0.0")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("10.255.255.255")) {
+ t.Error(last.String())
+ }
+
+ // 32bit mask
+ _, network, _ = net.ParseCIDR("10.1.2.3/32")
+ first, last = NetworkRange(network)
+ if !first.Equal(net.ParseIP("10.1.2.3")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("10.1.2.3")) {
+ t.Error(last.String())
+ }
+ if size := NetworkSize(network.Mask); size != 1 {
+ t.Error(size)
+ }
+
+ // 31bit mask
+ _, network, _ = net.ParseCIDR("10.1.2.3/31")
+ first, last = NetworkRange(network)
+ if !first.Equal(net.ParseIP("10.1.2.2")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("10.1.2.3")) {
+ t.Error(last.String())
+ }
+ if size := NetworkSize(network.Mask); size != 2 {
+ t.Error(size)
+ }
+
+ // 26bit mask
+ _, network, _ = net.ParseCIDR("10.1.2.3/26")
+ first, last = NetworkRange(network)
+ if !first.Equal(net.ParseIP("10.1.2.0")) {
+ t.Error(first.String())
+ }
+ if !last.Equal(net.ParseIP("10.1.2.63")) {
+ t.Error(last.String())
+ }
+ if size := NetworkSize(network.Mask); size != 64 {
+ t.Error(size)
+ }
+}
diff --git a/runtime/networkdriver/portallocator/portallocator.go b/runtime/networkdriver/portallocator/portallocator.go
new file mode 100644
index 0000000000..9ecd447116
--- /dev/null
+++ b/runtime/networkdriver/portallocator/portallocator.go
@@ -0,0 +1,188 @@
+package portallocator
+
+import (
+ "errors"
+ "github.com/dotcloud/docker/pkg/collections"
+ "net"
+ "sync"
+)
+
+const (
+ BeginPortRange = 49153
+ EndPortRange = 65535
+)
+
+type (
+ portMappings map[string]*collections.OrderedIntSet
+ ipMapping map[string]portMappings
+)
+
+var (
+ ErrAllPortsAllocated = errors.New("all ports are allocated")
+ ErrPortAlreadyAllocated = errors.New("port has already been allocated")
+ ErrUnknownProtocol = errors.New("unknown protocol")
+)
+
+var (
+ currentDynamicPort = map[string]int{
+ "tcp": BeginPortRange - 1,
+ "udp": BeginPortRange - 1,
+ }
+ defaultIP = net.ParseIP("0.0.0.0")
+ defaultAllocatedPorts = portMappings{}
+ otherAllocatedPorts = ipMapping{}
+ lock = sync.Mutex{}
+)
+
+func init() {
+ defaultAllocatedPorts["tcp"] = collections.NewOrderedIntSet()
+ defaultAllocatedPorts["udp"] = collections.NewOrderedIntSet()
+}
+
+// RequestPort returns an available port if the port is 0
+// If the provided port is not 0 then it will be checked if
+// it is available for allocation
+func RequestPort(ip net.IP, proto string, port int) (int, error) {
+ lock.Lock()
+ defer lock.Unlock()
+
+ if err := validateProtocol(proto); err != nil {
+ return 0, err
+ }
+
+ // If the user requested a specific port to be allocated
+ if port > 0 {
+ if err := registerSetPort(ip, proto, port); err != nil {
+ return 0, err
+ }
+ return port, nil
+ }
+ return registerDynamicPort(ip, proto)
+}
+
+// ReleasePort will return the provided port back into the
+// pool for reuse
+func ReleasePort(ip net.IP, proto string, port int) error {
+ lock.Lock()
+ defer lock.Unlock()
+
+ if err := validateProtocol(proto); err != nil {
+ return err
+ }
+
+ allocated := defaultAllocatedPorts[proto]
+ allocated.Remove(port)
+
+ if !equalsDefault(ip) {
+ registerIP(ip)
+
+ // Remove the port for the specific ip address
+ allocated = otherAllocatedPorts[ip.String()][proto]
+ allocated.Remove(port)
+ }
+ return nil
+}
+
+func ReleaseAll() error {
+ lock.Lock()
+ defer lock.Unlock()
+
+ currentDynamicPort["tcp"] = BeginPortRange - 1
+ currentDynamicPort["udp"] = BeginPortRange - 1
+
+ defaultAllocatedPorts = portMappings{}
+ defaultAllocatedPorts["tcp"] = collections.NewOrderedIntSet()
+ defaultAllocatedPorts["udp"] = collections.NewOrderedIntSet()
+
+ otherAllocatedPorts = ipMapping{}
+
+ return nil
+}
+
+func registerDynamicPort(ip net.IP, proto string) (int, error) {
+
+ if !equalsDefault(ip) {
+ registerIP(ip)
+
+ ipAllocated := otherAllocatedPorts[ip.String()][proto]
+
+ port, err := findNextPort(proto, ipAllocated)
+ if err != nil {
+ return 0, err
+ }
+ ipAllocated.Push(port)
+ return port, nil
+
+ } else {
+
+ allocated := defaultAllocatedPorts[proto]
+
+ port, err := findNextPort(proto, allocated)
+ if err != nil {
+ return 0, err
+ }
+ allocated.Push(port)
+ return port, nil
+ }
+}
+
+func registerSetPort(ip net.IP, proto string, port int) error {
+ allocated := defaultAllocatedPorts[proto]
+ if allocated.Exists(port) {
+ return ErrPortAlreadyAllocated
+ }
+
+ if !equalsDefault(ip) {
+ registerIP(ip)
+
+ ipAllocated := otherAllocatedPorts[ip.String()][proto]
+ if ipAllocated.Exists(port) {
+ return ErrPortAlreadyAllocated
+ }
+ ipAllocated.Push(port)
+ } else {
+ allocated.Push(port)
+ }
+ return nil
+}
+
+func equalsDefault(ip net.IP) bool {
+ return ip == nil || ip.Equal(defaultIP)
+}
+
+func findNextPort(proto string, allocated *collections.OrderedIntSet) (int, error) {
+ port := nextPort(proto)
+ startSearchPort := port
+ for allocated.Exists(port) {
+ port = nextPort(proto)
+ if startSearchPort == port {
+ return 0, ErrAllPortsAllocated
+ }
+ }
+ return port, nil
+}
+
+func nextPort(proto string) int {
+ c := currentDynamicPort[proto] + 1
+ if c > EndPortRange {
+ c = BeginPortRange
+ }
+ currentDynamicPort[proto] = c
+ return c
+}
+
+func registerIP(ip net.IP) {
+ if _, exists := otherAllocatedPorts[ip.String()]; !exists {
+ otherAllocatedPorts[ip.String()] = portMappings{
+ "tcp": collections.NewOrderedIntSet(),
+ "udp": collections.NewOrderedIntSet(),
+ }
+ }
+}
+
+func validateProtocol(proto string) error {
+ if _, exists := defaultAllocatedPorts[proto]; !exists {
+ return ErrUnknownProtocol
+ }
+ return nil
+}
diff --git a/runtime/networkdriver/portallocator/portallocator_test.go b/runtime/networkdriver/portallocator/portallocator_test.go
new file mode 100644
index 0000000000..5a4765ddd4
--- /dev/null
+++ b/runtime/networkdriver/portallocator/portallocator_test.go
@@ -0,0 +1,213 @@
+package portallocator
+
+import (
+ "net"
+ "testing"
+)
+
+func reset() {
+ ReleaseAll()
+}
+
+func TestRequestNewPort(t *testing.T) {
+ defer reset()
+
+ port, err := RequestPort(defaultIP, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := BeginPortRange; port != expected {
+ t.Fatalf("Expected port %d got %d", expected, port)
+ }
+}
+
+func TestRequestSpecificPort(t *testing.T) {
+ defer reset()
+
+ port, err := RequestPort(defaultIP, "tcp", 5000)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port != 5000 {
+ t.Fatalf("Expected port 5000 got %d", port)
+ }
+}
+
+func TestReleasePort(t *testing.T) {
+ defer reset()
+
+ port, err := RequestPort(defaultIP, "tcp", 5000)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port != 5000 {
+ t.Fatalf("Expected port 5000 got %d", port)
+ }
+
+ if err := ReleasePort(defaultIP, "tcp", 5000); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestReuseReleasedPort(t *testing.T) {
+ defer reset()
+
+ port, err := RequestPort(defaultIP, "tcp", 5000)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port != 5000 {
+ t.Fatalf("Expected port 5000 got %d", port)
+ }
+
+ if err := ReleasePort(defaultIP, "tcp", 5000); err != nil {
+ t.Fatal(err)
+ }
+
+ port, err = RequestPort(defaultIP, "tcp", 5000)
+ if err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestReleaseUnreadledPort(t *testing.T) {
+ defer reset()
+
+ port, err := RequestPort(defaultIP, "tcp", 5000)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port != 5000 {
+ t.Fatalf("Expected port 5000 got %d", port)
+ }
+
+ port, err = RequestPort(defaultIP, "tcp", 5000)
+ if err != ErrPortAlreadyAllocated {
+ t.Fatalf("Expected error %s got %s", ErrPortAlreadyAllocated, err)
+ }
+}
+
+func TestUnknowProtocol(t *testing.T) {
+ defer reset()
+
+ if _, err := RequestPort(defaultIP, "tcpp", 0); err != ErrUnknownProtocol {
+ t.Fatalf("Expected error %s got %s", ErrUnknownProtocol, err)
+ }
+}
+
+func TestAllocateAllPorts(t *testing.T) {
+ defer reset()
+
+ for i := 0; i <= EndPortRange-BeginPortRange; i++ {
+ port, err := RequestPort(defaultIP, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if expected := BeginPortRange + i; port != expected {
+ t.Fatalf("Expected port %d got %d", expected, port)
+ }
+ }
+
+ if _, err := RequestPort(defaultIP, "tcp", 0); err != ErrAllPortsAllocated {
+ t.Fatalf("Expected error %s got %s", ErrAllPortsAllocated, err)
+ }
+
+ _, err := RequestPort(defaultIP, "udp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // release a port in the middle and ensure we get another tcp port
+ port := BeginPortRange + 5
+ if err := ReleasePort(defaultIP, "tcp", port); err != nil {
+ t.Fatal(err)
+ }
+ newPort, err := RequestPort(defaultIP, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if newPort != port {
+ t.Fatalf("Expected port %d got %d", port, newPort)
+ }
+}
+
+func BenchmarkAllocatePorts(b *testing.B) {
+ defer reset()
+
+ for i := 0; i < b.N; i++ {
+ for i := 0; i <= EndPortRange-BeginPortRange; i++ {
+ port, err := RequestPort(defaultIP, "tcp", 0)
+ if err != nil {
+ b.Fatal(err)
+ }
+
+ if expected := BeginPortRange + i; port != expected {
+ b.Fatalf("Expected port %d got %d", expected, port)
+ }
+ }
+ reset()
+ }
+}
+
+func TestPortAllocation(t *testing.T) {
+ defer reset()
+
+ ip := net.ParseIP("192.168.0.1")
+ ip2 := net.ParseIP("192.168.0.2")
+ if port, err := RequestPort(ip, "tcp", 80); err != nil {
+ t.Fatal(err)
+ } else if port != 80 {
+ t.Fatalf("Acquire(80) should return 80, not %d", port)
+ }
+ port, err := RequestPort(ip, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port <= 0 {
+ t.Fatalf("Acquire(0) should return a non-zero port")
+ }
+
+ if _, err := RequestPort(ip, "tcp", port); err == nil {
+ t.Fatalf("Acquiring a port already in use should return an error")
+ }
+
+ if newPort, err := RequestPort(ip, "tcp", 0); err != nil {
+ t.Fatal(err)
+ } else if newPort == port {
+ t.Fatalf("Acquire(0) allocated the same port twice: %d", port)
+ }
+
+ if _, err := RequestPort(ip, "tcp", 80); err == nil {
+ t.Fatalf("Acquiring a port already in use should return an error")
+ }
+ if _, err := RequestPort(ip2, "tcp", 80); err != nil {
+ t.Fatalf("It should be possible to allocate the same port on a different interface")
+ }
+ if _, err := RequestPort(ip2, "tcp", 80); err == nil {
+ t.Fatalf("Acquiring a port already in use should return an error")
+ }
+ if err := ReleasePort(ip, "tcp", 80); err != nil {
+ t.Fatal(err)
+ }
+ if _, err := RequestPort(ip, "tcp", 80); err != nil {
+ t.Fatal(err)
+ }
+
+ port, err = RequestPort(ip, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ port2, err := RequestPort(ip, "tcp", port+1)
+ if err != nil {
+ t.Fatal(err)
+ }
+ port3, err := RequestPort(ip, "tcp", 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if port3 == port2 {
+ t.Fatal("Requesting a dynamic port should never allocate a used port")
+ }
+}
diff --git a/runtime/networkdriver/portmapper/mapper.go b/runtime/networkdriver/portmapper/mapper.go
new file mode 100644
index 0000000000..e29959a245
--- /dev/null
+++ b/runtime/networkdriver/portmapper/mapper.go
@@ -0,0 +1,131 @@
+package portmapper
+
+import (
+ "errors"
+ "fmt"
+ "github.com/dotcloud/docker/pkg/iptables"
+ "github.com/dotcloud/docker/pkg/proxy"
+ "net"
+ "sync"
+)
+
+type mapping struct {
+ proto string
+ userlandProxy proxy.Proxy
+ host net.Addr
+ container net.Addr
+}
+
+var (
+ chain *iptables.Chain
+ lock sync.Mutex
+
+ // udp:ip:port
+ currentMappings = make(map[string]*mapping)
+ newProxy = proxy.NewProxy
+)
+
+var (
+ ErrUnknownBackendAddressType = errors.New("unknown container address type not supported")
+ ErrPortMappedForIP = errors.New("port is already mapped to ip")
+ ErrPortNotMapped = errors.New("port is not mapped")
+)
+
+func SetIptablesChain(c *iptables.Chain) {
+ chain = c
+}
+
+func Map(container net.Addr, hostIP net.IP, hostPort int) error {
+ lock.Lock()
+ defer lock.Unlock()
+
+ var m *mapping
+ switch container.(type) {
+ case *net.TCPAddr:
+ m = &mapping{
+ proto: "tcp",
+ host: &net.TCPAddr{IP: hostIP, Port: hostPort},
+ container: container,
+ }
+ case *net.UDPAddr:
+ m = &mapping{
+ proto: "udp",
+ host: &net.UDPAddr{IP: hostIP, Port: hostPort},
+ container: container,
+ }
+ default:
+ return ErrUnknownBackendAddressType
+ }
+
+ key := getKey(m.host)
+ if _, exists := currentMappings[key]; exists {
+ return ErrPortMappedForIP
+ }
+
+ containerIP, containerPort := getIPAndPort(m.container)
+ if err := forward(iptables.Add, m.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil {
+ return err
+ }
+
+ p, err := newProxy(m.host, m.container)
+ if err != nil {
+ // need to undo the iptables rules before we reutrn
+ forward(iptables.Delete, m.proto, hostIP, hostPort, containerIP.String(), containerPort)
+ return err
+ }
+
+ m.userlandProxy = p
+ currentMappings[key] = m
+
+ go p.Run()
+
+ return nil
+}
+
+func Unmap(host net.Addr) error {
+ lock.Lock()
+ defer lock.Unlock()
+
+ key := getKey(host)
+ data, exists := currentMappings[key]
+ if !exists {
+ return ErrPortNotMapped
+ }
+
+ data.userlandProxy.Close()
+ delete(currentMappings, key)
+
+ containerIP, containerPort := getIPAndPort(data.container)
+ hostIP, hostPort := getIPAndPort(data.host)
+ if err := forward(iptables.Delete, data.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil {
+ return err
+ }
+ return nil
+}
+
+func getKey(a net.Addr) string {
+ switch t := a.(type) {
+ case *net.TCPAddr:
+ return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "tcp")
+ case *net.UDPAddr:
+ return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "udp")
+ }
+ return ""
+}
+
+func getIPAndPort(a net.Addr) (net.IP, int) {
+ switch t := a.(type) {
+ case *net.TCPAddr:
+ return t.IP, t.Port
+ case *net.UDPAddr:
+ return t.IP, t.Port
+ }
+ return nil, 0
+}
+
+func forward(action iptables.Action, proto string, sourceIP net.IP, sourcePort int, containerIP string, containerPort int) error {
+ if chain == nil {
+ return nil
+ }
+ return chain.Forward(action, sourceIP, sourcePort, proto, containerIP, containerPort)
+}
diff --git a/runtime/networkdriver/portmapper/mapper_test.go b/runtime/networkdriver/portmapper/mapper_test.go
new file mode 100644
index 0000000000..4c09f3c651
--- /dev/null
+++ b/runtime/networkdriver/portmapper/mapper_test.go
@@ -0,0 +1,107 @@
+package portmapper
+
+import (
+ "github.com/dotcloud/docker/pkg/iptables"
+ "github.com/dotcloud/docker/pkg/proxy"
+ "net"
+ "testing"
+)
+
+func init() {
+ // override this func to mock out the proxy server
+ newProxy = proxy.NewStubProxy
+}
+
+func reset() {
+ chain = nil
+ currentMappings = make(map[string]*mapping)
+}
+
+func TestSetIptablesChain(t *testing.T) {
+ defer reset()
+
+ c := &iptables.Chain{
+ Name: "TEST",
+ Bridge: "192.168.1.1",
+ }
+
+ if chain != nil {
+ t.Fatal("chain should be nil at init")
+ }
+
+ SetIptablesChain(c)
+ if chain == nil {
+ t.Fatal("chain should not be nil after set")
+ }
+}
+
+func TestMapPorts(t *testing.T) {
+ dstIp1 := net.ParseIP("192.168.0.1")
+ dstIp2 := net.ParseIP("192.168.0.2")
+ dstAddr1 := &net.TCPAddr{IP: dstIp1, Port: 80}
+ dstAddr2 := &net.TCPAddr{IP: dstIp2, Port: 80}
+
+ srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")}
+ srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")}
+
+ if err := Map(srcAddr1, dstIp1, 80); err != nil {
+ t.Fatalf("Failed to allocate port: %s", err)
+ }
+
+ if Map(srcAddr1, dstIp1, 80) == nil {
+ t.Fatalf("Port is in use - mapping should have failed")
+ }
+
+ if Map(srcAddr2, dstIp1, 80) == nil {
+ t.Fatalf("Port is in use - mapping should have failed")
+ }
+
+ if err := Map(srcAddr2, dstIp2, 80); err != nil {
+ t.Fatalf("Failed to allocate port: %s", err)
+ }
+
+ if Unmap(dstAddr1) != nil {
+ t.Fatalf("Failed to release port")
+ }
+
+ if Unmap(dstAddr2) != nil {
+ t.Fatalf("Failed to release port")
+ }
+
+ if Unmap(dstAddr2) == nil {
+ t.Fatalf("Port already released, but no error reported")
+ }
+}
+
+func TestGetUDPKey(t *testing.T) {
+ addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53}
+
+ key := getKey(addr)
+
+ if expected := "192.168.1.5:53/udp"; key != expected {
+ t.Fatalf("expected key %s got %s", expected, key)
+ }
+}
+
+func TestGetTCPKey(t *testing.T) {
+ addr := &net.TCPAddr{IP: net.ParseIP("192.168.1.5"), Port: 80}
+
+ key := getKey(addr)
+
+ if expected := "192.168.1.5:80/tcp"; key != expected {
+ t.Fatalf("expected key %s got %s", expected, key)
+ }
+}
+
+func TestGetUDPIPAndPort(t *testing.T) {
+ addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53}
+
+ ip, port := getIPAndPort(addr)
+ if expected := "192.168.1.5"; ip.String() != expected {
+ t.Fatalf("expected ip %s got %s", expected, ip)
+ }
+
+ if ep := 53; port != ep {
+ t.Fatalf("expected port %d got %d", ep, port)
+ }
+}
diff --git a/runtime/networkdriver/utils.go b/runtime/networkdriver/utils.go
new file mode 100644
index 0000000000..0a4ef70c95
--- /dev/null
+++ b/runtime/networkdriver/utils.go
@@ -0,0 +1,118 @@
+package networkdriver
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "net"
+
+ "github.com/dotcloud/docker/pkg/netlink"
+)
+
+var (
+ networkGetRoutesFct = netlink.NetworkGetRoutes
+ ErrNoDefaultRoute = errors.New("no default route")
+)
+
+func CheckNameserverOverlaps(nameservers []string, toCheck *net.IPNet) error {
+ if len(nameservers) > 0 {
+ for _, ns := range nameservers {
+ _, nsNetwork, err := net.ParseCIDR(ns)
+ if err != nil {
+ return err
+ }
+ if NetworkOverlaps(toCheck, nsNetwork) {
+ return ErrNetworkOverlapsWithNameservers
+ }
+ }
+ }
+ return nil
+}
+
+func CheckRouteOverlaps(toCheck *net.IPNet) error {
+ networks, err := networkGetRoutesFct()
+ if err != nil {
+ return err
+ }
+
+ for _, network := range networks {
+ if network.IPNet != nil && NetworkOverlaps(toCheck, network.IPNet) {
+ return ErrNetworkOverlaps
+ }
+ }
+ return nil
+}
+
+// Detects overlap between one IPNet and another
+func NetworkOverlaps(netX *net.IPNet, netY *net.IPNet) bool {
+ if firstIP, _ := NetworkRange(netX); netY.Contains(firstIP) {
+ return true
+ }
+ if firstIP, _ := NetworkRange(netY); netX.Contains(firstIP) {
+ return true
+ }
+ return false
+}
+
+// Calculates the first and last IP addresses in an IPNet
+func NetworkRange(network *net.IPNet) (net.IP, net.IP) {
+ var (
+ netIP = network.IP.To4()
+ firstIP = netIP.Mask(network.Mask)
+ lastIP = net.IPv4(0, 0, 0, 0).To4()
+ )
+
+ for i := 0; i < len(lastIP); i++ {
+ lastIP[i] = netIP[i] | ^network.Mask[i]
+ }
+ return firstIP, lastIP
+}
+
+// Given a netmask, calculates the number of available hosts
+func NetworkSize(mask net.IPMask) int32 {
+ m := net.IPv4Mask(0, 0, 0, 0)
+ for i := 0; i < net.IPv4len; i++ {
+ m[i] = ^mask[i]
+ }
+ return int32(binary.BigEndian.Uint32(m)) + 1
+}
+
+// Return the IPv4 address of a network interface
+func GetIfaceAddr(name string) (net.Addr, error) {
+ iface, err := net.InterfaceByName(name)
+ if err != nil {
+ return nil, err
+ }
+ addrs, err := iface.Addrs()
+ if err != nil {
+ return nil, err
+ }
+ var addrs4 []net.Addr
+ for _, addr := range addrs {
+ ip := (addr.(*net.IPNet)).IP
+ if ip4 := ip.To4(); len(ip4) == net.IPv4len {
+ addrs4 = append(addrs4, addr)
+ }
+ }
+ switch {
+ case len(addrs4) == 0:
+ return nil, fmt.Errorf("Interface %v has no IP addresses", name)
+ case len(addrs4) > 1:
+ fmt.Printf("Interface %v has more than 1 IPv4 address. Defaulting to using %v\n",
+ name, (addrs4[0].(*net.IPNet)).IP)
+ }
+ return addrs4[0], nil
+}
+
+func GetDefaultRouteIface() (*net.Interface, error) {
+ rs, err := networkGetRoutesFct()
+ if err != nil {
+ return nil, fmt.Errorf("unable to get routes: %v", err)
+ }
+ for _, r := range rs {
+ if r.Default {
+ return r.Iface, nil
+ }
+ }
+ return nil, ErrNoDefaultRoute
+}
diff --git a/runtime/runtime.go b/runtime/runtime.go
new file mode 100644
index 0000000000..98903cfa08
--- /dev/null
+++ b/runtime/runtime.go
@@ -0,0 +1,993 @@
+package runtime
+
+import (
+ "container/list"
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ "github.com/dotcloud/docker/daemonconfig"
+ "github.com/dotcloud/docker/dockerversion"
+ "github.com/dotcloud/docker/engine"
+ "github.com/dotcloud/docker/graph"
+ "github.com/dotcloud/docker/image"
+ "github.com/dotcloud/docker/pkg/graphdb"
+ "github.com/dotcloud/docker/pkg/mount"
+ "github.com/dotcloud/docker/pkg/selinux"
+ "github.com/dotcloud/docker/pkg/sysinfo"
+ "github.com/dotcloud/docker/runconfig"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/runtime/execdriver/execdrivers"
+ "github.com/dotcloud/docker/runtime/execdriver/lxc"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ _ "github.com/dotcloud/docker/runtime/graphdriver/vfs"
+ _ "github.com/dotcloud/docker/runtime/networkdriver/bridge"
+ "github.com/dotcloud/docker/runtime/networkdriver/portallocator"
+ "github.com/dotcloud/docker/utils"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "path"
+ "regexp"
+ "strings"
+ "sync"
+ "time"
+)
+
+// Set the max depth to the aufs default that most
+// kernels are compiled with
+// For more information see: http://sourceforge.net/p/aufs/aufs3-standalone/ci/aufs3.12/tree/config.mk
+const MaxImageDepth = 127
+
+var (
+ DefaultDns = []string{"8.8.8.8", "8.8.4.4"}
+ validContainerNameChars = `[a-zA-Z0-9_.-]`
+ validContainerNamePattern = regexp.MustCompile(`^/?` + validContainerNameChars + `+$`)
+)
+
+type Runtime struct {
+ repository string
+ sysInitPath string
+ containers *list.List
+ graph *graph.Graph
+ repositories *graph.TagStore
+ idIndex *utils.TruncIndex
+ sysInfo *sysinfo.SysInfo
+ volumes *graph.Graph
+ srv Server
+ eng *engine.Engine
+ config *daemonconfig.Config
+ containerGraph *graphdb.Database
+ driver graphdriver.Driver
+ execDriver execdriver.Driver
+}
+
+// Mountpoints should be private to the container
+func remountPrivate(mountPoint string) error {
+ mounted, err := mount.Mounted(mountPoint)
+ if err != nil {
+ return err
+ }
+
+ if !mounted {
+ if err := mount.Mount(mountPoint, mountPoint, "none", "bind,rw"); err != nil {
+ return err
+ }
+ }
+ return mount.ForceMount("", mountPoint, "none", "private")
+}
+
+// List returns an array of all containers registered in the runtime.
+func (runtime *Runtime) List() []*Container {
+ containers := new(History)
+ for e := runtime.containers.Front(); e != nil; e = e.Next() {
+ containers.Add(e.Value.(*Container))
+ }
+ return *containers
+}
+
+func (runtime *Runtime) getContainerElement(id string) *list.Element {
+ for e := runtime.containers.Front(); e != nil; e = e.Next() {
+ container := e.Value.(*Container)
+ if container.ID == id {
+ return e
+ }
+ }
+ return nil
+}
+
+// Get looks for a container by the specified ID or name, and returns it.
+// If the container is not found, or if an error occurs, nil is returned.
+func (runtime *Runtime) Get(name string) *Container {
+ if c, _ := runtime.GetByName(name); c != nil {
+ return c
+ }
+
+ id, err := runtime.idIndex.Get(name)
+ if err != nil {
+ return nil
+ }
+
+ e := runtime.getContainerElement(id)
+ if e == nil {
+ return nil
+ }
+ return e.Value.(*Container)
+}
+
+// Exists returns a true if a container of the specified ID or name exists,
+// false otherwise.
+func (runtime *Runtime) Exists(id string) bool {
+ return runtime.Get(id) != nil
+}
+
+func (runtime *Runtime) containerRoot(id string) string {
+ return path.Join(runtime.repository, id)
+}
+
+// Load reads the contents of a container from disk
+// This is typically done at startup.
+func (runtime *Runtime) load(id string) (*Container, error) {
+ container := &Container{root: runtime.containerRoot(id)}
+ if err := container.FromDisk(); err != nil {
+ return nil, err
+ }
+ if container.ID != id {
+ return container, fmt.Errorf("Container %s is stored at %s", container.ID, id)
+ }
+ if container.State.IsRunning() {
+ container.State.SetGhost(true)
+ }
+ return container, nil
+}
+
+// Register makes a container object usable by the runtime as <container.ID>
+func (runtime *Runtime) Register(container *Container) error {
+ if container.runtime != nil || runtime.Exists(container.ID) {
+ return fmt.Errorf("Container is already loaded")
+ }
+ if err := validateID(container.ID); err != nil {
+ return err
+ }
+ if err := runtime.ensureName(container); err != nil {
+ return err
+ }
+
+ container.runtime = runtime
+
+ // Attach to stdout and stderr
+ container.stderr = utils.NewWriteBroadcaster()
+ container.stdout = utils.NewWriteBroadcaster()
+ // Attach to stdin
+ if container.Config.OpenStdin {
+ container.stdin, container.stdinPipe = io.Pipe()
+ } else {
+ container.stdinPipe = utils.NopWriteCloser(ioutil.Discard) // Silently drop stdin
+ }
+ // done
+ runtime.containers.PushBack(container)
+ runtime.idIndex.Add(container.ID)
+
+ // FIXME: if the container is supposed to be running but is not, auto restart it?
+ // if so, then we need to restart monitor and init a new lock
+ // If the container is supposed to be running, make sure of it
+ if container.State.IsRunning() {
+ if container.State.IsGhost() {
+ utils.Debugf("killing ghost %s", container.ID)
+
+ existingPid := container.State.Pid
+ container.State.SetGhost(false)
+ container.State.SetStopped(0)
+
+ // We only have to handle this for lxc because the other drivers will ensure that
+ // no ghost processes are left when docker dies
+ if container.ExecDriver == "" || strings.Contains(container.ExecDriver, "lxc") {
+ lxc.KillLxc(container.ID, 9)
+ } else {
+ // use the current driver and ensure that the container is dead x.x
+ cmd := &execdriver.Command{
+ ID: container.ID,
+ }
+ var err error
+ cmd.Process, err = os.FindProcess(existingPid)
+ if err != nil {
+ utils.Debugf("cannot find existing process for %d", existingPid)
+ }
+ runtime.execDriver.Terminate(cmd)
+ }
+ if err := container.Unmount(); err != nil {
+ utils.Debugf("ghost unmount error %s", err)
+ }
+ if err := container.ToDisk(); err != nil {
+ utils.Debugf("saving ghost state to disk %s", err)
+ }
+ }
+
+ info := runtime.execDriver.Info(container.ID)
+ if !info.IsRunning() {
+ utils.Debugf("Container %s was supposed to be running but is not.", container.ID)
+ if runtime.config.AutoRestart {
+ utils.Debugf("Restarting")
+ if err := container.Unmount(); err != nil {
+ utils.Debugf("restart unmount error %s", err)
+ }
+
+ container.State.SetGhost(false)
+ container.State.SetStopped(0)
+ if err := container.Start(); err != nil {
+ return err
+ }
+ } else {
+ utils.Debugf("Marking as stopped")
+ container.State.SetStopped(-127)
+ if err := container.ToDisk(); err != nil {
+ return err
+ }
+ }
+ }
+ } else {
+ // When the container is not running, we still initialize the waitLock
+ // chan and close it. Receiving on nil chan blocks whereas receiving on a
+ // closed chan does not. In this case we do not want to block.
+ container.waitLock = make(chan struct{})
+ close(container.waitLock)
+ }
+ return nil
+}
+
+func (runtime *Runtime) ensureName(container *Container) error {
+ if container.Name == "" {
+ name, err := generateRandomName(runtime)
+ if err != nil {
+ name = utils.TruncateID(container.ID)
+ }
+ container.Name = name
+
+ if err := container.ToDisk(); err != nil {
+ utils.Debugf("Error saving container name %s", err)
+ }
+ if !runtime.containerGraph.Exists(name) {
+ if _, err := runtime.containerGraph.Set(name, container.ID); err != nil {
+ utils.Debugf("Setting default id - %s", err)
+ }
+ }
+ }
+ return nil
+}
+
+func (runtime *Runtime) LogToDisk(src *utils.WriteBroadcaster, dst, stream string) error {
+ log, err := os.OpenFile(dst, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0600)
+ if err != nil {
+ return err
+ }
+ src.AddWriter(log, stream)
+ return nil
+}
+
+// Destroy unregisters a container from the runtime and cleanly removes its contents from the filesystem.
+func (runtime *Runtime) Destroy(container *Container) error {
+ if container == nil {
+ return fmt.Errorf("The given container is <nil>")
+ }
+
+ element := runtime.getContainerElement(container.ID)
+ if element == nil {
+ return fmt.Errorf("Container %v not found - maybe it was already destroyed?", container.ID)
+ }
+
+ if err := container.Stop(3); err != nil {
+ return err
+ }
+
+ if err := runtime.driver.Remove(container.ID); err != nil {
+ return fmt.Errorf("Driver %s failed to remove root filesystem %s: %s", runtime.driver, container.ID, err)
+ }
+
+ initID := fmt.Sprintf("%s-init", container.ID)
+ if err := runtime.driver.Remove(initID); err != nil {
+ return fmt.Errorf("Driver %s failed to remove init filesystem %s: %s", runtime.driver, initID, err)
+ }
+
+ if _, err := runtime.containerGraph.Purge(container.ID); err != nil {
+ utils.Debugf("Unable to remove container from link graph: %s", err)
+ }
+
+ // Deregister the container before removing its directory, to avoid race conditions
+ runtime.idIndex.Delete(container.ID)
+ runtime.containers.Remove(element)
+ if err := os.RemoveAll(container.root); err != nil {
+ return fmt.Errorf("Unable to remove filesystem for %v: %v", container.ID, err)
+ }
+ return nil
+}
+
+func (runtime *Runtime) restore() error {
+ if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" {
+ fmt.Printf("Loading containers: ")
+ }
+ dir, err := ioutil.ReadDir(runtime.repository)
+ if err != nil {
+ return err
+ }
+ containers := make(map[string]*Container)
+ currentDriver := runtime.driver.String()
+
+ for _, v := range dir {
+ id := v.Name()
+ container, err := runtime.load(id)
+ if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" {
+ fmt.Print(".")
+ }
+ if err != nil {
+ utils.Errorf("Failed to load container %v: %v", id, err)
+ continue
+ }
+
+ // Ignore the container if it does not support the current driver being used by the graph
+ if container.Driver == "" && currentDriver == "aufs" || container.Driver == currentDriver {
+ utils.Debugf("Loaded container %v", container.ID)
+ containers[container.ID] = container
+ } else {
+ utils.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
+ }
+ }
+
+ register := func(container *Container) {
+ if err := runtime.Register(container); err != nil {
+ utils.Debugf("Failed to register container %s: %s", container.ID, err)
+ }
+ }
+
+ if entities := runtime.containerGraph.List("/", -1); entities != nil {
+ for _, p := range entities.Paths() {
+ if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" {
+ fmt.Print(".")
+ }
+ e := entities[p]
+ if container, ok := containers[e.ID()]; ok {
+ register(container)
+ delete(containers, e.ID())
+ }
+ }
+ }
+
+ // Any containers that are left over do not exist in the graph
+ for _, container := range containers {
+ // Try to set the default name for a container if it exists prior to links
+ container.Name, err = generateRandomName(runtime)
+ if err != nil {
+ container.Name = utils.TruncateID(container.ID)
+ }
+
+ if _, err := runtime.containerGraph.Set(container.Name, container.ID); err != nil {
+ utils.Debugf("Setting default id - %s", err)
+ }
+ register(container)
+ }
+
+ if os.Getenv("DEBUG") == "" && os.Getenv("TEST") == "" {
+ fmt.Printf(": done.\n")
+ }
+
+ return nil
+}
+
+// Create creates a new container from the given configuration with a given name.
+func (runtime *Runtime) Create(config *runconfig.Config, name string) (*Container, []string, error) {
+ var (
+ container *Container
+ warnings []string
+ )
+
+ img, err := runtime.repositories.LookupImage(config.Image)
+ if err != nil {
+ return nil, nil, err
+ }
+ if err := runtime.checkImageDepth(img); err != nil {
+ return nil, nil, err
+ }
+ if warnings, err = runtime.mergeAndVerifyConfig(config, img); err != nil {
+ return nil, nil, err
+ }
+ if container, err = runtime.newContainer(name, config, img); err != nil {
+ return nil, nil, err
+ }
+ if err := runtime.createRootfs(container, img); err != nil {
+ return nil, nil, err
+ }
+ if err := container.ToDisk(); err != nil {
+ return nil, nil, err
+ }
+ if err := runtime.Register(container); err != nil {
+ return nil, nil, err
+ }
+ return container, warnings, nil
+}
+
+func (runtime *Runtime) checkImageDepth(img *image.Image) error {
+ // We add 2 layers to the depth because the container's rw and
+ // init layer add to the restriction
+ depth, err := img.Depth()
+ if err != nil {
+ return err
+ }
+ if depth+2 >= MaxImageDepth {
+ return fmt.Errorf("Cannot create container with more than %d parents", MaxImageDepth)
+ }
+ return nil
+}
+
+func (runtime *Runtime) checkDeprecatedExpose(config *runconfig.Config) bool {
+ if config != nil {
+ if config.PortSpecs != nil {
+ for _, p := range config.PortSpecs {
+ if strings.Contains(p, ":") {
+ return true
+ }
+ }
+ }
+ }
+ return false
+}
+
+func (runtime *Runtime) mergeAndVerifyConfig(config *runconfig.Config, img *image.Image) ([]string, error) {
+ warnings := []string{}
+ if runtime.checkDeprecatedExpose(img.Config) || runtime.checkDeprecatedExpose(config) {
+ warnings = append(warnings, "The mapping to public ports on your host via Dockerfile EXPOSE (host:port:port) has been deprecated. Use -p to publish the ports.")
+ }
+ if img.Config != nil {
+ if err := runconfig.Merge(config, img.Config); err != nil {
+ return nil, err
+ }
+ }
+ if len(config.Entrypoint) == 0 && len(config.Cmd) == 0 {
+ return nil, fmt.Errorf("No command specified")
+ }
+ return warnings, nil
+}
+
+func (runtime *Runtime) generateIdAndName(name string) (string, string, error) {
+ var (
+ err error
+ id = utils.GenerateRandomID()
+ )
+
+ if name == "" {
+ name, err = generateRandomName(runtime)
+ if err != nil {
+ name = utils.TruncateID(id)
+ }
+ } else {
+ if !validContainerNamePattern.MatchString(name) {
+ return "", "", fmt.Errorf("Invalid container name (%s), only %s are allowed", name, validContainerNameChars)
+ }
+ }
+ if name[0] != '/' {
+ name = "/" + name
+ }
+ // Set the enitity in the graph using the default name specified
+ if _, err := runtime.containerGraph.Set(name, id); err != nil {
+ if !graphdb.IsNonUniqueNameError(err) {
+ return "", "", err
+ }
+
+ conflictingContainer, err := runtime.GetByName(name)
+ if err != nil {
+ if strings.Contains(err.Error(), "Could not find entity") {
+ return "", "", err
+ }
+
+ // Remove name and continue starting the container
+ if err := runtime.containerGraph.Delete(name); err != nil {
+ return "", "", err
+ }
+ } else {
+ nameAsKnownByUser := strings.TrimPrefix(name, "/")
+ return "", "", fmt.Errorf(
+ "Conflict, The name %s is already assigned to %s. You have to delete (or rename) that container to be able to assign %s to a container again.", nameAsKnownByUser,
+ utils.TruncateID(conflictingContainer.ID), nameAsKnownByUser)
+ }
+ }
+ return id, name, nil
+}
+
+func (runtime *Runtime) generateHostname(id string, config *runconfig.Config) {
+ // Generate default hostname
+ // FIXME: the lxc template no longer needs to set a default hostname
+ if config.Hostname == "" {
+ config.Hostname = id[:12]
+ }
+}
+
+func (runtime *Runtime) getEntrypointAndArgs(config *runconfig.Config) (string, []string) {
+ var (
+ entrypoint string
+ args []string
+ )
+ if len(config.Entrypoint) != 0 {
+ entrypoint = config.Entrypoint[0]
+ args = append(config.Entrypoint[1:], config.Cmd...)
+ } else {
+ entrypoint = config.Cmd[0]
+ args = config.Cmd[1:]
+ }
+ return entrypoint, args
+}
+
+func (runtime *Runtime) newContainer(name string, config *runconfig.Config, img *image.Image) (*Container, error) {
+ var (
+ id string
+ err error
+ )
+ id, name, err = runtime.generateIdAndName(name)
+ if err != nil {
+ return nil, err
+ }
+
+ runtime.generateHostname(id, config)
+ entrypoint, args := runtime.getEntrypointAndArgs(config)
+
+ container := &Container{
+ // FIXME: we should generate the ID here instead of receiving it as an argument
+ ID: id,
+ Created: time.Now().UTC(),
+ Path: entrypoint,
+ Args: args, //FIXME: de-duplicate from config
+ Config: config,
+ hostConfig: &runconfig.HostConfig{},
+ Image: img.ID, // Always use the resolved image id
+ NetworkSettings: &NetworkSettings{},
+ Name: name,
+ Driver: runtime.driver.String(),
+ ExecDriver: runtime.execDriver.Name(),
+ }
+ container.root = runtime.containerRoot(container.ID)
+ return container, nil
+}
+
+func (runtime *Runtime) createRootfs(container *Container, img *image.Image) error {
+ // Step 1: create the container directory.
+ // This doubles as a barrier to avoid race conditions.
+ if err := os.Mkdir(container.root, 0700); err != nil {
+ return err
+ }
+ initID := fmt.Sprintf("%s-init", container.ID)
+ if err := runtime.driver.Create(initID, img.ID, ""); err != nil {
+ return err
+ }
+ initPath, err := runtime.driver.Get(initID)
+ if err != nil {
+ return err
+ }
+ defer runtime.driver.Put(initID)
+
+ if err := graph.SetupInitLayer(initPath); err != nil {
+ return err
+ }
+
+ if err := runtime.driver.Create(container.ID, initID, ""); err != nil {
+ return err
+ }
+ return nil
+}
+
+// Commit creates a new filesystem image from the current state of a container.
+// The image can optionally be tagged into a repository
+func (runtime *Runtime) Commit(container *Container, repository, tag, comment, author string, config *runconfig.Config) (*image.Image, error) {
+ // FIXME: freeze the container before copying it to avoid data corruption?
+ if err := container.Mount(); err != nil {
+ return nil, err
+ }
+ defer container.Unmount()
+
+ rwTar, err := container.ExportRw()
+ if err != nil {
+ return nil, err
+ }
+ defer rwTar.Close()
+
+ // Create a new image from the container's base layers + a new layer from container changes
+ var (
+ containerID, containerImage string
+ containerConfig *runconfig.Config
+ )
+ if container != nil {
+ containerID = container.ID
+ containerImage = container.Image
+ containerConfig = container.Config
+ }
+ img, err := runtime.graph.Create(rwTar, containerID, containerImage, comment, author, containerConfig, config)
+ if err != nil {
+ return nil, err
+ }
+ // Register the image if needed
+ if repository != "" {
+ if err := runtime.repositories.Set(repository, tag, img.ID, true); err != nil {
+ return img, err
+ }
+ }
+ return img, nil
+}
+
+func GetFullContainerName(name string) (string, error) {
+ if name == "" {
+ return "", fmt.Errorf("Container name cannot be empty")
+ }
+ if name[0] != '/' {
+ name = "/" + name
+ }
+ return name, nil
+}
+
+func (runtime *Runtime) GetByName(name string) (*Container, error) {
+ fullName, err := GetFullContainerName(name)
+ if err != nil {
+ return nil, err
+ }
+ entity := runtime.containerGraph.Get(fullName)
+ if entity == nil {
+ return nil, fmt.Errorf("Could not find entity for %s", name)
+ }
+ e := runtime.getContainerElement(entity.ID())
+ if e == nil {
+ return nil, fmt.Errorf("Could not find container for entity id %s", entity.ID())
+ }
+ return e.Value.(*Container), nil
+}
+
+func (runtime *Runtime) Children(name string) (map[string]*Container, error) {
+ name, err := GetFullContainerName(name)
+ if err != nil {
+ return nil, err
+ }
+ children := make(map[string]*Container)
+
+ err = runtime.containerGraph.Walk(name, func(p string, e *graphdb.Entity) error {
+ c := runtime.Get(e.ID())
+ if c == nil {
+ return fmt.Errorf("Could not get container for name %s and id %s", e.ID(), p)
+ }
+ children[p] = c
+ return nil
+ }, 0)
+
+ if err != nil {
+ return nil, err
+ }
+ return children, nil
+}
+
+func (runtime *Runtime) RegisterLink(parent, child *Container, alias string) error {
+ fullName := path.Join(parent.Name, alias)
+ if !runtime.containerGraph.Exists(fullName) {
+ _, err := runtime.containerGraph.Set(fullName, child.ID)
+ return err
+ }
+ return nil
+}
+
+// FIXME: harmonize with NewGraph()
+func NewRuntime(config *daemonconfig.Config, eng *engine.Engine) (*Runtime, error) {
+ runtime, err := NewRuntimeFromDirectory(config, eng)
+ if err != nil {
+ return nil, err
+ }
+ return runtime, nil
+}
+
+func NewRuntimeFromDirectory(config *daemonconfig.Config, eng *engine.Engine) (*Runtime, error) {
+ if !config.EnableSelinuxSupport {
+ selinux.SetDisabled()
+ }
+
+ // Set the default driver
+ graphdriver.DefaultDriver = config.GraphDriver
+
+ // Load storage driver
+ driver, err := graphdriver.New(config.Root)
+ if err != nil {
+ return nil, err
+ }
+ utils.Debugf("Using graph driver %s", driver)
+
+ if err := remountPrivate(config.Root); err != nil {
+ return nil, err
+ }
+
+ runtimeRepo := path.Join(config.Root, "containers")
+
+ if err := os.MkdirAll(runtimeRepo, 0700); err != nil && !os.IsExist(err) {
+ return nil, err
+ }
+
+ // Migrate the container if it is aufs and aufs is enabled
+ if err = migrateIfAufs(driver, config.Root); err != nil {
+ return nil, err
+ }
+
+ utils.Debugf("Creating images graph")
+ g, err := graph.NewGraph(path.Join(config.Root, "graph"), driver)
+ if err != nil {
+ return nil, err
+ }
+
+ // We don't want to use a complex driver like aufs or devmapper
+ // for volumes, just a plain filesystem
+ volumesDriver, err := graphdriver.GetDriver("vfs", config.Root)
+ if err != nil {
+ return nil, err
+ }
+ utils.Debugf("Creating volumes graph")
+ volumes, err := graph.NewGraph(path.Join(config.Root, "volumes"), volumesDriver)
+ if err != nil {
+ return nil, err
+ }
+ utils.Debugf("Creating repository list")
+ repositories, err := graph.NewTagStore(path.Join(config.Root, "repositories-"+driver.String()), g)
+ if err != nil {
+ return nil, fmt.Errorf("Couldn't create Tag store: %s", err)
+ }
+
+ if !config.DisableNetwork {
+ job := eng.Job("init_networkdriver")
+
+ job.SetenvBool("EnableIptables", config.EnableIptables)
+ job.SetenvBool("InterContainerCommunication", config.InterContainerCommunication)
+ job.SetenvBool("EnableIpForward", config.EnableIpForward)
+ job.Setenv("BridgeIface", config.BridgeIface)
+ job.Setenv("BridgeIP", config.BridgeIP)
+ job.Setenv("DefaultBindingIP", config.DefaultIp.String())
+
+ if err := job.Run(); err != nil {
+ return nil, err
+ }
+ }
+
+ graphdbPath := path.Join(config.Root, "linkgraph.db")
+ graph, err := graphdb.NewSqliteConn(graphdbPath)
+ if err != nil {
+ return nil, err
+ }
+
+ localCopy := path.Join(config.Root, "init", fmt.Sprintf("dockerinit-%s", dockerversion.VERSION))
+ sysInitPath := utils.DockerInitPath(localCopy)
+ if sysInitPath == "" {
+ return nil, fmt.Errorf("Could not locate dockerinit: This usually means docker was built incorrectly. See http://docs.docker.io/en/latest/contributing/devenvironment for official build instructions.")
+ }
+
+ if sysInitPath != localCopy {
+ // When we find a suitable dockerinit binary (even if it's our local binary), we copy it into config.Root at localCopy for future use (so that the original can go away without that being a problem, for example during a package upgrade).
+ if err := os.Mkdir(path.Dir(localCopy), 0700); err != nil && !os.IsExist(err) {
+ return nil, err
+ }
+ if _, err := utils.CopyFile(sysInitPath, localCopy); err != nil {
+ return nil, err
+ }
+ if err := os.Chmod(localCopy, 0700); err != nil {
+ return nil, err
+ }
+ sysInitPath = localCopy
+ }
+
+ sysInfo := sysinfo.New(false)
+ ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInitPath, sysInfo)
+ if err != nil {
+ return nil, err
+ }
+
+ runtime := &Runtime{
+ repository: runtimeRepo,
+ containers: list.New(),
+ graph: g,
+ repositories: repositories,
+ idIndex: utils.NewTruncIndex(),
+ sysInfo: sysInfo,
+ volumes: volumes,
+ config: config,
+ containerGraph: graph,
+ driver: driver,
+ sysInitPath: sysInitPath,
+ execDriver: ed,
+ eng: eng,
+ }
+
+ if err := runtime.checkLocaldns(); err != nil {
+ return nil, err
+ }
+ if err := runtime.restore(); err != nil {
+ return nil, err
+ }
+ return runtime, nil
+}
+
+func (runtime *Runtime) shutdown() error {
+ group := sync.WaitGroup{}
+ utils.Debugf("starting clean shutdown of all containers...")
+ for _, container := range runtime.List() {
+ c := container
+ if c.State.IsRunning() {
+ utils.Debugf("stopping %s", c.ID)
+ group.Add(1)
+
+ go func() {
+ defer group.Done()
+ if err := c.KillSig(15); err != nil {
+ utils.Debugf("kill 15 error for %s - %s", c.ID, err)
+ }
+ c.Wait()
+ utils.Debugf("container stopped %s", c.ID)
+ }()
+ }
+ }
+ group.Wait()
+
+ return nil
+}
+
+func (runtime *Runtime) Close() error {
+ errorsStrings := []string{}
+ if err := runtime.shutdown(); err != nil {
+ utils.Errorf("runtime.shutdown(): %s", err)
+ errorsStrings = append(errorsStrings, err.Error())
+ }
+ if err := portallocator.ReleaseAll(); err != nil {
+ utils.Errorf("portallocator.ReleaseAll(): %s", err)
+ errorsStrings = append(errorsStrings, err.Error())
+ }
+ if err := runtime.driver.Cleanup(); err != nil {
+ utils.Errorf("runtime.driver.Cleanup(): %s", err.Error())
+ errorsStrings = append(errorsStrings, err.Error())
+ }
+ if err := runtime.containerGraph.Close(); err != nil {
+ utils.Errorf("runtime.containerGraph.Close(): %s", err.Error())
+ errorsStrings = append(errorsStrings, err.Error())
+ }
+ if len(errorsStrings) > 0 {
+ return fmt.Errorf("%s", strings.Join(errorsStrings, ", "))
+ }
+ return nil
+}
+
+func (runtime *Runtime) Mount(container *Container) error {
+ dir, err := runtime.driver.Get(container.ID)
+ if err != nil {
+ return fmt.Errorf("Error getting container %s from driver %s: %s", container.ID, runtime.driver, err)
+ }
+ if container.basefs == "" {
+ container.basefs = dir
+ } else if container.basefs != dir {
+ return fmt.Errorf("Error: driver %s is returning inconsistent paths for container %s ('%s' then '%s')",
+ runtime.driver, container.ID, container.basefs, dir)
+ }
+ return nil
+}
+
+func (runtime *Runtime) Unmount(container *Container) error {
+ runtime.driver.Put(container.ID)
+ return nil
+}
+
+func (runtime *Runtime) Changes(container *Container) ([]archive.Change, error) {
+ if differ, ok := runtime.driver.(graphdriver.Differ); ok {
+ return differ.Changes(container.ID)
+ }
+ cDir, err := runtime.driver.Get(container.ID)
+ if err != nil {
+ return nil, fmt.Errorf("Error getting container rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err)
+ }
+ defer runtime.driver.Put(container.ID)
+ initDir, err := runtime.driver.Get(container.ID + "-init")
+ if err != nil {
+ return nil, fmt.Errorf("Error getting container init rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err)
+ }
+ defer runtime.driver.Put(container.ID + "-init")
+ return archive.ChangesDirs(cDir, initDir)
+}
+
+func (runtime *Runtime) Diff(container *Container) (archive.Archive, error) {
+ if differ, ok := runtime.driver.(graphdriver.Differ); ok {
+ return differ.Diff(container.ID)
+ }
+
+ changes, err := runtime.Changes(container)
+ if err != nil {
+ return nil, err
+ }
+
+ cDir, err := runtime.driver.Get(container.ID)
+ if err != nil {
+ return nil, fmt.Errorf("Error getting container rootfs %s from driver %s: %s", container.ID, container.runtime.driver, err)
+ }
+
+ archive, err := archive.ExportChanges(cDir, changes)
+ if err != nil {
+ return nil, err
+ }
+ return utils.NewReadCloserWrapper(archive, func() error {
+ err := archive.Close()
+ runtime.driver.Put(container.ID)
+ return err
+ }), nil
+}
+
+func (runtime *Runtime) Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) {
+ return runtime.execDriver.Run(c.command, pipes, startCallback)
+}
+
+func (runtime *Runtime) Kill(c *Container, sig int) error {
+ return runtime.execDriver.Kill(c.command, sig)
+}
+
+// Nuke kills all containers then removes all content
+// from the content root, including images, volumes and
+// container filesystems.
+// Again: this will remove your entire docker runtime!
+func (runtime *Runtime) Nuke() error {
+ var wg sync.WaitGroup
+ for _, container := range runtime.List() {
+ wg.Add(1)
+ go func(c *Container) {
+ c.Kill()
+ wg.Done()
+ }(container)
+ }
+ wg.Wait()
+ runtime.Close()
+
+ return os.RemoveAll(runtime.config.Root)
+}
+
+// FIXME: this is a convenience function for integration tests
+// which need direct access to runtime.graph.
+// Once the tests switch to using engine and jobs, this method
+// can go away.
+func (runtime *Runtime) Graph() *graph.Graph {
+ return runtime.graph
+}
+
+func (runtime *Runtime) Repositories() *graph.TagStore {
+ return runtime.repositories
+}
+
+func (runtime *Runtime) Config() *daemonconfig.Config {
+ return runtime.config
+}
+
+func (runtime *Runtime) SystemConfig() *sysinfo.SysInfo {
+ return runtime.sysInfo
+}
+
+func (runtime *Runtime) SystemInitPath() string {
+ return runtime.sysInitPath
+}
+
+func (runtime *Runtime) GraphDriver() graphdriver.Driver {
+ return runtime.driver
+}
+
+func (runtime *Runtime) ExecutionDriver() execdriver.Driver {
+ return runtime.execDriver
+}
+
+func (runtime *Runtime) Volumes() *graph.Graph {
+ return runtime.volumes
+}
+
+func (runtime *Runtime) ContainerGraph() *graphdb.Database {
+ return runtime.containerGraph
+}
+
+func (runtime *Runtime) SetServer(server Server) {
+ runtime.srv = server
+}
+
+func (runtime *Runtime) checkLocaldns() error {
+ resolvConf, err := utils.GetResolvConf()
+ if err != nil {
+ return err
+ }
+ if len(runtime.config.Dns) == 0 && utils.CheckLocalDns(resolvConf) {
+ log.Printf("Local (127.0.0.1) DNS resolver found in resolv.conf and containers can't use it. Using default external servers : %v\n", DefaultDns)
+ runtime.config.Dns = DefaultDns
+ }
+ return nil
+}
diff --git a/runtime/runtime_aufs.go b/runtime/runtime_aufs.go
new file mode 100644
index 0000000000..5a32615df5
--- /dev/null
+++ b/runtime/runtime_aufs.go
@@ -0,0 +1,22 @@
+// +build !exclude_graphdriver_aufs
+
+package runtime
+
+import (
+ "github.com/dotcloud/docker/graph"
+ "github.com/dotcloud/docker/runtime/graphdriver"
+ "github.com/dotcloud/docker/runtime/graphdriver/aufs"
+ "github.com/dotcloud/docker/utils"
+)
+
+// Given the graphdriver ad, if it is aufs, then migrate it.
+// If aufs driver is not built, this func is a noop.
+func migrateIfAufs(driver graphdriver.Driver, root string) error {
+ if ad, ok := driver.(*aufs.Driver); ok {
+ utils.Debugf("Migrating existing containers")
+ if err := ad.Migrate(root, graph.SetupInitLayer); err != nil {
+ return err
+ }
+ }
+ return nil
+}
diff --git a/runtime/runtime_btrfs.go b/runtime/runtime_btrfs.go
new file mode 100644
index 0000000000..c59b103ff9
--- /dev/null
+++ b/runtime/runtime_btrfs.go
@@ -0,0 +1,7 @@
+// +build !exclude_graphdriver_btrfs
+
+package runtime
+
+import (
+ _ "github.com/dotcloud/docker/runtime/graphdriver/btrfs"
+)
diff --git a/runtime/runtime_devicemapper.go b/runtime/runtime_devicemapper.go
new file mode 100644
index 0000000000..5b418b377a
--- /dev/null
+++ b/runtime/runtime_devicemapper.go
@@ -0,0 +1,7 @@
+// +build !exclude_graphdriver_devicemapper
+
+package runtime
+
+import (
+ _ "github.com/dotcloud/docker/runtime/graphdriver/devmapper"
+)
diff --git a/runtime/runtime_no_aufs.go b/runtime/runtime_no_aufs.go
new file mode 100644
index 0000000000..05a01fe151
--- /dev/null
+++ b/runtime/runtime_no_aufs.go
@@ -0,0 +1,11 @@
+// +build exclude_graphdriver_aufs
+
+package runtime
+
+import (
+ "github.com/dotcloud/docker/runtime/graphdriver"
+)
+
+func migrateIfAufs(driver graphdriver.Driver, root string) error {
+ return nil
+}
diff --git a/runtime/server.go b/runtime/server.go
new file mode 100644
index 0000000000..a74c4d1200
--- /dev/null
+++ b/runtime/server.go
@@ -0,0 +1,10 @@
+package runtime
+
+import (
+ "github.com/dotcloud/docker/utils"
+)
+
+type Server interface {
+ LogEvent(action, id, from string) *utils.JSONMessage
+ IsRunning() bool // returns true if the server is currently in operation
+}
diff --git a/runtime/sorter.go b/runtime/sorter.go
new file mode 100644
index 0000000000..c5af772dae
--- /dev/null
+++ b/runtime/sorter.go
@@ -0,0 +1,25 @@
+package runtime
+
+import "sort"
+
+type containerSorter struct {
+ containers []*Container
+ by func(i, j *Container) bool
+}
+
+func (s *containerSorter) Len() int {
+ return len(s.containers)
+}
+
+func (s *containerSorter) Swap(i, j int) {
+ s.containers[i], s.containers[j] = s.containers[j], s.containers[i]
+}
+
+func (s *containerSorter) Less(i, j int) bool {
+ return s.by(s.containers[i], s.containers[j])
+}
+
+func sortContainers(containers []*Container, predicate func(i, j *Container) bool) {
+ s := &containerSorter{containers, predicate}
+ sort.Sort(s)
+}
diff --git a/runtime/state.go b/runtime/state.go
new file mode 100644
index 0000000000..316b8a40f1
--- /dev/null
+++ b/runtime/state.go
@@ -0,0 +1,84 @@
+package runtime
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/utils"
+ "sync"
+ "time"
+)
+
+type State struct {
+ sync.RWMutex
+ Running bool
+ Pid int
+ ExitCode int
+ StartedAt time.Time
+ FinishedAt time.Time
+ Ghost bool
+}
+
+// String returns a human-readable description of the state
+func (s *State) String() string {
+ s.RLock()
+ defer s.RUnlock()
+
+ if s.Running {
+ if s.Ghost {
+ return fmt.Sprintf("Ghost")
+ }
+ return fmt.Sprintf("Up %s", utils.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
+ }
+ if s.FinishedAt.IsZero() {
+ return ""
+ }
+ return fmt.Sprintf("Exited (%d) %s ago", s.ExitCode, utils.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
+}
+
+func (s *State) IsRunning() bool {
+ s.RLock()
+ defer s.RUnlock()
+
+ return s.Running
+}
+
+func (s *State) IsGhost() bool {
+ s.RLock()
+ defer s.RUnlock()
+
+ return s.Ghost
+}
+
+func (s *State) GetExitCode() int {
+ s.RLock()
+ defer s.RUnlock()
+
+ return s.ExitCode
+}
+
+func (s *State) SetGhost(val bool) {
+ s.Lock()
+ defer s.Unlock()
+
+ s.Ghost = val
+}
+
+func (s *State) SetRunning(pid int) {
+ s.Lock()
+ defer s.Unlock()
+
+ s.Running = true
+ s.Ghost = false
+ s.ExitCode = 0
+ s.Pid = pid
+ s.StartedAt = time.Now().UTC()
+}
+
+func (s *State) SetStopped(exitCode int) {
+ s.Lock()
+ defer s.Unlock()
+
+ s.Running = false
+ s.Pid = 0
+ s.FinishedAt = time.Now().UTC()
+ s.ExitCode = exitCode
+}
diff --git a/runtime/utils.go b/runtime/utils.go
new file mode 100644
index 0000000000..b983e67d41
--- /dev/null
+++ b/runtime/utils.go
@@ -0,0 +1,64 @@
+package runtime
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/nat"
+ "github.com/dotcloud/docker/pkg/namesgenerator"
+ "github.com/dotcloud/docker/runconfig"
+ "strings"
+)
+
+func migratePortMappings(config *runconfig.Config, hostConfig *runconfig.HostConfig) error {
+ if config.PortSpecs != nil {
+ ports, bindings, err := nat.ParsePortSpecs(config.PortSpecs)
+ if err != nil {
+ return err
+ }
+ config.PortSpecs = nil
+ if len(bindings) > 0 {
+ if hostConfig == nil {
+ hostConfig = &runconfig.HostConfig{}
+ }
+ hostConfig.PortBindings = bindings
+ }
+
+ if config.ExposedPorts == nil {
+ config.ExposedPorts = make(nat.PortSet, len(ports))
+ }
+ for k, v := range ports {
+ config.ExposedPorts[k] = v
+ }
+ }
+ return nil
+}
+
+func mergeLxcConfIntoOptions(hostConfig *runconfig.HostConfig, driverConfig map[string][]string) {
+ if hostConfig == nil {
+ return
+ }
+
+ // merge in the lxc conf options into the generic config map
+ if lxcConf := hostConfig.LxcConf; lxcConf != nil {
+ lxc := driverConfig["lxc"]
+ for _, pair := range lxcConf {
+ // because lxc conf gets the driver name lxc.XXXX we need to trim it off
+ // and let the lxc driver add it back later if needed
+ parts := strings.SplitN(pair.Key, ".", 2)
+ lxc = append(lxc, fmt.Sprintf("%s=%s", parts[1], pair.Value))
+ }
+ driverConfig["lxc"] = lxc
+ }
+}
+
+type checker struct {
+ runtime *Runtime
+}
+
+func (c *checker) Exists(name string) bool {
+ return c.runtime.containerGraph.Exists("/" + name)
+}
+
+// Generate a random and unique name
+func generateRandomName(runtime *Runtime) (string, error) {
+ return namesgenerator.GenerateRandomName(&checker{runtime})
+}
diff --git a/runtime/utils_test.go b/runtime/utils_test.go
new file mode 100644
index 0000000000..bdf3543a49
--- /dev/null
+++ b/runtime/utils_test.go
@@ -0,0 +1,29 @@
+package runtime
+
+import (
+ "testing"
+
+ "github.com/dotcloud/docker/runconfig"
+ "github.com/dotcloud/docker/utils"
+)
+
+func TestMergeLxcConfig(t *testing.T) {
+ var (
+ hostConfig = &runconfig.HostConfig{
+ LxcConf: []utils.KeyValuePair{
+ {Key: "lxc.cgroups.cpuset", Value: "1,2"},
+ },
+ }
+ driverConfig = make(map[string][]string)
+ )
+
+ mergeLxcConfIntoOptions(hostConfig, driverConfig)
+ if l := len(driverConfig["lxc"]); l > 1 {
+ t.Fatalf("expected lxc options len of 1 got %d", l)
+ }
+
+ cpuset := driverConfig["lxc"][0]
+ if expected := "cgroups.cpuset=1,2"; cpuset != expected {
+ t.Fatalf("expected %s got %s", expected, cpuset)
+ }
+}
diff --git a/runtime/volumes.go b/runtime/volumes.go
new file mode 100644
index 0000000000..004f1bb024
--- /dev/null
+++ b/runtime/volumes.go
@@ -0,0 +1,287 @@
+package runtime
+
+import (
+ "fmt"
+ "github.com/dotcloud/docker/archive"
+ "github.com/dotcloud/docker/runtime/execdriver"
+ "github.com/dotcloud/docker/utils"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+)
+
+type BindMap struct {
+ SrcPath string
+ DstPath string
+ Mode string
+}
+
+func prepareVolumesForContainer(container *Container) error {
+ if container.Volumes == nil || len(container.Volumes) == 0 {
+ container.Volumes = make(map[string]string)
+ container.VolumesRW = make(map[string]bool)
+ if err := applyVolumesFrom(container); err != nil {
+ return err
+ }
+ }
+
+ if err := createVolumes(container); err != nil {
+ return err
+ }
+ return nil
+}
+
+func setupMountsForContainer(container *Container, envPath string) error {
+ mounts := []execdriver.Mount{
+ {container.runtime.sysInitPath, "/.dockerinit", false, true},
+ {envPath, "/.dockerenv", false, true},
+ {container.ResolvConfPath, "/etc/resolv.conf", false, true},
+ }
+
+ if container.HostnamePath != "" && container.HostsPath != "" {
+ mounts = append(mounts, execdriver.Mount{container.HostnamePath, "/etc/hostname", false, true})
+ mounts = append(mounts, execdriver.Mount{container.HostsPath, "/etc/hosts", false, true})
+ }
+
+ // Mount user specified volumes
+ // Note, these are not private because you may want propagation of (un)mounts from host
+ // volumes. For instance if you use -v /usr:/usr and the host later mounts /usr/share you
+ // want this new mount in the container
+ for r, v := range container.Volumes {
+ mounts = append(mounts, execdriver.Mount{v, r, container.VolumesRW[r], false})
+ }
+
+ container.command.Mounts = mounts
+
+ return nil
+}
+
+func applyVolumesFrom(container *Container) error {
+ volumesFrom := container.hostConfig.VolumesFrom
+ if len(volumesFrom) > 0 {
+ for _, containerSpec := range volumesFrom {
+ var (
+ mountRW = true
+ specParts = strings.SplitN(containerSpec, ":", 2)
+ )
+
+ switch len(specParts) {
+ case 0:
+ return fmt.Errorf("Malformed volumes-from specification: %s", containerSpec)
+ case 2:
+ switch specParts[1] {
+ case "ro":
+ mountRW = false
+ case "rw": // mountRW is already true
+ default:
+ return fmt.Errorf("Malformed volumes-from specification: %s", containerSpec)
+ }
+ }
+
+ c := container.runtime.Get(specParts[0])
+ if c == nil {
+ return fmt.Errorf("Container %s not found. Impossible to mount its volumes", specParts[0])
+ }
+
+ if err := c.Mount(); err != nil {
+ return fmt.Errorf("Container %s failed to mount. Impossible to mount its volumes", specParts[0])
+ }
+ defer c.Unmount()
+
+ for volPath, id := range c.Volumes {
+ if _, exists := container.Volumes[volPath]; exists {
+ continue
+ }
+ stat, err := os.Stat(filepath.Join(c.basefs, volPath))
+ if err != nil {
+ return err
+ }
+ if err := createIfNotExists(filepath.Join(container.basefs, volPath), stat.IsDir()); err != nil {
+ return err
+ }
+ container.Volumes[volPath] = id
+ if isRW, exists := c.VolumesRW[volPath]; exists {
+ container.VolumesRW[volPath] = isRW && mountRW
+ }
+ }
+
+ }
+ }
+ return nil
+}
+
+func getBindMap(container *Container) (map[string]BindMap, error) {
+ var (
+ // Create the requested bind mounts
+ binds = make(map[string]BindMap)
+ // Define illegal container destinations
+ illegalDsts = []string{"/", "."}
+ )
+
+ for _, bind := range container.hostConfig.Binds {
+ // FIXME: factorize bind parsing in parseBind
+ var (
+ src, dst, mode string
+ arr = strings.Split(bind, ":")
+ )
+
+ if len(arr) == 2 {
+ src = arr[0]
+ dst = arr[1]
+ mode = "rw"
+ } else if len(arr) == 3 {
+ src = arr[0]
+ dst = arr[1]
+ mode = arr[2]
+ } else {
+ return nil, fmt.Errorf("Invalid bind specification: %s", bind)
+ }
+
+ // Bail if trying to mount to an illegal destination
+ for _, illegal := range illegalDsts {
+ if dst == illegal {
+ return nil, fmt.Errorf("Illegal bind destination: %s", dst)
+ }
+ }
+
+ bindMap := BindMap{
+ SrcPath: src,
+ DstPath: dst,
+ Mode: mode,
+ }
+ binds[filepath.Clean(dst)] = bindMap
+ }
+ return binds, nil
+}
+
+func createVolumes(container *Container) error {
+ binds, err := getBindMap(container)
+ if err != nil {
+ return err
+ }
+
+ volumesDriver := container.runtime.volumes.Driver()
+ // Create the requested volumes if they don't exist
+ for volPath := range container.Config.Volumes {
+ volPath = filepath.Clean(volPath)
+ volIsDir := true
+ // Skip existing volumes
+ if _, exists := container.Volumes[volPath]; exists {
+ continue
+ }
+ var srcPath string
+ var isBindMount bool
+ srcRW := false
+ // If an external bind is defined for this volume, use that as a source
+ if bindMap, exists := binds[volPath]; exists {
+ isBindMount = true
+ srcPath = bindMap.SrcPath
+ if !filepath.IsAbs(srcPath) {
+ return fmt.Errorf("%s must be an absolute path", srcPath)
+ }
+ if strings.ToLower(bindMap.Mode) == "rw" {
+ srcRW = true
+ }
+ if stat, err := os.Stat(bindMap.SrcPath); err != nil {
+ return err
+ } else {
+ volIsDir = stat.IsDir()
+ }
+ // Otherwise create an directory in $ROOT/volumes/ and use that
+ } else {
+
+ // Do not pass a container as the parameter for the volume creation.
+ // The graph driver using the container's information ( Image ) to
+ // create the parent.
+ c, err := container.runtime.volumes.Create(nil, "", "", "", "", nil, nil)
+ if err != nil {
+ return err
+ }
+ srcPath, err = volumesDriver.Get(c.ID)
+ if err != nil {
+ return fmt.Errorf("Driver %s failed to get volume rootfs %s: %s", volumesDriver, c.ID, err)
+ }
+ srcRW = true // RW by default
+ }
+
+ if p, err := filepath.EvalSymlinks(srcPath); err != nil {
+ return err
+ } else {
+ srcPath = p
+ }
+
+ container.Volumes[volPath] = srcPath
+ container.VolumesRW[volPath] = srcRW
+
+ // Create the mountpoint
+ volPath = filepath.Join(container.basefs, volPath)
+ rootVolPath, err := utils.FollowSymlinkInScope(volPath, container.basefs)
+ if err != nil {
+ return err
+ }
+ if err := createIfNotExists(rootVolPath, volIsDir); err != nil {
+ return err
+ }
+
+ // Do not copy or change permissions if we are mounting from the host
+ if srcRW && !isBindMount {
+ volList, err := ioutil.ReadDir(rootVolPath)
+ if err != nil {
+ return err
+ }
+ if len(volList) > 0 {
+ srcList, err := ioutil.ReadDir(srcPath)
+ if err != nil {
+ return err
+ }
+ if len(srcList) == 0 {
+ // If the source volume is empty copy files from the root into the volume
+ if err := archive.CopyWithTar(rootVolPath, srcPath); err != nil {
+ return err
+ }
+
+ var stat syscall.Stat_t
+ if err := syscall.Stat(rootVolPath, &stat); err != nil {
+ return err
+ }
+ var srcStat syscall.Stat_t
+ if err := syscall.Stat(srcPath, &srcStat); err != nil {
+ return err
+ }
+ // Change the source volume's ownership if it differs from the root
+ // files that were just copied
+ if stat.Uid != srcStat.Uid || stat.Gid != srcStat.Gid {
+ if err := os.Chown(srcPath, int(stat.Uid), int(stat.Gid)); err != nil {
+ return err
+ }
+ }
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func createIfNotExists(path string, isDir bool) error {
+ if _, err := os.Stat(path); err != nil {
+ if os.IsNotExist(err) {
+ if isDir {
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ } else {
+ if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+ return err
+ }
+ f, err := os.OpenFile(path, os.O_CREATE, 0755)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ }
+ }
+ }
+ return nil
+}