summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordanielsdeleo <dan@opscode.com>2013-12-09 10:17:26 -0800
committerdanielsdeleo <dan@opscode.com>2013-12-09 10:17:26 -0800
commitbf00ab2b68f61440766c3223efa184c7f5e1a074 (patch)
tree8b26b51554e3a5714b47c76d10ccd39cc4b544a0
parent834cfbdb98975845a44e9a11a4b8477386cf3ec8 (diff)
parenta4c67900863bf5baaa6766a325cc6ecf9eaecf7b (diff)
downloadmixlib-shellout-bf00ab2b68f61440766c3223efa184c7f5e1a074.tar.gz
Merge branch 'track-child-by-pgid'
* Closes https://tickets.opscode.com/browse/MIXLIB-24 * Expand timeout process cleanup to include grandchildren.
-rw-r--r--.rspec2
-rw-r--r--lib/mixlib/shellout/unix.rb47
-rw-r--r--spec/mixlib/shellout_spec.rb66
3 files changed, 103 insertions, 12 deletions
diff --git a/.rspec b/.rspec
index 7060880..62c58f0 100644
--- a/.rspec
+++ b/.rspec
@@ -1 +1 @@
--cbfs
+-cfs
diff --git a/lib/mixlib/shellout/unix.rb b/lib/mixlib/shellout/unix.rb
index 15ead60..a2518cf 100644
--- a/lib/mixlib/shellout/unix.rb
+++ b/lib/mixlib/shellout/unix.rb
@@ -29,13 +29,20 @@ module Mixlib
# to +stdout+ and +stderr+, and saving its exit status object to +status+
# === Returns
# returns +self+; +stdout+, +stderr+, +status+, and +exitstatus+ will be
- # populated with results of the command
+ # populated with results of the command.
# === Raises
# * Errno::EACCES when you are not privileged to execute the command
# * Errno::ENOENT when the command is not available on the system (or not
# in the current $PATH)
# * Chef::Exceptions::CommandTimeout when the command does not complete
- # within +timeout+ seconds (default: 600s)
+ # within +timeout+ seconds (default: 600s). When this happens, ShellOut
+ # will send a TERM and then KILL to the entire process group to ensure
+ # that any grandchild processes are terminated. If the invocation of
+ # the child process spawned multiple child processes (which commonly
+ # happens if the command is passed as a single string to be interpreted
+ # by bin/sh, and bin/sh is not bash), the exit status object may not
+ # contain the correct exit code of the process (of course there is no
+ # exit code if the command is killed by SIGKILL, also).
def run_command
@child_pid = fork_subprocess
@reaped = false
@@ -51,6 +58,7 @@ module Mixlib
# CHEF-3390: Marshall.load on Ruby < 1.8.7p369 also has a GC bug related
# to Marshall.load, so try disabling GC first.
propagate_pre_exec_failure
+ @child_pgid = -Process.getpgid(@child_pid)
@result = nil
@execution_time = 0
@@ -122,6 +130,12 @@ module Mixlib
Dir.chdir(cwd) if cwd
end
+ # Process group id of the child. Returned as a negative value so you can
+ # put it directly in arguments to kill, wait, etc.
+ def child_pgid
+ @child_pgid
+ end
+
def initialize_ipc
@stdin_pipe, @stdout_pipe, @stderr_pipe, @process_status_pipe = IO.pipe, IO.pipe, IO.pipe, IO.pipe
@process_status_pipe.last.fcntl(Fcntl::F_SETFD, Fcntl::FD_CLOEXEC)
@@ -263,6 +277,14 @@ module Mixlib
initialize_ipc
fork do
+ # Child processes may themselves fork off children. A common case
+ # is when the command is given as a single string (instead of
+ # command name plus Array of arguments) and /bin/sh does not
+ # support the "ONESHOT" optimization (where sh -c does exec without
+ # forking). To support cleaning up all the children, we need to
+ # ensure they're in a unique process group.
+ Process.setsid
+
configure_subprocess_file_descriptors
clean_parent_file_descriptors
@@ -302,14 +324,13 @@ module Mixlib
def reap_errant_child
return if attempt_reap
- @terminate_reason = "Command execeded allowed execution time, killed by TERM signal."
+ @terminate_reason = "Command execeded allowed execution time, process terminated"
logger.error("Command execeded allowed execution time, sending TERM") if logger
- Process.kill(:TERM, @child_pid)
+ Process.kill(:TERM, child_pgid)
sleep 3
- return if attempt_reap
- @terminate_reason = "Command execeded allowed execution time, did not respond to TERM. Killed by KILL signal."
- logger.error("Command did not exit from TERM, sending KILL") if logger
- Process.kill(:KILL, @child_pid)
+ attempt_reap
+ logger.error("Command execeded allowed execution time, sending KILL") if logger
+ Process.kill(:KILL, child_pgid)
reap
# Should not hit this but it's possible if something is calling waitall
@@ -323,12 +344,22 @@ module Mixlib
@child_pid && !@reaped
end
+ # Unconditionally reap the child process. This is used in scenarios where
+ # we can be confident the child will exit quickly, and has not spawned
+ # and grandchild processes.
def reap
results = Process.waitpid2(@child_pid)
@reaped = true
@status = results.last
+ rescue Errno::ECHILD
+ # When cleaning up timed-out processes, we might send SIGKILL to the
+ # whole process group after we've cleaned up the direct child. In that
+ # case the grandchildren will have been adopted by init so we can't
+ # reap them even if we wanted to (we don't).
+ nil
end
+ # Try to reap the child process but don't block if it isn't dead yet.
def attempt_reap
if results = Process.waitpid2(@child_pid, Process::WNOHANG)
@reaped = true
diff --git a/spec/mixlib/shellout_spec.rb b/spec/mixlib/shellout_spec.rb
index 71ecfef..0a6f9b2 100644
--- a/spec/mixlib/shellout_spec.rb
+++ b/spec/mixlib/shellout_spec.rb
@@ -826,8 +826,15 @@ describe Mixlib::ShellOut do
end
context 'with subprocess that takes longer than timeout' do
+ def ruby_wo_shell(code)
+ parts = %w[ruby]
+ parts << "--disable-gems" if ruby_19?
+ parts << "-e"
+ parts << code
+ end
+
let(:cmd) do
- ruby_eval.call(<<-CODE)
+ ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { puts "got term"; exit!(123) }
sleep 10
@@ -849,7 +856,7 @@ describe Mixlib::ShellOut do
context "and the child is unresponsive" do
let(:cmd) do
- ruby_eval.call(<<-CODE)
+ ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { puts "nanana cant hear you" }
sleep 10
@@ -877,11 +884,64 @@ describe Mixlib::ShellOut do
shell_cmd.status.termsig.should == 9
log_output.string.should include("Command execeded allowed execution time, sending TERM")
- log_output.string.should include("Command did not exit from TERM, sending KILL")
+ log_output.string.should include("Command execeded allowed execution time, sending KILL")
end
end
end
+
+ context "and the child process forks grandchildren" do
+ let(:cmd) do
+ ruby_wo_shell(<<-CODE)
+ STDOUT.sync = true
+ trap(:TERM) { print "got term in child\n"; exit!(123) }
+ fork do
+ trap(:TERM) { print "got term in grandchild\n"; exit!(142) }
+ sleep 10
+ end
+ sleep 10
+ CODE
+ end
+
+ it "should TERM the wayward child and grandchild" do
+ # note: let blocks don't correctly memoize if an exception is raised,
+ # so can't use executed_cmd
+ lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
+ shell_cmd.stdout.should include("got term in child")
+ shell_cmd.stdout.should include("got term in grandchild")
+ end
+
+ end
+ context "and the child process forks grandchildren that don't respond to TERM" do
+ let(:cmd) do
+ ruby_wo_shell(<<-CODE)
+ STDOUT.sync = true
+
+ trap(:TERM) { print "got term in child\n"; exit!(123) }
+ fork do
+ trap(:TERM) { print "got term in grandchild\n" }
+ sleep 10
+ end
+ sleep 10
+ CODE
+ end
+
+ it "should TERM the wayward child and grandchild, then KILL whoever is left" do
+ # note: let blocks don't correctly memoize if an exception is raised,
+ # so can't use executed_cmd
+ lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
+
+ shell_cmd.stdout.should include("got term in child")
+ shell_cmd.stdout.should include("got term in grandchild")
+
+ # A little janky. We get the process group id out of the command
+ # object, then try to kill a process in it to make sure none
+ # exists. Trusting the system under test like this isn't great but
+ # it's difficult to test otherwise.
+ lambda { Process.kill(:INT, shell_cmd.send(:child_pgid)) }.should raise_error(Errno::ESRCH)
+ end
+
+ end
end
context 'with subprocess that exceeds buffersize' do