diff options
Diffstat (limited to 'src/core/main.c')
-rw-r--r-- | src/core/main.c | 464 |
1 files changed, 308 insertions, 156 deletions
diff --git a/src/core/main.c b/src/core/main.c index 44dd8348be..839dc062ff 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -28,6 +28,7 @@ #include "bus-error.h" #include "bus-util.h" #include "capability-util.h" +#include "cgroup-util.h" #include "clock-util.h" #include "conf-parser.h" #include "cpu-set-util.h" @@ -57,6 +58,7 @@ #include "pager.h" #include "parse-util.h" #include "path-util.h" +#include "pretty-print.h" #include "proc-cmdline.h" #include "process-util.h" #include "raw-clone.h" @@ -73,6 +75,7 @@ #include "stdio-util.h" #include "strv.h" #include "switch-root.h" +#include "sysctl-util.h" #include "terminal-util.h" #include "umask-util.h" #include "user-util.h" @@ -95,11 +98,10 @@ static int arg_crash_chvt = -1; static bool arg_crash_shell = false; static bool arg_crash_reboot = false; static char *arg_confirm_spawn = NULL; -static ShowStatus arg_show_status = _SHOW_STATUS_UNSET; +static ShowStatus arg_show_status = _SHOW_STATUS_INVALID; static bool arg_switched_root = false; -static bool arg_no_pager = false; +static PagerFlags arg_pager_flags = 0; static bool arg_service_watchdogs = true; -static char ***arg_join_controllers = NULL; static ExecOutput arg_default_std_output = EXEC_OUTPUT_JOURNAL; static ExecOutput arg_default_std_error = EXEC_OUTPUT_INHERIT; static usec_t arg_default_restart_usec = DEFAULT_RESTART_USEC; @@ -109,6 +111,7 @@ static usec_t arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL; static unsigned arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST; static usec_t arg_runtime_watchdog = 0; static usec_t arg_shutdown_watchdog = 10 * USEC_PER_MINUTE; +static char *arg_early_core_pattern = NULL; static char *arg_watchdog_device = NULL; static char **arg_default_environment = NULL; static struct rlimit *arg_default_rlimit[_RLIMIT_MAX] = {}; @@ -118,7 +121,7 @@ static nsec_t arg_timer_slack_nsec = NSEC_INFINITY; static usec_t arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE; static Set* arg_syscall_archs = NULL; static FILE* arg_serialization = NULL; -static bool arg_default_cpu_accounting = false; +static int arg_default_cpu_accounting = -1; static bool arg_default_io_accounting = false; static bool arg_default_ip_accounting = false; static bool arg_default_blockio_accounting = false; @@ -128,7 +131,14 @@ static uint64_t arg_default_tasks_max = UINT64_MAX; static sd_id128_t arg_machine_id = {}; static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; -_noreturn_ static void freeze_or_reboot(void) { +_noreturn_ static void freeze_or_exit_or_reboot(void) { + + /* If we are running in a contianer, let's prefer exiting, after all we can propagate an exit code to the + * container manager, and thus inform it that something went wrong. */ + if (detect_container() > 0) { + log_emergency("Exiting PID 1..."); + exit(EXIT_EXCEPTION); + } if (arg_crash_reboot) { log_notice("Rebooting in 10s..."); @@ -183,7 +193,7 @@ _noreturn_ static void crash(int sig) { (void) kill(pid, sig); /* raise() would kill the parent */ assert_not_reached("We shouldn't be here..."); - _exit(EXIT_FAILURE); + _exit(EXIT_EXCEPTION); } else { siginfo_t status; int r; @@ -226,17 +236,18 @@ _noreturn_ static void crash(int sig) { else if (pid == 0) { (void) setsid(); (void) make_console_stdio(); + (void) rlimit_nofile_safe(); (void) execle("/bin/sh", "/bin/sh", NULL, environ); log_emergency_errno(errno, "execle() failed: %m"); - _exit(EXIT_FAILURE); + _exit(EXIT_EXCEPTION); } else { log_info("Spawned crash shell as PID "PID_FMT".", pid); (void) wait_for_terminate(pid, NULL); } } - freeze_or_reboot(); + freeze_or_exit_or_reboot(); } static void install_crash_handler(void) { @@ -347,22 +358,35 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = value ? parse_boolean(value) : true; if (r < 0) - log_warning("Failed to parse dump core switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value); else arg_dump_core = r; + } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (path_is_absolute(value)) + (void) parse_path_argument_and_warn(value, false, &arg_early_core_pattern); + else + log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value); + } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) { if (!value) arg_crash_chvt = 0; /* turn on */ - else if (parse_crash_chvt(value) < 0) - log_warning("Failed to parse crash chvt switch %s. Ignoring.", value); + else { + r = parse_crash_chvt(value); + if (r < 0) + log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value); + } } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) { r = value ? parse_boolean(value) : true; if (r < 0) - log_warning("Failed to parse crash shell switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value); else arg_crash_shell = r; @@ -370,7 +394,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = value ? parse_boolean(value) : true; if (r < 0) - log_warning("Failed to parse crash reboot switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value); else arg_crash_reboot = r; @@ -379,17 +403,15 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = parse_confirm_spawn(value, &s); if (r < 0) - log_warning_errno(r, "Failed to parse confirm_spawn switch %s. Ignoring.", value); - else { - free(arg_confirm_spawn); - arg_confirm_spawn = s; - } + log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value); + else + free_and_replace(arg_confirm_spawn, s); } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) { r = value ? parse_boolean(value) : true; if (r < 0) - log_warning("Failed to parse service watchdog switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value); else arg_service_watchdogs = r; @@ -398,7 +420,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat if (value) { r = parse_show_status(value, &arg_show_status); if (r < 0) - log_warning("Failed to parse show status switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value); } else arg_show_status = SHOW_STATUS_YES; @@ -409,7 +431,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = exec_output_from_string(value); if (r < 0) - log_warning("Failed to parse default standard output switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value); else arg_default_std_output = r; @@ -420,7 +442,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = exec_output_from_string(value); if (r < 0) - log_warning("Failed to parse default standard error switch %s. Ignoring.", value); + log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value); else arg_default_std_error = r; @@ -447,7 +469,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = set_machine_id(value); if (r < 0) - log_warning("MachineID '%s' is not valid. Ignoring.", value); + log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value); } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) { @@ -456,7 +478,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat r = parse_sec(value, &arg_default_timeout_start_usec); if (r < 0) - log_warning_errno(r, "Failed to parse default start timeout: %s, ignoring.", value); + log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value); if (arg_default_timeout_start_usec <= 0) arg_default_timeout_start_usec = USEC_INFINITY; @@ -466,11 +488,11 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat if (proc_cmdline_value_missing(key, value)) return 0; - parse_path_argument_and_warn(value, false, &arg_watchdog_device); + (void) parse_path_argument_and_warn(value, false, &arg_watchdog_device); } else if (streq(key, "quiet") && !value) { - if (arg_show_status == _SHOW_STATUS_UNSET) + if (arg_show_status == _SHOW_STATUS_INVALID) arg_show_status = SHOW_STATUS_AUTO; } else if (streq(key, "debug") && !value) { @@ -604,8 +626,8 @@ static int config_parse_output_restricted( return 0; } - if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE)) { - log_syntax(unit, LOG_ERR, filename, line, 0, "Standard output types socket, fd:, file: are not supported as defaults, ignoring: %s", rvalue); + if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Standard output types socket, fd:, file:, append: are not supported as defaults, ignoring: %s", rvalue); return 0; } @@ -654,7 +676,7 @@ static int parse_config_file(void) { { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot }, { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status }, { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, NULL }, - { "Manager", "JoinControllers", config_parse_join_controllers, 0, &arg_join_controllers }, + { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog }, { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_shutdown_watchdog }, { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device }, @@ -690,7 +712,7 @@ static int parse_config_file(void) { { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit }, { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit }, { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit }, - { "Manager", "DefaultCPUAccounting", config_parse_bool, 0, &arg_default_cpu_accounting }, + { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting }, { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting }, { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting }, { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting }, @@ -739,7 +761,14 @@ static void set_manager_defaults(Manager *m) { m->default_restart_usec = arg_default_restart_usec; m->default_start_limit_interval = arg_default_start_limit_interval; m->default_start_limit_burst = arg_default_start_limit_burst; - m->default_cpu_accounting = arg_default_cpu_accounting; + + /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU + * controller to be enabled, so the default is to enable it unless we got told otherwise. */ + if (arg_default_cpu_accounting >= 0) + m->default_cpu_accounting = arg_default_cpu_accounting; + else + m->default_cpu_accounting = cpu_accounting_is_cheap(); + m->default_io_accounting = arg_default_io_accounting; m->default_ip_accounting = arg_default_ip_accounting; m->default_blockio_accounting = arg_default_blockio_accounting; @@ -747,8 +776,10 @@ static void set_manager_defaults(Manager *m) { m->default_tasks_accounting = arg_default_tasks_accounting; m->default_tasks_max = arg_default_tasks_max; - manager_set_default_rlimits(m, arg_default_rlimit); - manager_environment_add(m, NULL, arg_default_environment); + (void) manager_set_default_rlimits(m, arg_default_rlimit); + + (void) manager_default_environment(m); + (void) manager_transient_environment_add(m, arg_default_environment); } static void set_manager_settings(Manager *m) { @@ -838,19 +869,15 @@ static int parse_argv(int argc, char *argv[]) { case ARG_LOG_LEVEL: r = log_set_max_level_from_string(optarg); - if (r < 0) { - log_error("Failed to parse log level %s.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); break; case ARG_LOG_TARGET: r = log_set_target_from_string(optarg); - if (r < 0) { - log_error("Failed to parse log target %s.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); break; @@ -858,10 +885,9 @@ static int parse_argv(int argc, char *argv[]) { if (optarg) { r = log_show_color_from_string(optarg); - if (r < 0) { - log_error("Failed to parse log color setting %s.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to parse log color setting \"%s\": %m", + optarg); } else log_show_color(true); @@ -870,10 +896,9 @@ static int parse_argv(int argc, char *argv[]) { case ARG_LOG_LOCATION: if (optarg) { r = log_show_location_from_string(optarg); - if (r < 0) { - log_error("Failed to parse log location setting %s.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to parse log location setting \"%s\": %m", + optarg); } else log_show_location(true); @@ -881,26 +906,24 @@ static int parse_argv(int argc, char *argv[]) { case ARG_DEFAULT_STD_OUTPUT: r = exec_output_from_string(optarg); - if (r < 0) { - log_error("Failed to parse default standard output setting %s.", optarg); - return r; - } else - arg_default_std_output = r; + if (r < 0) + return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m", + optarg); + arg_default_std_output = r; break; case ARG_DEFAULT_STD_ERROR: r = exec_output_from_string(optarg); - if (r < 0) { - log_error("Failed to parse default standard error output setting %s.", optarg); - return r; - } else - arg_default_std_error = r; + if (r < 0) + return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m", + optarg); + arg_default_std_error = r; break; case ARG_UNIT: r = free_and_strdup(&arg_default_unit, optarg); if (r < 0) - return log_error_errno(r, "Failed to set default unit %s: %m", optarg); + return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg); break; @@ -917,7 +940,7 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_NO_PAGER: - arg_no_pager = true; + arg_pager_flags |= PAGER_DISABLE; break; case ARG_VERSION: @@ -938,7 +961,8 @@ static int parse_argv(int argc, char *argv[]) { else { r = parse_boolean(optarg); if (r < 0) - return log_error_errno(r, "Failed to parse dump core boolean: %s", optarg); + return log_error_errno(r, "Failed to parse dump core boolean: \"%s\": %m", + optarg); arg_dump_core = r; } break; @@ -946,7 +970,8 @@ static int parse_argv(int argc, char *argv[]) { case ARG_CRASH_CHVT: r = parse_crash_chvt(optarg); if (r < 0) - return log_error_errno(r, "Failed to parse crash virtual terminal index: %s", optarg); + return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m", + optarg); break; case ARG_CRASH_SHELL: @@ -955,7 +980,8 @@ static int parse_argv(int argc, char *argv[]) { else { r = parse_boolean(optarg); if (r < 0) - return log_error_errno(r, "Failed to parse crash shell boolean: %s", optarg); + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); arg_crash_shell = r; } break; @@ -966,7 +992,8 @@ static int parse_argv(int argc, char *argv[]) { else { r = parse_boolean(optarg); if (r < 0) - return log_error_errno(r, "Failed to parse crash shell boolean: %s", optarg); + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); arg_crash_reboot = r; } break; @@ -976,23 +1003,24 @@ static int parse_argv(int argc, char *argv[]) { r = parse_confirm_spawn(optarg, &arg_confirm_spawn); if (r < 0) - return log_error_errno(r, "Failed to parse confirm spawn option: %m"); + return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m", + optarg); break; case ARG_SERVICE_WATCHDOGS: r = parse_boolean(optarg); if (r < 0) - return log_error_errno(r, "Failed to parse service watchdogs boolean: %s", optarg); + return log_error_errno(r, "Failed to parse service watchdogs boolean: \"%s\": %m", + optarg); arg_service_watchdogs = r; break; case ARG_SHOW_STATUS: if (optarg) { r = parse_show_status(optarg, &arg_show_status); - if (r < 0) { - log_error("Failed to parse show status boolean %s.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m", + optarg); } else arg_show_status = SHOW_STATUS_YES; break; @@ -1002,16 +1030,18 @@ static int parse_argv(int argc, char *argv[]) { FILE *f; r = safe_atoi(optarg, &fd); - if (r < 0 || fd < 0) { - log_error("Failed to parse deserialize option %s.", optarg); - return -EINVAL; - } + if (r < 0) + log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg); + if (fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid deserialize fd: %d", + fd); (void) fd_cloexec(fd, true); f = fdopen(fd, "r"); if (!f) - return log_error_errno(errno, "Failed to open serialization fd: %m"); + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); safe_fclose(arg_serialization); arg_serialization = f; @@ -1026,7 +1056,7 @@ static int parse_argv(int argc, char *argv[]) { case ARG_MACHINE_ID: r = set_machine_id(optarg); if (r < 0) - return log_error_errno(r, "MachineID '%s' is not valid.", optarg); + return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg); break; case 'h': @@ -1059,14 +1089,20 @@ static int parse_argv(int argc, char *argv[]) { /* Hmm, when we aren't run as init system * let's complain about excess arguments */ - log_error("Excess arguments."); - return -EINVAL; + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Excess arguments."); } return 0; } static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); printf("%s [OPTIONS...]\n\n" "Starts up and maintains the system or user services.\n\n" @@ -1090,20 +1126,28 @@ static int help(void) { " --log-color[=BOOL] Highlight important log messages\n" " --log-location[=BOOL] Include code location in log messages\n" " --default-standard-output= Set default standard output for services\n" - " --default-standard-error= Set default standard error output for services\n", - program_invocation_short_name); + " --default-standard-error= Set default standard error output for services\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , link + ); return 0; } -static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching_root) { +static int prepare_reexecute( + Manager *m, + FILE **ret_f, + FDSet **ret_fds, + bool switching_root) { + _cleanup_fdset_free_ FDSet *fds = NULL; _cleanup_fclose_ FILE *f = NULL; int r; assert(m); - assert(_f); - assert(_fds); + assert(ret_f); + assert(ret_fds); r = manager_open_serialization(m, &f); if (r < 0) @@ -1119,7 +1163,7 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching r = manager_serialize(m, f, fds, switching_root); if (r < 0) - return log_error_errno(r, "Failed to serialize state: %m"); + return r; if (fseeko(f, 0, SEEK_SET) == (off_t) -1) return log_error_errno(errno, "Failed to rewind serialization fd: %m"); @@ -1132,24 +1176,108 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching if (r < 0) return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m"); - *_f = TAKE_PTR(f); - *_fds = TAKE_PTR(fds); + *ret_f = TAKE_PTR(f); + *ret_fds = TAKE_PTR(fds); return 0; } +static void bump_file_max_and_nr_open(void) { + + /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file + * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting + * them and limiting them in another two layers of limits is unnecessary and just complicates things. This + * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft + + * hard) the only ones that really matter. */ + +#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN + _cleanup_free_ char *t = NULL; + int r; +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX + /* I so wanted to use STRINGIFY(ULONG_MAX) here, but alas we can't as glibc/gcc define that as + * "(0x7fffffffffffffffL * 2UL + 1UL)". Seriously. 😢 */ + if (asprintf(&t, "%lu\n", ULONG_MAX) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/file-max", t); + if (r < 0) + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m"); +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX && BUMP_PROC_SYS_FS_NR_OPEN + t = mfree(t); +#endif + +#if BUMP_PROC_SYS_FS_NR_OPEN + int v = INT_MAX; + + /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they + * are. The expression by which the maximum is determined is dependent on the architecture, and is something we + * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since + * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with + * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel + * APIs are kernel APIs, so what do can we do... 🤯 */ + + for (;;) { + int k; + + v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ + if (v < 1024) { + log_warning("Can't bump fs.nr_open, value too small."); + break; + } + + k = read_nr_open(); + if (k < 0) { + log_error_errno(k, "Failed to read fs.nr_open: %m"); + break; + } + if (k >= v) { /* Already larger */ + log_debug("Skipping bump, value is already larger."); + break; + } + + if (asprintf(&t, "%i\n", v) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/nr_open", t); + t = mfree(t); + if (r == -EINVAL) { + log_debug("Couldn't write fs.nr_open as %i, halving it.", v); + v /= 2; + continue; + } + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m"); + break; + } + + log_debug("Successfully bumped fs.nr_open to %i", v); + break; + } +#endif +} + static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { int r, nr; assert(saved_rlimit); - /* Save the original RLIMIT_NOFILE so that we can reset it - * later when transitioning from the initrd to the main + /* Save the original RLIMIT_NOFILE so that we can reset it later when transitioning from the initrd to the main * systemd or suchlike. */ if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0) return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m"); - /* Make sure forked processes get the default kernel setting */ + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + /* Make sure forked processes get limits based on the original kernel setting */ if (!arg_default_rlimit[RLIMIT_NOFILE]) { struct rlimit *rl; @@ -1157,11 +1285,25 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { if (!rl) return log_oom(); + /* Bump the hard limit for system services to a substantially higher value. The default hard limit + * current kernels set is pretty low (4K), mostly for historical reasons. According to kernel + * developers, the fd handling in recent kernels has been optimized substantially enough, so that we + * can bump the limit now, without paying too high a price in memory or performance. Note however that + * we only bump the hard limit, not the soft limit. That's because select() works the way it works, and + * chokes on fds >= 1024. If we'd bump the soft limit globally, it might accidentally happen to + * unexpecting programs that they get fds higher than what they can process using select(). By only + * bumping the hard limit but leaving the low limit as it is we avoid this pitfall: programs that are + * written by folks aware of the select() problem in mind (and thus use poll()/epoll instead of + * select(), the way everybody should) can explicitly opt into high fds by bumping their soft limit + * beyond 1024, to the hard limit we pass. */ + if (arg_system) + rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); + arg_default_rlimit[RLIMIT_NOFILE] = rl; } - /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows */ - nr = read_nr_open(); + /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for + * both hard and soft. */ r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr)); if (r < 0) return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m"); @@ -1173,16 +1315,15 @@ static int bump_rlimit_memlock(struct rlimit *saved_rlimit) { int r; assert(saved_rlimit); - assert(getuid() == 0); - /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which - * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's - * bump the value high enough for the root user. */ + /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should + * normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's bump + * the value high enough for our user. */ if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0) return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m"); - r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL)); + r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(HIGH_RLIMIT_MEMLOCK)); if (r < 0) return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m"); @@ -1219,7 +1360,7 @@ static int status_welcome(void) { _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL; int r; - if (arg_show_status <= 0) + if (IN_SET(arg_show_status, SHOW_STATUS_NO, SHOW_STATUS_AUTO)) return 0; r = parse_os_release(NULL, @@ -1231,12 +1372,12 @@ static int status_welcome(void) { "Failed to read os-release file, ignoring: %m"); if (log_get_show_color()) - return status_printf(NULL, false, false, + return status_printf(NULL, 0, "\nWelcome to \x1B[%sm%s\x1B[0m!\n", isempty(ansi_color) ? "1" : ansi_color, isempty(pretty_name) ? "Linux" : pretty_name); else - return status_printf(NULL, false, false, + return status_printf(NULL, 0, "\nWelcome to %s!\n", isempty(pretty_name) ? "Linux" : pretty_name); } @@ -1268,7 +1409,7 @@ static int bump_unix_max_dgram_qlen(void) { r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen); if (r < 0) - return log_warning_errno(r, "Failed to read AF_UNIX datagram queue length, ignoring: %m"); + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m"); r = safe_atolu(qlen, &v); if (r < 0) @@ -1277,7 +1418,7 @@ static int bump_unix_max_dgram_qlen(void) { if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN) return 0; - r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", 0, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN); + r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN); if (r < 0) return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump AF_UNIX datagram queue length, ignoring: %m"); @@ -1474,13 +1615,29 @@ static void initialize_coredump(bool skip_setup) { if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0) log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m"); - /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored - * until the systemd-coredump tool is enabled via sysctl. */ + /* But at the same time, turn off the core_pattern logic by default, so that no + * coredumps are stored until the systemd-coredump tool is enabled via + * sysctl. However it can be changed via the kernel command line later so core + * dumps can still be generated during early startup and in initramfs. */ if (!skip_setup) disable_coredumps(); #endif } +static void initialize_core_pattern(bool skip_setup) { + int r; + + if (skip_setup || !arg_early_core_pattern) + return; + + if (getpid_cached() != 1) + return; + + r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern); +} + static void do_reexecute( int argc, char *argv[], @@ -1577,6 +1734,7 @@ static void do_reexecute( /* Reenable any blocked signals, especially important if we switch from initial ramdisk to init=... */ (void) reset_all_signal_handlers(); (void) reset_signal_mask(); + (void) rlimit_nofile_safe(); if (switch_root_init) { args[0] = switch_root_init; @@ -1633,7 +1791,7 @@ static int invoke_main_loop( return log_emergency_errno(r, "Failed to run main loop: %m"); } - switch (m->exit_code) { + switch ((ManagerObjective) r) { case MANAGER_RELOAD: { LogTarget saved_log_target; @@ -1660,7 +1818,8 @@ static int invoke_main_loop( r = manager_reload(m); if (r < 0) - log_warning_errno(r, "Failed to reload, ignoring: %m"); + /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */ + m->objective = MANAGER_OK; break; } @@ -1724,19 +1883,19 @@ static int invoke_main_loop( case MANAGER_POWEROFF: case MANAGER_HALT: case MANAGER_KEXEC: { - static const char * const table[_MANAGER_EXIT_CODE_MAX] = { - [MANAGER_EXIT] = "exit", - [MANAGER_REBOOT] = "reboot", + static const char * const table[_MANAGER_OBJECTIVE_MAX] = { + [MANAGER_EXIT] = "exit", + [MANAGER_REBOOT] = "reboot", [MANAGER_POWEROFF] = "poweroff", - [MANAGER_HALT] = "halt", - [MANAGER_KEXEC] = "kexec" + [MANAGER_HALT] = "halt", + [MANAGER_KEXEC] = "kexec", }; log_notice("Shutting down."); *ret_reexecute = false; *ret_retval = m->return_value; - assert_se(*ret_shutdown_verb = table[m->exit_code]); + assert_se(*ret_shutdown_verb = table[m->objective]); *ret_fds = NULL; *ret_switch_root_dir = *ret_switch_root_init = NULL; @@ -1744,7 +1903,7 @@ static int invoke_main_loop( } default: - assert_not_reached("Unknown exit code."); + assert_not_reached("Unknown or unexpected manager objective."); } } } @@ -1816,7 +1975,7 @@ static int initialize_runtime( install_crash_handler(); if (!skip_setup) { - r = mount_cgroup_controllers(arg_join_controllers); + r = mount_cgroup_controllers(); if (r < 0) { *ret_error_message = "Failed to mount cgroup hierarchies"; return r; @@ -1827,6 +1986,7 @@ static int initialize_runtime( machine_id_setup(NULL, arg_machine_id, NULL); loopback_setup(); bump_unix_max_dgram_qlen(); + bump_file_max_and_nr_open(); test_usr(); write_container_id(); } @@ -1879,11 +2039,9 @@ static int initialize_runtime( if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) log_warning_errno(errno, "Failed to make us a subreaper: %m"); - if (arg_system) { - /* Bump up RLIMIT_NOFILE for systemd itself */ - (void) bump_rlimit_nofile(saved_rlimit_nofile); - (void) bump_rlimit_memlock(saved_rlimit_memlock); - } + /* Bump up RLIMIT_NOFILE for systemd itself */ + (void) bump_rlimit_nofile(saved_rlimit_nofile); + (void) bump_rlimit_memlock(saved_rlimit_memlock); return 0; } @@ -1942,7 +2100,6 @@ static void free_arguments(void) { arg_default_unit = mfree(arg_default_unit); arg_confirm_spawn = mfree(arg_confirm_spawn); - arg_join_controllers = strv_free_free(arg_join_controllers); arg_default_environment = strv_free(arg_default_environment); arg_syscall_archs = set_free(arg_syscall_archs); } @@ -1985,7 +2142,7 @@ static int load_configuration(int argc, char **argv, const char **ret_error_mess } /* Initialize the show status setting if it hasn't been set explicitly yet */ - if (arg_show_status == _SHOW_STATUS_UNSET) + if (arg_show_status == _SHOW_STATUS_INVALID) arg_show_status = SHOW_STATUS_YES; return 0; @@ -1994,50 +2151,43 @@ static int load_configuration(int argc, char **argv, const char **ret_error_mess static int safety_checks(void) { if (getpid_cached() == 1 && - arg_action != ACTION_RUN) { - log_error("Unsupported execution mode while PID 1."); - return -EPERM; - } + arg_action != ACTION_RUN) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Unsupported execution mode while PID 1."); if (getpid_cached() == 1 && - !arg_system) { - log_error("Can't run --user mode as PID 1."); - return -EPERM; - } + !arg_system) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run --user mode as PID 1."); if (arg_action == ACTION_RUN && arg_system && - getpid_cached() != 1) { - log_error("Can't run system mode unless PID 1."); - return -EPERM; - } + getpid_cached() != 1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run system mode unless PID 1."); if (arg_action == ACTION_TEST && - geteuid() == 0) { - log_error("Don't run test mode as root."); - return -EPERM; - } + geteuid() == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Don't run test mode as root."); if (!arg_system && arg_action == ACTION_RUN && - sd_booted() <= 0) { - log_error("Trying to run as user instance, but the system has not been booted with systemd."); - return -EOPNOTSUPP; - } + sd_booted() <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Trying to run as user instance, but the system has not been booted with systemd."); if (!arg_system && arg_action == ACTION_RUN && - !getenv("XDG_RUNTIME_DIR")) { - log_error("Trying to run as user instance, but $XDG_RUNTIME_DIR is not set."); - return -EUNATCH; - } + !getenv("XDG_RUNTIME_DIR")) + return log_error_errno(SYNTHETIC_ERRNO(EUNATCH), + "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set."); if (arg_system && arg_action == ACTION_RUN && - running_in_chroot() > 0) { - log_error("Cannot be run in a chroot() environment."); - return -EOPNOTSUPP; - } + running_in_chroot() > 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cannot be run in a chroot() environment."); return 0; } @@ -2309,13 +2459,13 @@ int main(int argc, char *argv[]) { goto finish; if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES)) - (void) pager_open(arg_no_pager, false); + (void) pager_open(arg_pager_flags); if (arg_action != ACTION_RUN) skip_setup = true; if (arg_action == ACTION_HELP) { - retval = help(); + retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS; goto finish; } else if (arg_action == ACTION_VERSION) { retval = version(); @@ -2337,6 +2487,9 @@ int main(int argc, char *argv[]) { if (arg_action == ACTION_RUN) { + /* A core pattern might have been specified via the cmdline. */ + initialize_core_pattern(skip_setup); + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ log_close(); @@ -2373,8 +2526,8 @@ int main(int argc, char *argv[]) { m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp; m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp; m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp; - m->timestamps[MANAGER_TIMESTAMP_SECURITY_START] = security_start_timestamp; - m->timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH] = security_finish_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp; set_manager_defaults(m); set_manager_settings(m); @@ -2387,7 +2540,6 @@ int main(int argc, char *argv[]) { r = manager_startup(m, arg_serialization, fds); if (r < 0) { - log_error_errno(r, "Failed to fully start up daemon: %m"); error_message = "Failed to start up manager"; goto finish; } @@ -2473,8 +2625,8 @@ finish: if (error_message) manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL, - "%s, freezing.", error_message); - freeze_or_reboot(); + "%s.", error_message); + freeze_or_exit_or_reboot(); } return retval; |