summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Haller <thaller@redhat.com>2019-09-01 09:38:31 +0200
committerThomas Haller <thaller@redhat.com>2019-09-25 17:30:30 +0200
commit586d0282b63ceac9964a952abd397a04821bac59 (patch)
treee6867c0d1e2bfae01a8656eadeb6a1b9f7d86d44
parent28a9c7a3cf02494ab88cdb744e431bedde71bc91 (diff)
downloadNetworkManager-th/dnsmasq-rework.tar.gz
dns: move ratelimiting and restart from NMDnsManager to NMDnsDnsmasqth/dnsmasq-rework
Note that the only DNS plugin that actually emits the FAILED signal was NMDnsDnsmasq. Let's not handle restart, retry and rate-limiting by NMDnsManager but by NMDnsDnsmasq itself. There are three goals here: (1) we want that when dnsmasq (infrequently) crashes, that we always keep retrying. A random crash should be automatically resolved and eventually dnsmasq should be working again. Note that we anyway cannot fully detect whether something is wrong. OK, we detect crashes, but if dnsmasq just gets catatonic, it's just as broken. Point being: our ability to detect non-working dnsmasq is limited. (2) when dnsmasq keeps crashing all the time, then rate limit the retry. Of course, at this point there is already something seriously wrong, but we shouldn't kill the system by respawning the process without rate limiting. (3) previously, when NMDnsManager noticed that the pluging was broken (and rate-limiting kicked in), it would temporarily disable the plugin. Basically, that meant to write the real name servers to /etc/resolv.conf directly, instead of setting localhost. This partly conflicts with (1), because we want to retry and recover automatically. So what good is it to notice a problem, resort to plain /etc/resolv.conf for a short time, and then run into the issues again? If something is really broken, there is no way but to involve the user to investigate and fix the issue. Hence, we don't need to concern NMDnsManager with this either. The only thing that the manager notices is when the dnsmasq binary is not available. In that case, update() fails right away, and the manager falls back to configure the name servers in /etc/resolv.conf directly. Also, change the backoff time from 5 minutes to 1 minute (twice the burst interval). There is not particularly strong reason for either choice, I think that if the ratelimit kicks in, then something is already so wrong that it doesn't matter either way. Anyway, also 60 seconds is long enough to not kill the machine otherwise.
-rw-r--r--src/dns/nm-dns-dnsmasq.c117
-rw-r--r--src/dns/nm-dns-manager.c74
-rw-r--r--src/dns/nm-dns-plugin.c30
-rw-r--r--src/dns/nm-dns-plugin.h7
4 files changed, 90 insertions, 138 deletions
diff --git a/src/dns/nm-dns-dnsmasq.c b/src/dns/nm-dns-dnsmasq.c
index 49c5203966..4265a66660 100644
--- a/src/dns/nm-dns-dnsmasq.c
+++ b/src/dns/nm-dns-dnsmasq.c
@@ -31,6 +31,9 @@
#define DNSMASQ_DBUS_SERVICE "org.freedesktop.NetworkManager.dnsmasq"
#define DNSMASQ_DBUS_PATH "/uk/org/thekelleys/dnsmasq"
+#define RATELIMIT_INTERVAL_MSEC 30000
+#define RATELIMIT_BURST 5
+
#define _NMLOG_DOMAIN LOGD_DNS
/*****************************************************************************/
@@ -647,11 +650,19 @@ typedef struct {
char *name_owner;
+ gint64 burst_start_at;
+
GPid process_pid;
guint name_owner_changed_id;
guint main_timeout_id;
+ guint burst_retry_timeout_id;
+
+ guint8 burst_count;
+
+ bool is_stopped:1;
+
} NMDnsDnsmasqPrivate;
struct _NMDnsDnsmasq {
@@ -674,6 +685,10 @@ G_DEFINE_TYPE (NMDnsDnsmasq, nm_dns_dnsmasq, NM_TYPE_DNS_PLUGIN)
/*****************************************************************************/
+static gboolean start_dnsmasq (NMDnsDnsmasq *self, gboolean force_start, GError **error);
+
+/*****************************************************************************/
+
static void
add_dnsmasq_nameserver (NMDnsDnsmasq *self,
GVariantBuilder *servers,
@@ -839,30 +854,27 @@ send_dnsmasq_update (NMDnsDnsmasq *self)
{
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
- if (!priv->set_server_ex_args)
- return;
+ if ( !priv->name_owner
+ || !priv->set_server_ex_args)
+ return;
- if (priv->name_owner) {
- _LOGD ("trying to update dnsmasq nameservers");
-
- nm_clear_g_cancellable (&priv->update_cancellable);
- priv->update_cancellable = g_cancellable_new ();
-
- g_dbus_connection_call (priv->dbus_connection,
- priv->name_owner,
- DNSMASQ_DBUS_PATH,
- DNSMASQ_DBUS_SERVICE,
- "SetServersEx",
- priv->set_server_ex_args,
- NULL,
- G_DBUS_CALL_FLAGS_NO_AUTO_START,
- 20000,
- priv->update_cancellable,
- dnsmasq_update_done,
- self);
- g_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
- } else
- _LOGD ("dnsmasq is starting... The nameserver update will be sent when dnsmasq appears");
+ _LOGD ("trying to update dnsmasq nameservers");
+
+ nm_clear_g_cancellable (&priv->update_cancellable);
+ priv->update_cancellable = g_cancellable_new ();
+
+ g_dbus_connection_call (priv->dbus_connection,
+ priv->name_owner,
+ DNSMASQ_DBUS_PATH,
+ DNSMASQ_DBUS_SERVICE,
+ "SetServersEx",
+ priv->set_server_ex_args,
+ NULL,
+ G_DBUS_CALL_FLAGS_NO_AUTO_START,
+ 20000,
+ priv->update_cancellable,
+ dnsmasq_update_done,
+ self);
}
/*****************************************************************************/
@@ -888,8 +900,11 @@ _main_cleanup (NMDnsDnsmasq *self, gboolean emit_failed)
* process in the background. */
nm_clear_g_cancellable (&priv->main_cancellable);
- if (emit_failed)
- _nm_dns_plugin_emit_failed (self, TRUE);
+ if ( !priv->is_stopped
+ && priv->burst_retry_timeout_id == 0) {
+ start_dnsmasq (self, FALSE, NULL);
+ send_dnsmasq_update (self);
+ }
}
static void
@@ -1001,10 +1016,24 @@ spawn_notify (GCancellable *cancellable,
}
static gboolean
-start_dnsmasq (NMDnsDnsmasq *self, GError **error)
+_burst_retry_timeout_cb (gpointer user_data)
+{
+ NMDnsDnsmasq *self = user_data;
+ NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
+
+ priv->burst_retry_timeout_id = 0;
+
+ start_dnsmasq (self, TRUE, NULL);
+ send_dnsmasq_update (self);
+ return G_SOURCE_REMOVE;
+}
+
+static gboolean
+start_dnsmasq (NMDnsDnsmasq *self, gboolean force_start, GError **error)
{
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
const char *dm_binary;
+ gint64 now;
if (G_LIKELY (priv->main_cancellable)) {
/* The process is already running or about to be started. Nothing to do. */
@@ -1030,6 +1059,31 @@ start_dnsmasq (NMDnsDnsmasq *self, GError **error)
}
}
+ now = nm_utils_get_monotonic_timestamp_ms ();
+ if ( force_start
+ || priv->burst_start_at == 0
+ || priv->burst_start_at + RATELIMIT_INTERVAL_MSEC <= now) {
+ priv->burst_start_at = now;
+ priv->burst_count = 1;
+ nm_clear_g_source (&priv->burst_retry_timeout_id);
+ _LOGT ("rate-limit: start burst interval of %d seconds %s",
+ RATELIMIT_INTERVAL_MSEC / 1000,
+ force_start ? " (force)" : "");
+ } else if (priv->burst_count < RATELIMIT_BURST) {
+ nm_assert (priv->burst_retry_timeout_id == 0);
+ priv->burst_count++;
+ _LOGT ("rate-limit: %u try within burst interval of %d seconds",
+ (guint) priv->burst_count,
+ RATELIMIT_INTERVAL_MSEC / 1000);
+ } else {
+ if (priv->burst_retry_timeout_id == 0) {
+ _LOGW ("dnsmasq dies and gets respawned too quickly. Back off. Something is very wrong");
+ priv->burst_retry_timeout_id = g_timeout_add_seconds ((2 * RATELIMIT_INTERVAL_MSEC) / 1000, _burst_retry_timeout_cb, self);
+ } else
+ _LOGT ("rate-limit: currently rate-limited from restart");
+ return TRUE;
+ }
+
priv->main_timeout_id = g_timeout_add (10000,
spawn_timeout_cb,
self);
@@ -1053,7 +1107,7 @@ update (NMDnsPlugin *plugin,
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (plugin);
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
- if (!start_dnsmasq (self, error))
+ if (!start_dnsmasq (self, TRUE, error))
return FALSE;
nm_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
@@ -1072,6 +1126,11 @@ static void
stop (NMDnsPlugin *plugin)
{
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (plugin);
+ NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
+
+ priv->is_stopped = TRUE;
+ priv->burst_start_at = 0;
+ nm_clear_g_source (&priv->burst_retry_timeout_id);
/* Cancelling the cancellable will also terminate the
* process (in the background). */
@@ -1097,6 +1156,10 @@ dispose (GObject *object)
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (object);
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
+ priv->is_stopped = TRUE;
+
+ nm_clear_g_source (&priv->burst_retry_timeout_id);
+
_main_cleanup (self, FALSE);
g_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
diff --git a/src/dns/nm-dns-manager.c b/src/dns/nm-dns-manager.c
index 4c619d55f6..3234c4e798 100644
--- a/src/dns/nm-dns-manager.c
+++ b/src/dns/nm-dns-manager.c
@@ -50,10 +50,6 @@
#define NETCONFIG_PATH "/sbin/netconfig"
#endif
-#define PLUGIN_RATELIMIT_INTERVAL 30
-#define PLUGIN_RATELIMIT_BURST 5
-#define PLUGIN_RATELIMIT_DELAY 300
-
/*****************************************************************************/
typedef enum {
@@ -1557,71 +1553,6 @@ update_dns (NMDnsManager *self,
return !update || result == SR_SUCCESS;
}
-static void
-plugin_failed (NMDnsManager *self, NMDnsPlugin *plugin)
-{
- GError *error = NULL;
-
- /* Errors with non-caching plugins aren't fatal */
- if (!nm_dns_plugin_is_caching (plugin))
- return;
-
- /* Disable caching until the next DNS update */
- if (!update_dns (self, TRUE, &error)) {
- _LOGW ("could not commit DNS changes: %s", error->message);
- g_clear_error (&error);
- }
-}
-
-static gboolean
-plugin_child_quit_update_dns (gpointer user_data)
-{
- GError *error = NULL;
- NMDnsManager *self = NM_DNS_MANAGER (user_data);
-
- /* Let the plugin try to spawn the child again */
- if (!update_dns (self, FALSE, &error)) {
- _LOGW ("could not commit DNS changes: %s", error->message);
- g_clear_error (&error);
- }
-
- return G_SOURCE_REMOVE;
-}
-
-static void
-plugin_failed_cb (NMDnsPlugin *plugin, gboolean is_fatal, NMDnsManager *self)
-{
- NMDnsManagerPrivate *priv = NM_DNS_MANAGER_GET_PRIVATE (self);
-
- if (is_fatal) {
- gint64 ts = nm_utils_get_monotonic_timestamp_ms ();
-
- _LOGW ("plugin %s died unexpectedly", nm_dns_plugin_get_name (plugin));
-
- if ( !priv->plugin_ratelimit.ts
- || (ts - priv->plugin_ratelimit.ts) / 1000 > PLUGIN_RATELIMIT_INTERVAL) {
- priv->plugin_ratelimit.ts = ts;
- priv->plugin_ratelimit.num_restarts = 0;
- } else {
- priv->plugin_ratelimit.num_restarts++;
- if (priv->plugin_ratelimit.num_restarts > PLUGIN_RATELIMIT_BURST) {
- plugin_failed (self, plugin);
- _LOGW ("plugin %s child respawning too fast, delaying update for %u seconds",
- nm_dns_plugin_get_name (plugin), PLUGIN_RATELIMIT_DELAY);
- priv->plugin_ratelimit.timer = g_timeout_add_seconds (PLUGIN_RATELIMIT_DELAY,
- plugin_child_quit_update_dns,
- self);
- return;
- }
- }
-
- plugin_child_quit_update_dns (self);
- return;
- }
-
- plugin_failed (self, plugin);
-}
-
/*****************************************************************************/
static void
@@ -1856,7 +1787,6 @@ _clear_plugin (NMDnsManager *self)
nm_clear_g_source (&priv->plugin_ratelimit.timer);
if (priv->plugin) {
- g_signal_handlers_disconnect_by_func (priv->plugin, plugin_failed_cb, self);
nm_dns_plugin_stop (priv->plugin);
g_clear_object (&priv->plugin);
return TRUE;
@@ -2079,10 +2009,6 @@ again:
} else if (nm_clear_g_object (&priv->sd_resolve_plugin))
systemd_resolved_changed = TRUE;
- if ( plugin_changed
- && priv->plugin)
- g_signal_connect (priv->plugin, NM_DNS_PLUGIN_FAILED, G_CALLBACK (plugin_failed_cb), self);
-
g_object_freeze_notify (G_OBJECT (self));
if (!nm_streq0 (priv->mode, mode)) {
diff --git a/src/dns/nm-dns-plugin.c b/src/dns/nm-dns-plugin.c
index d7d3672ec7..6930b7a6cd 100644
--- a/src/dns/nm-dns-plugin.c
+++ b/src/dns/nm-dns-plugin.c
@@ -17,13 +17,6 @@
/*****************************************************************************/
-enum {
- FAILED,
- LAST_SIGNAL,
-};
-
-static guint signals[LAST_SIGNAL] = { 0 };
-
typedef struct _NMDnsPluginPrivate {
GPid pid;
guint watch_id;
@@ -105,15 +98,6 @@ nm_dns_plugin_stop (NMDnsPlugin *self)
klass->stop (self);
}
-void
-_nm_dns_plugin_emit_failed (gpointer /* NMDnsPlugin * */ self,
- gboolean is_fatal)
-{
- nm_assert (NM_IS_DNS_PLUGIN (self));
-
- g_signal_emit (self, signals[FAILED], 0, (gboolean) (!!is_fatal));
-}
-
/*****************************************************************************/
static void
@@ -124,18 +108,4 @@ nm_dns_plugin_init (NMDnsPlugin *self)
static void
nm_dns_plugin_class_init (NMDnsPluginClass *plugin_class)
{
- GObjectClass *object_class = G_OBJECT_CLASS (plugin_class);
-
- /* Emitted by the plugin and consumed by NMDnsManager when
- * some error happens with the nameserver subprocess. Causes NM to fall
- * back to writing out a non-local-caching resolv.conf until the next
- * DNS update.
- */
- signals[FAILED] =
- g_signal_new (NM_DNS_PLUGIN_FAILED,
- G_OBJECT_CLASS_TYPE (object_class),
- G_SIGNAL_RUN_FIRST,
- 0, NULL, NULL,
- g_cclosure_marshal_VOID__BOOLEAN,
- G_TYPE_NONE, 1, G_TYPE_BOOLEAN);
}
diff --git a/src/dns/nm-dns-plugin.h b/src/dns/nm-dns-plugin.h
index 326bc50eae..e96d67f097 100644
--- a/src/dns/nm-dns-plugin.h
+++ b/src/dns/nm-dns-plugin.h
@@ -15,8 +15,6 @@
#define NM_IS_DNS_PLUGIN_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), NM_TYPE_DNS_PLUGIN))
#define NM_DNS_PLUGIN_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), NM_TYPE_DNS_PLUGIN, NMDnsPluginClass))
-#define NM_DNS_PLUGIN_FAILED "failed"
-
typedef struct {
GObject parent;
} NMDnsPlugin;
@@ -61,9 +59,4 @@ gboolean nm_dns_plugin_update (NMDnsPlugin *self,
void nm_dns_plugin_stop (NMDnsPlugin *self);
-/*****************************************************************************/
-
-void _nm_dns_plugin_emit_failed (gpointer /* NMDnsPlugin * */ self,
- gboolean is_fatal);
-
#endif /* __NM_DNS_PLUGIN_H__ */