summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Haller <thaller@redhat.com>2017-12-05 16:32:04 +0100
committerThomas Haller <thaller@redhat.com>2017-12-14 11:32:46 +0100
commite2ad949df45042a98e4a2109792e118d3962c131 (patch)
treeb7a3236759cd80fe1f2a10bab0dfca5bb4f3c95d
parent54b452a2ceb8a4361407e8a7fd646ba167de32fe (diff)
downloadNetworkManager-th/device-route-metric-rh1505893.tar.gz
device: generate unique default route-metrics per interfaceth/device-route-metric-rh1505893
In the past we had NMDefaultRouteManager which would coordinate adding the default-route with identical metrics. That especially happened, when activating two devices of the same type, without explicitly specifying ipv4.route-metric. For example, with ethernet devices, the routes on both interfaces would get a metric of 100. Coordinating routes was especially necessary, because we added routes with NLM_F_EXCL flag, akin to `ip route replace`. We not only had to avoid that activating two devices in NetworkManager would result in a fight over the default-route, but more importently to preserve externally added default-routes on unmanaged interfaces. NMDefaultRouteManager would ensure that in case of duplicate metrics, that the device that activated first would keep the best default-route. It would do so by bumping the metric of the second device to find a unused metric. The bumping itself was not very important -- MDefaultRouteManager could also just not configure any default-routes that show up as second, the result would be quite similar. More important was to keep the best default-route on the first activating device until the device deactivates or a device activates that really has a better default-route.. Likewise, NMRouteManager would globally manage non-default-routes. It would not do any bumping of metrics, but it would also ensure that the routes of the device that activates first are not overwritten by a device activating later. However, the `ip route replace` approach has downsides, especially that it messes with routes on other interfaces, interfaces that are possibly not managed by NetworkManager. Another downside is, that binding a socket to an interface might not result in correct routes, because the route might just not be there (in case of NMRouteManager, which wouldn't configure duplicate routes by bumping their metric). Since commit 77ec302714795f905301d500b9aab6c88001f32e we would no longer use NLM_F_EXCL, but add routes akin to `ip route append`. When activating for example two ethernet devices with no explict route metric configuration, there are two routes like default via 10.16.122.254 dev eth0 proto dhcp metric 100 default via 192.168.100.1 dev eth1 proto dhcp metric 100 This does not only affect default routes. In case of a multi-homing setup you'd get 192.168.100.0/24 dev eth0 proto kernel scope link src 192.168.100.1 metric 100 192.168.100.0/24 dev eth1 proto kernel scope link src 192.168.100.1 metric 100 but it's visible the most for default-routes. Note that we would append the routes that are activated later, as the order of `ip route show` confirms. One might hence expect, that kernel selects a route based on the order in the routing tables. However, that isn't the case, and activating the second interface will non-deterministically re-route traffic via the new interface. That will interfere badly with with NAT, stateful firewalls, and existing connections (like TCP). The solution is to have NMManager keep a global index of the default route-metrics currently in use. So, instead of determining the default-route metric based solely on the device-type, we now in addition generate default metrics that do not overlap. For example, if you activate eth0 first, it gets route-metric 100, and if you then activate eth1, it gets 101. Note that if you deactivate and re-activate eth0, then it will get route-metric 102, because the best route should stick on eth1 (which reserves the range 100 to 101). Note that when a connection explititly selects a particular metric, then that choice is honored (contrary to NMDefaultRouteManager which was more concerned with avoiding conflicts, then keeping the exact metric). https://bugzilla.redhat.com/show_bug.cgi?id=1505893
-rw-r--r--src/devices/nm-device.c10
-rw-r--r--src/nm-manager.c237
-rw-r--r--src/nm-manager.h10
3 files changed, 255 insertions, 2 deletions
diff --git a/src/devices/nm-device.c b/src/devices/nm-device.c
index 61f7395c60..fda2126410 100644
--- a/src/devices/nm-device.c
+++ b/src/devices/nm-device.c
@@ -1870,7 +1870,10 @@ nm_device_get_route_metric (NMDevice *self,
if (route_metric >= 0)
goto out;
}
- route_metric = nm_device_get_route_metric_default (nm_device_get_device_type (self));
+
+ route_metric = nm_manager_device_route_metric_reserve (nm_manager_get (),
+ nm_device_get_ip_ifindex (self),
+ nm_device_get_device_type (self));
out:
return nm_utils_ip_route_metric_normalize (addr_family, route_metric);
}
@@ -12609,6 +12612,11 @@ _cleanup_generic_pre (NMDevice *self, CleanupType cleanup_type)
_cancel_activation (self);
+ if (cleanup_type != CLEANUP_TYPE_KEEP) {
+ nm_manager_device_route_metric_clear (nm_manager_get (),
+ nm_device_get_ip_ifindex (self));
+ }
+
if ( cleanup_type == CLEANUP_TYPE_DECONFIGURE
&& priv->fw_state >= FIREWALL_STATE_INITIALIZED
&& priv->fw_mgr
diff --git a/src/nm-manager.c b/src/nm-manager.c
index e2e53f0344..5c84dc463e 100644
--- a/src/nm-manager.c
+++ b/src/nm-manager.c
@@ -161,6 +161,8 @@ typedef struct {
NMAuthManager *auth_mgr;
+ GHashTable *device_route_metrics;
+
GSList *auth_chains;
GHashTable *sleep_devices;
@@ -325,6 +327,237 @@ static NM_CACHED_QUARK_FCN ("autoconnect-root", autoconnect_root_quark)
/*****************************************************************************/
+typedef struct {
+ int ifindex;
+ guint32 aspired_metric;
+ guint32 effective_metric;
+} DeviceRouteMetricData;
+
+static DeviceRouteMetricData *
+_device_route_metric_data_new (int ifindex, guint32 metric)
+{
+ DeviceRouteMetricData *data;
+
+ nm_assert (ifindex > 0);
+
+ /* For IPv4, metrics can use the entire uint32 bit range. For IPv6,
+ * zero is treated like 1024. Since we handle IPv4 and IPv6 identically,
+ * we cannot allow a zero metric here.
+ */
+ nm_assert (metric > 0);
+
+ data = g_slice_new0 (DeviceRouteMetricData);
+ data->ifindex = ifindex;
+ data->aspired_metric = metric;
+ data->effective_metric = metric;
+ return data;
+}
+
+static guint
+_device_route_metric_data_by_ifindex_hash (gconstpointer p)
+{
+ const DeviceRouteMetricData *data = p;
+ NMHashState h;
+
+ nm_hash_init (&h, 1030338191);
+ nm_hash_update_vals (&h, data->ifindex);
+ return nm_hash_complete (&h);
+}
+
+static gboolean
+_device_route_metric_data_by_ifindex_equal (gconstpointer pa, gconstpointer pb)
+{
+ const DeviceRouteMetricData *a = pa;
+ const DeviceRouteMetricData *b = pb;
+
+ return a->ifindex == b->ifindex;
+}
+
+static guint32
+_device_route_metric_get (NMManager *self,
+ int ifindex,
+ NMDeviceType device_type,
+ gboolean lookup_only)
+{
+ NMManagerPrivate *priv;
+ const DeviceRouteMetricData *d2;
+ DeviceRouteMetricData *data;
+ DeviceRouteMetricData data_lookup;
+ const NMDedupMultiHeadEntry *all_links_head;
+ NMPObject links_needle;
+ guint n_links;
+ gboolean cleaned = FALSE;
+ GHashTableIter h_iter;
+
+ g_return_val_if_fail (NM_IS_MANAGER (self), 0);
+
+ if (ifindex <= 0) {
+ if (lookup_only)
+ return 0;
+ return nm_device_get_route_metric_default (device_type);
+ }
+
+ priv = NM_MANAGER_GET_PRIVATE (self);
+
+ if ( lookup_only
+ && !priv->device_route_metrics)
+ return 0;
+
+ if (G_UNLIKELY (!priv->device_route_metrics)) {
+ const GHashTable *h;
+ const NMConfigDeviceStateData *device_state;
+
+ priv->device_route_metrics = g_hash_table_new_full (_device_route_metric_data_by_ifindex_hash,
+ _device_route_metric_data_by_ifindex_equal,
+ NULL,
+ nm_g_slice_free_fcn (DeviceRouteMetricData));
+ cleaned = TRUE;
+
+ /* we need to pre-populate the cache for all (still existing) devices from the state-file */
+ h = nm_config_device_state_get_all (priv->config);
+ if (!h)
+ goto initited;
+
+ g_hash_table_iter_init (&h_iter, (GHashTable *) h);
+ while (g_hash_table_iter_next (&h_iter, NULL, (gpointer *) &device_state)) {
+ if (!device_state->route_metric_default)
+ continue;
+ if (!nm_platform_link_get (priv->platform, device_state->ifindex)) {
+ /* we have the entry in the state file, but (currently) no such
+ * ifindex exists in platform. Most likely the entry is obsolete,
+ * hence we skip it. */
+ continue;
+ }
+ if (!nm_g_hash_table_add (priv->device_route_metrics,
+ _device_route_metric_data_new (device_state->ifindex,
+ device_state->route_metric_default)))
+ nm_assert_not_reached ();
+ }
+ }
+
+initited:
+ data_lookup.ifindex = ifindex;
+
+ data = g_hash_table_lookup (priv->device_route_metrics, &data_lookup);
+ if (data)
+ return data->effective_metric;
+ if (lookup_only)
+ return 0;
+
+ if (!cleaned) {
+ /* get the number of all links in the platform cache. */
+ all_links_head = nm_platform_lookup_all (priv->platform,
+ NMP_CACHE_ID_TYPE_OBJECT_TYPE,
+ nmp_object_stackinit_id_link (&links_needle, 1));
+ n_links = all_links_head ? all_links_head->len : 0;
+
+ /* on systems where a lot of devices are created and go away, the index contains
+ * a lot of stale entries. We must from time to time clean them up.
+ *
+ * Do do this cleanup, whenever we have more enties then 2 times the number of links. */
+ if (G_UNLIKELY (g_hash_table_size (priv->device_route_metrics) > NM_MAX (20, n_links * 2))) {
+ /* from time to time, we need to do some house-keeping and prune stale entries.
+ * Otherwise, on a system where interfaces frequently come and go (docker), we
+ * keep growing this cache for ifindexes that no longer exist. */
+ g_hash_table_iter_init (&h_iter, priv->device_route_metrics);
+ while (g_hash_table_iter_next (&h_iter, NULL, (gpointer *) &d2)) {
+ if (!nm_platform_link_get (priv->platform, d2->ifindex))
+ g_hash_table_iter_remove (&h_iter);
+ }
+ cleaned = TRUE;
+ }
+ }
+
+ data = _device_route_metric_data_new (ifindex, nm_device_get_route_metric_default (device_type));
+
+ /* unfortunately, there is no stright forward way to lookup all reserved metrics.
+ * Note, that we don't only have to know which metrics are currently reserved,
+ * but also, which metrics are now seemingly un-used but caused another reserved
+ * metric to be bumped. Hence, the naive O(n^2) search :( */
+again:
+ g_hash_table_iter_init (&h_iter, priv->device_route_metrics);
+ while (g_hash_table_iter_next (&h_iter, NULL, (gpointer *) &d2)) {
+ if ( data->effective_metric < d2->aspired_metric
+ || data->effective_metric > d2->effective_metric) {
+ /* no overlap. Skip. */
+ continue;
+ }
+ if ( !cleaned
+ && !nm_platform_link_get (priv->platform, d2->ifindex)) {
+ /* the metric seems taken, but there is no such interface. This entry
+ * is stale, forget about it. */
+ g_hash_table_iter_remove (&h_iter);
+ continue;
+ }
+ data->effective_metric = d2->effective_metric;
+ if (data->effective_metric == G_MAXUINT32) {
+ /* we cannot bump any further. Done. */
+ break;
+ }
+
+ if (data->effective_metric - data->aspired_metric > 50) {
+ /* as one active interface reserves an entire range of metrics
+ * (from aspired_metric to effective_metric), that means if you
+ * alternatingly activate two interfaces, their metric will
+ * juggle up.
+ *
+ * Limit this, don't bump the metric more then 50 times. */
+ break;
+ }
+
+ /* bump the metric, and search again. */
+ data->effective_metric++;
+ goto again;
+ }
+
+ _LOGT (LOGD_DEVICE, "default-route-metric: ifindex %d reserves metric %u (aspired %u)",
+ data->ifindex, data->effective_metric, data->aspired_metric);
+
+ if (!nm_g_hash_table_add (priv->device_route_metrics, data))
+ nm_assert_not_reached ();
+
+ return data->effective_metric;
+}
+
+guint32
+nm_manager_device_route_metric_reserve (NMManager *self,
+ int ifindex,
+ NMDeviceType device_type)
+{
+ guint32 metric;
+
+ metric = _device_route_metric_get (self, ifindex, device_type, FALSE);
+ nm_assert (metric != 0);
+ return metric;
+}
+
+guint32
+nm_manager_device_route_metric_get (NMManager *self,
+ int ifindex)
+{
+ return _device_route_metric_get (self, ifindex, NM_DEVICE_TYPE_UNKNOWN, TRUE);
+}
+
+void
+nm_manager_device_route_metric_clear (NMManager *self,
+ int ifindex)
+{
+ NMManagerPrivate *priv;
+ DeviceRouteMetricData data_lookup;
+
+ priv = NM_MANAGER_GET_PRIVATE (self);
+
+ if (!priv->device_route_metrics)
+ return;
+ data_lookup.ifindex = ifindex;
+ if (g_hash_table_remove (priv->device_route_metrics, &data_lookup)) {
+ _LOGT (LOGD_DEVICE, "default-route-metric: ifindex %d released",
+ ifindex);
+ }
+}
+
+/*****************************************************************************/
+
static void
_delete_volatile_connection_do (NMManager *self,
NMSettingsConnection *connection)
@@ -5229,7 +5462,7 @@ nm_manager_write_device_state (NMManager *self)
nm_owned = nm_device_is_software (device) ? nm_device_is_nm_owned (device) : -1;
- route_metric_default = 0;
+ route_metric_default = nm_manager_device_route_metric_get (self, ifindex);
if (nm_config_device_state_write (ifindex,
managed_type,
@@ -6615,6 +6848,8 @@ dispose (GObject *object)
nm_clear_g_source (&priv->timestamp_update_id);
+ g_clear_pointer (&priv->device_route_metrics, g_hash_table_destroy);
+
G_OBJECT_CLASS (nm_manager_parent_class)->dispose (object);
}
diff --git a/src/nm-manager.h b/src/nm-manager.h
index d52b2655de..34e868a9cb 100644
--- a/src/nm-manager.h
+++ b/src/nm-manager.h
@@ -115,6 +115,16 @@ NMDevice * nm_manager_get_device_by_ifindex (NMManager *manager,
NMDevice * nm_manager_get_device_by_path (NMManager *manager,
const char *path);
+guint32 nm_manager_device_route_metric_reserve (NMManager *self,
+ int ifindex,
+ NMDeviceType device_type);
+
+guint32 nm_manager_device_route_metric_get (NMManager *self,
+ int ifindex);
+
+void nm_manager_device_route_metric_clear (NMManager *self,
+ int ifindex);
+
char * nm_manager_get_connection_iface (NMManager *self,
NMConnection *connection,
NMDevice **out_parent,