diff options
author | Lubomir Rintel <lkundrak@v3.sk> | 2022-07-29 00:02:20 +0200 |
---|---|---|
committer | Lubomir Rintel <lkundrak@v3.sk> | 2022-07-29 00:03:18 +0200 |
commit | 1dc8b7f4774827c889718d3b6748efe330a9ec6e (patch) | |
tree | 0c27af2e03bb07aaf68d1ce1a1f39662b95939be | |
parent | 7864e75e96582d0bac5c56669b2294e8fac473b6 (diff) | |
download | NetworkManager-lr/up-carrier-wait.tar.gz |
device: wait for carrier even if it wasn't us who brought the device IFF_UPlr/up-carrier-wait
The devices generally need to be IFF_UP and wait a little before the
carrier detection is reliable. Some devices, actually need to wait
more than a little -- r8169 needs up to 5 seconds.
For this reason, we delay startup complete while the carrier is down
after we bring the device up. We do this so that we don't reject
activations due to carrier down until we're sure it's really down.
This works well as long as it's us who brought the device up.
If we're restarting the daemon, the device is going to be already up
when we start up the daemon for the second time. There's, however, a
slim chance that the device was brought down and up very shortly before
the restart and therefore the carrier reporting is still not reliable.
As a matter of fact, we bring the devices down and back up on some
occassions, such as when enslaving to a team device.
Therefore, the following events in quick succession cause trouble:
# nmcli con up team-slave-eth0
[20099.205355] Generic FE-GE Realtek PHY r8169-0-300:00: attached PHY driver (mii_bus:phy_addr=r8169-0-300:00, irq=MAC)
[20099.365641] nm-team: Port device eth0 added
[20099.370728] r8169 0000:03:00.0 eth0: Link is Down
[20099.436631] nm-team: Port device eth0 removed
[20099.463422] Generic FE-GE Realtek PHY r8169-0-300:00: attached PHY driver (mii_bus:phy_addr=r8169-0-300:00, irq=MAC)
[20099.628505] r8169 0000:03:00.0 eth0: Link is Down
[20099.669425] Generic FE-GE Realtek PHY r8169-0-300:00: attached PHY driver (mii_bus:phy_addr=r8169-0-300:00, irq=MAC)
[20099.833457] r8169 0000:03:00.0 eth0: Link is Down
[20099.838471] nm-team: Port device eth0 added
The device has been brought down, enslaved and brought up.
"Link is Down" indicates carrier not being detected.
Connection successfully activated (D-Bus active path: /org/freedesktop/NetworkManager/ActiveConnection/7)
# systemctl restart NetworkManager
Now NM sees the device being up, but carrier down.
# nmcli con up testeth0
Error: Connection activation failed: No suitable device found for this connection (...).
Activation failed, because eth0 carrier still appears down.
# [20102.943464] r8169 0000:03:00.0 eth0: Link is Up - 1Gbps/Full - flow control rx/tx
Now it's up, but the party is already over. Shiet.
Let's wait whenever the device reaches unavailable state, whether we
bring it up at that point or not.
Fixes-test: @restart_L2_only_lacp
https://bugzilla.redhat.com/show_bug.cgi?id=2092361
-rw-r--r-- | src/core/devices/nm-device.c | 59 |
1 files changed, 37 insertions, 22 deletions
diff --git a/src/core/devices/nm-device.c b/src/core/devices/nm-device.c index f2de8c9c89..6cc823b2ee 100644 --- a/src/core/devices/nm-device.c +++ b/src/core/devices/nm-device.c @@ -13875,10 +13875,38 @@ _get_carrier_wait_ms(NMDevice *self) CARRIER_WAIT_TIME_MS); } +/* + * Devices that support carrier detect must be IFF_UP to report carrier + * changes; so after setting the device IFF_UP we must suppress startup + * complete (via a pending action) until either the carrier turns on, or + * a timeout is reached. + */ +static void +carrier_detect_wait(NMDevice *self) +{ + NMDevicePrivate *priv = NM_DEVICE_GET_PRIVATE(self); + gint64 now_ms, until_ms; + + if (!nm_device_has_capability(self, NM_DEVICE_CAP_CARRIER_DETECT)) + return; + + /* we start a grace period of 5 seconds during which we will schedule + * a pending action whenever we have no carrier. + * + * If during that time carrier goes away, we declare the interface + * as not ready. */ + nm_clear_g_source(&priv->carrier_wait_id); + if (!priv->carrier) + nm_device_add_pending_action(self, NM_PENDING_ACTION_CARRIER_WAIT, FALSE); + + now_ms = nm_utils_get_monotonic_timestamp_msec(); + until_ms = NM_MAX(now_ms + _get_carrier_wait_ms(self), priv->carrier_wait_until_ms); + priv->carrier_wait_id = g_timeout_add(until_ms - now_ms, carrier_wait_timeout, self); +} + gboolean nm_device_bring_up(NMDevice *self, gboolean block, gboolean *no_firmware) { - NMDevicePrivate *priv = NM_DEVICE_GET_PRIVATE(self); gboolean device_is_up = FALSE; NMDeviceCapabilities capabilities; int ifindex; @@ -13934,27 +13962,7 @@ nm_device_bring_up(NMDevice *self, gboolean block, gboolean *no_firmware) capabilities |= NM_DEVICE_GET_CLASS(self)->get_generic_capabilities(self); _add_capabilities(self, capabilities); - /* Devices that support carrier detect must be IFF_UP to report carrier - * changes; so after setting the device IFF_UP we must suppress startup - * complete (via a pending action) until either the carrier turns on, or - * a timeout is reached. - */ - if (nm_device_has_capability(self, NM_DEVICE_CAP_CARRIER_DETECT)) { - gint64 now_ms, until_ms; - - /* we start a grace period of 5 seconds during which we will schedule - * a pending action whenever we have no carrier. - * - * If during that time carrier goes away, we declare the interface - * as not ready. */ - nm_clear_g_source(&priv->carrier_wait_id); - if (!priv->carrier) - nm_device_add_pending_action(self, NM_PENDING_ACTION_CARRIER_WAIT, FALSE); - - now_ms = nm_utils_get_monotonic_timestamp_msec(); - until_ms = NM_MAX(now_ms + _get_carrier_wait_ms(self), priv->carrier_wait_until_ms); - priv->carrier_wait_id = g_timeout_add(until_ms - now_ms, carrier_wait_timeout, self); - } + carrier_detect_wait(self); /* Can only get HW address of some devices when they are up */ nm_device_update_hw_address(self); @@ -15713,6 +15721,13 @@ _set_state_full(NMDevice *self, NMDeviceState state, NMDeviceStateReason reason, if (!nm_device_bring_up(self, TRUE, &no_firmware) && no_firmware) _LOGW(LOGD_PLATFORM, "firmware may be missing."); nm_device_set_firmware_missing(self, no_firmware ? TRUE : FALSE); + } else { + /* We didn't bring the device up and we have little idea + * when was it brought up. Play it safe and assume it could + * have been brought up very recently and it might one of + * those who take time to detect carrier. + */ + carrier_detect_wait(self); } /* Ensure the device gets deactivated in response to stuff like |