summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBalazs Gibizer <balazs.gibizer@est.tech>2021-10-11 14:41:37 +0200
committerBalazs Gibizer <balazs.gibizer@est.tech>2021-11-19 15:43:29 +0100
commitc531fdcc192afb5af628ac567cb0ff8aa3eab052 (patch)
treec866cf25ccaa94e84e4fc0fd945e428f6297bee5
parent34e0c0205b1053d3bbe064177740aba654997fe0 (diff)
downloadnova-c531fdcc192afb5af628ac567cb0ff8aa3eab052.tar.gz
Add a WA flag waiting for vif-plugged event during reboot
The libvirt driver power on and hard reboot destroys the domain first and unplugs the vifs then recreate the domain and replug the vifs. However nova does not wait for the network-vif-plugged event before unpause the domain. This can cause that the domain starts running and requesting IP via DHCP before the networking backend finished plugging the vifs. So this patch adds a workaround config option to nova to wait for network-vif-plugged events during hard reboot the same way as nova waits for this event during new instance spawn. This logic cannot be enabled unconditionally as not all neutron networking backend sending plug time events to wait for. Also the logic needs to be vnic_type dependent as ml2/ovs and the in tree sriov backend often deployed together on the same compute. While ml2/ovs sends plug time event the sriov backend does not send it reliably. So the configuration is not just a boolean flag but a list of vnic_types instead. This way the waiting for the plug time event for a vif that is handled by ml2/ovs is possible while the instance has other vifs handled by the sriov backend where no event can be expected. Conflicts: nova/virt/libvirt/driver.py both I73305e82da5d8da548961b801a8e75fb0e8c4cf1 and I0b93bdc12cdce591c7e642ab8830e92445467b9a are not in stable/victoria The stable/victoria specific changes: * The list of accepted vnic_type-s are adapted to what is supported by neutron on this release. So vdpa, accelerator-direct, and accelerator-direct-physical are removed as they are only added in stable/wallaby Change-Id: Ie904d1513b5cf76d6d5f6877545e8eb378dd5499 Closes-Bug: #1946729 (cherry picked from commit 68c970ea9915a95f9828239006559b84e4ba2581) (cherry picked from commit 0c41bfb8c5c60f1cc930ae432e6be460ee2e97ac) (cherry picked from commit 89c4ff5f7b45f1a5bed8b6b9b4586fceaa391bfb)
-rw-r--r--.zuul.yaml6
-rw-r--r--nova/conf/workarounds.py53
-rw-r--r--nova/tests/unit/virt/libvirt/test_driver.py43
-rw-r--r--nova/virt/libvirt/driver.py23
-rw-r--r--releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml18
5 files changed, 141 insertions, 2 deletions
diff --git a/.zuul.yaml b/.zuul.yaml
index c00865e504..aa371db06c 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -191,6 +191,12 @@
# reduce the number of placement calls in steady state. Added in
# Stein.
resource_provider_association_refresh: 0
+ workarounds:
+ # This wa is an improvement on hard reboot that cannot be turned
+ # on unconditionally. But we know that ml2/ovs sends plug time
+ # events so we can enable this in this ovs job for vnic_type
+ # normal
+ wait_for_vif_plugged_event_during_hard_reboot: normal
$NOVA_CONF:
quota:
# Added in Train.
diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py
index 8eadc0b6ec..4e64d87578 100644
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@@ -346,6 +346,59 @@ Related options:
* :oslo.config:option:`image_cache.subdirectory_name`
* :oslo.config:option:`update_resources_interval`
"""),
+ cfg.ListOpt('wait_for_vif_plugged_event_during_hard_reboot',
+ item_type=cfg.types.String(
+ choices=[
+ "normal",
+ "direct",
+ "macvtap",
+ "baremetal",
+ "direct-physical",
+ "virtio-forwarder",
+ "smart-nic",
+ ]),
+ default=[],
+ help="""
+The libvirt virt driver implements power on and hard reboot by tearing down
+every vif of the instance being rebooted then plug them again. By default nova
+does not wait for network-vif-plugged event from neutron before it lets the
+instance run. This can cause the instance to requests the IP via DHCP before
+the neutron backend has a chance to set up the networking backend after the vif
+plug.
+
+This flag defines which vifs nova expects network-vif-plugged events from
+during hard reboot. The possible values are neutron port vnic types:
+
+* normal
+* direct
+* macvtap
+* baremetal
+* direct-physical
+* virtio-forwarder
+* smart-nic
+
+Adding a ``vnic_type`` to this configuration makes Nova wait for a
+network-vif-plugged event for each of the instance's vifs having the specific
+``vnic_type`` before unpausing the instance, similarly to how new instance
+creation works.
+
+Please note that not all neutron networking backends send plug time events, for
+certain ``vnic_type`` therefore this config is empty by default.
+
+The ml2/ovs and the networking-odl backends are known to send plug time events
+for ports with ``normal`` ``vnic_type`` so it is safe to add ``normal`` to this
+config if you are using only those backends in the compute host.
+
+The neutron in-tree SRIOV backend does not reliably send network-vif-plugged
+event during plug time for ports with ``direct`` vnic_type and never sends
+that event for port with ``direct-physical`` vnic_type during plug time. For
+other ``vnic_type`` and backend pairs, please consult the developers of the
+backend.
+
+Related options:
+
+* :oslo.config:option:`DEFAULT.vif_plugging_timeout`
+"""),
]
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 3a93eaa2a6..7576a6a60d 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -16288,7 +16288,48 @@ class LibvirtConnTestCase(test.NoDBTestCase,
accel_info=accel_info)
mock_create_guest_with_network.assert_called_once_with(self.context,
dummyxml, instance, network_info, block_device_info,
- vifs_already_plugged=True)
+ vifs_already_plugged=True, external_events=[])
+
+ @mock.patch('oslo_utils.fileutils.ensure_tree', new=mock.Mock())
+ @mock.patch('nova.virt.libvirt.LibvirtDriver.get_info')
+ @mock.patch('nova.virt.libvirt.LibvirtDriver._create_guest_with_network')
+ @mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml')
+ @mock.patch('nova.virt.libvirt.LibvirtDriver.destroy', new=mock.Mock())
+ @mock.patch(
+ 'nova.virt.libvirt.LibvirtDriver._get_all_assigned_mediated_devices',
+ new=mock.Mock(return_value={}))
+ def test_hard_reboot_wait_for_plug(
+ self, mock_get_guest_xml, mock_create_guest_with_network, mock_get_info
+ ):
+ self.flags(
+ group="workarounds",
+ wait_for_vif_plugged_event_during_hard_reboot=["normal"])
+ self.context.auth_token = None
+ instance = objects.Instance(**self.test_instance)
+ network_info = _fake_network_info(self, num_networks=4)
+ network_info[0]["vnic_type"] = "normal"
+ network_info[1]["vnic_type"] = "direct"
+ network_info[2]["vnic_type"] = "normal"
+ network_info[3]["vnic_type"] = "direct-physical"
+ block_device_info = None
+ return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN),
+ hardware.InstanceInfo(state=power_state.RUNNING)]
+ mock_get_info.side_effect = return_values
+ mock_get_guest_xml.return_value = mock.sentinel.xml
+
+ drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+ drvr._hard_reboot(
+ self.context, instance, network_info, block_device_info)
+
+ mock_create_guest_with_network.assert_called_once_with(
+ self.context, mock.sentinel.xml, instance, network_info,
+ block_device_info,
+ vifs_already_plugged=False,
+ external_events=[
+ ('network-vif-plugged', uuids.vif1),
+ ('network-vif-plugged', uuids.vif3),
+ ]
+ )
@mock.patch('oslo_utils.fileutils.ensure_tree')
@mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index fbd033690a..2558a49f7d 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -3383,11 +3383,32 @@ class LibvirtDriver(driver.ComputeDriver):
# on which vif type we're using and we are working with a stale network
# info cache here, so won't rely on waiting for neutron plug events.
# vifs_already_plugged=True means "do not wait for neutron plug events"
+ external_events = []
+ vifs_already_plugged = True
+ event_expected_for_vnic_types = (
+ CONF.workarounds.wait_for_vif_plugged_event_during_hard_reboot)
+ if event_expected_for_vnic_types:
+ # NOTE(gibi): We unplugged every vif during destroy above and we
+ # will replug them with _create_guest_with_network. As the
+ # workaround config has some vnic_types configured we expect
+ # vif-plugged events for every vif with those vnic_types.
+ # TODO(gibi): only wait for events if we know that the networking
+ # backend sends plug time events. For that we need to finish
+ # https://bugs.launchpad.net/neutron/+bug/1821058 first in Neutron
+ # then create a driver -> plug-time event mapping in nova.
+ external_events = [
+ ('network-vif-plugged', vif['id'])
+ for vif in network_info
+ if vif['vnic_type'] in event_expected_for_vnic_types
+ ]
+ vifs_already_plugged = False
+
# NOTE(efried): The instance should already have a vtpm_secret_uuid
# registered if appropriate.
self._create_guest_with_network(
context, xml, instance, network_info, block_device_info,
- vifs_already_plugged=True)
+ vifs_already_plugged=vifs_already_plugged,
+ external_events=external_events)
self._prepare_pci_devices_for_use(
pci_manager.get_instance_pci_devs(instance, 'all'))
diff --git a/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml
new file mode 100644
index 0000000000..c3686a9978
--- /dev/null
+++ b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml
@@ -0,0 +1,18 @@
+---
+issues:
+ - |
+ The libvirt virt driver in Nova implements power on and hard reboot by
+ destroying the domain first and unpluging the vifs then recreating the
+ domain and replugging the vifs. However nova does not wait for the
+ network-vif-plugged event before unpause the domain. This can cause
+ the domain to start running and requesting IP via DHCP before the
+ networking backend has finished plugging the vifs. The config option
+ [workarounds]wait_for_vif_plugged_event_during_hard_reboot has been added,
+ defaulting to an empty list, that can be used to ensure that the libvirt
+ driver waits for the network-vif-plugged event for vifs with specific
+ ``vnic_type`` before it unpauses the domain during hard reboot. This should
+ only be used if the deployment uses a networking backend that sends such
+ event for the given ``vif_type`` at vif plug time. The ml2/ovs and the
+ networking-odl Neutron backend is known to send plug time events for ports
+ with ``normal`` ``vnic_type``. For more information see
+ https://bugs.launchpad.net/nova/+bug/1946729