diff options
115 files changed, 3550 insertions, 1100 deletions
diff --git a/.zuul.yaml b/.zuul.yaml index b4f67067e4..b5224367b2 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -56,21 +56,19 @@ bindep_profile: test py38 timeout: 3600 -# TODO(lyarwood): Remove once the new zuulv3 nova-multinode jobs are voting - job: - name: nova-live-migration - parent: nova-dsvm-multinode-base + name: nova-tox-validate-backport + parent: openstack-tox description: | - Run tempest live migration tests against both local storage and shared - storage using ceph (the environment is reconfigured for ceph after the - local storage tests are run). Also runs simple evacuate tests. - Config drive is forced on all instances. Runs with python 3. - run: playbooks/legacy/nova-live-migration/run.yaml - post-run: playbooks/legacy/nova-live-migration/post.yaml - irrelevant-files: *dsvm-irrelevant-files + Determine whether a backport is ready to be merged by checking whether it + has already been merged to master or more recent stable branches. + + Uses tox with the ``validate-backport`` environment. + vars: + tox_envlist: validate-backport - job: - name: nova-multinode-live-migration + name: nova-live-migration parent: tempest-multinode-full-py3 description: | Run tempest live migration tests against local qcow2 ephemeral storage @@ -86,6 +84,7 @@ volume_backed_live_migration: true block_migration_for_live_migration: true block_migrate_cinder_iscsi: true + post-run: playbooks/nova-live-migration/post-run.yaml # TODO(lyarwood): The following jobs need to be written as part of the # migration to zuulv3 before nova-live-migration can be removed: @@ -95,13 +94,6 @@ # description: | # Run tempest live migration tests against ceph ephemeral storage and # cinder volumes. -# -#- job: -# name: nova-multinode-evacuate -# description: | -# Verifiy the evacuation of instances with local qcow2 ephemeral disks -# from down compute hosts. -# #- job: # name: nova-multinode-evacuate-ceph # description: | @@ -199,6 +191,12 @@ # reduce the number of placement calls in steady state. Added in # Stein. resource_provider_association_refresh: 0 + workarounds: + # This wa is an improvement on hard reboot that cannot be turned + # on unconditionally. But we know that ml2/ovs sends plug time + # events so we can enable this in this ovs job for vnic_type + # normal + wait_for_vif_plugged_event_during_hard_reboot: normal $NOVA_CONF: quota: # Added in Train. @@ -272,22 +270,24 @@ - job: name: nova-grenade-multinode - parent: nova-dsvm-multinode-base + parent: grenade-multinode description: | - Multi-node grenade job which runs gate/live_migration/hooks tests under - python 3. - In other words, this tests live and cold migration and resize with - mixed-version compute services which is important for things like - rolling upgrade support. + Run a multinode grenade job and run the smoke, cold and live migration + tests with the controller upgraded and the compute on the older release. The former names for this job were "nova-grenade-live-migration" and "legacy-grenade-dsvm-neutron-multinode-live-migration". - run: playbooks/legacy/nova-grenade-multinode/run.yaml - post-run: playbooks/legacy/nova-grenade-multinode/post.yaml - required-projects: - - openstack/grenade - - openstack/devstack-gate - - openstack/nova irrelevant-files: *dsvm-irrelevant-files + vars: + devstack_local_conf: + test-config: + $TEMPEST_CONFIG: + compute-feature-enabled: + live_migration: true + volume_backed_live_migration: true + block_migration_for_live_migration: true + block_migrate_cinder_iscsi: true + tox_envlist: all + tempest_test_regex: ((tempest\.(api\.compute|scenario)\..*smoke.*)|(^tempest\.api\.compute\.admin\.(test_live_migration|test_migration))) - job: name: nova-multi-cell @@ -417,7 +417,6 @@ - check-requirements - integrated-gate-compute - openstack-cover-jobs - - openstack-lower-constraints-jobs - openstack-python3-victoria-jobs - periodic-stable-jobs - publish-openstack-docs-pti @@ -437,13 +436,12 @@ # so that we only run it on changes to networking and libvirt/vif # code; we don't need to run this on all changes. - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - - nova-grenade-multinode - nova-live-migration - - nova-multinode-live-migration: - voting: false - nova-lvm - nova-multi-cell - nova-next + - nova-tox-validate-backport: + voting: false - nova-tox-functional-py38 - tempest-integrated-compute: # NOTE(gmann): Policies changes do not need to run all the @@ -464,7 +462,7 @@ - ^setup.cfg$ - ^tools/.*$ - ^tox.ini$ - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files @@ -478,11 +476,11 @@ voting: false gate: jobs: - - nova-grenade-multinode - nova-live-migration - nova-tox-functional-py38 - nova-multi-cell - nova-next + - nova-tox-validate-backport - nova-ceph-multistore: irrelevant-files: *dsvm-irrelevant-files - neutron-tempest-linuxbridge: @@ -493,7 +491,7 @@ - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - tempest-integrated-compute: irrelevant-files: *policies-irrelevant-files - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files diff --git a/api-guide/source/port_with_resource_request.rst b/api-guide/source/port_with_resource_request.rst index 2d4cc113e2..b5bb5bc491 100644 --- a/api-guide/source/port_with_resource_request.rst +++ b/api-guide/source/port_with_resource_request.rst @@ -29,7 +29,9 @@ As of 20.0.0 (Train), nova supports cold migrating and resizing servers with neutron ports having resource requests if both the source and destination compute services are upgraded to 20.0.0 (Train) and the ``[upgrade_levels]/compute`` configuration does not prevent the computes from -using the latest RPC version. +using the latest RPC version. However cross cell resize and cross cell migrate +operations are still not supported with such ports and Nova will fall back to +same-cell resize if the server has such ports. As of 21.0.0 (Ussuri), nova supports evacuating, live migrating and unshelving servers with neutron ports having resource requests. diff --git a/devstack/nova-multi-cell-blacklist.txt b/devstack/nova-multi-cell-blacklist.txt index afda00fae1..d0a62e61aa 100644 --- a/devstack/nova-multi-cell-blacklist.txt +++ b/devstack/nova-multi-cell-blacklist.txt @@ -4,3 +4,9 @@ # Exclude tempest.scenario.test_network tests since they are slow and # only test advanced neutron features, unrelated to multi-cell testing. ^tempest.scenario.test_network + +# Also exlude resize and migrate tests with qos ports as qos is currently +# not supported in cross cell resize case . See +# https://bugs.launchpad.net/nova/+bug/1907511 for details +test_migrate_with_qos_min_bw_allocation +test_resize_with_qos_min_bw_allocation diff --git a/doc/source/admin/configuration/cross-cell-resize.rst b/doc/source/admin/configuration/cross-cell-resize.rst index 8a82b60417..d17ee24109 100644 --- a/doc/source/admin/configuration/cross-cell-resize.rst +++ b/doc/source/admin/configuration/cross-cell-resize.rst @@ -237,7 +237,8 @@ These are known to not yet be supported in the code: * Instances with ports attached that have :doc:`bandwidth-aware </admin/ports-with-resource-requests>` resource - provider allocations. + provider allocations. Nova falls back to same-cell resize if the server has + such ports. * Rescheduling to alternative hosts within the same target cell in case the primary selected host fails the ``prep_snapshot_based_resize_at_dest`` call. diff --git a/doc/source/admin/configuration/hypervisor-xen-libvirt.rst b/doc/source/admin/configuration/hypervisor-xen-libvirt.rst index 3cb2cf5de0..3e548686c9 100644 --- a/doc/source/admin/configuration/hypervisor-xen-libvirt.rst +++ b/doc/source/admin/configuration/hypervisor-xen-libvirt.rst @@ -158,7 +158,7 @@ Use the following as a guideline for configuring Xen for use in OpenStack: $ openstack image set --property hypervisor_type=xen vm_mode=hvm IMAGE For more information on image metadata, refer to the `OpenStack Virtual - Image Guide <https://docs.openstack.org/image-guide/image-metadata.html>`__. + Image Guide <https://docs.openstack.org/image-guide/introduction.html#image-metadata>`__. #. **Libguestfs file injection**: OpenStack compute nodes can use `libguestfs <http://libguestfs.org/>`_ to inject files into an instance's image prior to diff --git a/doc/source/admin/cpu-topologies.rst b/doc/source/admin/cpu-topologies.rst index 5639abdd6a..5c2a1c9740 100644 --- a/doc/source/admin/cpu-topologies.rst +++ b/doc/source/admin/cpu-topologies.rst @@ -665,6 +665,6 @@ memory allocation turned on. The Hyper-V driver will ignore the configured instances with a NUMA topology. .. Links -.. _`Image metadata`: https://docs.openstack.org/image-guide/image-metadata.html +.. _`Image metadata`: https://docs.openstack.org/image-guide/introduction.html#image-metadata .. _`discussion`: http://lists.openstack.org/pipermail/openstack-dev/2016-March/090367.html .. _`MTTCG project`: http://wiki.qemu.org/Features/tcg-multithread diff --git a/doc/source/admin/huge-pages.rst b/doc/source/admin/huge-pages.rst index 3ce61d501b..3a77756564 100644 --- a/doc/source/admin/huge-pages.rst +++ b/doc/source/admin/huge-pages.rst @@ -239,4 +239,4 @@ guide. .. Links .. _`Linux THP guide`: https://www.kernel.org/doc/Documentation/vm/transhuge.txt .. _`Linux hugetlbfs guide`: https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt -.. _`Image metadata`: https://docs.openstack.org/image-guide/image-metadata.html +.. _`Image metadata`: https://docs.openstack.org/image-guide/introduction.html#image-metadata diff --git a/doc/source/admin/networking.rst b/doc/source/admin/networking.rst index 407a43aafe..626bb89592 100644 --- a/doc/source/admin/networking.rst +++ b/doc/source/admin/networking.rst @@ -24,6 +24,18 @@ A full guide on configuring and using SR-IOV is provided in the :neutron-doc:`OpenStack Networking service documentation <admin/config-sriov.html>` +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. NUMA Affinity ------------- diff --git a/doc/source/admin/pci-passthrough.rst b/doc/source/admin/pci-passthrough.rst index 227538edb0..5a4c001e6d 100644 --- a/doc/source/admin/pci-passthrough.rst +++ b/doc/source/admin/pci-passthrough.rst @@ -37,6 +37,18 @@ devices with potentially different capabilities. supported until the 14.0.0 Newton release, see `bug 1512800 <https://bugs.launchpad.net/nova/+bug/1512880>`_ for details. +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. Configure host (Compute) ------------------------ diff --git a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst index 012d78e93b..fb76f656af 100644 --- a/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst +++ b/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst @@ -120,10 +120,13 @@ Performing the migration (1) On all relevant compute nodes, enable the :oslo.config:option:`libvirt.live_migration_with_native_tls` - configuration attribute:: + configuration attribute and set the + :oslo.config:option:`libvirt.live_migration_scheme` + configuration attribute to tls:: [libvirt] live_migration_with_native_tls = true + live_migration_scheme = tls .. note:: Setting both @@ -131,6 +134,12 @@ Performing the migration :oslo.config:option:`libvirt.live_migration_tunnelled` at the same time is invalid (and disallowed). + .. note:: + Not setting + :oslo.config:option:`libvirt.live_migration_scheme` to ``tls`` + will result in libvirt using the unencrypted TCP connection + without displaying any error or a warning in the logs. + And restart the ``nova-compute`` service:: $ systemctl restart openstack-nova-compute diff --git a/doc/source/cli/nova-status.rst b/doc/source/cli/nova-status.rst index 9eae13b43d..96ae333c1c 100644 --- a/doc/source/cli/nova-status.rst +++ b/doc/source/cli/nova-status.rst @@ -142,6 +142,8 @@ Upgrade **22.0.0 (Victoria)** * Checks for the policy files is not JSON-formatted. + * Checks for computes older than the previous major release. This check was + backported from 23.0.0 (Wallaby). See Also ======== diff --git a/doc/source/contributor/ptl-guide.rst b/doc/source/contributor/ptl-guide.rst index 3e4b2ab18c..6683ef5e94 100644 --- a/doc/source/contributor/ptl-guide.rst +++ b/doc/source/contributor/ptl-guide.rst @@ -257,6 +257,9 @@ Immediately after RC * Example: https://review.opendev.org/543580 + * Bump the oldest supported compute service version + * https://review.opendev.org/#/c/738482/ + * Create the launchpad series for the next cycle * Set the development focus of the project to the new cycle series diff --git a/doc/source/install/verify.rst b/doc/source/install/verify.rst index 440abc7e71..9c67e4574e 100644 --- a/doc/source/install/verify.rst +++ b/doc/source/install/verify.rst @@ -131,3 +131,7 @@ Verify operation of the Compute service. | Result: Success | | Details: None | +--------------------------------------------------------------------+ + | Check: Older than N-1 computes | + | Result: Success | + | Details: None | + +--------------------------------------------------------------------+ diff --git a/gate/live_migration/hooks/ceph.sh b/gate/live_migration/hooks/ceph.sh deleted file mode 100755 index 3d596ff0b3..0000000000 --- a/gate/live_migration/hooks/ceph.sh +++ /dev/null @@ -1,208 +0,0 @@ -#!/bin/bash - -function prepare_ceph { - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/settings - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph - configure_ceph - #install ceph-common package and additional python3 ceph libraries on compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash - USE_PYTHON3=${USE_PYTHON3:-True} - source $BASE/new/devstack/functions - source $BASE/new/devstack/functions-common - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph_remote - " - - #copy ceph admin keyring to compute nodes - sudo cp /etc/ceph/ceph.client.admin.keyring /tmp/ceph.client.admin.keyring - sudo chown ${STACK_USER}:${STACK_USER} /tmp/ceph.client.admin.keyring - sudo chmod 644 /tmp/ceph.client.admin.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.admin.keyring dest=/etc/ceph/ceph.client.admin.keyring owner=ceph group=ceph" - sudo rm -f /tmp/ceph.client.admin.keyring - #copy ceph.conf to compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/etc/ceph/ceph.conf dest=/etc/ceph/ceph.conf owner=root group=root" - - start_ceph -} - -function _ceph_configure_glance { - GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${GLANCE_CEPH_POOL} ${GLANCE_CEPH_POOL_PG} ${GLANCE_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${GLANCE_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=DEFAULT option=show_image_direct_url value=True" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=default_store value=rbd" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=stores value='file, http, rbd'" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_user value=$GLANCE_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_pool value=$GLANCE_CEPH_POOL" - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - #copy glance keyring to compute only node - sudo cp /etc/ceph/ceph.client.glance.keyring /tmp/ceph.client.glance.keyring - sudo chown $STACK_USER:$STACK_USER /tmp/ceph.client.glance.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.glance.keyring dest=/etc/ceph/ceph.client.glance.keyring" - sudo rm -f /tmp/ceph.client.glance.keyring -} - -function configure_and_start_glance { - _ceph_configure_glance - echo 'check processes before glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" - - # restart glance - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@g-api" - - echo 'check processes after glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" -} - -function _ceph_configure_nova { - #setup ceph for nova, we don't reuse configure_ceph_nova - as we need to emulate case where cinder is not configured for ceph - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${NOVA_CEPH_POOL} ${NOVA_CEPH_POOL_PG} ${NOVA_CEPH_POOL_PGP} - NOVA_CONF=${NOVA_CPU_CONF:-/etc/nova/nova.conf} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_user value=${CINDER_CEPH_USER}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_secret_uuid value=${CINDER_CEPH_UUID}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_key value=false" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_partition value=-2" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=disk_cachemodes value='network=writeback'" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_type value=rbd" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_pool value=${NOVA_CEPH_POOL}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_ceph_conf value=${CEPH_CONF_FILE}" - - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${CINDER_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${CINDER_CEPH_POOL}, allow rwx pool=${NOVA_CEPH_POOL},allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring > /dev/null - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring - - #copy cinder keyring to compute only node - sudo cp /etc/ceph/ceph.client.cinder.keyring /tmp/ceph.client.cinder.keyring - sudo chown stack:stack /tmp/ceph.client.cinder.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.cinder.keyring dest=/etc/ceph/ceph.client.cinder.keyring" - sudo rm -f /tmp/ceph.client.cinder.keyring - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} crush_ruleset ${RULE_ID} - fi -} - -function _wait_for_nova_compute_service_state { - source $BASE/new/devstack/openrc admin admin - local status=$1 - local attempt=1 - local max_attempts=24 - local attempt_sleep=5 - local computes_count=$(openstack compute service list | grep -c nova-compute) - local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - - echo "Waiting for $computes_count computes to report as $status" - while [ "$computes_ready" -ne "$computes_count" ]; do - if [ "$attempt" -eq "$max_attempts" ]; then - echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts" - exit 4 - fi - echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts" - sleep $attempt_sleep - attempt=$((attempt+1)) - computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - done - echo "All computes are now reporting as ${status} after ${attempt} attempts" -} - -function configure_and_start_nova { - - echo "Checking all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # stop nova-compute - echo "Stopping all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu" - - # Wait for the service to be marked as down - _wait_for_nova_compute_service_state "down" - - _ceph_configure_nova - - #import secret to libvirt - _populate_libvirt_secret - - # start nova-compute - echo "Starting all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu" - - echo "Checking all n-cpu services" - # test that they are all running again - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # Wait for the service to be marked as up - _wait_for_nova_compute_service_state "up" -} - -function _ceph_configure_cinder { - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${CINDER_CEPH_POOL} ${CINDER_CEPH_POOL_PG} ${CINDER_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - CINDER_CONF=${CINDER_CONF:-/etc/cinder/cinder.conf} - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_backend_name value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_driver value=cinder.volume.drivers.rbd.RBDDriver" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_pool value=$CINDER_CEPH_POOL" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_user value=$CINDER_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_uuid value=$CINDER_CEPH_UUID" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_flatten_volume_from_snapshot value=False" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_max_clone_depth value=5" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=default_volume_type value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=enabled_backends value=ceph" - -} - -function configure_and_start_cinder { - _ceph_configure_cinder - - # restart cinder - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@c-vol" - - source $BASE/new/devstack/openrc - - export OS_USERNAME=admin - export OS_PROJECT_NAME=admin - lvm_type=$(cinder type-list | awk -F "|" 'NR==4{ print $2}') - cinder type-delete $lvm_type - openstack volume type create --os-volume-api-version 1 --property volume_backend_name="ceph" ceph -} - -function _populate_libvirt_secret { - cat > /tmp/secret.xml <<EOF -<secret ephemeral='no' private='no'> - <uuid>${CINDER_CEPH_UUID}</uuid> - <usage type='ceph'> - <name>client.${CINDER_CEPH_USER} secret</name> - </usage> -</secret> -EOF - - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/secret.xml dest=/tmp/secret.xml" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-define --file /tmp/secret.xml" - local secret=$(sudo ceph -c ${CEPH_CONF_FILE} auth get-key client.${CINDER_CEPH_USER}) - # TODO(tdurakov): remove this escaping as https://github.com/ansible/ansible/issues/13862 fixed - secret=${secret//=/'\='} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-set-value --secret ${CINDER_CEPH_UUID} --base64 $secret" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/tmp/secret.xml state=absent" - -} diff --git a/gate/live_migration/hooks/nfs.sh b/gate/live_migration/hooks/nfs.sh deleted file mode 100755 index acadb36d6c..0000000000 --- a/gate/live_migration/hooks/nfs.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -function nfs_setup { - if uses_debs; then - module=apt - elif is_fedora; then - module=yum - fi - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-common state=present" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-kernel-server state=present" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-User value=nova" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-Group value=nova" - - for SUBNODE in $SUBNODES ; do - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m lineinfile -a "dest=/etc/exports line='/opt/stack/data/nova/instances $SUBNODE(rw,fsid=0,insecure,no_subtree_check,async,no_root_squash)'" - done - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "exportfs -a" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=restarted" - GetDistro - if [[ ! ${DISTRO} =~ (xenial) ]]; then - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=idmapd state=restarted" - fi - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 2049 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 2049 -j ACCEPT" - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "mount -t nfs4 -o proto\=tcp,port\=2049 $primary_node:/ /opt/stack/data/nova/instances/" -} - -function nfs_configure_tempest { - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" -} - -function nfs_verify_setup { - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/opt/stack/data/nova/instances/test_file state=touch" - if [ ! -e '/opt/stack/data/nova/instances/test_file' ]; then - die $LINENO "NFS configuration failure" - fi -} - -function nfs_teardown { - #teardown nfs shared storage - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "umount -t nfs4 /opt/stack/data/nova/instances/" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=stopped" -}
\ No newline at end of file diff --git a/gate/live_migration/hooks/run_tests.sh b/gate/live_migration/hooks/run_tests.sh deleted file mode 100755 index 331f2fa204..0000000000 --- a/gate/live_migration/hooks/run_tests.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -# Live migration dedicated ci job will be responsible for testing different -# environments based on underlying storage, used for ephemerals. -# This hook allows to inject logic of environment reconfiguration in ci job. -# Base scenario for this would be: -# -# 1. test with all local storage (use default for volumes) -# 2. test with NFS for root + ephemeral disks -# 3. test with Ceph for root + ephemeral disks -# 4. test with Ceph for volumes and root + ephemeral disk - -set -xe -cd $BASE/new/tempest - -source $BASE/new/devstack/functions -source $BASE/new/devstack/functions-common -source $BASE/new/devstack/lib/nova -source $WORKSPACE/devstack-gate/functions.sh -source $BASE/new/nova/gate/live_migration/hooks/utils.sh -source $BASE/new/nova/gate/live_migration/hooks/nfs.sh -source $BASE/new/nova/gate/live_migration/hooks/ceph.sh -primary_node=$(cat /etc/nodepool/primary_node_private) -SUBNODES=$(cat /etc/nodepool/sub_nodes_private) -SERVICE_HOST=$primary_node -STACK_USER=${STACK_USER:-stack} - -echo '1. test with all local storage (use default for volumes)' -echo 'NOTE: test_volume_backed_live_migration is skipped due to https://bugs.launchpad.net/nova/+bug/1524898' -run_tempest "block migration test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration))" - -# TODO(mriedem): Run $BASE/new/nova/gate/test_evacuate.sh for local storage - -#all tests bellow this line use shared storage, need to update tempest.conf -echo 'disabling block_migration in tempest' -$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" - -echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04' -#echo '2. test with NFS for root + ephemeral disks' - -#nfs_setup -#nfs_configure_tempest -#nfs_verify_setup -#run_tempest "NFS shared storage test" "live_migration" -#nfs_teardown - -# The nova-grenade-multinode job also runs resize and cold migration tests -# so we check for a grenade-only variable. -if [[ -n "$GRENADE_NEW_BRANCH" ]]; then - echo '3. test cold migration and resize' - run_tempest "cold migration and resize test" "test_resize_server|test_cold_migration|test_revert_cold_migration" -else - echo '3. cold migration and resize is skipped for non-grenade jobs' -fi - -echo '4. test with Ceph for root + ephemeral disks' -# Discover and set variables for the OS version so the devstack-plugin-ceph -# scripts can find the correct repository to install the ceph packages. -GetOSVersion -USE_PYTHON3=${USE_PYTHON3:-True} -prepare_ceph -GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} -configure_and_start_glance - -configure_and_start_nova -run_tempest "Ceph nova&glance test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration))" - -set +e -#echo '5. test with Ceph for volumes and root + ephemeral disk' - -#configure_and_start_cinder -#run_tempest "Ceph nova&glance&cinder test" "live_migration" diff --git a/gate/live_migration/hooks/utils.sh b/gate/live_migration/hooks/utils.sh deleted file mode 100755 index 9f98ca2e25..0000000000 --- a/gate/live_migration/hooks/utils.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -function run_tempest { - local message=$1 - local tempest_regex=$2 - sudo -H -u tempest tox -eall -- $tempest_regex --concurrency=$TEMPEST_CONCURRENCY - exitcode=$? - if [[ $exitcode -ne 0 ]]; then - die $LINENO "$message failure" - fi -} diff --git a/lower-constraints.txt b/lower-constraints.txt deleted file mode 100644 index 04af416aa6..0000000000 --- a/lower-constraints.txt +++ /dev/null @@ -1,165 +0,0 @@ -alembic==0.9.8 -amqp==2.5.0 -appdirs==1.4.3 -asn1crypto==0.24.0 -attrs==17.4.0 -automaton==1.14.0 -bandit==1.1.0 -cachetools==2.0.1 -castellan==0.16.0 -cffi==1.14.0 -cliff==2.11.0 -cmd2==0.8.1 -colorama==0.3.9 -coverage==4.0 -cryptography==2.7 -cursive==0.2.1 -dataclasses==0.7 -ddt==1.2.1 -debtcollector==1.19.0 -decorator==4.1.0 -deprecation==2.0 -dogpile.cache==0.6.5 -enum-compat==0.0.2 -eventlet==0.22.0 -extras==1.0.0 -fasteners==0.14.1 -fixtures==3.0.0 -future==0.16.0 -futurist==1.8.0 -gabbi==1.35.0 -gitdb2==2.0.3 -GitPython==2.1.8 -greenlet==0.4.15 -idna==2.6 -iso8601==0.1.11 -Jinja2==2.10 -jmespath==0.9.3 -jsonpatch==1.21 -jsonpath-rw==1.4.0 -jsonpath-rw-ext==1.1.3 -jsonpointer==2.0 -jsonschema==3.2.0 -keystoneauth1==3.16.0 -keystonemiddleware==4.20.0 -kombu==4.6.1 -linecache2==1.0.0 -lxml==4.5.0 -Mako==1.0.7 -MarkupSafe==1.1.1 -microversion-parse==0.2.1 -mock==3.0.0 -msgpack==0.5.6 -msgpack-python==0.5.6 -munch==2.2.0 -mypy==0.761 -netaddr==0.7.18 -netifaces==0.10.4 -networkx==2.1.0 -numpy==1.19.0 -openstacksdk==0.35.0 -os-brick==3.1.0 -os-client-config==1.29.0 -os-resource-classes==0.4.0 -os-service-types==1.7.0 -os-traits==2.4.0 -os-vif==1.14.0 -os-win==4.2.0 -os-xenapi==0.3.4 -osc-lib==1.10.0 -oslo.cache==1.26.0 -oslo.concurrency==3.29.0 -oslo.config==6.8.0 -oslo.context==2.22.0 -oslo.db==4.44.0 -oslo.i18n==3.15.3 -oslo.log==3.36.0 -oslo.messaging==10.3.0 -oslo.middleware==3.31.0 -oslo.policy==3.4.0 -oslo.privsep==1.33.2 -oslo.reports==1.18.0 -oslo.rootwrap==5.8.0 -oslo.serialization==2.21.1 -oslo.service==1.40.1 -oslo.upgradecheck==0.1.1 -oslo.utils==4.5.0 -oslo.versionedobjects==1.35.0 -oslo.vmware==2.17.0 -oslotest==3.8.0 -osprofiler==1.4.0 -ovs==2.10.0 -ovsdbapp==0.15.0 -packaging==17.1 -paramiko==2.7.1 -Paste==2.0.2 -PasteDeploy==1.5.0 -pbr==2.0.0 -pluggy==0.6.0 -ply==3.11 -prettytable==0.7.1 -psutil==3.2.2 -psycopg2==2.8 -py==1.5.2 -pyasn1==0.4.2 -pyasn1-modules==0.2.1 -pycadf==2.7.0 -pycparser==2.18 -pyinotify==0.9.6 -pyroute2==0.5.4 -PyJWT==1.7.0 -PyMySQL==0.8.0 -pyOpenSSL==17.5.0 -pyparsing==2.2.0 -pyperclip==1.6.0 -pypowervm==1.1.15 -pytest==3.4.2 -python-barbicanclient==4.5.2 -python-cinderclient==3.3.0 -python-dateutil==2.5.3 -python-editor==1.0.3 -python-glanceclient==2.8.0 -python-ironicclient==3.0.0 -python-keystoneclient==3.15.0 -python-mimeparse==1.6.0 -python-neutronclient==6.7.0 -python-subunit==1.4.0 -pytz==2018.3 -PyYAML==3.13 -repoze.lru==0.7 -requests==2.23.0 -requests-mock==1.2.0 -requestsexceptions==1.4.0 -retrying==1.3.3 -rfc3986==1.2.0 -Routes==2.3.1 -simplejson==3.13.2 -six==1.11.0 -smmap2==2.0.3 -sortedcontainers==2.1.0 -SQLAlchemy==1.2.19 -sqlalchemy-migrate==0.13.0 -sqlparse==0.2.4 -statsd==3.2.2 -stestr==2.0.0 -stevedore==1.20.0 -suds-jurko==0.6 -taskflow==3.8.0 -Tempita==0.5.2 -tenacity==6.0.0 -testrepository==0.0.20 -testresources==2.0.0 -testscenarios==0.4 -testtools==2.2.0 -tooz==1.58.0 -traceback2==1.4.0 -unittest2==1.1.0 -urllib3==1.22 -vine==1.1.4 -voluptuous==0.11.1 -warlock==1.3.1 -WebOb==1.8.2 -websockify==0.9.0 -wrapt==1.10.11 -wsgi-intercept==1.7.0 -zVMCloudConnector==1.3.0 diff --git a/nova/api/metadata/base.py b/nova/api/metadata/base.py index a069301b29..164d0ffefc 100644 --- a/nova/api/metadata/base.py +++ b/nova/api/metadata/base.py @@ -124,8 +124,13 @@ class InstanceMetadata(object): if not content: content = [] + # NOTE(gibi): this is not a cell targeted context even if we are called + # in a situation when the instance is in a different cell than the + # metadata service itself. ctxt = context.get_admin_context() + self.mappings = _format_instance_mapping(instance) + # NOTE(danms): Sanitize the instance to limit the amount of stuff # inside that may not pickle well (i.e. context). We also touch # some of the things we'll lazy load later to make sure we keep their @@ -147,8 +152,6 @@ class InstanceMetadata(object): self.security_groups = security_group_api.get_instance_security_groups( ctxt, instance) - self.mappings = _format_instance_mapping(ctxt, instance) - if instance.user_data is not None: self.userdata_raw = base64.decode_as_bytes(instance.user_data) else: @@ -696,9 +699,8 @@ def get_metadata_by_instance_id(instance_id, address, ctxt=None): return InstanceMetadata(instance, address) -def _format_instance_mapping(ctxt, instance): - bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( - ctxt, instance.uuid) +def _format_instance_mapping(instance): + bdms = instance.get_bdms() return block_device.instance_block_mapping(instance, bdms) diff --git a/nova/api/openstack/compute/admin_actions.py b/nova/api/openstack/compute/admin_actions.py index 6de6956bcf..f6df60e78c 100644 --- a/nova/api/openstack/compute/admin_actions.py +++ b/nova/api/openstack/compute/admin_actions.py @@ -19,8 +19,10 @@ from nova.api.openstack.compute.schemas import reset_server_state from nova.api.openstack import wsgi from nova.api import validation from nova.compute import api as compute +from nova.compute import instance_actions from nova.compute import vm_states from nova import exception +from nova import objects from nova.policies import admin_actions as aa_policies # States usable in resetState action @@ -73,9 +75,14 @@ class AdminActionsController(wsgi.Controller): context.can(aa_policies.POLICY_ROOT % 'reset_state', target={'project_id': instance.project_id}) + # Log os-resetState as an instance action + instance_action = objects.InstanceAction.action_start( + context, instance.uuid, instance_actions.RESET_STATE) + # Identify the desired state from the body state = state_map[body["os-resetState"]["state"]] instance.vm_state = state instance.task_state = None instance.save(admin_state_reset=True) + instance_action.finish() diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index b462b1a9d9..be35f62ee3 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -267,10 +267,25 @@ class ServiceController(wsgi.Controller): # service delete since we could orphan resource providers and # break the ability to do things like confirm/revert instances # in VERIFY_RESIZE status. - compute_nodes = objects.ComputeNodeList.get_all_by_host( - context, service.host) - self._assert_no_in_progress_migrations( - context, id, compute_nodes) + compute_nodes = [] + try: + compute_nodes = objects.ComputeNodeList.get_all_by_host( + context, service.host) + self._assert_no_in_progress_migrations( + context, id, compute_nodes) + except exception.ComputeHostNotFound: + # NOTE(artom) Consider the following situation: + # - Using the Ironic virt driver + # - Replacing (so removing and re-adding) all baremetal + # nodes associated with a single nova-compute service + # The update resources periodic will have destroyed the + # compute node records because they're no longer being + # reported by the virt driver. If we then attempt to + # manually delete the compute service record, + # get_all_host() above will raise, as there are no longer + # any compute node records for the host. Catch it here and + # continue to allow compute service deletion. + pass aggrs = self.aggregate_api.get_aggregates_by_host(context, service.host) diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index c789da51bb..9039b888cb 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -23,11 +23,14 @@ from nova import context from nova import exception from nova import objects from nova import service +from nova import utils CONF = cfg.CONF CONFIG_FILES = ['api-paste.ini', 'nova.conf'] +LOG = logging.getLogger(__name__) + objects.register_all() @@ -40,6 +43,11 @@ def _get_config_files(env=None): def _setup_service(host, name): + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + logging.getLogger(__name__).warning(str(e)) + binary = name if name.startswith('nova-') else "nova-%s" % name ctxt = context.get_admin_context() @@ -71,8 +79,12 @@ def error_application(exc, name): return application -def init_application(name): - conf_files = _get_config_files() +@utils.run_once('Global data already initialized, not re-initializing.', + LOG.info) +def init_global_data(conf_files): + # NOTE(melwitt): parse_args initializes logging and calls global rpc.init() + # and db_api.configure(). The db_api.configure() call does not initiate any + # connection to the database. config.parse_args([], default_config_files=conf_files) logging.setup(CONF, "nova") @@ -87,11 +99,25 @@ def init_application(name): logging.getLogger(__name__), logging.DEBUG) + +def init_application(name): + conf_files = _get_config_files() + + # NOTE(melwitt): The init_application method can be called multiple times + # within a single python interpreter instance if any exception is raised + # during it (example: DBConnectionError while setting up the service) and + # apache/mod_wsgi reloads the init_application script. So, we initialize + # global data separately and decorate the method to run only once in a + # python interpreter instance. + init_global_data(conf_files) + try: _setup_service(CONF.host, name) except exception.ServiceTooOld as exc: return error_application(exc, name) + # This global init is safe because if we got here, we already successfully + # set up the service and setting up the profile cannot fail. service.setup_profiler(name, CONF.host) conf = conf_files[0] diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 92c48c71e6..e704336d51 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -2668,7 +2668,7 @@ class PlacementCommands(object): """ url = '/resource_providers' if 'uuid' in kwargs: - url += '&uuid=%s' % kwargs['uuid'] + url += '?uuid=%s' % kwargs['uuid'] resp = placement.get(url, global_request_id=context.global_id, version='1.14') diff --git a/nova/cmd/status.py b/nova/cmd/status.py index fd3c307dca..8036d550fa 100644 --- a/nova/cmd/status.py +++ b/nova/cmd/status.py @@ -427,6 +427,16 @@ class UpgradeCommands(upgradecheck.UpgradeCommands): status = upgradecheck.Result(upgradecheck.Code.FAILURE, msg) return status + def _check_old_computes(self): + # warn if there are computes in the system older than the previous + # major release + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + return upgradecheck.Result(upgradecheck.Code.WARNING, str(e)) + + return upgradecheck.Result(upgradecheck.Code.SUCCESS) + # The format of the check functions is to return an upgradecheck.Result # object with the appropriate upgradecheck.Code and details set. If the # check hits warnings or failures then those should be stored in the @@ -447,6 +457,8 @@ class UpgradeCommands(upgradecheck.UpgradeCommands): (_('Policy Scope-based Defaults'), _check_policy), # Added in Victoria (_('Policy File JSON to YAML Migration'), _check_policy_json), + # Backported from Wallaby + (_('Older than N-1 computes'), _check_old_computes) ) diff --git a/nova/compute/api.py b/nova/compute/api.py index 59d2186c2c..97868490c3 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -2179,22 +2179,22 @@ class API(base.Base): return True return False - def _local_delete_cleanup(self, context, instance): + def _local_delete_cleanup(self, context, instance_uuid): # NOTE(aarents) Ensure instance allocation is cleared and instance # mapping queued as deleted before _delete() return try: self.placementclient.delete_allocation_for_instance( - context, instance.uuid) + context, instance_uuid) except exception.AllocationDeleteFailed: LOG.info("Allocation delete failed during local delete cleanup.", - instance=instance) + instance_uuid=instance_uuid) try: - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance_uuid, True) except exception.InstanceMappingNotFound: LOG.info("Instance Mapping does not exist while attempting " "local delete cleanup.", - instance=instance) + instance_uuid=instance_uuid) def _attempt_delete_of_buildrequest(self, context, instance): # If there is a BuildRequest then the instance may not have been @@ -2231,7 +2231,7 @@ class API(base.Base): if not instance.host and not may_have_ports_or_volumes: try: if self._delete_while_booting(context, instance): - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return # If instance.host was not set it's possible that the Instance # object here was pulled from a BuildRequest object and is not @@ -2240,6 +2240,11 @@ class API(base.Base): # properly. A lookup is attempted which will either return a # full Instance or None if not found. If not found then it's # acceptable to skip the rest of the delete processing. + + # Save a copy of the instance UUID early, in case + # _lookup_instance returns instance = None, to pass to + # _local_delete_cleanup if needed. + instance_uuid = instance.uuid cell, instance = self._lookup_instance(context, instance.uuid) if cell and instance: try: @@ -2250,11 +2255,11 @@ class API(base.Base): except exception.InstanceNotFound: pass # The instance was deleted or is already gone. - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return if not instance: # Instance is already deleted. - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance_uuid) return except exception.ObjectActionError: # NOTE(melwitt): This means the instance.host changed @@ -2267,7 +2272,7 @@ class API(base.Base): cell, instance = self._lookup_instance(context, instance.uuid) if not instance: # Instance is already deleted - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance_uuid) return bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( @@ -2311,7 +2316,7 @@ class API(base.Base): 'field, its vm_state is %(state)s.', {'state': instance.vm_state}, instance=instance) - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return except exception.ObjectActionError as ex: # The instance's host likely changed under us as @@ -2496,7 +2501,7 @@ class API(base.Base): instance.destroy() @staticmethod - def _update_queued_for_deletion(context, instance, qfd): + def _update_queued_for_deletion(context, instance_uuid, qfd): # NOTE(tssurya): We query the instance_mapping record of this instance # and update the queued_for_delete flag to True (or False according to # the state of the instance). This just means that the instance is @@ -2505,7 +2510,7 @@ class API(base.Base): # value could be stale which is fine, considering its use is only # during down cell (desperate) situation. im = objects.InstanceMapping.get_by_instance_uuid(context, - instance.uuid) + instance_uuid) im.queued_for_delete = qfd im.save() @@ -2517,7 +2522,7 @@ class API(base.Base): instance.save() else: self.compute_rpcapi.terminate_instance(context, instance, bdms) - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance.uuid, True) def _do_soft_delete(self, context, instance, bdms, local=False): if local: @@ -2527,7 +2532,7 @@ class API(base.Base): instance.save() else: self.compute_rpcapi.soft_delete_instance(context, instance) - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance.uuid, True) # NOTE(maoy): we allow delete to be called no matter what vm_state says. @check_instance_lock @@ -2580,7 +2585,7 @@ class API(base.Base): instance.task_state = None instance.deleted_at = None instance.save(expected_task_state=[None]) - self._update_queued_for_deletion(context, instance, False) + self._update_queued_for_deletion(context, instance.uuid, False) @check_instance_lock @check_instance_state(task_state=None, @@ -3870,8 +3875,7 @@ class API(base.Base): migration, migration.source_compute) - @staticmethod - def _allow_cross_cell_resize(context, instance): + def _allow_cross_cell_resize(self, context, instance): """Determine if the request can perform a cross-cell resize on this instance. @@ -3901,7 +3905,17 @@ class API(base.Base): 'version in the deployment %s is less than %s so ' 'cross-cell resize is not allowed at this time.', min_compute_version, MIN_COMPUTE_CROSS_CELL_RESIZE) - allowed = False + return False + + if self.network_api.get_requested_resource_for_instance( + context, instance.uuid): + LOG.info( + 'Request is allowed by policy to perform cross-cell ' + 'resize but the instance has ports with resource request ' + 'and cross-cell resize is not supported with such ports.', + instance=instance) + return False + return allowed @staticmethod diff --git a/nova/compute/instance_actions.py b/nova/compute/instance_actions.py index e6e51c95e4..1089975049 100644 --- a/nova/compute/instance_actions.py +++ b/nova/compute/instance_actions.py @@ -70,3 +70,4 @@ LOCK = 'lock' UNLOCK = 'unlock' BACKUP = 'createBackup' CREATE_IMAGE = 'createImage' +RESET_STATE = 'resetState' diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 20f5f76020..281f2733aa 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1411,6 +1411,13 @@ class ComputeManager(manager.Manager): eventlet.semaphore.BoundedSemaphore( CONF.compute.max_concurrent_disk_ops) + if CONF.compute.max_disk_devices_to_attach == 0: + msg = _('[compute]max_disk_devices_to_attach has been set to 0, ' + 'which will prevent instances from being able to boot. ' + 'Set -1 for unlimited or set >= 1 to limit the maximum ' + 'number of disk devices.') + raise exception.InvalidConfiguration(msg) + self.driver.init_host(host=self.host) context = nova.context.get_admin_context() instances = objects.InstanceList.get_by_host( @@ -1639,7 +1646,11 @@ class ComputeManager(manager.Manager): return [_decode(f) for f in injected_files] def _validate_instance_group_policy(self, context, instance, - scheduler_hints): + scheduler_hints=None): + + if CONF.workarounds.disable_group_policy_check_upcall: + return + # NOTE(russellb) Instance group policy is enforced by the scheduler. # However, there is a race condition with the enforcement of # the policy. Since more than one instance may be scheduled at the @@ -1648,29 +1659,63 @@ class ComputeManager(manager.Manager): # multiple instances with an affinity policy could end up on different # hosts. This is a validation step to make sure that starting the # instance here doesn't violate the policy. - group_hint = scheduler_hints.get('group') - if not group_hint: - return - - # The RequestSpec stores scheduler_hints as key=list pairs so we need - # to check the type on the value and pull the single entry out. The - # API request schema validates that the 'group' hint is a single value. - if isinstance(group_hint, list): - group_hint = group_hint[0] + if scheduler_hints is not None: + # only go through here if scheduler_hints is provided, even if it + # is empty. + group_hint = scheduler_hints.get('group') + if not group_hint: + return + else: + # The RequestSpec stores scheduler_hints as key=list pairs so + # we need to check the type on the value and pull the single + # entry out. The API request schema validates that + # the 'group' hint is a single value. + if isinstance(group_hint, list): + group_hint = group_hint[0] + + group = objects.InstanceGroup.get_by_hint(context, group_hint) + else: + # TODO(ganso): a call to DB can be saved by adding request_spec + # to rpcapi payload of live_migration, pre_live_migration and + # check_can_live_migrate_destination + try: + group = objects.InstanceGroup.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceGroupNotFound: + return - @utils.synchronized(group_hint) - def _do_validation(context, instance, group_hint): - group = objects.InstanceGroup.get_by_hint(context, group_hint) + @utils.synchronized(group['uuid']) + def _do_validation(context, instance, group): if group.policy and 'anti-affinity' == group.policy: + + # instances on host instances_uuids = objects.InstanceList.get_uuids_by_host( context, self.host) ins_on_host = set(instances_uuids) + + # instance param is just for logging, the nodename obtained is + # not actually related to the instance at all + nodename = self._get_nodename(instance) + + # instances being migrated to host + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, self.host, nodename)) + migration_vm_uuids = set([mig['instance_uuid'] + for mig in migrations]) + + total_instances = migration_vm_uuids | ins_on_host + + # refresh group to get updated members within locked block + group = objects.InstanceGroup.get_by_uuid(context, + group['uuid']) members = set(group.members) # Determine the set of instance group members on this host # which are not the instance in question. This is used to # determine how many other members from the same anti-affinity # group can be on this host. - members_on_host = ins_on_host & members - set([instance.uuid]) + members_on_host = (total_instances & members - + set([instance.uuid])) rules = group.rules if rules and 'max_server_per_host' in rules: max_server = rules['max_server_per_host'] @@ -1682,6 +1727,12 @@ class ComputeManager(manager.Manager): raise exception.RescheduledException( instance_uuid=instance.uuid, reason=msg) + + # NOTE(ganso): The check for affinity below does not work and it + # can easily be violated because the lock happens in different + # compute hosts. + # The only fix seems to be a DB lock to perform the check whenever + # setting the host field to an instance. elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: @@ -1690,8 +1741,7 @@ class ComputeManager(manager.Manager): instance_uuid=instance.uuid, reason=msg) - if not CONF.workarounds.disable_group_policy_check_upcall: - _do_validation(context, instance, group_hint) + _do_validation(context, instance, group) def _log_original_error(self, exc_info, instance_uuid): LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid, @@ -2230,8 +2280,6 @@ class ComputeManager(manager.Manager): instance=instance) self._cleanup_allocated_networks(context, instance, requested_networks) - self._cleanup_volumes(context, instance, - block_device_mapping, raise_exc=False) compute_utils.add_instance_fault_from_exc(context, instance, e, sys.exc_info(), fault_message=e.kwargs['reason']) @@ -3325,7 +3373,8 @@ class ComputeManager(manager.Manager): injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral, migration, - scheduled_node, limits, request_spec, accel_uuids): + scheduled_node, limits, request_spec, + accel_uuids=None): """Destroy and re-make this instance. A 'rebuild' effectively purges all existing data from the system and @@ -3356,7 +3405,8 @@ class ComputeManager(manager.Manager): :param limits: Overcommit limits set by the scheduler. If a host was specified by the user, this will be None :param request_spec: a RequestSpec object used to schedule the instance - :param accel_uuids: a list of cyborg ARQ uuids. + :param accel_uuids: a list of cyborg ARQ uuids or None if the RPC API + is <=5.11 """ # recreate=True means the instance is being evacuated from a failed @@ -3470,19 +3520,11 @@ class ComputeManager(manager.Manager): self._notify_instance_rebuild_error(context, instance, e, bdms) raise else: - instance.apply_migration_context() - # NOTE (ndipanov): This save will now update the host and node - # attributes making sure that next RT pass is consistent since - # it will be based on the instance and not the migration DB - # entry. - instance.host = self.host - instance.node = scheduled_node - instance.save() - instance.drop_migration_context() - - # NOTE (ndipanov): Mark the migration as done only after we - # mark the instance as belonging to this host. - self._set_migration_status(migration, 'done') + # NOTE(gibi): Let the resource tracker set the instance + # host and drop the migration context as we need to hold the + # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with + # _update_available_resources. See bug 1896463. + self.rt.finish_evacuation(instance, scheduled_node, migration) def _do_rebuild_instance_with_claim( self, context, instance, orig_image_ref, image_meta, @@ -5218,10 +5260,24 @@ class ComputeManager(manager.Manager): with self._error_out_instance_on_exception( context, instance, instance_state=instance_state),\ errors_out_migration_ctxt(migration): + self._send_prep_resize_notifications( context, instance, fields.NotificationPhase.START, instance_type) try: + scheduler_hints = self._get_scheduler_hints(filter_properties, + request_spec) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already + # in-progress, so this is the definitive moment to abort due to + # the policy violation. Also, exploding here is covered by the + # cleanup methods in except block. + try: + self._validate_instance_group_policy(context, instance, + scheduler_hints) + except exception.RescheduledException as e: + raise exception.InstanceFaultRollback(inner_exception=e) + self._prep_resize(context, image, instance, instance_type, filter_properties, node, migration, request_spec, @@ -5598,6 +5654,14 @@ class ComputeManager(manager.Manager): instance.host = migration.dest_compute instance.node = migration.dest_node + # NOTE(gibi): as the instance now tracked on the destination we + # have to make sure that the source compute resource track can + # track this instance as a migration. For that the resource tracker + # needs to see the old_flavor set on the instance. The old_flavor + # setting used to be done on the destination host in finish_resize + # but that is racy with a source host update_available_resource + # periodic run + instance.old_flavor = instance.flavor instance.task_state = task_states.RESIZE_MIGRATED instance.save(expected_task_state=task_states.RESIZE_MIGRATING) @@ -5711,6 +5775,10 @@ class ComputeManager(manager.Manager): # to ACTIVE for backwards compatibility old_vm_state = instance.system_metadata.get('old_vm_state', vm_states.ACTIVE) + # NOTE(gibi): this is already set by the resize_instance on the source + # node before calling finish_resize on destination but during upgrade + # it can be that the source node is not having the fix for bug 1944759 + # yet. This assignment can be removed in Z release. instance.old_flavor = old_flavor if old_instance_type_id != new_instance_type_id: @@ -7320,9 +7388,33 @@ class ComputeManager(manager.Manager): @wrap_instance_fault def swap_volume(self, context, old_volume_id, new_volume_id, instance, new_attachment_id): - """Swap volume for an instance.""" - context = context.elevated() + """Replace the old volume with the new volume within the active server + :param context: User request context + :param old_volume_id: Original volume id + :param new_volume_id: New volume id being swapped to + :param instance: Instance with original_volume_id attached + :param new_attachment_id: ID of the new attachment for new_volume_id + """ + @utils.synchronized(instance.uuid) + def _do_locked_swap_volume(context, old_volume_id, new_volume_id, + instance, new_attachment_id): + self._do_swap_volume(context, old_volume_id, new_volume_id, + instance, new_attachment_id) + _do_locked_swap_volume(context, old_volume_id, new_volume_id, instance, + new_attachment_id) + + def _do_swap_volume(self, context, old_volume_id, new_volume_id, + instance, new_attachment_id): + """Replace the old volume with the new volume within the active server + + :param context: User request context + :param old_volume_id: Original volume id + :param new_volume_id: New volume id being swapped to + :param instance: Instance with original_volume_id attached + :param new_attachment_id: ID of the new attachment for new_volume_id + """ + context = context.elevated() compute_utils.notify_about_volume_swap( context, instance, self.host, fields.NotificationPhase.START, @@ -7800,6 +7892,20 @@ class ComputeManager(manager.Manager): :param limits: objects.SchedulerLimits object for this live migration. :returns: a LiveMigrateData object (hypervisor-dependent) """ + + # Error out if this host cannot accept the new instance due + # to anti-affinity. This check at this moment is not very accurate, as + # multiple requests may be happening concurrently and miss the lock, + # but when it works it provides a better user experience by failing + # earlier. Also, it should be safe to explode here, error becomes + # NoValidHost and instance status remains ACTIVE. + try: + self._validate_instance_group_policy(ctxt, instance) + except exception.RescheduledException as e: + msg = ("Failed to validate instance group policy " + "due to: {}".format(e)) + raise exception.MigrationPreCheckError(reason=msg) + src_compute_info = obj_base.obj_to_primitive( self._get_compute_info(ctxt, instance.host)) dst_compute_info = obj_base.obj_to_primitive( @@ -7942,6 +8048,13 @@ class ComputeManager(manager.Manager): """ LOG.debug('pre_live_migration data is %s', migrate_data) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already in-progress, + # so this is the definitive moment to abort due to the policy + # violation. Also, it should be safe to explode here. The instance + # status remains ACTIVE, migration status failed. + self._validate_instance_group_policy(context, instance) + migrate_data.old_vol_attachment_ids = {} bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) @@ -8059,8 +8172,8 @@ class ComputeManager(manager.Manager): # We don't generate events if CONF.vif_plugging_timeout=0 # meaning that the operator disabled using them. if CONF.vif_plugging_timeout: - return [('network-vif-plugged', vif['id']) - for vif in instance.get_network_info()] + return (instance.get_network_info() + .get_live_migration_plug_time_events()) else: return [] diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 33926e0d47..685f799233 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -933,14 +933,41 @@ class ResourceTracker(object): 'flavor', 'migration_context', 'resources']) - # Now calculate usage based on instance utilization: - instance_by_uuid = self._update_usage_from_instances( - context, instances, nodename) - # Grab all in-progress migrations and error migrations: migrations = objects.MigrationList.get_in_progress_and_error( context, self.host, nodename) + # Check for tracked instances with in-progress, incoming, but not + # finished migrations. For those instance the migration context + # is not applied yet (it will be during finish_resize when the + # migration goes to finished state). We need to manually and + # temporary apply the migration context here when the resource usage is + # updated. See bug 1953359 for more details. + instance_by_uuid = {instance.uuid: instance for instance in instances} + for migration in migrations: + if ( + migration.instance_uuid in instance_by_uuid and + migration.dest_compute == self.host and + migration.dest_node == nodename + ): + # we does not check for the 'post-migrating' migration status + # as applying the migration context for an instance already + # in finished migration status is a no-op anyhow. + instance = instance_by_uuid[migration.instance_uuid] + LOG.debug( + 'Applying migration context for instance %s as it has an ' + 'incoming, in-progress migration %s. ' + 'Migration status is %s', + migration.instance_uuid, migration.uuid, migration.status + ) + # It is OK not to revert the migration context at the end of + # the periodic as the instance is not saved during the periodic + instance.apply_migration_context() + + # Now calculate usage based on instance utilization: + instance_by_uuid = self._update_usage_from_instances( + context, instances, nodename) + self._pair_instances_to_migrations(migrations, instance_by_uuid) self._update_usage_from_migrations(context, migrations, nodename) @@ -1970,3 +1997,21 @@ class ResourceTracker(object): """ self.pci_tracker.free_instance_claims(context, instance) self.pci_tracker.save(context) + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def finish_evacuation(self, instance, node, migration): + instance.apply_migration_context() + # NOTE (ndipanov): This save will now update the host and node + # attributes making sure that next RT pass is consistent since + # it will be based on the instance and not the migration DB + # entry. + instance.host = self.host + instance.node = node + instance.save() + instance.drop_migration_context() + + # NOTE (ndipanov): Mark the migration as done only after we + # mark the instance as belonging to this host. + if migration: + migration.status = 'done' + migration.save() diff --git a/nova/conf/compute.py b/nova/conf/compute.py index dbdd051d9f..92b5ab7918 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -962,10 +962,16 @@ on compute host B. The configured maximum is not enforced on shelved offloaded servers, as they have no compute host. +.. warning:: If this option is set to 0, the ``nova-compute`` service will fail + to start, as 0 disk devices is an invalid configuration that would + prevent instances from being able to boot. + Possible values: * -1 means unlimited -* Any integer >= 0 represents the maximum allowed +* Any integer >= 1 represents the maximum allowed. A value of 0 will cause the + ``nova-compute`` service to fail to start, as 0 disk devices is an invalid + configuration that would prevent instances from being able to boot. """), cfg.StrOpt('provider_config_location', default='/etc/nova/provider_config/', diff --git a/nova/conf/pci.py b/nova/conf/pci.py index b812b39676..b383d0a69f 100644 --- a/nova/conf/pci.py +++ b/nova/conf/pci.py @@ -116,7 +116,13 @@ Possible values: ``address`` PCI address of the device. Both traditional glob style and regular - expression syntax is supported. + expression syntax is supported. Please note that the address fields are + restricted to the following maximum values: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 ``devname`` Device name of the device (for e.g. interface name). Not all PCI devices diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index 8eadc0b6ec..4e64d87578 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -346,6 +346,59 @@ Related options: * :oslo.config:option:`image_cache.subdirectory_name` * :oslo.config:option:`update_resources_interval` """), + cfg.ListOpt('wait_for_vif_plugged_event_during_hard_reboot', + item_type=cfg.types.String( + choices=[ + "normal", + "direct", + "macvtap", + "baremetal", + "direct-physical", + "virtio-forwarder", + "smart-nic", + ]), + default=[], + help=""" +The libvirt virt driver implements power on and hard reboot by tearing down +every vif of the instance being rebooted then plug them again. By default nova +does not wait for network-vif-plugged event from neutron before it lets the +instance run. This can cause the instance to requests the IP via DHCP before +the neutron backend has a chance to set up the networking backend after the vif +plug. + +This flag defines which vifs nova expects network-vif-plugged events from +during hard reboot. The possible values are neutron port vnic types: + +* normal +* direct +* macvtap +* baremetal +* direct-physical +* virtio-forwarder +* smart-nic + +Adding a ``vnic_type`` to this configuration makes Nova wait for a +network-vif-plugged event for each of the instance's vifs having the specific +``vnic_type`` before unpausing the instance, similarly to how new instance +creation works. + +Please note that not all neutron networking backends send plug time events, for +certain ``vnic_type`` therefore this config is empty by default. + +The ml2/ovs and the networking-odl backends are known to send plug time events +for ports with ``normal`` ``vnic_type`` so it is safe to add ``normal`` to this +config if you are using only those backends in the compute host. + +The neutron in-tree SRIOV backend does not reliably send network-vif-plugged +event during plug time for ports with ``direct`` vnic_type and never sends +that event for port with ``direct-physical`` vnic_type during plug time. For +other ``vnic_type`` and backend pairs, please consult the developers of the +backend. + +Related options: + +* :oslo.config:option:`DEFAULT.vif_plugging_timeout` +"""), ] diff --git a/nova/console/websocketproxy.py b/nova/console/websocketproxy.py index 5f2985dfbb..8512ac62ad 100644 --- a/nova/console/websocketproxy.py +++ b/nova/console/websocketproxy.py @@ -19,6 +19,8 @@ Leverages websockify.py by Joel Martin ''' import copy +from http import HTTPStatus +import os import socket import sys @@ -281,6 +283,22 @@ class NovaProxyRequestHandler(websockify.ProxyRequestHandler): def socket(self, *args, **kwargs): return websockifyserver.WebSockifyServer.socket(*args, **kwargs) + def send_head(self): + # This code is copied from this example patch: + # https://bugs.python.org/issue32084#msg306545 + path = self.translate_path(self.path) + if os.path.isdir(path): + parts = urlparse.urlsplit(self.path) + if not parts.path.endswith('/'): + # Browsers interpret "Location: //uri" as an absolute URI + # like "http://URI" + if self.path.startswith('//'): + self.send_error(HTTPStatus.BAD_REQUEST, + "URI must not start with //") + return None + + return super(NovaProxyRequestHandler, self).send_head() + class NovaWebSocketProxy(websockify.WebSocketProxy): def __init__(self, *args, **kwargs): diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 973622a8f2..9bc673faa8 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -47,6 +47,7 @@ from sqlalchemy import or_ from sqlalchemy.orm import aliased from sqlalchemy.orm import joinedload from sqlalchemy.orm import noload +from sqlalchemy.orm import subqueryload from sqlalchemy.orm import undefer from sqlalchemy.schema import Table from sqlalchemy import sql @@ -1266,13 +1267,27 @@ def _build_instance_get(context, columns_to_join=None): continue if 'extra.' in column: query = query.options(undefer(column)) + elif column in ['metadata', 'system_metadata']: + # NOTE(melwitt): We use subqueryload() instead of joinedload() for + # metadata and system_metadata because of the one-to-many + # relationship of the data. Directly joining these columns can + # result in a large number of additional rows being queried if an + # instance has a large number of (system_)metadata items, resulting + # in a large data transfer. Instead, the subqueryload() will + # perform additional queries to obtain metadata and system_metadata + # for the instance. + query = query.options(subqueryload(column)) else: query = query.options(joinedload(column)) # NOTE(alaski) Stop lazy loading of columns not needed. for col in ['metadata', 'system_metadata']: if col not in columns_to_join: query = query.options(noload(col)) - return query + # NOTE(melwitt): We need to use order_by(<unique column>) so that the + # additional queries emitted by subqueryload() include the same ordering as + # used by the parent query. + # https://docs.sqlalchemy.org/en/13/orm/loading_relationships.html#the-importance-of-ordering + return query.order_by(models.Instance.id) def _instances_fill_metadata(context, instances, manual_joins=None): @@ -4083,64 +4098,132 @@ def task_log_end_task(context, task_name, period_beginning, period_ending, ################## -def _archive_if_instance_deleted(table, shadow_table, instances, conn, - max_rows, before): - """Look for records that pertain to deleted instances, but may not be - deleted themselves. This catches cases where we delete an instance, - but leave some residue because of a failure in a cleanup path or - similar. - - Logic is: if I have a column called instance_uuid, and that instance - is deleted, then I can be deleted. - """ - - # NOTE(jake): handle instance_actions_events differently as it relies on - # instance_actions.id not instances.uuid - if table.name == "instance_actions_events": - instance_actions = models.BASE.metadata.tables["instance_actions"] - query_select = sql.select( - [table], - and_(instances.c.deleted != instances.c.deleted.default.arg, - instances.c.uuid == instance_actions.c.instance_uuid, - instance_actions.c.id == table.c.action_id)) - - else: - query_select = sql.select( - [table], - and_(instances.c.deleted != instances.c.deleted.default.arg, - instances.c.uuid == table.c.instance_uuid)) - - if before: - query_select = query_select.where(instances.c.deleted_at < before) +def _get_tables_with_fk_to_table(table): + """Get a list of tables that refer to the given table by foreign key (FK). - query_select = query_select.order_by(table.c.id).limit(max_rows) + :param table: Table object (parent) for which to find references by FK - query_insert = shadow_table.insert(inline=True).\ - from_select([c.name for c in table.c], query_select) - - delete_statement = DeleteFromSelect(table, query_select, - table.c.id) + :returns: A list of Table objects that refer to the specified table by FK + """ + tables = [] + for t in models.BASE.metadata.tables.values(): + for fk in t.foreign_keys: + if fk.references(table): + tables.append(t) + return tables + + +def _get_fk_stmts(metadata, conn, table, column, records): + """Find records related to this table by foreign key (FK) and create and + return insert/delete statements for them. + + Logic is: find the tables that reference the table passed to this method + and walk the tree of references by FK. As child records are found, prepend + them to deques to execute later in a single database transaction (to avoid + orphaning related records if any one insert/delete fails or the archive + process is otherwise interrupted). + + :param metadata: Metadata object to use to construct a shadow Table object + :param conn: Connection object to use to select records related by FK + :param table: Table object (parent) for which to find references by FK + :param column: Column object (parent) to use to select records related by + FK + :param records: A list of records (column values) to use to select records + related by FK + + :returns: tuple of (insert statements, delete statements) for records + related by FK to insert into shadow tables and delete from main tables + """ + inserts = collections.deque() + deletes = collections.deque() + fk_tables = _get_tables_with_fk_to_table(table) + for fk_table in fk_tables: + # Create the shadow table for the referencing table. + fk_shadow_tablename = _SHADOW_TABLE_PREFIX + fk_table.name + try: + fk_shadow_table = Table(fk_shadow_tablename, metadata, + autoload=True) + except NoSuchTableError: + # No corresponding shadow table; skip it. + continue - try: - with conn.begin(): - conn.execute(query_insert) - result_delete = conn.execute(delete_statement) - return result_delete.rowcount - except db_exc.DBReferenceError as ex: - LOG.warning('Failed to archive %(table)s: %(error)s', - {'table': table.name, - 'error': six.text_type(ex)}) - return 0 + # TODO(stephenfin): Drop this when we drop the table + if fk_table.name == "dns_domains": + # We have one table (dns_domains) where the key is called + # "domain" rather than "id" + fk_column = fk_table.c.domain + else: + fk_column = fk_table.c.id + + for fk in fk_table.foreign_keys: + # We need to find the records in the referring (child) table that + # correspond to the records in our (parent) table so we can archive + # them. + + # First, select the column in the parent referenced by the child + # table that corresponds to the parent table records that were + # passed in. + # Example: table = 'instances' and fk_table = 'instance_extra' + # fk.parent = instance_extra.instance_uuid + # fk.column = instances.uuid + # SELECT instances.uuid FROM instances, instance_extra + # WHERE instance_extra.instance_uuid = instances.uuid + # AND instance.id IN (<ids>) + # We need the instance uuids for the <ids> in order to + # look up the matching instance_extra records. + select = sql.select([fk.column]).where( + sql.and_(fk.parent == fk.column, column.in_(records))) + rows = conn.execute(select).fetchall() + p_records = [r[0] for r in rows] + # Then, select rows in the child table that correspond to the + # parent table records that were passed in. + # Example: table = 'instances' and fk_table = 'instance_extra' + # fk.parent = instance_extra.instance_uuid + # fk.column = instances.uuid + # SELECT instance_extra.id FROM instance_extra, instances + # WHERE instance_extra.instance_uuid = instances.uuid + # AND instances.uuid IN (<uuids>) + # We will get the instance_extra ids we need to archive + # them. + fk_select = sql.select([fk_column]).where( + sql.and_(fk.parent == fk.column, fk.column.in_(p_records))) + fk_rows = conn.execute(fk_select).fetchall() + fk_records = [r[0] for r in fk_rows] + if fk_records: + # If we found any records in the child table, create shadow + # table insert statements for them and prepend them to the + # deque. + fk_columns = [c.name for c in fk_table.c] + fk_insert = fk_shadow_table.insert(inline=True).\ + from_select(fk_columns, sql.select([fk_table], + fk_column.in_(fk_records))) + inserts.appendleft(fk_insert) + # Create main table delete statements and prepend them to the + # deque. + fk_delete = fk_table.delete().where(fk_column.in_(fk_records)) + deletes.appendleft(fk_delete) + # Repeat for any possible nested child tables. + i, d = _get_fk_stmts(metadata, conn, fk_table, fk_column, fk_records) + inserts.extendleft(i) + deletes.extendleft(d) + + return inserts, deletes def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): """Move up to max_rows rows from one tables to the corresponding shadow table. - :returns: 2-item tuple: + Will also follow FK constraints and archive all referring rows. + Example: archving a record from the 'instances' table will also archive + the 'instance_extra' record before archiving the 'instances' record. + + :returns: 3-item tuple: - number of rows archived - list of UUIDs of instances that were archived + - number of extra rows archived (due to FK constraints) + dict of {tablename: rows_archived} """ conn = metadata.bind.connect() # NOTE(tdurakov): table metadata should be received @@ -4156,7 +4239,7 @@ def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): shadow_table = Table(shadow_tablename, metadata, autoload=True) except NoSuchTableError: # No corresponding shadow table; skip it. - return rows_archived, deleted_instance_uuids + return rows_archived, deleted_instance_uuids, {} # TODO(stephenfin): Drop this when we drop the table if tablename == "dns_domains": @@ -4179,10 +4262,29 @@ def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): rows = conn.execute(select).fetchall() records = [r[0] for r in rows] + # We will archive deleted rows for this table and also generate insert and + # delete statements for extra rows we may archive by following FK + # relationships. Because we are iterating over the sorted_tables (list of + # Table objects sorted in order of foreign key dependency), new inserts and + # deletes ("leaves") will be added to the fronts of the deques created in + # _get_fk_stmts. This way, we make sure we delete child table records + # before we delete their parent table records. + + # Keep track of any extra tablenames to number of rows that we archive by + # following FK relationships. + # {tablename: extra_rows_archived} + extras = collections.defaultdict(int) if records: insert = shadow_table.insert(inline=True).\ from_select(columns, sql.select([table], column.in_(records))) delete = table.delete().where(column.in_(records)) + # Walk FK relationships and add insert/delete statements for rows that + # refer to this table via FK constraints. fk_inserts and fk_deletes + # will be prepended to by _get_fk_stmts if referring rows are found by + # FK constraints. + fk_inserts, fk_deletes = _get_fk_stmts( + metadata, conn, table, column, records) + # NOTE(tssurya): In order to facilitate the deletion of records from # instance_mappings, request_specs and instance_group_member tables in # the nova_api DB, the rows of deleted instances from the instances @@ -4196,9 +4298,14 @@ def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): try: # Group the insert and delete in a transaction. with conn.begin(): + for fk_insert in fk_inserts: + conn.execute(fk_insert) + for fk_delete in fk_deletes: + result_fk_delete = conn.execute(fk_delete) + extras[fk_delete.table.name] += result_fk_delete.rowcount conn.execute(insert) result_delete = conn.execute(delete) - rows_archived = result_delete.rowcount + rows_archived += result_delete.rowcount except db_exc.DBReferenceError as ex: # A foreign key constraint keeps us from deleting some of # these rows until we clean up a dependent table. Just @@ -4207,18 +4314,7 @@ def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): "%(tablename)s: %(error)s", {'tablename': tablename, 'error': six.text_type(ex)}) - # NOTE(jake): instance_actions_events doesn't have a instance_uuid column - # but still needs to be archived as it is a FK constraint - if ((max_rows is None or rows_archived < max_rows) and - ('instance_uuid' in columns or - tablename == 'instance_actions_events')): - instances = models.BASE.metadata.tables['instances'] - limit = max_rows - rows_archived if max_rows is not None else None - extra = _archive_if_instance_deleted(table, shadow_table, instances, - conn, limit, before) - rows_archived += extra - - return rows_archived, deleted_instance_uuids + return rows_archived, deleted_instance_uuids, extras def archive_deleted_rows(context=None, max_rows=None, before=None): @@ -4242,13 +4338,18 @@ def archive_deleted_rows(context=None, max_rows=None, before=None): - list of UUIDs of instances that were archived - total number of rows that were archived """ - table_to_rows_archived = {} + table_to_rows_archived = collections.defaultdict(int) deleted_instance_uuids = [] total_rows_archived = 0 meta = MetaData(get_engine(use_slave=True, context=context)) meta.reflect() - # Reverse sort the tables so we get the leaf nodes first for processing. - for table in reversed(meta.sorted_tables): + # Get the sorted list of tables in order of foreign key dependency. + # Process the parent tables and find their dependent records in order to + # archive the related records in a single database transactions. The goal + # is to avoid a situation where, for example, an 'instances' table record + # is missing its corresponding 'instance_extra' record due to running the + # archive_deleted_rows command with max_rows. + for table in meta.sorted_tables: tablename = table.name rows_archived = 0 # skip the special sqlalchemy-migrate migrate_version table and any @@ -4256,7 +4357,7 @@ def archive_deleted_rows(context=None, max_rows=None, before=None): if (tablename == 'migrate_version' or tablename.startswith(_SHADOW_TABLE_PREFIX)): continue - rows_archived, _deleted_instance_uuids = ( + rows_archived, _deleted_instance_uuids, extras = ( _archive_deleted_rows_for_table( meta, tablename, max_rows=max_rows - total_rows_archived, @@ -4267,6 +4368,9 @@ def archive_deleted_rows(context=None, max_rows=None, before=None): # Only report results for tables that had updates. if rows_archived: table_to_rows_archived[tablename] = rows_archived + for tablename, extra_rows_archived in extras.items(): + table_to_rows_archived[tablename] += extra_rows_archived + total_rows_archived += extra_rows_archived if total_rows_archived >= max_rows: break return table_to_rows_archived, deleted_instance_uuids, total_rows_archived diff --git a/nova/exception.py b/nova/exception.py index f58705d8c1..fd9dc4f5d0 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -559,6 +559,14 @@ class ServiceTooOld(Invalid): "Unable to continue.") +class TooOldComputeService(Invalid): + msg_fmt = _("Current Nova version does not support computes older than " + "%(oldest_supported_version)s but the minimum compute service " + "level in your %(scope)s is %(min_service_level)d and the " + "oldest supported service level is " + "%(oldest_supported_service)d.") + + class DestinationDiskExists(Invalid): msg_fmt = _("The supplied disk path (%(path)s) already exists, " "it is expected not to exist.") @@ -2281,6 +2289,10 @@ class PMEMNamespaceConfigInvalid(NovaException): "please check your conf file. ") +class GetPMEMNamespacesFailed(NovaException): + msg_fmt = _("Get PMEM namespaces on host failed: %(reason)s.") + + class VPMEMCleanupFailed(NovaException): msg_fmt = _("Failed to clean up the vpmem backend device %(dev)s: " "%(error)s") diff --git a/nova/middleware.py b/nova/middleware.py index 717fecd4ef..8b0fc59561 100644 --- a/nova/middleware.py +++ b/nova/middleware.py @@ -24,11 +24,15 @@ def set_defaults(): 'X-Roles', 'X-Service-Catalog', 'X-User-Id', - 'X-Tenant-Id'], + 'X-Tenant-Id', + 'X-OpenStack-Nova-API-Version', + 'OpenStack-API-Version'], expose_headers=['X-Auth-Token', 'X-Openstack-Request-Id', 'X-Subject-Token', - 'X-Service-Token'], + 'X-Service-Token', + 'X-OpenStack-Nova-API-Version', + 'OpenStack-API-Version'], allow_methods=['GET', 'PUT', 'POST', diff --git a/nova/network/model.py b/nova/network/model.py index d8119fae72..7ed9d2d1b8 100644 --- a/nova/network/model.py +++ b/nova/network/model.py @@ -469,6 +469,14 @@ class VIF(Model): return (self.is_hybrid_plug_enabled() and not migration.is_same_host()) + @property + def has_live_migration_plug_time_event(self): + """Returns whether this VIF's network-vif-plugged external event will + be sent by Neutron at "plugtime" - in other words, as soon as neutron + completes configuring the network backend. + """ + return self.is_hybrid_plug_enabled() + def is_hybrid_plug_enabled(self): return self['details'].get(VIF_DETAILS_OVS_HYBRID_PLUG, False) @@ -530,15 +538,22 @@ class NetworkInfo(list): return jsonutils.dumps(self) def get_bind_time_events(self, migration): - """Returns whether any of our VIFs have "bind-time" events. See - has_bind_time_event() docstring for more details. + """Returns a list of external events for any VIFs that have + "bind-time" events during cold migration. """ return [('network-vif-plugged', vif['id']) for vif in self if vif.has_bind_time_event(migration)] + def get_live_migration_plug_time_events(self): + """Returns a list of external events for any VIFs that have + "plug-time" events during live migration. + """ + return [('network-vif-plugged', vif['id']) + for vif in self if vif.has_live_migration_plug_time_event] + def get_plug_time_events(self, migration): - """Complementary to get_bind_time_events(), any event that does not - fall in that category is a plug-time event. + """Returns a list of external events for any VIFs that have + "plug-time" events during cold migration. """ return [('network-vif-plugged', vif['id']) for vif in self if not vif.has_bind_time_event(migration)] diff --git a/nova/network/neutron.py b/nova/network/neutron.py index db4cb44d47..b05ab1a055 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -271,6 +271,7 @@ def _get_ksa_client(context, admin=False): client = utils.get_ksa_adapter( 'network', ksa_auth=auth_plugin, ksa_session=session) client.additional_headers = {'accept': 'application/json'} + client.connect_retries = CONF.neutron.http_retries return client @@ -802,9 +803,15 @@ class API(base.Base): # TODO(arosen) Should optimize more to do direct query for security # group if len(security_groups) == 1 if len(security_groups): + # NOTE(slaweq): fields other than name and id aren't really needed + # so asking only about those fields will allow Neutron to not + # prepare list of rules for each found security group. That may + # speed processing of this request a lot in case when tenant has + # got many security groups + sg_fields = ['id', 'name'] search_opts = {'tenant_id': instance.project_id} user_security_groups = neutron.list_security_groups( - **search_opts).get('security_groups') + fields=sg_fields, **search_opts).get('security_groups') for security_group in security_groups: name_match = None diff --git a/nova/objects/service.py b/nova/objects/service.py index 33d1a5e2e8..bc1782e7dd 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -192,6 +192,13 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '5.12'}, ) +# This is used to raise an error at service startup if older than N-1 computes +# are detected. Update this at the beginning of every release cycle +OLDEST_SUPPORTED_SERVICE_VERSION = 'Ussuri' +SERVICE_VERSION_ALIASES = { + 'Ussuri': 41 +} + # TODO(berrange): Remove NovaObjectDictCompat @base.NovaObjectRegistry.register diff --git a/nova/pci/manager.py b/nova/pci/manager.py index 3084643f5e..c55e732c01 100644 --- a/nova/pci/manager.py +++ b/nova/pci/manager.py @@ -117,8 +117,42 @@ class PciDevTracker(object): devices = [] for dev in jsonutils.loads(devices_json): - if self.dev_filter.device_assignable(dev): - devices.append(dev) + try: + if self.dev_filter.device_assignable(dev): + devices.append(dev) + except exception.PciConfigInvalidWhitelist as e: + # The raised exception is misleading as the problem is not with + # the whitelist config but with the host PCI device reported by + # libvirt. The code that matches the host PCI device to the + # withelist spec reuses the WhitelistPciAddress object to parse + # the host PCI device address. That parsing can fail if the + # PCI address has a 32 bit domain. But this should not prevent + # processing the rest of the devices. So we simply skip this + # device and continue. + # Please note that this except block does not ignore the + # invalid whitelist configuration. The whitelist config has + # already been parsed or rejected in case it was invalid. At + # this point the self.dev_filter representes the parsed and + # validated whitelist config. + LOG.debug( + 'Skipping PCI device %s reported by the hypervisor: %s', + {k: v for k, v in dev.items() + if k in ['address', 'parent_addr']}, + # NOTE(gibi): this is ugly but the device_assignable() call + # uses the PhysicalPciAddress class to parse the PCI + # addresses and that class reuses the code from + # PciAddressSpec that was originally designed to parse + # whitelist spec. Hence the raised exception talks about + # whitelist config. This is misleading as in our case the + # PCI address that we failed to parse came from the + # hypervisor. + # TODO(gibi): refactor the false abstraction to make the + # code reuse clean from the false assumption that we only + # parse whitelist config with + # devspec.PciAddressSpec._set_pci_dev_info() + str(e).replace( + 'Invalid PCI devices Whitelist config:', 'The')) + self._set_hvdevs(devices) @staticmethod @@ -225,6 +259,7 @@ class PciDevTracker(object): self.stale[new_value['address']] = new_value else: existed.update_device(new_value) + self.stats.update_device(existed) # Track newly discovered devices. for dev in [dev for dev in devices if diff --git a/nova/pci/stats.py b/nova/pci/stats.py index b911410848..232e94ff6f 100644 --- a/nova/pci/stats.py +++ b/nova/pci/stats.py @@ -104,6 +104,32 @@ class PciDeviceStats(object): pool['parent_ifname'] = dev.extra_info['parent_ifname'] return pool + def _get_pool_with_device_type_mismatch(self, dev): + """Check for device type mismatch in the pools for a given device. + + Return (pool, device) if device type does not match or a single None + if the device type matches. + """ + for pool in self.pools: + for device in pool['devices']: + if device.address == dev.address: + if dev.dev_type != pool["dev_type"]: + return pool, device + return None + + return None + + def update_device(self, dev): + """Update a device to its matching pool.""" + pool_device_info = self._get_pool_with_device_type_mismatch(dev) + if pool_device_info is None: + return + + pool, device = pool_device_info + pool['devices'].remove(device) + self._decrease_pool_count(self.pools, pool) + self.add_device(dev) + def add_device(self, dev): """Add a device to its matching pool.""" dev_pool = self._create_pool_keys_from_dev(dev) diff --git a/nova/service.py b/nova/service.py index 5af20ed0b9..064275e97a 100644 --- a/nova/service.py +++ b/nova/service.py @@ -255,6 +255,17 @@ class Service(service.Service): periodic_fuzzy_delay=periodic_fuzzy_delay, periodic_interval_max=periodic_interval_max) + # NOTE(gibi): This have to be after the service object creation as + # that is the point where we can safely use the RPC to the conductor. + # E.g. the Service.__init__ actually waits for the conductor to start + # up before it allows the service to be created. The + # raise_if_old_compute() depends on the RPC to be up and does not + # implement its own retry mechanism to connect to the conductor. + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + LOG.warning(str(e)) + return service_obj def kill(self): diff --git a/nova/storage/rbd_utils.py b/nova/storage/rbd_utils.py index 22bafe5053..431dfc9aec 100644 --- a/nova/storage/rbd_utils.py +++ b/nova/storage/rbd_utils.py @@ -405,7 +405,14 @@ class RBDDriver(object): # MAX_AVAIL stat will divide by the replication size when doing the # calculation. args = ['ceph', 'df', '--format=json'] + self.ceph_args() - out, _ = processutils.execute(*args) + + try: + out, _ = processutils.execute(*args) + except processutils.ProcessExecutionError: + LOG.exception('Could not determine disk usage') + raise exception.StorageError( + reason='Could not determine disk usage') + stats = jsonutils.loads(out) # Find the pool for which we are configured. diff --git a/nova/test.py b/nova/test.py index dd0fc4daa5..9bb6e5e032 100644 --- a/nova/test.py +++ b/nova/test.py @@ -51,10 +51,13 @@ from oslotest import base from oslotest import mock_fixture import six from six.moves import builtins +from sqlalchemy.dialects import sqlite import testtools +from nova.api.openstack import wsgi_app from nova.compute import rpcapi as compute_rpcapi from nova import context +from nova.db.sqlalchemy import api as sqlalchemy_api from nova import exception from nova import objects from nova.objects import base as objects_base @@ -283,6 +286,10 @@ class TestCase(base.BaseTestCase): self.useFixture(nova_fixtures.GenericPoisonFixture()) + # make sure that the wsgi app is fully initialized for all testcase + # instead of only once initialized for test worker + wsgi_app.init_global_data.reset() + def _setup_cells(self): """Setup a normal cellsv2 environment. @@ -376,6 +383,22 @@ class TestCase(base.BaseTestCase): for k, v in kw.items(): CONF.set_override(k, v, group) + def enforce_fk_constraints(self, engine=None): + if engine is None: + engine = sqlalchemy_api.get_engine() + dialect = engine.url.get_dialect() + if dialect == sqlite.dialect: + # We're seeing issues with foreign key support in SQLite 3.6.20 + # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 + # It works fine in SQLite 3.7. + # So return early to skip this test if running SQLite < 3.7 + import sqlite3 + tup = sqlite3.sqlite_version_info + if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): + self.skipTest( + 'sqlite version too old for reliable SQLA foreign_keys') + engine.connect().execute("PRAGMA foreign_keys = ON") + def start_service(self, name, host=None, cell_name=None, **kwargs): # Disallow starting multiple scheduler services if name == 'scheduler' and self._service_fixture_count[name]: diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py index 0bda64eb43..04abf032cd 100644 --- a/nova/tests/functional/db/test_archive.py +++ b/nova/tests/functional/db/test_archive.py @@ -17,7 +17,6 @@ import re from dateutil import parser as dateutil_parser from oslo_utils import timeutils -from sqlalchemy.dialects import sqlite from sqlalchemy import func from sqlalchemy import MetaData from sqlalchemy import select @@ -33,22 +32,7 @@ class TestDatabaseArchive(integrated_helpers._IntegratedTestBase): def setUp(self): super(TestDatabaseArchive, self).setUp() - # TODO(mriedem): pull this out so we can re-use it in - # test_archive_deleted_rows_fk_constraint - # SQLite doesn't enforce foreign key constraints without a pragma. - engine = sqlalchemy_api.get_engine() - dialect = engine.url.get_dialect() - if dialect == sqlite.dialect: - # We're seeing issues with foreign key support in SQLite 3.6.20 - # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 - # It works fine in SQLite 3.7. - # So return early to skip this test if running SQLite < 3.7 - import sqlite3 - tup = sqlite3.sqlite_version_info - if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): - self.skipTest( - 'sqlite version too old for reliable SQLA foreign_keys') - engine.connect().execute("PRAGMA foreign_keys = ON") + self.enforce_fk_constraints() def test_archive_deleted_rows(self): # Boots a server, deletes it, and then tries to archive it. @@ -114,6 +98,19 @@ class TestDatabaseArchive(integrated_helpers._IntegratedTestBase): # Verify we have some system_metadata since we'll check that later. self.assertTrue(len(instance.system_metadata), 'No system_metadata for instance: %s' % server_id) + # Create a pci_devices record to simulate an instance that had a PCI + # device allocated at the time it was deleted. There is a window of + # time between deletion of the instance record and freeing of the PCI + # device in nova-compute's _complete_deletion method during RT update. + db.pci_device_update(admin_context, 1, 'fake-address', + {'compute_node_id': 1, + 'address': 'fake-address', + 'vendor_id': 'fake', + 'product_id': 'fake', + 'dev_type': 'fake', + 'label': 'fake', + 'status': 'allocated', + 'instance_uuid': instance.uuid}) # Now try and archive the soft deleted records. results, deleted_instance_uuids, archived = \ db.archive_deleted_rows(max_rows=100) @@ -128,6 +125,49 @@ class TestDatabaseArchive(integrated_helpers._IntegratedTestBase): self.assertIn('instance_actions', results) self.assertIn('instance_actions_events', results) self.assertEqual(sum(results.values()), archived) + # Verify that the pci_devices record has not been dropped + self.assertNotIn('pci_devices', results) + + def test_archive_deleted_rows_incomplete(self): + """This tests a scenario where archive_deleted_rows is run with + --max_rows and does not run to completion. + + That is, the archive is stopped before all archivable records have been + archived. Specifically, the problematic state is when a single instance + becomes partially archived (example: 'instance_extra' record for one + instance has been archived while its 'instances' record remains). Any + access of the instance (example: listing deleted instances) that + triggers the retrieval of a dependent record that has been archived + away, results in undefined behavior that may raise an error. + + We will force the system into a state where a single deleted instance + is partially archived. We want to verify that we can, for example, + successfully do a GET /servers/detail at any point between partial + archive_deleted_rows runs without errors. + """ + # Boots a server, deletes it, and then tries to archive it. + server = self._create_server() + server_id = server['id'] + # Assert that there are instance_actions. instance_actions are + # interesting since we don't soft delete them but they have a foreign + # key back to the instances table. + actions = self.api.get_instance_actions(server_id) + self.assertTrue(len(actions), + 'No instance actions for server: %s' % server_id) + self._delete_server(server) + # Archive deleted records iteratively, 1 row at a time, and try to do a + # GET /servers/detail between each run. All should succeed. + exceptions = [] + while True: + _, _, archived = db.archive_deleted_rows(max_rows=1) + try: + # Need to use the admin API to list deleted servers. + self.admin_api.get_servers(search_opts={'deleted': True}) + except Exception as ex: + exceptions.append(ex) + if archived == 0: + break + self.assertFalse(exceptions) def _get_table_counts(self): engine = sqlalchemy_api.get_engine() diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index f0e7e148eb..fcbbdce9d4 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -385,7 +385,15 @@ class InstanceHelperMixin: """ # if forcing the server onto a host, we have to use the admin API if not api: - api = self.api if not az else getattr(self, 'admin_api', self.api) + api = self.api if not az and not host else getattr( + self, 'admin_api', self.api) + + if host and not api.microversion: + api.microversion = '2.74' + # with 2.74 networks param needs to use 'none' instead of None + # if no network is needed + if networks is None: + networks = 'none' body = self._build_server( name, image_uuid, flavor_id, networks, az, host) diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py index 28f8463aea..db61b5fd4b 100644 --- a/nova/tests/functional/libvirt/test_numa_servers.py +++ b/nova/tests/functional/libvirt/test_numa_servers.py @@ -21,6 +21,7 @@ from oslo_config import cfg from oslo_log import log as logging import nova +from nova.compute import manager from nova.conf import neutron as neutron_conf from nova import context as nova_context from nova import objects @@ -711,6 +712,211 @@ class NUMAServersTest(NUMAServersTestBase): server = self._wait_for_state_change(server, 'ACTIVE') + def _assert_pinned_cpus(self, hostname, expected_number_of_pinned): + numa_topology = objects.NUMATopology.obj_from_db_obj( + objects.ComputeNode.get_by_nodename( + self.ctxt, hostname, + ).numa_topology, + ) + self.assertEqual( + expected_number_of_pinned, len(numa_topology.cells[0].pinned_cpus)) + + def _create_server_and_resize_bug_1944759(self): + self.flags( + cpu_dedicated_set='0-3', cpu_shared_set='4-7', group='compute') + self.flags(vcpu_pin_set=None) + + # start services + self.start_compute(hostname='test_compute0') + self.start_compute(hostname='test_compute1') + + flavor_a_id = self._create_flavor( + vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'}) + server = self._create_server(flavor_id=flavor_a_id) + + src_host = server['OS-EXT-SRV-ATTR:host'] + self._assert_pinned_cpus(src_host, 2) + + # we don't really care what the new flavor is, so long as the old + # flavor is using pinning. We use a similar flavor for simplicity. + flavor_b_id = self._create_flavor( + vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'}) + + orig_rpc_finish_resize = nova.compute.rpcapi.ComputeAPI.finish_resize + + # Simulate that the finish_resize call overlaps with an + # update_available_resource periodic job + def inject_periodic_to_finish_resize(*args, **kwargs): + self._run_periodics() + return orig_rpc_finish_resize(*args, **kwargs) + + self.stub_out( + 'nova.compute.rpcapi.ComputeAPI.finish_resize', + inject_periodic_to_finish_resize, + ) + + # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should + # probably be less...dumb + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + post = {'resize': {'flavorRef': flavor_b_id}} + self.api.post_server_action(server['id'], post) + server = self._wait_for_state_change(server, 'VERIFY_RESIZE') + + dst_host = server['OS-EXT-SRV-ATTR:host'] + + # we have 2 cpus pinned on both computes. The source should have it + # due to the outbound migration and the destination due to the + # instance running there + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 2) + + return server, src_host, dst_host + + def test_resize_confirm_bug_1944759(self): + server, src_host, dst_host = ( + self._create_server_and_resize_bug_1944759()) + + # Now confirm the resize + post = {'confirmResize': None} + + self.api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + + # the resource allocation reflects that the VM is running on the dest + # node + self._assert_pinned_cpus(src_host, 0) + self._assert_pinned_cpus(dst_host, 2) + + # and running periodics does not break it either + self._run_periodics() + + self._assert_pinned_cpus(src_host, 0) + self._assert_pinned_cpus(dst_host, 2) + + def test_resize_revert_bug_1944759(self): + server, src_host, dst_host = ( + self._create_server_and_resize_bug_1944759()) + + # Now revert the resize + post = {'revertResize': None} + + # reverts actually succeeds (not like confirm) but the resource + # allocation is still flaky + self.api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + + # After the revert the source host should have 2 cpus pinned due to + # the instance. + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 0) + + # running the periodic job will not break it either + self._run_periodics() + + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 0) + + def test_resize_dedicated_policy_race_on_dest_bug_1953359(self): + + self.flags(cpu_dedicated_set='0-2', cpu_shared_set=None, + group='compute') + self.flags(vcpu_pin_set=None) + + host_info = fakelibvirt.HostInfo(cpu_nodes=1, cpu_sockets=1, + cpu_cores=2, cpu_threads=1, + kB_mem=15740000) + self.start_compute(host_info=host_info, hostname='compute1') + + extra_spec = { + 'hw:cpu_policy': 'dedicated', + } + flavor_id = self._create_flavor(vcpu=1, extra_spec=extra_spec) + expected_usage = {'DISK_GB': 20, 'MEMORY_MB': 2048, 'PCPU': 1} + + server = self._run_build_test(flavor_id, expected_usage=expected_usage) + + inst = objects.Instance.get_by_uuid(self.ctxt, server['id']) + self.assertEqual(1, len(inst.numa_topology.cells)) + # assert that the pcpu 0 is used on compute1 + self.assertEqual({'0': 0}, inst.numa_topology.cells[0].cpu_pinning_raw) + + # start another compute with the same config + self.start_compute(host_info=host_info, hostname='compute2') + + # boot another instance but now on compute2 so that it occupies the + # pcpu 0 on compute2 + # NOTE(gibi): _run_build_test cannot be used here as it assumes only + # compute1 exists + server2 = self._create_server( + flavor_id=flavor_id, + host='compute2', + ) + inst2 = objects.Instance.get_by_uuid(self.ctxt, server2['id']) + self.assertEqual(1, len(inst2.numa_topology.cells)) + # assert that the pcpu 0 is used + self.assertEqual( + {'0': 0}, inst2.numa_topology.cells[0].cpu_pinning_raw) + + # migrate the first instance from compute1 to compute2 but stop + # migrating at the start of finish_resize. Then start a racing periodic + # update_available_resources. + orig_finish_resize = manager.ComputeManager.finish_resize + + def fake_finish_resize(*args, **kwargs): + # start a racing update_available_resource periodic + self._run_periodics() + # we expect it that CPU pinning fails on the destination node + # as the resource_tracker will use the source node numa_topology + # and that does not fit to the dest node as pcpu 0 in the dest + # is already occupied. + log = self.stdlog.logger.output + # The resize_claim correctly calculates that the instance should be + # pinned to pcpu id 1 instead of 0 + self.assertIn( + 'Computed NUMA topology CPU pinning: usable pCPUs: [[1]], ' + 'vCPUs mapping: [(0, 1)]', + log, + ) + # We expect that the periodic not fails as bug 1953359 is fixed. + log = self.stdlog.logger.output + self.assertIn('Running periodic for compute (compute2)', log) + self.assertNotIn('Error updating resources for node compute2', log) + self.assertNotIn( + 'nova.exception.CPUPinningInvalid: CPU set to pin [0] must be ' + 'a subset of free CPU set [1]', + log, + ) + + # now let the resize finishes + return orig_finish_resize(*args, **kwargs) + + # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should + # probably be less...dumb + with mock.patch('nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}'): + with mock.patch( + 'nova.compute.manager.ComputeManager.finish_resize', + new=fake_finish_resize, + ): + post = {'migrate': None} + # this is expected to succeed + self.admin_api.post_server_action(server['id'], post) + + server = self._wait_for_state_change(server, 'VERIFY_RESIZE') + + # As bug 1953359 is fixed the revert should succeed too + post = {'revertResize': {}} + self.admin_api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + self.assertNotIn( + 'nova.exception.CPUUnpinningInvalid: CPU set to unpin [1] must be ' + 'a subset of pinned CPU set [0]', + self.stdlog.logger.output, + ) + class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest): diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index d494d66a07..0d9463165a 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -70,6 +70,7 @@ class SRIOVServersTest(_PCIServersTestBase): { 'vendor_id': fakelibvirt.PCI_VEND_ID, 'product_id': fakelibvirt.PF_PROD_ID, + 'physical_network': 'physnet4', }, { 'vendor_id': fakelibvirt.PCI_VEND_ID, @@ -103,6 +104,20 @@ class SRIOVServersTest(_PCIServersTestBase): # fixture already stubbed. self.neutron = self.useFixture(base.LibvirtNeutronFixture(self)) + def _disable_sriov_in_pf(self, pci_info): + # Check for PF and change the capability from virt_functions + # Delete all the VFs + vfs_to_delete = [] + + for device_name, device in pci_info.devices.items(): + if 'virt_functions' in device.pci_device: + device.generate_xml(skip_capability=True) + elif 'phys_function' in device.pci_device: + vfs_to_delete.append(device_name) + + for device in vfs_to_delete: + del pci_info.devices[device] + def test_create_server_with_VF(self): """Create a server with an SR-IOV VF-type PCI device.""" @@ -266,6 +281,69 @@ class SRIOVServersTest(_PCIServersTestBase): ) self.assertIsNone(diagnostics['nic_details'][1]['tx_packets']) + def test_create_server_after_change_in_nonsriov_pf_to_sriov_pf(self): + # Starts a compute with PF not configured with SRIOV capabilities + # Updates the PF with SRIOV capability and restart the compute service + # Then starts a VM with the sriov port. The VM should be in active + # state with sriov port attached. + + # To emulate the device type changing, we first create a + # HostPCIDevicesInfo object with PFs and VFs. Then we make a copy + # and remove the VFs and the virt_function capability. This is + # done to ensure the physical function product id is same in both + # the versions. + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1) + pci_info_no_sriov = copy.deepcopy(pci_info) + + # Disable SRIOV capabilties in PF and delete the VFs + self._disable_sriov_in_pf(pci_info_no_sriov) + + fake_connection = self._get_connection(pci_info=pci_info_no_sriov, + hostname='test_compute0') + self.mock_conn.return_value = fake_connection + + self.compute = self.start_service('compute', host='test_compute0') + + ctxt = context.get_admin_context() + pci_devices = objects.PciDeviceList.get_by_compute_node( + ctxt, + objects.ComputeNode.get_by_nodename( + ctxt, 'test_compute0', + ).id, + ) + self.assertEqual(1, len(pci_devices)) + self.assertEqual('type-PCI', pci_devices[0].dev_type) + + # Update connection with original pci info with sriov PFs + fake_connection = self._get_connection(pci_info=pci_info, + hostname='test_compute0') + self.mock_conn.return_value = fake_connection + + # Restart the compute service + self.restart_compute_service(self.compute) + + # Verify if PCI devices are of type type-PF or type-VF + pci_devices = objects.PciDeviceList.get_by_compute_node( + ctxt, + objects.ComputeNode.get_by_nodename( + ctxt, 'test_compute0', + ).id, + ) + for pci_device in pci_devices: + self.assertIn(pci_device.dev_type, ['type-PF', 'type-VF']) + + # create the port + self.neutron.create_port({'port': self.neutron.network_4_port_1}) + + # create a server using the VF via neutron + flavor_id = self._create_flavor() + self._create_server( + flavor_id=flavor_id, + networks=[ + {'port': base.LibvirtNeutronFixture.network_4_port_1['id']}, + ], + ) + class SRIOVAttachDetachTest(_PCIServersTestBase): # no need for aliases as these test will request SRIOV via neutron diff --git a/nova/tests/functional/regressions/test_bug_1896463.py b/nova/tests/functional/regressions/test_bug_1896463.py new file mode 100644 index 0000000000..dc74791e0e --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1896463.py @@ -0,0 +1,222 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import fixtures +import time + +from oslo_config import cfg + +from nova import context +from nova import objects +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers +from nova import utils +from nova.virt import fake + + +CONF = cfg.CONF + + +class TestEvacuateResourceTrackerRace( + test.TestCase, integrated_helpers.InstanceHelperMixin, +): + """Demonstrate bug #1896463. + + Trigger a race condition between an almost finished evacuation that is + dropping the migration context, and the _update_available_resource() + periodic task that already loaded the instance list but haven't loaded the + migration list yet. The result is that the PCI allocation made by the + evacuation is deleted by the overlapping periodic task run and the instance + will not have PCI allocation after the evacuation. + """ + + def setUp(self): + super().setUp() + self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self)) + self.glance = self.useFixture(nova_fixtures.GlanceFixture(self)) + self.placement = self.useFixture(func_fixtures.PlacementFixture()).api + + self.api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + + self.admin_api = self.api_fixture.admin_api + self.admin_api.microversion = 'latest' + self.api = self.admin_api + + self.start_service('conductor') + self.start_service('scheduler') + + self.flags(compute_driver='fake.FakeDriverWithPciResources') + self.useFixture( + fake.FakeDriverWithPciResources. + FakeDriverWithPciResourcesConfigFixture()) + + self.compute1 = self._start_compute('host1') + self.compute1_id = self._get_compute_node_id_by_host('host1') + self.compute1_service_id = self.admin_api.get_services( + host='host1', binary='nova-compute')[0]['id'] + + self.compute2 = self._start_compute('host2') + self.compute2_id = self._get_compute_node_id_by_host('host2') + self.compute2_service_id = self.admin_api.get_services( + host='host2', binary='nova-compute')[0]['id'] + + # add extra ports and the related network to the neutron fixture + # specifically for these tests. It cannot be added globally in the + # fixture init as it adds a second network that makes auto allocation + # based test to fail due to ambiguous networks. + self.neutron._ports[self.neutron.sriov_port['id']] = \ + copy.deepcopy(self.neutron.sriov_port) + self.neutron._networks[ + self.neutron.network_2['id']] = self.neutron.network_2 + self.neutron._subnets[ + self.neutron.subnet_2['id']] = self.neutron.subnet_2 + + self.ctxt = context.get_admin_context() + + def _get_compute_node_id_by_host(self, host): + # we specifically need the integer id of the node not the UUID so we + # need to use the old microversion + with utils.temporary_mutation(self.admin_api, microversion='2.52'): + hypers = self.admin_api.api_get( + 'os-hypervisors').body['hypervisors'] + for hyper in hypers: + if hyper['hypervisor_hostname'] == host: + return hyper['id'] + + self.fail('Hypervisor with hostname=%s not found' % host) + + def _assert_pci_device_allocated( + self, instance_uuid, compute_node_id, num=1): + """Assert that a given number of PCI devices are allocated to the + instance on the given host. + """ + + devices = objects.PciDeviceList.get_by_instance_uuid( + self.ctxt, instance_uuid) + devices_on_host = [dev for dev in devices + if dev.compute_node_id == compute_node_id] + self.assertEqual(num, len(devices_on_host)) + + def test_evacuate_races_with_update_available_resource(self): + # Create a server with a direct port to have PCI allocation + server = self._create_server( + name='test-server-for-bug-1896463', + networks=[{'port': self.neutron.sriov_port['id']}], + host='host1' + ) + + self._assert_pci_device_allocated(server['id'], self.compute1_id) + self._assert_pci_device_allocated( + server['id'], self.compute2_id, num=0) + + # stop and force down the compute the instance is on to allow + # evacuation + self.compute1.stop() + self.admin_api.put_service( + self.compute1_service_id, {'forced_down': 'true'}) + + # Inject some sleeps both in the Instance.drop_migration_context and + # the MigrationList.get_in_progress_and_error code to make them + # overlap. + # We want to create the following execution scenario: + # 1) The evacuation makes a move claim on the dest including the PCI + # claim. This means there is a migration context. But the evacuation + # is not complete yet so the instance.host does not point to the + # dest host. + # 2) The dest resource tracker starts an _update_available_resource() + # periodic task and this task loads the list of instances on its + # host from the DB. Our instance is not in this list due to #1. + # 3) The evacuation finishes, the instance.host is set to the dest host + # and the migration context is deleted. + # 4) The periodic task now loads the list of in-progress migration from + # the DB to check for incoming our outgoing migrations. However due + # to #3 our instance is not in this list either. + # 5) The periodic task cleans up every lingering PCI claim that is not + # connected to any instance collected above from the instance list + # and from the migration list. As our instance is not in either of + # the lists, the resource tracker cleans up the PCI allocation for + # the already finished evacuation of our instance. + # + # Unfortunately we cannot reproduce the above situation without sleeps. + # We need that the evac starts first then the periodic starts, but not + # finishes, then evac finishes, then periodic finishes. If I trigger + # and run the whole periodic in a wrapper of drop_migration_context + # then I could not reproduce the situation described at #4). In general + # it is not + # + # evac + # | + # | + # | periodic + # | | + # | | + # | x + # | + # | + # x + # + # but + # + # evac + # | + # | + # | periodic + # | | + # | | + # | | + # x | + # | + # x + # + # what is needed need. + # + # Starting the periodic from the test in a separate thread at + # drop_migration_context() might work but that is an extra complexity + # in the test code. Also it might need a sleep still to make the + # reproduction stable but only one sleep instead of two. + orig_drop = objects.Instance.drop_migration_context + + def slow_drop(*args, **kwargs): + time.sleep(1) + return orig_drop(*args, **kwargs) + + self.useFixture( + fixtures.MockPatch( + 'nova.objects.instance.Instance.drop_migration_context', + new=slow_drop)) + + orig_get_mig = objects.MigrationList.get_in_progress_and_error + + def slow_get_mig(*args, **kwargs): + time.sleep(2) + return orig_get_mig(*args, **kwargs) + + self.useFixture( + fixtures.MockPatch( + 'nova.objects.migration.MigrationList.' + 'get_in_progress_and_error', + new=slow_get_mig)) + + self.admin_api.post_server_action(server['id'], {'evacuate': {}}) + # we trigger the _update_available_resource periodic to overlap with + # the already started evacuation + self._run_periodics() + + self._wait_for_server_parameter( + server, {'OS-EXT-SRV-ATTR:host': 'host2', 'status': 'ACTIVE'}) + + self._assert_pci_device_allocated(server['id'], self.compute1_id) + self._assert_pci_device_allocated(server['id'], self.compute2_id) diff --git a/nova/tests/functional/regressions/test_bug_1899649.py b/nova/tests/functional/regressions/test_bug_1899649.py new file mode 100644 index 0000000000..be75ea947f --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1899649.py @@ -0,0 +1,100 @@ +# Copyright 2020, Red Hat, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional.libvirt import base +from nova.tests.unit.virt.libvirt import fakelibvirt + + +class TestVolAttachmentsAfterFailureToScheduleOrBuild(base.ServersTestBase): + """Regression test for bug . + + This regression test aims to ensure a volume attachment remains in place + after a failure to either schedule a server or when building a server + directly on a compute after skipping the scheduler. + + A volume attachment is required to remain after such failures to ensure the + volume itself remains marked as reserved. + + To ensure this is as accurate as possible the tests use the libvirt + functional base class to mimic a real world example with NUMA nodes being + requested via flavor extra specs. The underlying compute being unable to + meet this request ensuring a failure. + """ + + microversion = 'latest' + + def setUp(self): + super().setUp() + + # Launch a single libvirt based compute service with a single NUMA node + host_info = fakelibvirt.HostInfo( + cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000) + self.start_compute(host_info=host_info, hostname='compute1') + + # Use a flavor requesting 2 NUMA nodes that we know will always fail + self.flavor_id = self._create_flavor(extra_spec={'hw:numa_nodes': '2'}) + + # Craft a common bfv server request for use within each test + self.volume_id = nova_fixtures.CinderFixture.IMAGE_BACKED_VOL + self.server = { + 'name': 'test', + 'flavorRef': self.flavor_id, + 'imageRef': '', + 'networks': 'none', + 'block_device_mapping_v2': [{ + 'source_type': 'volume', + 'destination_type': 'volume', + 'boot_index': 0, + 'uuid': self.volume_id}] + } + + def _assert_failure_and_volume_attachments(self, server): + # Assert that the server is in an ERROR state + self._wait_for_state_change(server, 'ERROR') + + # Assert that the volume is in a reserved state. As this isn't modelled + # by the CinderFixture we just assert that a single volume attachment + # remains after the failure and that it is referenced by the server. + attachments = self.cinder.volume_to_attachment.get(self.volume_id) + self.assertEqual(1, len(attachments)) + self.assertIn( + self.volume_id, self.cinder.volume_ids_for_instance(server['id'])) + + def test_failure_to_schedule(self): + # Assert that a volume attachment remains after a failure to schedule + server = self.api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_schedule_with_az(self): + # Assert that a volume attachment remains after a failure to schedule + # with the addition of an availability_zone in the request + self.server['availability_zone'] = 'nova' + server = self.api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_schedule_with_host(self): + # Assert that a volume attachment remains after a failure to schedule + # using the optional host parameter introduced in microversion 2.74 + self.server['host'] = 'compute1' + server = self.admin_api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_build_with_az_and_host(self): + # Assert that a volume attachments remain after a failure to + # build and reschedule by providing an availability_zone *and* host, + # skipping the scheduler. This is bug #1899649. + self.server['availability_zone'] = 'nova:compute1' + server = self.admin_api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) diff --git a/nova/tests/functional/regressions/test_bug_1902925.py b/nova/tests/functional/regressions/test_bug_1902925.py new file mode 100644 index 0000000000..fb5f5251e5 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1902925.py @@ -0,0 +1,44 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +from nova.tests.functional import integrated_helpers +from nova.tests.unit import cast_as_call + + +class ComputeVersion5xPinnedRpcTests(integrated_helpers._IntegratedTestBase): + + compute_driver = 'fake.MediumFakeDriver' + ADMIN_API = True + api_major_version = 'v2.1' + microversion = 'latest' + + def setUp(self): + super(ComputeVersion5xPinnedRpcTests, self).setUp() + self.useFixture(cast_as_call.CastAsCall(self)) + + self.compute1 = self._start_compute(host='host1') + + def _test_rebuild_instance_with_compute_rpc_pin(self, version_cap): + self.flags(compute=version_cap, group='upgrade_levels') + + server_req = self._build_server(networks='none') + server = self.api.post_server({'server': server_req}) + server = self._wait_for_state_change(server, 'ACTIVE') + + self.api.post_server_action(server['id'], {'rebuild': { + 'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6' + }}) + + def test_rebuild_instance_5_0(self): + self._test_rebuild_instance_with_compute_rpc_pin('5.0') + + def test_rebuild_instance_5_12(self): + self._test_rebuild_instance_with_compute_rpc_pin('5.12') diff --git a/nova/tests/functional/regressions/test_bug_1914777.py b/nova/tests/functional/regressions/test_bug_1914777.py new file mode 100644 index 0000000000..651ddfeb3d --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1914777.py @@ -0,0 +1,138 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova import context as nova_context +from nova import exception +from nova import objects +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit import policy_fixture + + +class TestDeleteWhileBooting(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """This tests race scenarios where an instance is deleted while booting. + + In these scenarios, the nova-api service is racing with nova-conductor + service; nova-conductor is in the middle of booting the instance when + nova-api begins fulfillment of a delete request. As the two services + delete records out from under each other, both services need to handle + it properly such that a delete request will always be fulfilled. + + Another scenario where two requests can race and delete things out from + under each other is if two or more delete requests are racing while the + instance is booting. + + In order to force things into states where bugs have occurred, we must + mock some object retrievals from the database to simulate the different + points at which a delete request races with a create request or another + delete request. We aim to mock only the bare minimum necessary to recreate + the bug scenarios. + """ + def setUp(self): + super(TestDeleteWhileBooting, self).setUp() + self.useFixture(policy_fixture.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.GlanceFixture(self)) + + api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + self.api = api_fixture.api + + self.ctxt = nova_context.get_context() + + # We intentionally do not start a conductor or scheduler service, since + # our goal is to simulate an instance that has not been scheduled yet. + + # Kick off a server create request and move on once it's in the BUILD + # state. Since we have no conductor or scheduler service running, the + # server will "hang" in an unscheduled state for testing. + self.server = self._create_server(expected_state='BUILD') + # Simulate that a different request has deleted the build request + # record after this delete request has begun processing. (The first + # lookup of the build request occurs in the servers API to get the + # instance object in order to delete it). + # We need to get the build request now before we mock the method. + self.br = objects.BuildRequest.get_by_instance_uuid( + self.ctxt, self.server['id']) + + @mock.patch('nova.objects.build_request.BuildRequest.get_by_instance_uuid') + def test_build_request_and_instance_not_found(self, mock_get_br): + """This tests a scenario where another request has deleted the build + request record and the instance record ahead of us. + """ + # The first lookup at the beginning of the delete request in the + # ServersController succeeds and the second lookup to handle "delete + # while booting" in compute/api fails after a different request has + # deleted it. + br_not_found = exception.BuildRequestNotFound(uuid=self.server['id']) + mock_get_br.side_effect = [self.br, br_not_found, br_not_found] + self._delete_server(self.server) + + @mock.patch('nova.objects.build_request.BuildRequest.get_by_instance_uuid') + @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid') + @mock.patch('nova.objects.instance.Instance.get_by_uuid') + def test_deleting_instance_at_the_same_time(self, mock_get_i, mock_get_im, + mock_get_br): + """This tests the scenario where another request is trying to delete + the instance record at the same time we are, while the instance is + booting. An example of this: while the create and delete are running at + the same time, the delete request deletes the build request, the create + request finds the build request already deleted when it tries to delete + it. The create request deletes the instance record and then delete + request tries to lookup the instance after it deletes the build + request. Its attempt to lookup the instance fails because the create + request already deleted it. + """ + # First lookup at the beginning of the delete request in the + # ServersController succeeds, second lookup to handle "delete while + # booting" in compute/api fails after the conductor has deleted it. + br_not_found = exception.BuildRequestNotFound(uuid=self.server['id']) + mock_get_br.side_effect = [self.br, br_not_found] + # Simulate the instance transitioning from having no cell assigned to + # having a cell assigned while the delete request is being processed. + # First lookup of the instance mapping has the instance unmapped (no + # cell) and subsequent lookups have the instance mapped to cell1. + no_cell_im = objects.InstanceMapping( + context=self.ctxt, instance_uuid=self.server['id'], + cell_mapping=None) + has_cell_im = objects.InstanceMapping( + context=self.ctxt, instance_uuid=self.server['id'], + cell_mapping=self.cell_mappings['cell1']) + mock_get_im.side_effect = [ + no_cell_im, has_cell_im, has_cell_im, has_cell_im, has_cell_im] + # Simulate that the instance object has been created by the conductor + # in the create path while the delete request is being processed. + # First lookups are before the instance has been deleted and the last + # lookup is after the conductor has deleted the instance. Use the build + # request to make an instance object for testing. + i = self.br.get_new_instance(self.ctxt) + i_not_found = exception.InstanceNotFound(instance_id=self.server['id']) + mock_get_i.side_effect = [i, i, i, i_not_found, i_not_found] + + # Simulate that the conductor is running instance_destroy at the same + # time as we are. + def fake_instance_destroy(*args, **kwargs): + # NOTE(melwitt): This is a misleading exception, as it is not only + # raised when a constraint on 'host' is not met, but also when two + # instance_destroy calls are racing. In this test, the soft delete + # returns 0 rows affected because another request soft deleted the + # record first. + raise exception.ObjectActionError( + action='destroy', reason='host changed') + + self.stub_out( + 'nova.objects.instance.Instance.destroy', fake_instance_destroy) + self._delete_server(self.server) diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py index 8c81ac1cf3..311bc1b1cb 100644 --- a/nova/tests/functional/test_nova_manage.py +++ b/nova/tests/functional/test_nova_manage.py @@ -1767,6 +1767,7 @@ class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase): def setUp(self): super(TestDBArchiveDeletedRows, self).setUp() + self.enforce_fk_constraints() self.cli = manage.DbCommands() self.output = StringIO() self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output)) @@ -1812,6 +1813,7 @@ class TestDBArchiveDeletedRowsMultiCell(integrated_helpers.InstanceHelperMixin, def setUp(self): super(TestDBArchiveDeletedRowsMultiCell, self).setUp() + self.enforce_fk_constraints() self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index dd52864056..426745cb8a 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -40,6 +40,8 @@ from nova.network import constants from nova.network import neutron as neutronapi from nova import objects from nova.objects import block_device as block_device_obj +from nova.policies import base as base_policies +from nova.policies import servers as servers_policies from nova.scheduler import utils from nova import test from nova.tests import fixtures as nova_fixtures @@ -8256,3 +8258,115 @@ class AcceleratorServerOpsTest(AcceleratorServerBase): 'OS-DCF:diskConfig': 'AUTO'}}) self.assertEqual(403, ex.response.status_code) self._check_allocations_usage(self.server) + + +class CrossCellResizeWithQoSPort(PortResourceRequestBasedSchedulingTestBase): + NUMBER_OF_CELLS = 2 + + def setUp(self): + # Use our custom weigher defined above to make sure that we have + # a predictable host order in the alternate list returned by the + # scheduler for migration. + self.useFixture(nova_fixtures.HostNameWeigherFixture()) + super(CrossCellResizeWithQoSPort, self).setUp() + # start compute2 in cell2, compute1 is started in cell1 by default + self.compute2 = self._start_compute('host2', cell_name='cell2') + self.compute2_rp_uuid = self._get_provider_uuid_by_host('host2') + self._create_networking_rp_tree('host2', self.compute2_rp_uuid) + self.compute2_service_id = self.admin_api.get_services( + host='host2', binary='nova-compute')[0]['id'] + + # Enable cross-cell resize policy since it defaults to not allow + # anyone to perform that type of operation. For these tests we'll + # just allow admins to perform cross-cell resize. + self.policy.set_rules({ + servers_policies.CROSS_CELL_RESIZE: + base_policies.RULE_ADMIN_API}, + overwrite=False) + + def test_cross_cell_migrate_server_with_qos_ports(self): + """Test that cross cell migration is not supported with qos ports and + nova therefore falls back to do a same cell migration instead. + To test this properly we first make sure that there is no valid host + in the same cell but there is valid host in another cell and observe + that the migration fails with NoValidHost. Then we start a new compute + in the same cell the instance is in and retry the migration that is now + expected to pass. + """ + + non_qos_normal_port = self.neutron.port_1 + qos_normal_port = self.neutron.port_with_resource_request + qos_sriov_port = self.neutron.port_with_sriov_resource_request + + server = self._create_server_with_ports_and_check_allocation( + non_qos_normal_port, qos_normal_port, qos_sriov_port) + + orig_create_binding = neutronapi.API._create_port_binding + + hosts = { + 'host1': self.compute1_rp_uuid, 'host2': self.compute2_rp_uuid} + + # Add an extra check to our neutron fixture. This check makes sure that + # the RP sent in the binding corresponds to host of the binding. In a + # real deployment this is checked by the Neutron server. As bug + # 1907522 showed we fail this check for cross cell migration with qos + # ports in a real deployment. So to reproduce that bug we need to have + # the same check in our test env too. + def spy_on_create_binding(context, client, port_id, data): + host_rp_uuid = hosts[data['binding']['host']] + device_rp_uuid = data['binding']['profile'].get('allocation') + if port_id == qos_normal_port['id']: + if device_rp_uuid != self.ovs_bridge_rp_per_host[host_rp_uuid]: + raise exception.PortBindingFailed(port_id=port_id) + elif port_id == qos_sriov_port['id']: + if (device_rp_uuid not in + self.sriov_dev_rp_per_host[host_rp_uuid].values()): + raise exception.PortBindingFailed(port_id=port_id) + + return orig_create_binding(context, client, port_id, data) + + with mock.patch( + 'nova.network.neutron.API._create_port_binding', + side_effect=spy_on_create_binding, autospec=True + ): + # We expect the migration to fail as the only available target + # host is in a different cell and while cross cell migration is + # enabled it is not supported for neutron ports with resource + # request. + self.api.post_server_action(server['id'], {'migrate': None}) + self._wait_for_migration_status(server, ['error']) + self._wait_for_server_parameter( + server, + {'status': 'ACTIVE', 'OS-EXT-SRV-ATTR:host': 'host1'}) + event = self._wait_for_action_fail_completion( + server, 'migrate', 'conductor_migrate_server') + self.assertIn( + 'exception.NoValidHost', event['traceback']) + log = self.stdlog.logger.output + self.assertIn( + 'Request is allowed by policy to perform cross-cell resize ' + 'but the instance has ports with resource request and ' + 'cross-cell resize is not supported with such ports.', + log) + self.assertNotIn( + 'nova.exception.PortBindingFailed: Binding failed for port', + log) + self.assertNotIn( + "AttributeError: 'NoneType' object has no attribute 'version'", + log) + + # Now start a new compute in the same cell as the instance and retry + # the migration. + self._start_compute('host3', cell_name='cell1') + self.compute3_rp_uuid = self._get_provider_uuid_by_host('host3') + self._create_networking_rp_tree('host3', self.compute3_rp_uuid) + + with mock.patch( + 'nova.network.neutron.API._create_port_binding', + side_effect=spy_on_create_binding, autospec=True + ): + server = self._migrate_server(server) + self.assertEqual('host3', server['OS-EXT-SRV-ATTR:host']) + + self._delete_server_and_check_allocations( + server, qos_normal_port, qos_sriov_port) diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py index 962245cf70..6e7fdc6b30 100644 --- a/nova/tests/functional/test_service.py +++ b/nova/tests/functional/test_service.py @@ -10,7 +10,10 @@ # License for the specific language governing permissions and limitations # under the License. +from unittest import mock + from nova import context as nova_context +from nova.objects import service from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures @@ -97,3 +100,40 @@ class ServiceTestCase(test.TestCase, self.metadata.start() # Cell cache should be empty after the service reset. self.assertEqual({}, nova_context.CELL_CACHE) + + +class TestOldComputeCheck( + test.TestCase, integrated_helpers.InstanceHelperMixin): + + def test_conductor_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self.start_service('conductor') + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) + + def test_api_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self.useFixture(nova_fixtures.OSAPIFixture(api_version='v2.1')) + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) + + def test_compute_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self._start_compute('host1') + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) diff --git a/nova/tests/unit/api/openstack/compute/test_server_reset_state.py b/nova/tests/unit/api/openstack/compute/test_server_reset_state.py index 591ab493a4..3462cf21ac 100644 --- a/nova/tests/unit/api/openstack/compute/test_server_reset_state.py +++ b/nova/tests/unit/api/openstack/compute/test_server_reset_state.py @@ -18,6 +18,7 @@ import webob from nova.api.openstack.compute import admin_actions \ as admin_actions_v21 +from nova.compute import instance_actions from nova.compute import vm_states from nova import exception from nova import objects @@ -59,18 +60,27 @@ class ResetStateTestsV21(test.NoDBTestCase): def _get_request(self): return fakes.HTTPRequest.blank('') + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_no_state(self): self.assertRaises(self.bad_request, self.admin_api._reset_state, self.request, self.uuid, body={"os-resetState": None}) + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_bad_state(self): self.assertRaises(self.bad_request, self.admin_api._reset_state, self.request, self.uuid, body={"os-resetState": {"state": "spam"}}) + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_no_instance(self): self.compute_api.get = mock.MagicMock( side_effect=exception.InstanceNotFound(instance_id='inst_ud')) @@ -84,11 +94,14 @@ class ResetStateTestsV21(test.NoDBTestCase): self.context, self.uuid, expected_attrs=None, cell_down_support=False) - def test_reset_active(self): + @mock.patch('nova.objects.instance_action.InstanceAction.action_start') + def test_reset_active(self, mock_instance_action_start): expected = dict(vm_state=vm_states.ACTIVE, task_state=None) self.instance.save = mock.MagicMock( side_effect=lambda **kw: self._check_instance_state(expected)) self.compute_api.get = mock.MagicMock(return_value=self.instance) + mock_instance_action = mock.Mock(spec=objects.InstanceAction) + mock_instance_action_start.return_value = mock_instance_action body = {"os-resetState": {"state": "active"}} result = self.admin_api._reset_state(self.request, self.uuid, @@ -107,11 +120,18 @@ class ResetStateTestsV21(test.NoDBTestCase): cell_down_support=False) self.instance.save.assert_called_once_with(admin_state_reset=True) - def test_reset_error(self): + mock_instance_action_start.assert_called_once_with( + self.context, self.instance.uuid, instance_actions.RESET_STATE) + mock_instance_action.finish.assert_called_once() + + @mock.patch('nova.objects.instance_action.InstanceAction.action_start') + def test_reset_error(self, mock_instance_action_start): expected = dict(vm_state=vm_states.ERROR, task_state=None) self.instance.save = mock.MagicMock( side_effect=lambda **kw: self._check_instance_state(expected)) self.compute_api.get = mock.MagicMock(return_value=self.instance) + mock_instance_action = mock.Mock(spec=objects.InstanceAction) + mock_instance_action_start.return_value = mock_instance_action body = {"os-resetState": {"state": "error"}} result = self.admin_api._reset_state(self.request, self.uuid, @@ -129,3 +149,7 @@ class ResetStateTestsV21(test.NoDBTestCase): self.context, self.instance.uuid, expected_attrs=None, cell_down_support=False) self.instance.save.assert_called_once_with(admin_state_reset=True) + + mock_instance_action_start.assert_called_once_with( + self.context, self.instance.uuid, instance_actions.RESET_STATE) + mock_instance_action.finish.assert_called_once() diff --git a/nova/tests/unit/api/openstack/compute/test_services.py b/nova/tests/unit/api/openstack/compute/test_services.py index 170b39f208..94263833c7 100644 --- a/nova/tests/unit/api/openstack/compute/test_services.py +++ b/nova/tests/unit/api/openstack/compute/test_services.py @@ -701,6 +701,21 @@ class ServicesTestV21(test.TestCase): mock_get_compute_nodes.assert_called_once_with( self.req.environ['nova.context'], compute.host) + @mock.patch( + 'nova.objects.ComputeNodeList.get_all_by_host', + side_effect=exception.ComputeHostNotFound(host='fake-compute-host')) + def test_services_delete_compute_host_not_found( + self, mock_get_all_by_host): + compute = objects.Service(self.ctxt, + **{'host': 'fake-compute-host', + 'binary': 'nova-compute', + 'topic': 'compute', + 'report_count': 0}) + compute.create() + self.controller.delete(self.req, compute.id) + mock_get_all_by_host.assert_called_with( + self.req.environ['nova.context'], 'fake-compute-host') + def test_services_delete_not_found(self): self.assertRaises(webob.exc.HTTPNotFound, diff --git a/nova/tests/unit/api/openstack/test_requestlog.py b/nova/tests/unit/api/openstack/test_requestlog.py index 386b79a528..0a25f7c51f 100644 --- a/nova/tests/unit/api/openstack/test_requestlog.py +++ b/nova/tests/unit/api/openstack/test_requestlog.py @@ -42,7 +42,8 @@ class TestRequestLogMiddleware(testtools.TestCase): # this is the minimal set of magic mocks needed to convince # the API service it can start on it's own without a database. mocks = ['nova.objects.Service.get_by_host_and_binary', - 'nova.objects.Service.create'] + 'nova.objects.Service.create', + 'nova.utils.raise_if_old_compute'] self.stdlog = fixtures.StandardLogging() self.useFixture(self.stdlog) for m in mocks: diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py new file mode 100644 index 0000000000..4cb7459c98 --- /dev/null +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -0,0 +1,85 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import tempfile + +import fixtures +import mock +from oslo_config import fixture as config_fixture +from oslotest import base + +from nova.api.openstack import wsgi_app +from nova import test +from nova.tests import fixtures as nova_fixtures + + +class WSGIAppTest(base.BaseTestCase): + + _paste_config = """ +[app:nova-api] +use = egg:Paste#static +document_root = /tmp + """ + + def setUp(self): + # Ensure BaseTestCase's ConfigureLogging fixture is disabled since + # we're using our own (StandardLogging). + with fixtures.EnvironmentVariable('OS_LOG_CAPTURE', '0'): + super(WSGIAppTest, self).setUp() + self.stdlog = self.useFixture(nova_fixtures.StandardLogging()) + self.conf = tempfile.NamedTemporaryFile(mode='w+t') + self.conf.write(self._paste_config.lstrip()) + self.conf.seek(0) + self.conf.flush() + self.addCleanup(self.conf.close) + # Use of this fixture takes care of cleaning up config settings for + # subsequent tests. + self.useFixture(config_fixture.Config()) + + @mock.patch('sys.argv', return_value=mock.sentinel.argv) + @mock.patch('nova.db.sqlalchemy.api.configure') + @mock.patch('nova.api.openstack.wsgi_app._setup_service') + @mock.patch('nova.api.openstack.wsgi_app._get_config_files') + def test_init_application_called_twice(self, mock_get_files, mock_setup, + mock_db_configure, mock_argv): + """Test that init_application can tolerate being called twice in a + single python interpreter instance. + + When nova-api is run via mod_wsgi, if any exception is raised during + init_application, mod_wsgi will re-run the WSGI script without + restarting the daemon process even when configured for Daemon Mode. + + We access the database as part of init_application, so if nova-api + starts up before the database is up, we'll get, for example, a + DBConnectionError raised during init_application and our WSGI script + will get reloaded/re-run by mod_wsgi. + """ + mock_get_files.return_value = [self.conf.name] + mock_setup.side_effect = [test.TestingException, None] + # We need to mock the global database configure() method, else we will + # be affected by global database state altered by other tests that ran + # before this test, causing this test to fail with + # oslo_db.sqlalchemy.enginefacade.AlreadyStartedError. We can instead + # mock the method to raise an exception if it's called a second time in + # this test to simulate the fact that the database does not tolerate + # re-init [after a database query has been made]. + mock_db_configure.side_effect = [None, test.TestingException] + # Run init_application the first time, simulating an exception being + # raised during it. + self.assertRaises(test.TestingException, wsgi_app.init_application, + 'nova-api') + # Now run init_application a second time, it should succeed since no + # exception is being raised (the init of global data should not be + # re-attempted). + wsgi_app.init_application('nova-api') + self.assertIn('Global data already initialized, not re-initializing.', + self.stdlog.logger.output) diff --git a/nova/tests/unit/cmd/test_status.py b/nova/tests/unit/cmd/test_status.py index 6922a2a6e0..f5f2a8b634 100644 --- a/nova/tests/unit/cmd/test_status.py +++ b/nova/tests/unit/cmd/test_status.py @@ -44,6 +44,7 @@ from nova import exception # NOTE(mriedem): We only use objects as a convenience to populate the database # in the tests, we don't use them in the actual CLI. from nova import objects +from nova.objects import service from nova import policy from nova import test from nova.tests import fixtures as nova_fixtures @@ -667,3 +668,32 @@ class TestUpgradeCheckPolicyJSON(test.NoDBTestCase): jsonutils.dump(self.data, fh) self.assertEqual(upgradecheck.Code.FAILURE, self.cmd._check_policy_json().code) + + +class TestUpgradeCheckOldCompute(test.NoDBTestCase): + + def setUp(self): + super(TestUpgradeCheckOldCompute, self).setUp() + self.cmd = status.UpgradeCommands() + + def test_no_compute(self): + self.assertEqual( + upgradecheck.Code.SUCCESS, self.cmd._check_old_computes().code) + + def test_only_new_compute(self): + last_supported_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=last_supported_version): + self.assertEqual( + upgradecheck.Code.SUCCESS, self.cmd._check_old_computes().code) + + def test_old_compute(self): + too_old = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=too_old): + result = self.cmd._check_old_computes() + self.assertEqual(upgradecheck.Code.WARNING, result.code) diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py index 88dd94ac70..34a866788d 100644 --- a/nova/tests/unit/compute/test_api.py +++ b/nova/tests/unit/compute/test_api.py @@ -4211,7 +4211,7 @@ class _ComputeAPIUnitTestMixIn(object): 'cores': 1 + instance.flavor.vcpus, 'ram': 512 + instance.flavor.memory_mb}, project_id=instance.project_id) - update_qfd.assert_called_once_with(admin_context, instance, False) + update_qfd.assert_called_once_with(admin_context, instance.uuid, False) @mock.patch('nova.objects.Quotas.get_all_by_project_and_user', new=mock.MagicMock()) @@ -4252,7 +4252,7 @@ class _ComputeAPIUnitTestMixIn(object): 'cores': 1 + instance.flavor.vcpus, 'ram': 512 + instance.flavor.memory_mb}, project_id=instance.project_id) - update_qfd.assert_called_once_with(self.context, instance, False) + update_qfd.assert_called_once_with(self.context, instance.uuid, False) @mock.patch.object(objects.InstanceAction, 'action_start') def test_external_instance_event(self, mock_action_start): @@ -5971,7 +5971,8 @@ class _ComputeAPIUnitTestMixIn(object): inst = objects.Instance(uuid=uuid) im = objects.InstanceMapping(instance_uuid=uuid) mock_get.return_value = im - self.compute_api._update_queued_for_deletion(self.context, inst, True) + self.compute_api._update_queued_for_deletion( + self.context, inst.uuid, True) self.assertTrue(im.queued_for_delete) mock_get.assert_called_once_with(self.context, inst.uuid) mock_save.assert_called_once_with() @@ -7522,28 +7523,54 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): version is not new enough. """ instance = objects.Instance( - project_id='fake-project', user_id='fake-user') + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) + with mock.patch.object(self.context, 'can', return_value=True) as can: + self.assertFalse(self.compute_api._allow_cross_cell_resize( + self.context, instance)) + can.assert_called_once() + mock_get_min_ver.assert_called_once_with( + self.context, ['nova-compute']) + + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance', + return_value=[objects.RequestGroup()]) + @mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=compute_api.MIN_COMPUTE_CROSS_CELL_RESIZE) + def test_allow_cross_cell_resize_false_port_with_resource_req( + self, mock_get_min_ver, mock_get_res_req): + """Policy allows cross-cell resize but minimum nova-compute service + version is not new enough. + """ + instance = objects.Instance( + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) with mock.patch.object(self.context, 'can', return_value=True) as can: self.assertFalse(self.compute_api._allow_cross_cell_resize( self.context, instance)) can.assert_called_once() mock_get_min_ver.assert_called_once_with( self.context, ['nova-compute']) + mock_get_res_req.assert_called_once_with(self.context, uuids.instance) + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance', + return_value=[]) @mock.patch('nova.objects.service.get_minimum_version_all_cells', return_value=compute_api.MIN_COMPUTE_CROSS_CELL_RESIZE) - def test_allow_cross_cell_resize_true(self, mock_get_min_ver): + def test_allow_cross_cell_resize_true( + self, mock_get_min_ver, mock_get_res_req): """Policy allows cross-cell resize and minimum nova-compute service version is new enough. """ instance = objects.Instance( - project_id='fake-project', user_id='fake-user') + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) with mock.patch.object(self.context, 'can', return_value=True) as can: self.assertTrue(self.compute_api._allow_cross_cell_resize( self.context, instance)) can.assert_called_once() mock_get_min_ver.assert_called_once_with( self.context, ['nova-compute']) + mock_get_res_req.assert_called_once_with(self.context, uuids.instance) def _test_block_accelerators(self, instance, args_info, until_service=None): diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 3265f8056b..bb6629908a 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -1111,6 +1111,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, "time this service is starting on this host, then you can ignore " "this warning.", 'fake-node1') + def test_init_host_disk_devices_configuration_failure(self): + self.flags(max_disk_devices_to_attach=0, group='compute') + self.assertRaises(exception.InvalidConfiguration, + self.compute.init_host) + @mock.patch.object(objects.InstanceList, 'get_by_host', new=mock.Mock()) @mock.patch('nova.compute.manager.ComputeManager.' @@ -3382,12 +3387,16 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, CONF.host, instance.uuid, graceful_exit=False) return result + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_success(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_fail(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3397,7 +3406,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self._test_check_can_live_migrate_destination, do_raise=True) - def test_check_can_live_migrate_destination_contins_vifs(self): + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) + def test_check_can_live_migrate_destination_contains_vifs(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) @@ -3405,6 +3416,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.assertIn('vifs', migrate_data) self.assertIsNotNone(migrate_data.vifs) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_no_binding_extended(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3412,18 +3425,40 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, migrate_data = self._test_check_can_live_migrate_destination() self.assertNotIn('vifs', migrate_data) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_false(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=False) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_true(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=True) + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + def test_check_can_live_migrate_destination_fail_group_policy( + self, mock_fail_db): + + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.ACTIVE, + node='fake-node') + + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + exception.MigrationPreCheckError, + self.compute.check_can_live_migrate_destination, + self.context, instance, None, None, None, None) + def test_dest_can_numa_live_migrate(self): positive_dest_check_data = objects.LibvirtLiveMigrateData( dst_supports_numa_live_migration=True) @@ -5340,19 +5375,21 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, node = uuidutils.generate_uuid() # ironic node uuid instance = fake_instance.fake_instance_obj(self.context, node=node) instance.migration_context = None + migration = objects.Migration(status='accepted') with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance_with_claim'), mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + mock.patch.object(migration, 'save'), + ) as (mock_get, mock_rebuild, mock_save, mock_migration_save): self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, False, - False, False, None, None, {}, None, []) + False, False, migration, None, {}, None, []) self.assertFalse(mock_get.called) self.assertEqual(node, instance.node) - mock_set.assert_called_once_with(None, 'done') + self.assertEqual('done', migration.status) + mock_migration_save.assert_called_once_with() def test_rebuild_node_updated_if_recreate(self): dead_node = uuidutils.generate_uuid() @@ -5365,16 +5402,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance'), - mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + ) as (mock_get, mock_rebuild): mock_get.return_value.hypervisor_hostname = 'new-node' self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, - None, True, False, False, None, None, {}, None, []) + None, True, False, False, mock.sentinel.migration, None, {}, + None, []) mock_get.assert_called_once_with(mock.ANY, self.compute.host) - self.assertEqual('new-node', instance.node) - mock_set.assert_called_once_with(None, 'done') + mock_rt.finish_evacuation.assert_called_once_with( + instance, 'new-node', mock.sentinel.migration) # Make sure the rebuild_claim was called with the proper image_meta # from the instance. mock_rt.rebuild_claim.assert_called_once() @@ -6875,14 +6911,13 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): @mock.patch.object(objects.Instance, 'save') @mock.patch.object(manager.ComputeManager, '_nil_out_instance_obj_host_and_node') - @mock.patch.object(manager.ComputeManager, '_cleanup_volumes') @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state') @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def test_rescheduled_exception_without_retry(self, - mock_build_run, mock_add, mock_set, mock_clean_net, mock_clean_vol, - mock_nil, mock_save, mock_start, mock_finish): + mock_build_run, mock_add, mock_set, mock_clean_net, mock_nil, + mock_save, mock_start, mock_finish): self._do_build_instance_update(mock_save) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -6918,9 +6953,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.accel_uuids) mock_clean_net.assert_called_once_with(self.context, self.instance, self.requested_networks) - mock_clean_vol.assert_called_once_with(self.context, - self.instance, self.block_device_mapping, - raise_exc=False) mock_add.assert_called_once_with(self.context, self.instance, mock.ANY, mock.ANY, fault_message=mock.ANY) mock_nil.assert_called_once_with(self.instance) @@ -7530,7 +7562,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): def test_validate_policy_honors_workaround_disabled(self, mock_get): instance = objects.Instance(uuid=uuids.instance) hints = {'group': 'foo'} - mock_get.return_value = objects.InstanceGroup(policy=None) + mock_get.return_value = objects.InstanceGroup(policy=None, + uuid=uuids.group) self.compute._validate_instance_group_policy(self.context, instance, hints) mock_get.assert_called_once_with(self.context, 'foo') @@ -7556,10 +7589,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): instance, hints) mock_get.assert_called_once_with(self.context, uuids.group_hint) + @mock.patch('nova.objects.InstanceGroup.get_by_uuid') @mock.patch('nova.objects.InstanceList.get_uuids_by_host') @mock.patch('nova.objects.InstanceGroup.get_by_hint') - def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint, - mock_get_by_host): + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node') + def test_validate_instance_group_policy_with_rules( + self, migration_list, nodes, mock_get_by_hint, mock_get_by_host, + mock_get_by_uuid): # Create 2 instance in same host, inst2 created before inst1 instance = objects.Instance(uuid=uuids.inst1) hints = {'group': [uuids.group_hint]} @@ -7568,17 +7605,26 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_get_by_host.return_value = existing_insts # if group policy rules limit to 1, raise RescheduledException - mock_get_by_hint.return_value = objects.InstanceGroup( + group = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': '1'}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group + mock_get_by_uuid.return_value = group + nodes.return_value = ['nodename'] + migration_list.return_value = [objects.Migration( + uuid=uuids.migration, instance_uuid=uuids.instance)] self.assertRaises(exception.RescheduledException, self.compute._validate_instance_group_policy, self.context, instance, hints) # if group policy rules limit change to 2, validate OK - mock_get_by_hint.return_value = objects.InstanceGroup( + group2 = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': 2}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group2 + mock_get_by_uuid.return_value = group2 self.compute._validate_instance_group_policy(self.context, instance, hints) @@ -9105,6 +9151,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, manager.ComputeManager() mock_executor.assert_called_once_with() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_cinder_v3_api(self): # This tests that pre_live_migration with a bdm with an # attachment_id, will create a new attachment and update @@ -9182,6 +9230,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exception_cinder_v3_api(self): # The instance in this test has 2 attachments. The second attach_create # will throw an exception. This will test that the first attachment @@ -9251,6 +9301,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.assertGreater(len(m.mock_calls), 0) _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exceptions_delete_attachments(self): # The instance in this test has 2 attachments. The call to # driver.pre_live_migration will raise an exception. This will test @@ -9319,12 +9371,18 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, """Tests the various ways that _get_neutron_events_for_live_migration will return an empty list. """ + migration = mock.Mock() + migration.is_same_host = lambda: False + self.assertFalse(migration.is_same_host()) + # 1. no timeout self.flags(vif_plugging_timeout=0) with mock.patch.object(self.instance, 'get_network_info') as nw_info: nw_info.return_value = network_model.NetworkInfo( - [network_model.VIF(uuids.port1)]) + [network_model.VIF(uuids.port1, details={ + network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True})]) + self.assertTrue(nw_info.return_value[0].is_hybrid_plug_enabled()) self.assertEqual( [], self.compute._get_neutron_events_for_live_migration( self.instance)) @@ -9333,7 +9391,18 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.flags(vif_plugging_timeout=300) with mock.patch.object(self.instance, 'get_network_info') as nw_info: - nw_info.return_value = [] + nw_info.return_value = network_model.NetworkInfo([]) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration( + self.instance)) + + # 3. no plug time events + with mock.patch.object(self.instance, 'get_network_info') as nw_info: + nw_info.return_value = network_model.NetworkInfo( + [network_model.VIF( + uuids.port1, details={ + network_model.VIF_DETAILS_OVS_HYBRID_PLUG: False})]) + self.assertFalse(nw_info.return_value[0].is_hybrid_plug_enabled()) self.assertEqual( [], self.compute._get_neutron_events_for_live_migration( self.instance)) @@ -9351,9 +9420,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, wait_for_vif_plugged=True) mock_get_bdms.return_value = objects.BlockDeviceMappingList(objects=[]) mock_pre_live_mig.return_value = migrate_data + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + network_model.VIF(uuids.port1, details=details), + network_model.VIF(uuids.port2, details=details) ])) self.compute._waiting_live_migrations[self.instance.uuid] = ( self.migration, mock.MagicMock() @@ -9383,11 +9454,12 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, of not waiting. """ migrate_data = objects.LibvirtLiveMigrateData() + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} mock_get_bdms.return_value = objects.BlockDeviceMappingList(objects=[]) mock_pre_live_mig.return_value = migrate_data self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1)])) + network_model.VIF(uuids.port1, details=details)])) self.compute._waiting_live_migrations[self.instance.uuid] = ( self.migration, mock.MagicMock() ) @@ -9531,9 +9603,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, mock_get_bdms.return_value = source_bdms migrate_data = objects.LibvirtLiveMigrateData( wait_for_vif_plugged=True) + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + network_model.VIF(uuids.port1, details=details), + network_model.VIF(uuids.port2, details=details) ])) self.compute._waiting_live_migrations = {} fake_migration = objects.Migration( @@ -10607,6 +10681,54 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, # (_error_out_instance_on_exception will set to ACTIVE by default). self.assertEqual(vm_states.STOPPED, instance.vm_state) + @mock.patch('nova.compute.utils.notify_usage_exists') + @mock.patch('nova.compute.manager.ComputeManager.' + '_notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_about_resize_prep_instance') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation') + @mock.patch('nova.compute.manager.ComputeManager.' + '_reschedule_resize_or_reraise') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + # this is almost copy-paste from test_prep_resize_fails_rollback + def test_prep_resize_fails_group_validation( + self, add_instance_fault_from_exc, _reschedule_resize_or_reraise, + _revert_allocation, mock_instance_save, + notify_about_resize_prep_instance, _notify_about_instance_usage, + notify_usage_exists): + """Tests that if _validate_instance_group_policy raises + InstanceFaultRollback, the instance.vm_state is reset properly in + _error_out_instance_on_exception + """ + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.STOPPED, + node='fake-node', expected_attrs=['system_metadata', 'flavor']) + migration = mock.MagicMock(spec='nova.objects.Migration') + request_spec = mock.MagicMock(spec='nova.objects.RequestSpec') + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + ex2 = exception.InstanceFaultRollback( + inner_exception=ex) + + def fake_reschedule_resize_or_reraise(*args, **kwargs): + raise ex2 + + _reschedule_resize_or_reraise.side_effect = ( + fake_reschedule_resize_or_reraise) + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + # _error_out_instance_on_exception should reraise the + # RescheduledException inside InstanceFaultRollback. + exception.RescheduledException, self.compute.prep_resize, + self.context, instance.image_meta, instance, instance.flavor, + request_spec, filter_properties={}, node=instance.node, + clean_shutdown=True, migration=migration, host_list=[]) + # The instance.vm_state should remain unchanged + # (_error_out_instance_on_exception will set to ACTIVE by default). + self.assertEqual(vm_states.STOPPED, instance.vm_state) + @mock.patch('nova.compute.rpcapi.ComputeAPI.resize_instance') @mock.patch('nova.compute.resource_tracker.ResourceTracker.resize_claim') @mock.patch('nova.objects.Instance.save') diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py index 3c234df891..92883134a2 100644 --- a/nova/tests/unit/console/test_websocketproxy.py +++ b/nova/tests/unit/console/test_websocketproxy.py @@ -15,6 +15,7 @@ """Tests for nova websocketproxy.""" import copy +import io import socket import mock @@ -626,6 +627,72 @@ class NovaProxyRequestHandlerTestCase(test.NoDBTestCase): self.wh.server.top_new_client(conn, address) self.assertIsNone(self.wh._compute_rpcapi) + def test_reject_open_redirect(self): + # This will test the behavior when an attempt is made to cause an open + # redirect. It should be rejected. + mock_req = mock.MagicMock() + mock_req.makefile().readline.side_effect = [ + b'GET //example.com/%2F.. HTTP/1.1\r\n', + b'' + ] + + client_addr = ('8.8.8.8', 54321) + mock_server = mock.MagicMock() + # This specifies that the server will be able to handle requests other + # than only websockets. + mock_server.only_upgrade = False + + # Constructing a handler will process the mock_req request passed in. + handler = websocketproxy.NovaProxyRequestHandler( + mock_req, client_addr, mock_server) + + # Collect the response data to verify at the end. The + # SimpleHTTPRequestHandler writes the response data to a 'wfile' + # attribute. + output = io.BytesIO() + handler.wfile = output + # Process the mock_req again to do the capture. + handler.do_GET() + output.seek(0) + result = output.readlines() + + # Verify no redirect happens and instead a 400 Bad Request is returned. + self.assertIn('400 URI must not start with //', result[0].decode()) + + def test_reject_open_redirect_3_slashes(self): + # This will test the behavior when an attempt is made to cause an open + # redirect. It should be rejected. + mock_req = mock.MagicMock() + mock_req.makefile().readline.side_effect = [ + b'GET ///example.com/%2F.. HTTP/1.1\r\n', + b'' + ] + + # Collect the response data to verify at the end. The + # SimpleHTTPRequestHandler writes the response data by calling the + # request socket sendall() method. + self.data = b'' + + def fake_sendall(data): + self.data += data + + mock_req.sendall.side_effect = fake_sendall + + client_addr = ('8.8.8.8', 54321) + mock_server = mock.MagicMock() + # This specifies that the server will be able to handle requests other + # than only websockets. + mock_server.only_upgrade = False + + # Constructing a handler will process the mock_req request passed in. + websocketproxy.NovaProxyRequestHandler( + mock_req, client_addr, mock_server) + + # Verify no redirect happens and instead a 400 Bad Request is returned. + self.data = self.data.decode() + self.assertIn('Error code: 400', self.data) + self.assertIn('Message: URI must not start with //', self.data) + @mock.patch('websockify.websocketproxy.select_ssl_version') def test_ssl_min_version_is_not_set(self, mock_select_ssl): websocketproxy.NovaWebSocketProxy() diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index cf662aad73..6b80f21661 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -39,7 +39,6 @@ from oslo_utils import uuidutils import six from six.moves import range from sqlalchemy import Column -from sqlalchemy.dialects import sqlite from sqlalchemy.exc import OperationalError from sqlalchemy.exc import SQLAlchemyError from sqlalchemy import inspect @@ -1693,6 +1692,14 @@ class InstanceTestCase(test.TestCase, ModelsObjectComparatorMixin): sys_meta = utils.metadata_to_dict(inst['system_metadata']) self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_get_with_meta(self): + inst_id = self.create_instance_with_args().id + inst = db.instance_get(self.ctxt, inst_id) + meta = utils.metadata_to_dict(inst['metadata']) + self.assertEqual(meta, self.sample_data['metadata']) + sys_meta = utils.metadata_to_dict(inst['system_metadata']) + self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_update(self): instance = self.create_instance_with_args() metadata = {'host': 'bar', 'key2': 'wuff'} @@ -6288,17 +6295,24 @@ class ArchiveTestCase(test.TestCase, ModelsObjectComparatorMixin): instance_actions_events=1) self._assertEqualObjects(expected, results[0]) - # Archive 1 row deleted before 2017-01-03. instance_action_events - # should be the table with row deleted due to FK contraints + # Archive 1 row deleted before 2017-01-03 + # Because the instances table will be processed first, tables that + # refer to it (instance_actions and instance_action_events) will be + # visited and archived in the same transaction as the instance, to + # avoid orphaning the instance record (archive dependent records in one + # transaction) before_date = dateutil_parser.parse('2017-01-03', fuzzy=True) results = db.archive_deleted_rows(max_rows=1, before=before_date) - expected = dict(instance_actions_events=1) + expected = dict(instances=1, + instance_actions=1, + instance_actions_events=1) self._assertEqualObjects(expected, results[0]) - # Archive all other rows deleted before 2017-01-03. This should - # delete row in instance_actions, then row in instances due to FK - # constraints + # Try to archive all other rows deleted before 2017-01-03. This should + # not archive anything because the instances table and tables that + # refer to it (instance_actions and instance_action_events) were all + # archived in the last run. results = db.archive_deleted_rows(max_rows=100, before=before_date) - expected = dict(instances=1, instance_actions=1) + expected = {} self._assertEqualObjects(expected, results[0]) # Verify we have 4 left in main @@ -6432,18 +6446,7 @@ class ArchiveTestCase(test.TestCase, ModelsObjectComparatorMixin): def _check_sqlite_version_less_than_3_7(self): # SQLite doesn't enforce foreign key constraints without a pragma. - dialect = self.engine.url.get_dialect() - if dialect == sqlite.dialect: - # We're seeing issues with foreign key support in SQLite 3.6.20 - # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 - # It works fine in SQLite 3.7. - # So return early to skip this test if running SQLite < 3.7 - import sqlite3 - tup = sqlite3.sqlite_version_info - if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): - self.skipTest( - 'sqlite version too old for reliable SQLA foreign_keys') - self.conn.execute("PRAGMA foreign_keys = ON") + self.enforce_fk_constraints(engine=self.engine) def test_archive_deleted_rows_for_migrations(self): # migrations.instance_uuid depends on instances.uuid @@ -6457,19 +6460,8 @@ class ArchiveTestCase(test.TestCase, ModelsObjectComparatorMixin): ins_stmt = self.migrations.insert().values(instance_uuid=instance_uuid, deleted=0) self.conn.execute(ins_stmt) - # The first try to archive instances should fail, due to FK. - num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, - "instances", - max_rows=None, - before=None) - self.assertEqual(0, num[0]) - # Then archiving migrations should work. - num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, - "migrations", - max_rows=None, - before=None) - self.assertEqual(1, num[0]) - # Then archiving instances should work. + # Archiving instances should result in migrations related to the + # instances also being archived. num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, "instances", max_rows=None, diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index e23203c04a..fb7fa8713c 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -253,6 +253,8 @@ class TestNeutronClient(test.NoDBTestCase): auth_token='token') cl = neutronapi.get_client(my_context) self.assertEqual(retries, cl.httpclient.connect_retries) + kcl = neutronapi._get_ksa_client(my_context) + self.assertEqual(retries, kcl.connect_retries) class TestAPIBase(test.TestCase): diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index fe7d918e27..c1b26d9726 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -236,6 +236,40 @@ class PciDevTrackerTestCase(test.NoDBTestCase): tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) self.assertEqual(2, len(tracker.pci_devs)) + @mock.patch("nova.pci.manager.LOG.debug") + def test_update_devices_from_hypervisor_resources_32bit_domain( + self, mock_debug): + self.flags( + group='pci', + passthrough_whitelist=[ + '{"product_id":"2032", "vendor_id":"8086"}']) + # There are systems where 32 bit PCI domain is used. See bug 1897528 + # for example. While nova (and qemu) does not support assigning such + # devices but the existence of such device in the system should not + # lead to an error. + fake_pci = { + 'compute_node_id': 1, + 'address': '10000:00:02.0', + 'product_id': '2032', + 'vendor_id': '8086', + 'request_id': None, + 'status': fields.PciDeviceStatus.AVAILABLE, + 'dev_type': fields.PciDeviceType.STANDARD, + 'parent_addr': None, + 'numa_node': 0} + + fake_pci_devs = [fake_pci] + fake_pci_devs_json = jsonutils.dumps(fake_pci_devs) + tracker = manager.PciDevTracker(self.fake_context) + # We expect that the device with 32bit PCI domain is ignored + tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) + self.assertEqual(0, len(tracker.pci_devs)) + mock_debug.assert_called_once_with( + 'Skipping PCI device %s reported by the hypervisor: %s', + {'address': '10000:00:02.0', 'parent_addr': None}, + 'The property domain (10000) is greater than the maximum ' + 'allowable value (FFFF).') + def test_set_hvdev_new_dev(self): fake_pci_3 = dict(fake_pci, address='0000:00:00.4', vendor_id='v2') fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_1), diff --git a/nova/tests/unit/pci/test_stats.py b/nova/tests/unit/pci/test_stats.py index 1671e9f000..6c1e55f8cd 100644 --- a/nova/tests/unit/pci/test_stats.py +++ b/nova/tests/unit/pci/test_stats.py @@ -561,6 +561,30 @@ class PciDeviceStatsWithTagsTestCase(test.NoDBTestCase): self.pci_stats.remove_device(dev2) self._assertPools() + def test_update_device(self): + # Update device type of one of the device from type-PCI to + # type-PF. Verify if the existing pool is updated and a new + # pool is created with dev_type type-PF. + self._create_pci_devices() + dev1 = self.pci_tagged_devices.pop() + dev1.dev_type = 'type-PF' + self.pci_stats.update_device(dev1) + self.assertEqual(3, len(self.pci_stats.pools)) + self._assertPoolContent(self.pci_stats.pools[0], '1137', '0072', + len(self.pci_untagged_devices)) + self.assertEqual(self.pci_untagged_devices, + self.pci_stats.pools[0]['devices']) + self._assertPoolContent(self.pci_stats.pools[1], '1137', '0071', + len(self.pci_tagged_devices), + physical_network='physnet1') + self.assertEqual(self.pci_tagged_devices, + self.pci_stats.pools[1]['devices']) + self._assertPoolContent(self.pci_stats.pools[2], '1137', '0071', + 1, + physical_network='physnet1') + self.assertEqual(dev1, + self.pci_stats.pools[2]['devices'][0]) + class PciDeviceVFPFStatsTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/storage/__init__.py b/nova/tests/unit/storage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/nova/tests/unit/storage/__init__.py diff --git a/nova/tests/unit/storage/test_rbd.py b/nova/tests/unit/storage/test_rbd.py index 5a39bdbd5a..396f22c643 100644 --- a/nova/tests/unit/storage/test_rbd.py +++ b/nova/tests/unit/storage/test_rbd.py @@ -13,6 +13,7 @@ from eventlet import tpool import mock +from oslo_concurrency import processutils from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids @@ -110,7 +111,7 @@ class RbdTestCase(test.NoDBTestCase): self.rbd_pool = 'rbd' self.rbd_connect_timeout = 5 self.flags( - images_rbd_pool=self.images_rbd_pool, + images_rbd_pool=self.rbd_pool, images_rbd_ceph_conf='/foo/bar.conf', rbd_connect_timeout=self.rbd_connect_timeout, rbd_user='foo', group='libvirt') @@ -118,13 +119,15 @@ class RbdTestCase(test.NoDBTestCase): rados_patcher = mock.patch.object(rbd_utils, 'rados') self.mock_rados = rados_patcher.start() self.addCleanup(rados_patcher.stop) - self.mock_rados.Rados = mock.Mock - self.mock_rados.Rados.ioctx = mock.Mock() - self.mock_rados.Rados.connect = mock.Mock() - self.mock_rados.Rados.shutdown = mock.Mock() - self.mock_rados.Rados.open_ioctx = mock.Mock() - self.mock_rados.Rados.open_ioctx.return_value = \ - self.mock_rados.Rados.ioctx + self.mock_rados.Rados = mock.Mock() + self.rados_inst = mock.Mock() + self.mock_rados.Rados.return_value = self.rados_inst + self.rados_inst.ioctx = mock.Mock() + self.rados_inst.connect = mock.Mock() + self.rados_inst.shutdown = mock.Mock() + self.rados_inst.open_ioctx = mock.Mock() + self.rados_inst.open_ioctx.return_value = \ + self.rados_inst.ioctx self.mock_rados.Error = Exception rbd_patcher = mock.patch.object(rbd_utils, 'rbd') @@ -338,33 +341,31 @@ class RbdTestCase(test.NoDBTestCase): def test_connect_to_rados_default(self): ret = self.driver._connect_to_rados() - self.mock_rados.Rados.connect.assert_called_once_with( + self.rados_inst.connect.assert_called_once_with( timeout=self.rbd_connect_timeout) - self.assertTrue(self.mock_rados.Rados.open_ioctx.called) - self.assertIsInstance(ret[0], self.mock_rados.Rados) - self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) - self.mock_rados.Rados.open_ioctx.assert_called_with(self.rbd_pool) + self.assertTrue(self.rados_inst.open_ioctx.called) + self.assertEqual(self.rados_inst.ioctx, ret[1]) + self.rados_inst.open_ioctx.assert_called_with(self.rbd_pool) def test_connect_to_rados_different_pool(self): ret = self.driver._connect_to_rados('alt_pool') - self.mock_rados.Rados.connect.assert_called_once_with( + self.rados_inst.connect.assert_called_once_with( timeout=self.rbd_connect_timeout) - self.assertTrue(self.mock_rados.Rados.open_ioctx.called) - self.assertIsInstance(ret[0], self.mock_rados.Rados) - self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) - self.mock_rados.Rados.open_ioctx.assert_called_with('alt_pool') + self.assertTrue(self.rados_inst.open_ioctx.called) + self.assertEqual(self.rados_inst.ioctx, ret[1]) + self.rados_inst.open_ioctx.assert_called_with('alt_pool') def test_connect_to_rados_error(self): - self.mock_rados.Rados.open_ioctx.side_effect = self.mock_rados.Error + self.rados_inst.open_ioctx.side_effect = self.mock_rados.Error self.assertRaises(self.mock_rados.Error, self.driver._connect_to_rados) - self.mock_rados.Rados.open_ioctx.assert_called_once_with( + self.rados_inst.open_ioctx.assert_called_once_with( self.rbd_pool) - self.mock_rados.Rados.shutdown.assert_called_once_with() + self.rados_inst.shutdown.assert_called_once_with() def test_connect_to_rados_unicode_arg(self): self.driver._connect_to_rados(u'unicode_pool') - self.mock_rados.Rados.open_ioctx.assert_called_with( + self.rados_inst.open_ioctx.assert_called_with( test.MatchType(str)) def test_ceph_args_none(self): @@ -653,6 +654,11 @@ class RbdTestCase(test.NoDBTestCase): 'used': ceph_df_json['pools'][1]['stats']['bytes_used']} self.assertDictEqual(expected, self.driver.get_pool_info()) + @mock.patch('oslo_concurrency.processutils.execute', autospec=True, + side_effect=processutils.ProcessExecutionError("failed")) + def test_get_pool_info_execute_failed(self, mock_execute): + self.assertRaises(exception.StorageError, self.driver.get_pool_info) + @mock.patch('oslo_concurrency.processutils.execute') def test_get_pool_info_not_found(self, mock_execute): # Make the pool something other than self.rbd_pool so it won't be found diff --git a/nova/tests/unit/test_fixtures.py b/nova/tests/unit/test_fixtures.py index fbc7a3f6a1..797a4ef630 100644 --- a/nova/tests/unit/test_fixtures.py +++ b/nova/tests/unit/test_fixtures.py @@ -90,6 +90,7 @@ class TestLogging(testtools.TestCase): class TestOSAPIFixture(testtools.TestCase): @mock.patch('nova.objects.Service.get_by_host_and_binary') @mock.patch('nova.objects.Service.create') + @mock.patch('nova.utils.raise_if_old_compute', new=mock.Mock()) def test_responds_to_version(self, mock_service_create, mock_get): """Ensure the OSAPI server responds to calls sensibly.""" self.useFixture(output.CaptureOutput()) diff --git a/nova/tests/unit/test_hacking.py b/nova/tests/unit/test_hacking.py index 9222daa451..da9d83c68a 100644 --- a/nova/tests/unit/test_hacking.py +++ b/nova/tests/unit/test_hacking.py @@ -897,7 +897,7 @@ class HackingTestCase(test.NoDBTestCase): expected_errors=errors, filename="nova/tests/unit/test_context.py") # Check no errors in other than 'nova/tests' directory. self._assert_has_no_errors( - code, checks.nonexistent_assertion_methods_and_attributes, + code, checks.useless_assertion, filename="nova/compute/api.py") code = """ self.assertIsNone(None_test_var, "Fails") diff --git a/nova/tests/unit/test_metadata.py b/nova/tests/unit/test_metadata.py index 1dfa5eea65..45c2b6a6a8 100644 --- a/nova/tests/unit/test_metadata.py +++ b/nova/tests/unit/test_metadata.py @@ -322,12 +322,14 @@ class MetadataTestCase(test.TestCase): 'uuid': 'e5fe5518-0288-4fa3-b0c4-c79764101b85', 'root_device_name': None, 'default_ephemeral_device': None, - 'default_swap_device': None}) + 'default_swap_device': None, + 'context': self.context}) instance_ref1 = objects.Instance(**{'id': 0, 'uuid': 'b65cee2f-8c69-4aeb-be2f-f79742548fc2', 'root_device_name': '/dev/sda1', 'default_ephemeral_device': None, - 'default_swap_device': None}) + 'default_swap_device': None, + 'context': self.context}) def fake_bdm_get(ctxt, uuid): return [fake_block_device.FakeDbBlockDeviceDict( @@ -366,10 +368,12 @@ class MetadataTestCase(test.TestCase): 'swap': '/dev/sdc', 'ebs0': '/dev/sdh'} - self.assertEqual(base._format_instance_mapping(self.context, - instance_ref0), block_device._DEFAULT_MAPPINGS) - self.assertEqual(base._format_instance_mapping(self.context, - instance_ref1), expected) + self.assertEqual( + base._format_instance_mapping(instance_ref0), + block_device._DEFAULT_MAPPINGS) + self.assertEqual( + base._format_instance_mapping(instance_ref1), + expected) def test_pubkey(self): md = fake_InstanceMetadata(self, self.instance.obj_clone()) diff --git a/nova/tests/unit/test_service.py b/nova/tests/unit/test_service.py index 8e0ed56ae0..676eb0aa42 100644 --- a/nova/tests/unit/test_service.py +++ b/nova/tests/unit/test_service.py @@ -268,6 +268,24 @@ class ServiceTestCase(test.NoDBTestCase): serv.reset() mock_reset.assert_called_once_with() + @mock.patch('nova.conductor.api.API.wait_until_ready') + @mock.patch('nova.utils.raise_if_old_compute') + def test_old_compute_version_check_happens_after_wait_for_conductor( + self, mock_check_old, mock_wait): + obj_base.NovaObject.indirection_api = mock.MagicMock() + + def fake_wait(*args, **kwargs): + mock_check_old.assert_not_called() + + mock_wait.side_effect = fake_wait + + service.Service.create( + self.host, self.binary, self.topic, + 'nova.tests.unit.test_service.FakeManager') + + mock_check_old.assert_called_once_with() + mock_wait.assert_called_once_with(mock.ANY) + class TestWSGIService(test.NoDBTestCase): diff --git a/nova/tests/unit/test_utils.py b/nova/tests/unit/test_utils.py index e9930fcb7a..b7516ef7e8 100644 --- a/nova/tests/unit/test_utils.py +++ b/nova/tests/unit/test_utils.py @@ -37,6 +37,7 @@ from nova import context from nova import exception from nova.objects import base as obj_base from nova.objects import instance as instance_obj +from nova.objects import service as service_obj from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.unit.objects import test_objects @@ -1207,3 +1208,187 @@ class TestGetSDKAdapter(test.NoDBTestCase): self.mock_get_confgrp.assert_called_once_with(self.service_type) self.mock_connection.assert_not_called() self.mock_get_auth_sess.assert_not_called() + + +class TestOldComputeCheck(test.NoDBTestCase): + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_no_compute(self, mock_get_min_service): + mock_get_min_service.return_value = 0 + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_old_but_supported_compute(self, mock_get_min_service): + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_new_compute(self, mock_get_min_service): + mock_get_min_service.return_value = service_obj.SERVICE_VERSION + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.Service.get_minimum_version') + def test_too_old_compute_cell(self, mock_get_min_service): + self.flags(group='api_database', connection=None) + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('cell', str(ex)) + mock_get_min_service.assert_called_once_with(mock.ANY, 'nova-compute') + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_too_old_compute_top_level(self, mock_get_min_service): + self.flags(group='api_database', connection='fake db connection') + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('system', str(ex)) + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch.object(utils.LOG, 'warning') + @mock.patch('nova.objects.service.Service.get_minimum_version') + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_api_db_is_configured_but_the_service_cannot_access_db( + self, mock_get_all, mock_get, mock_warn): + # This is the case when the nova-compute service is wrongly configured + # with db credentials but nova-compute is never allowed to access the + # db directly. + mock_get_all.side_effect = exception.DBNotAllowed( + binary='nova-compute') + + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('cell', str(ex)) + mock_get_all.assert_called_once_with(mock.ANY, ['nova-compute']) + mock_get.assert_called_once_with(mock.ANY, 'nova-compute') + mock_warn.assert_called_once_with( + 'This service is configured for access to the API database but is ' + 'not allowed to directly access the database. You should run this ' + 'service without the [api_database]/connection config option. The ' + 'service version check will only query the local cell.') + + +class RunOnceTests(test.NoDBTestCase): + + fake_logger = mock.MagicMock() + + @utils.run_once("already ran once", fake_logger) + def dummy_test_func(self, fail=False): + if fail: + raise ValueError() + return True + + def setUp(self): + super(RunOnceTests, self).setUp() + self.dummy_test_func.reset() + RunOnceTests.fake_logger.reset_mock() + + def test_wrapped_funtions_called_once(self): + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_called_once_raises(self): + self.assertFalse(self.dummy_test_func.called) + self.assertRaises(ValueError, self.dummy_test_func, fail=True) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_can_be_reset(self): + # assert we start with a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + self.dummy_test_func.reset() + # assert we restored a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + # assert that we never called the logger + RunOnceTests.fake_logger.assert_not_called() + + def test_reset_calls_cleanup(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + f.reset() + self.assertFalse(f.called) + mock_clean.assert_called_once_with() + + def test_clean_is_not_called_at_reset_if_wrapped_not_called(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + self.assertFalse(f.called) + + f.reset() + self.assertFalse(f.called) + self.assertFalse(mock_clean.called) + + def test_reset_works_even_if_cleanup_raises(self): + mock_clean = mock.Mock(side_effect=ValueError()) + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + self.assertRaises(ValueError, f.reset) + self.assertFalse(f.called) + mock_clean.assert_called_once_with() diff --git a/nova/tests/unit/virt/disk/vfs/fakeguestfs.py b/nova/tests/unit/virt/disk/vfs/fakeguestfs.py index 96c97edf79..168400e956 100644 --- a/nova/tests/unit/virt/disk/vfs/fakeguestfs.py +++ b/nova/tests/unit/virt/disk/vfs/fakeguestfs.py @@ -109,7 +109,7 @@ class GuestFS(object): "mode": 0o700 } - return self.files[path]["content"] + return bytes(self.files[path]["content"].encode()) def write(self, path, content): if path not in self.files: diff --git a/nova/tests/unit/virt/libvirt/fakelibvirt.py b/nova/tests/unit/virt/libvirt/fakelibvirt.py index 7caf0ad44c..2abda6b657 100644 --- a/nova/tests/unit/virt/libvirt/fakelibvirt.py +++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py @@ -284,49 +284,62 @@ class FakePCIDevice(object): :param multiple_gpu_types: (bool) Supports different vGPU types """ + self.dev_type = dev_type + self.slot = slot + self.function = function + self.iommu_group = iommu_group + self.numa_node = numa_node + self.vf_ratio = vf_ratio + self.multiple_gpu_types = multiple_gpu_types + self.parent = parent + self.generate_xml() + + def generate_xml(self, skip_capability=False): vend_id = PCI_VEND_ID vend_name = PCI_VEND_NAME - if dev_type == 'PCI': - if vf_ratio: + capability = '' + if self.dev_type == 'PCI': + if self.vf_ratio: raise ValueError('vf_ratio does not apply for PCI devices') prod_id = PCI_PROD_ID prod_name = PCI_PROD_NAME driver = PCI_DRIVER_NAME - capability = '' - elif dev_type == 'PF': + elif self.dev_type == 'PF': prod_id = PF_PROD_ID prod_name = PF_PROD_NAME driver = PF_DRIVER_NAME - capability = self.cap_templ % { - 'cap_type': PF_CAP_TYPE, - 'addresses': '\n'.join([ - self.addr_templ % { - # these are the slot, function values of the child VFs - # we can only assign 8 functions to a slot (0-7) so - # bump the slot each time we exceed this - 'slot': slot + (x // 8), - # ...and wrap the function value - 'function': x % 8, - # the offset is because the PF is occupying function 0 - } for x in range(1, vf_ratio + 1)]) - } - elif dev_type == 'VF': + if not skip_capability: + capability = self.cap_templ % { + 'cap_type': PF_CAP_TYPE, + 'addresses': '\n'.join([ + self.addr_templ % { + # these are the slot, function values of the child + # VFs, we can only assign 8 functions to a slot + # (0-7) so bump the slot each time we exceed this + 'slot': self.slot + (x // 8), + # ...and wrap the function value + 'function': x % 8, + # the offset is because the PF is occupying function 0 + } for x in range(1, self.vf_ratio + 1)]) + } + elif self.dev_type == 'VF': prod_id = VF_PROD_ID prod_name = VF_PROD_NAME driver = VF_DRIVER_NAME - capability = self.cap_templ % { - 'cap_type': VF_CAP_TYPE, - 'addresses': self.addr_templ % { - # this is the slot, function value of the parent PF - # if we're e.g. device 8, we'll have a different slot - # to our parent so reverse this - 'slot': slot - ((vf_ratio + 1) // 8), - # the parent PF is always function 0 - 'function': 0, + if not skip_capability: + capability = self.cap_templ % { + 'cap_type': VF_CAP_TYPE, + 'addresses': self.addr_templ % { + # this is the slot, function value of the parent PF + # if we're e.g. device 8, we'll have a different slot + # to our parent so reverse this + 'slot': self.slot - ((self.vf_ratio + 1) // 8), + # the parent PF is always function 0 + 'function': 0, + } } - } - elif dev_type == 'MDEV_TYPES': + elif self.dev_type == 'MDEV_TYPES': prod_id = MDEV_CAPABLE_PROD_ID prod_name = MDEV_CAPABLE_PROD_NAME driver = MDEV_CAPABLE_DRIVER_NAME @@ -336,36 +349,37 @@ class FakePCIDevice(object): 'type_id': NVIDIA_11_VGPU_TYPE, 'instances': 16, }] - if multiple_gpu_types: + if self.multiple_gpu_types: types.append(self.mdevtypes_templ % { 'type_id': NVIDIA_12_VGPU_TYPE, 'instances': 8, }) - capability = self.cap_templ % { - 'cap_type': MDEV_CAPABLE_CAP_TYPE, - 'addresses': '\n'.join(types) - } + if not skip_capability: + capability = self.cap_templ % { + 'cap_type': MDEV_CAPABLE_CAP_TYPE, + 'addresses': '\n'.join(types) + } self.is_capable_of_mdevs = True else: raise ValueError('Expected one of: PCI, VF, PCI') self.pci_device = self.pci_device_template % { - 'slot': slot, - 'function': function, + 'slot': self.slot, + 'function': self.function, 'vend_id': vend_id, 'vend_name': vend_name, 'prod_id': prod_id, 'prod_name': prod_name, 'driver': driver, 'capability': capability, - 'iommu_group': iommu_group, - 'numa_node': numa_node, - 'parent': parent or self.pci_default_parent + 'iommu_group': self.iommu_group, + 'numa_node': self.numa_node, + 'parent': self.parent or self.pci_default_parent } # -1 is the sentinel set in /sys/bus/pci/devices/*/numa_node # for no NUMA affinity. When the numa_node is set to -1 on a device # Libvirt omits the NUMA element so we remove it. - if numa_node == -1: + if self.numa_node == -1: self.pci_device = self.pci_device.replace("<numa node='-1'/>", "") def XMLDesc(self, flags): @@ -943,6 +957,20 @@ class Domain(object): nic_info['source'] = source.get('network') elif nic_info['type'] == 'bridge': nic_info['source'] = source.get('bridge') + elif nic_info['type'] == 'hostdev': + # <interface type='hostdev'> is for VF when vnic_type + # is direct. Add sriov vf pci information in nic_info + address = source.find('./address') + pci_type = address.get('type') + pci_domain = address.get('domain').replace('0x', '') + pci_bus = address.get('bus').replace('0x', '') + pci_slot = address.get('slot').replace('0x', '') + pci_function = address.get('function').replace( + '0x', '') + pci_device = "%s_%s_%s_%s_%s" % (pci_type, pci_domain, + pci_bus, pci_slot, + pci_function) + nic_info['source'] = pci_device nics_info += [nic_info] @@ -984,11 +1012,32 @@ class Domain(object): return definition + def verify_hostdevs_interface_are_vfs(self): + """Verify for interface type hostdev if the pci device is VF or not. + """ + + error_message = ("Interface type hostdev is currently supported on " + "SR-IOV Virtual Functions only") + + nics = self._def['devices'].get('nics', []) + for nic in nics: + if nic['type'] == 'hostdev': + pci_device = nic['source'] + pci_info_from_connection = self._connection.pci_info.devices[ + pci_device] + if 'phys_function' not in pci_info_from_connection.pci_device: + raise make_libvirtError( + libvirtError, + error_message, + error_code=VIR_ERR_CONFIG_UNSUPPORTED, + error_domain=VIR_FROM_DOMAIN) + def create(self): self.createWithFlags(0) def createWithFlags(self, flags): # FIXME: Not handling flags at the moment + self.verify_hostdevs_interface_are_vfs() self._state = VIR_DOMAIN_RUNNING self._connection._mark_running(self) self._has_saved_state = False @@ -1112,7 +1161,7 @@ class Domain(object): nics = '' for nic in self._def['devices']['nics']: - if 'source' in nic: + if 'source' in nic and nic['type'] != 'hostdev': nics += '''<interface type='%(type)s'> <mac address='%(mac)s'/> <source %(type)s='%(source)s'/> @@ -1803,6 +1852,16 @@ virConnect = Connection virSecret = Secret +# A private libvirt-python class and global only provided here for testing to +# ensure it's not returned by libvirt.host.Host.get_libvirt_proxy_classes. +class FakeHandler(object): + def __init__(self): + pass + + +_EventAddHandleFunc = FakeHandler + + class FakeLibvirtFixture(fixtures.Fixture): """Performs global setup/stubbing for all libvirt tests. """ diff --git a/nova/tests/unit/virt/libvirt/test_config.py b/nova/tests/unit/virt/libvirt/test_config.py index f0c2fa3b51..9a9342efae 100644 --- a/nova/tests/unit/virt/libvirt/test_config.py +++ b/nova/tests/unit/virt/libvirt/test_config.py @@ -339,6 +339,26 @@ class LibvirtConfigCPUFeatureTest(LibvirtConfigBaseTest): <feature name="mtrr"/> """) + def test_config_parse_require(self): + xml = """ + <feature name="mtrr" policy="require"/> + """ + xmldoc = etree.fromstring(xml) + obj = config.LibvirtConfigCPUFeature() + obj.parse_dom(xmldoc) + + self.assertEqual(obj.policy, "require") + + def test_config_parse_disable(self): + xml = """ + <feature name="mtrr" policy="disable"/> + """ + xmldoc = etree.fromstring(xml) + obj = config.LibvirtConfigCPUFeature() + obj.parse_dom(xmldoc) + + self.assertEqual(obj.policy, "disable") + class LibvirtConfigGuestCPUFeatureTest(LibvirtConfigBaseTest): @@ -437,6 +457,27 @@ class LibvirtConfigCPUTest(LibvirtConfigBaseTest): </cpu> """) + def test_config_disabled_features(self): + obj = config.LibvirtConfigCPU() + obj.model = "Penryn" + obj.vendor = "Intel" + obj.arch = obj_fields.Architecture.X86_64 + + disabled_feature = config.LibvirtConfigCPUFeature("mtrr") + disabled_feature.policy = "disable" + obj.add_feature(disabled_feature) + obj.add_feature(config.LibvirtConfigCPUFeature("apic")) + + xml = obj.to_xml() + self.assertXmlEqual(xml, """ + <cpu> + <arch>x86_64</arch> + <model>Penryn</model> + <vendor>Intel</vendor> + <feature name="apic"/> + </cpu> + """) + def test_only_uniq_cpu_featues(self): obj = config.LibvirtConfigCPU() obj.model = "Penryn" diff --git a/nova/tests/unit/virt/libvirt/test_designer.py b/nova/tests/unit/virt/libvirt/test_designer.py index bb5b8e9f72..14a896eb17 100644 --- a/nova/tests/unit/virt/libvirt/test_designer.py +++ b/nova/tests/unit/virt/libvirt/test_designer.py @@ -207,6 +207,19 @@ class DesignerTestCase(test.NoDBTestCase): self.assertEqual(512, conf.vhost_rx_queue_size) self.assertIsNone(conf.vhost_tx_queue_size) + def test_set_vif_host_backend_vhostuser_config_tapname(self): + conf = config.LibvirtConfigGuestInterface() + designer.set_vif_host_backend_vhostuser_config(conf, 'fake-mode', + 'fake-path', None, None, + 'fake-tap') + self.assertEqual('vhostuser', conf.net_type) + self.assertEqual('unix', conf.vhostuser_type) + self.assertEqual('fake-mode', conf.vhostuser_mode) + self.assertEqual('fake-path', conf.vhostuser_path) + self.assertIsNone(conf.vhost_rx_queue_size) + self.assertIsNone(conf.vhost_tx_queue_size) + self.assertEqual('fake-tap', conf.target_dev) + def test_set_vif_mtu_config(self): conf = config.LibvirtConfigGuestInterface() designer.set_vif_mtu_config(conf, 9000) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index f1cd319b77..f947641c97 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -6739,7 +6739,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, self.assertEqual(cfg.devices[4].type, "spice") self.assertEqual(cfg.devices[5].type, "qxl") - self.assertEqual(cfg.devices[5].vram, 64 * units.Mi / units.Ki) + self.assertEqual(cfg.devices[5].vram, 65536) def _test_add_video_driver(self, model): self.flags(virt_type='kvm', group='libvirt') @@ -6750,15 +6750,19 @@ class LibvirtConnTestCase(test.NoDBTestCase, drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) guest = vconfig.LibvirtConfigGuest() - instance_ref = objects.Instance(**self.test_instance) - flavor = instance_ref.get_flavor() + flavor = objects.Flavor( + extra_specs={'hw_video:ram_max_mb': '512'}) image_meta = objects.ImageMeta.from_dict({ - 'properties': {'hw_video_model': model}}) + 'properties': { + 'hw_video_model': model, + 'hw_video_ram': 8, + }, + }) self.assertTrue(drvr._guest_add_video_device(guest)) - video = drvr._add_video_driver(guest, image_meta, - flavor) + video = drvr._add_video_driver(guest, image_meta, flavor) self.assertEqual(model, video.type) + self.assertEqual(8192, video.vram) # should be in bytes def test__add_video_driver(self): self._test_add_video_driver('qxl') @@ -9155,6 +9159,9 @@ class LibvirtConnTestCase(test.NoDBTestCase, 'encryption_key_id': uuids.encryption_key_id} instance = mock.sentinel.instance + # Mock out find_secret so we don't skip ahead + drvr._host.find_secret.return_value = None + # Mock out the encryptors mock_encryptor = mock.Mock() mock_get_volume_encryptor.return_value = mock_encryptor @@ -10036,6 +10043,8 @@ class LibvirtConnTestCase(test.NoDBTestCase, 'serial': uuids.volume_id, 'driver_volume_type': 'rbd', 'data': {'name': 'pool/volume', + 'auth_enabled': 'true', + 'auth_username': 'username', 'access_mode': 'rw'} } disk_1 = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk, @@ -10077,7 +10086,8 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_get_encryption_metadata.assert_called_once_with( self.context, drvr._volume_api, uuids.volume_id, connection_info) - mock_qemu_img_info.assert_called_once_with('rbd:pool/volume') + mock_qemu_img_info.assert_called_once_with( + 'rbd:pool/volume:id=username') # Assert that the Libvirt call to resize the device within the instance # is called with the LUKSv1 payload offset taken into account. @@ -10215,6 +10225,21 @@ class LibvirtConnTestCase(test.NoDBTestCase, crt_scrt.assert_called_once_with( 'volume', uuids.serial, password=key) + @mock.patch.object(key_manager, 'API') + def test_attach_encryptor_secret_exists(self, mock_key_manager_api): + connection_info = {'data': {'volume_id': uuids.volume_id}} + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + with test.nested( + mock.patch.object(drvr, '_get_volume_encryption'), + mock.patch.object(drvr._host, 'find_secret') + ) as (mock_get_volume_encryption, mock_find_secret): + drvr._attach_encryptor(self.context, connection_info, None) + + # Assert we called find_secret and nothing else + mock_find_secret.assert_called_once_with('volume', uuids.volume_id) + mock_get_volume_encryption.assert_not_called() + mock_key_manager_api.assert_not_called() + @mock.patch('os_brick.encryptors.get_encryption_metadata') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_encryptor') def test_detach_encryptor_connection_info_incomplete(self, @@ -16286,7 +16311,48 @@ class LibvirtConnTestCase(test.NoDBTestCase, accel_info=accel_info) mock_create_guest_with_network.assert_called_once_with(self.context, dummyxml, instance, network_info, block_device_info, - vifs_already_plugged=True) + vifs_already_plugged=True, external_events=[]) + + @mock.patch('oslo_utils.fileutils.ensure_tree', new=mock.Mock()) + @mock.patch('nova.virt.libvirt.LibvirtDriver.get_info') + @mock.patch('nova.virt.libvirt.LibvirtDriver._create_guest_with_network') + @mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml') + @mock.patch('nova.virt.libvirt.LibvirtDriver.destroy', new=mock.Mock()) + @mock.patch( + 'nova.virt.libvirt.LibvirtDriver._get_all_assigned_mediated_devices', + new=mock.Mock(return_value={})) + def test_hard_reboot_wait_for_plug( + self, mock_get_guest_xml, mock_create_guest_with_network, mock_get_info + ): + self.flags( + group="workarounds", + wait_for_vif_plugged_event_during_hard_reboot=["normal"]) + self.context.auth_token = None + instance = objects.Instance(**self.test_instance) + network_info = _fake_network_info(self, num_networks=4) + network_info[0]["vnic_type"] = "normal" + network_info[1]["vnic_type"] = "direct" + network_info[2]["vnic_type"] = "normal" + network_info[3]["vnic_type"] = "direct-physical" + block_device_info = None + return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN), + hardware.InstanceInfo(state=power_state.RUNNING)] + mock_get_info.side_effect = return_values + mock_get_guest_xml.return_value = mock.sentinel.xml + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + drvr._hard_reboot( + self.context, instance, network_info, block_device_info) + + mock_create_guest_with_network.assert_called_once_with( + self.context, mock.sentinel.xml, instance, network_info, + block_device_info, + vifs_already_plugged=False, + external_events=[ + ('network-vif-plugged', uuids.vif1), + ('network-vif-plugged', uuids.vif3), + ] + ) @mock.patch('oslo_utils.fileutils.ensure_tree') @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall') @@ -21196,6 +21262,54 @@ class TestUpdateProviderTree(test.NoDBTestCase): for trait in ['HW_CPU_X86_AVX512F', 'HW_CPU_X86_BMI']: self.assertIn(trait, self.pt.data(self.cn_rp['uuid']).traits) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_gpu_inventories') + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_get_cpu_feature_traits', + new=mock.Mock(return_value=cpu_traits)) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info', + new=mock.Mock(return_value={'total': 0})) + @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total', + new=mock.Mock(return_value=0)) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_pcpu_available', + new=mock.Mock(return_value=range(0))) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vcpu_available', + new=mock.Mock(return_value=range(0))) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_update_provider_tree_for_pcpu', + new=mock.Mock()) + def test_update_provider_tree_zero_total(self, mock_gpu_invs): + # Verify that we omit various resources from inventory when there are + # zero total quantity of those resources. Placement does not allow + # inventory updates with total=0 as they fail API schema validation. + + # Use total=0 for vgpus. + gpu_inventory_dicts = { + 'pci_0000_06_00_0': {'total': 0, + 'max_unit': 16, + 'min_unit': 1, + 'step_size': 1, + 'reserved': 0, + 'allocation_ratio': 1.0, + }, + } + mock_gpu_invs.return_value = gpu_inventory_dicts + # Use an empty list for vpmems. + self.driver._vpmems_by_rc = {'CUSTOM_PMEM_NAMESPACE_4GB': []} + # Before we update_provider_tree, we have 2 providers from setUp(): + # self.cn_rp and self.shared_rp and they are both empty {}. + self.assertEqual(2, len(self.pt.get_provider_uuids())) + # Update the provider tree. + self.driver.update_provider_tree(self.pt, self.cn_rp['name']) + # After we update_provider_tree, we should still have 2 providers + # because VGPU has total=0 and we would skip adding a child provider + # for it. + self.assertEqual(2, len(self.pt.get_provider_uuids())) + # All providers should have an empty dict because (1) we never updated + # the self.shared_rp provider and (2) the other 2 providers have zero + # for resource totals. + for uuid in self.pt.get_provider_uuids(): + self.assertEqual({}, self.pt.data(uuid).inventory) + def test_update_provider_tree_with_vgpus(self): pci_devices = ['pci_0000_06_00_0', 'pci_0000_07_00_0'] gpu_inventory_dicts = { @@ -24938,6 +25052,20 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): libvirt_driver.LibvirtDriver, fake.FakeVirtAPI(), False) + @mock.patch.object(nova.conf.devices, 'register_dynamic_opts') + def test_get_supported_vgpu_types_registering_dynamic_opts(self, rdo): + self.flags(enabled_vgpu_types=['nvidia-11', 'nvidia-12'], + group='devices') + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + drvr._get_supported_vgpu_types() + + # Okay below is confusing, but remember, ._get_supported_vgpu_types() + # is first called by the LibvirtDriver object creation, so when + # calling the above drvr._get_supported_vgpu_types() method, it will + # be the second time that register_dynamic_opts() will be called. + rdo.assert_has_calls([mock.call(CONF), mock.call(CONF)]) + def test_get_vgpu_type_per_pgpu(self): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) device = 'pci_0000_84_00_0' @@ -26628,8 +26756,23 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase): not running should trigger a blockRebase using qemu-img not libvirt. In this test, we rebase the image with another image as backing file. """ + dom_xml = """ + <domain type='kvm'> + <devices> + <disk type='file'> + <source file='/var/lib/nova/instances/%s/disk1_file'/> + <target dev='vda' bus='virtio'/> + <serial>0e38683e-f0af-418f-a3f1-6b67ea0f919d</serial> + </disk> + <disk type='block'> + <source dev='/path/to/dev/1'/> + <target dev='vdb' bus='virtio' serial='1234'/> + </disk> + </devices> + </domain>""" % self.inst['uuid'] + mock_domain, guest = self._setup_block_rebase_domain_and_guest_mocks( - self.dom_xml) + dom_xml) instance = objects.Instance(**self.inst) snapshot_id = 'snapshot-1234' @@ -26640,10 +26783,13 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase): self.delete_info_1) mock_disk_op_sema.__enter__.assert_called_once() - mock_qemu_img_info.assert_called_once_with("snap.img") - mock_execute.assert_called_once_with('qemu-img', 'rebase', - '-b', 'snap.img', '-F', - 'fake_fmt', 'disk1_file') + mock_qemu_img_info.assert_called_once_with( + "/var/lib/nova/instances/%s/snap.img" % instance.uuid) + mock_execute.assert_called_once_with( + 'qemu-img', 'rebase', + '-b', '/var/lib/nova/instances/%s/snap.img' % instance.uuid, + '-F', 'fake_fmt', + '/var/lib/nova/instances/%s/disk1_file' % instance.uuid) @mock.patch.object(compute_utils, 'disk_ops_semaphore') @mock.patch.object(host.Host, "has_min_version", @@ -27590,6 +27736,15 @@ class LibvirtPMEMNamespaceTests(test.NoDBTestCase): self.assertRaises(exception.PMEMNamespaceConfigInvalid, drvr._discover_vpmems, vpmem_conf) + @mock.patch('nova.privsep.libvirt.get_pmem_namespaces') + def test_get_vpmems_on_host__exception(self, mock_get_ns): + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) + mock_get_ns.side_effect = Exception('foo') + + self.assertRaises( + exception.GetPMEMNamespacesFailed, + drvr._get_vpmems_on_host) + @mock.patch('nova.virt.hardware.get_vpmems') def test_get_ordered_vpmems(self, mock_labels): # get orgered vpmems based on flavor extra_specs diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py index 89ef74865b..358ae6f6a6 100644 --- a/nova/tests/unit/virt/libvirt/test_host.py +++ b/nova/tests/unit/virt/libvirt/test_host.py @@ -1387,8 +1387,9 @@ class LibvirtTpoolProxyTestCase(test.NoDBTestCase): self.assertIn(fakelibvirt.virNodeDevice, proxy_classes) self.assertIn(fakelibvirt.virSecret, proxy_classes) - # Assert that we filtered out libvirtError + # Assert that we filtered out libvirtError and any private classes self.assertNotIn(fakelibvirt.libvirtError, proxy_classes) + self.assertNotIn(fakelibvirt._EventAddHandleFunc, proxy_classes) def test_tpool_get_connection(self): # Test that Host.get_connection() returns a tpool.Proxy diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py index d4fba48397..37bc6ad4a4 100644 --- a/nova/tests/unit/virt/libvirt/test_migration.py +++ b/nova/tests/unit/virt/libvirt/test_migration.py @@ -991,7 +991,48 @@ class UtilityMigrationTestCase(test.NoDBTestCase): doc = etree.fromstring(original_xml) ex = self.assertRaises(KeyError, migration._update_vif_xml, doc, data, get_vif_config) - self.assertIn("CA:FE:DE:AD:BE:EF", six.text_type(ex)) + self.assertIn("ca:fe:de:ad:be:ef", six.text_type(ex)) + + def test_update_vif_xml_lower_case_mac(self): + """Tests that the vif in the migrate data is not found in the existing + guest interfaces. + """ + conf = vconfig.LibvirtConfigGuestInterface() + conf.net_type = "bridge" + conf.source_dev = "qbra188171c-ea" + conf.target_dev = "tapa188171c-ea" + conf.mac_addr = "DE:AD:BE:EF:CA:FE" + conf.model = "virtio" + original_xml = """<domain> + <uuid>3de6550a-8596-4937-8046-9d862036bca5</uuid> + <devices> + <interface type="bridge"> + <mac address="de:ad:be:ef:ca:fe"/> + <model type="virtio"/> + <source bridge="qbra188171c-ea"/> + <target dev="tapa188171c-ea"/> + <virtualport type="openvswitch"> + <parameters interfaceid="%s"/> + </virtualport> + <address type='pci' domain='0x0000' bus='0x00' slot='0x04' + function='0x0'/> + </interface> + </devices> + </domain>""" % uuids.ovs + expected_xml = """<domain> + <uuid>3de6550a-8596-4937-8046-9d862036bca5</uuid> + <devices> + <interface type="bridge"> + <mac address="DE:AD:BE:EF:CA:FE"/> + <model type="virtio"/> + <source bridge="qbra188171c-ea"/> + <target dev="tapa188171c-ea"/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x04' + function='0x0'/> + </interface> + </devices> + </domain>""" + self._test_update_vif_xml(conf, original_xml, expected_xml) class MigrationMonitorTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/libvirt/test_vif.py b/nova/tests/unit/virt/libvirt/test_vif.py index 1bfc532da6..24df5d5aa7 100644 --- a/nova/tests/unit/virt/libvirt/test_vif.py +++ b/nova/tests/unit/virt/libvirt/test_vif.py @@ -379,6 +379,10 @@ class LibvirtVifTestCase(test.NoDBTestCase): uuid='f0000000-0000-0000-0000-000000000001', project_id=723) + flavor_1vcpu = objects.Flavor(vcpus=1, memory=512, root_gb=1) + + flavor_2vcpu = objects.Flavor(vcpus=2, memory=512, root_gb=1) + bandwidth = { 'quota:vif_inbound_peak': '200', 'quota:vif_outbound_peak': '20', @@ -556,6 +560,12 @@ class LibvirtVifTestCase(test.NoDBTestCase): pci_slot_want = vif['profile']['pci_slot'] self.assertEqual(pci_slot, pci_slot_want) + def _assertQueueSizeEquals(self, node, rx_want, tx_want): + rx_queue_size = node.find("driver").get("rx_queue_size") + tx_queue_size = node.find("driver").get("tx_queue_size") + self.assertEqual(rx_queue_size, rx_want) + self.assertEqual(tx_queue_size, tx_want) + def _assertXmlEqual(self, expectedXmlstr, actualXmlstr): if not isinstance(actualXmlstr, six.string_types): actualXmlstr = etree.tostring(actualXmlstr, encoding='unicode', @@ -1057,30 +1067,50 @@ class LibvirtVifTestCase(test.NoDBTestCase): @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @mock.patch('nova.privsep.linux_net.create_tap_dev') - def test_plug_tap_kvm_virtio(self, mock_create_tap_dev, mock_set_mtu, - mock_device_exists): + def test_plug_tap_kvm_virtio( + self, mock_create_tap_dev, mock_set_mtu, mock_device_exists): d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={} ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) mock_create_tap_dev.reset_mock() d2 = vif.LibvirtGenericVIFDriver() mq_ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True' } ) d2.plug(mq_ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', None, - multiqueue=True) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=True) + + @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) + @mock.patch('nova.privsep.linux_net.set_device_mtu') + @mock.patch('nova.privsep.linux_net.create_tap_dev') + def test_plug_tap_mq_ignored_1vcpu( + self, mock_create_tap_dev, mock_set_mtu, mock_device_exists): + + d1 = vif.LibvirtGenericVIFDriver() + mq_ins = objects.Instance( + id=1, uuid='f0000000-0000-0000-0000-000000000001', + image_ref=uuids.image_ref, flavor=self.flavor_1vcpu, + project_id=723, system_metadata={ + 'image_hw_vif_multiqueue_enabled': 'True', + } + ) + d1.plug(mq_ins, self.vif_tap) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @@ -1095,14 +1125,14 @@ class LibvirtVifTestCase(test.NoDBTestCase): d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True' } ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', - None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @@ -1113,15 +1143,15 @@ class LibvirtVifTestCase(test.NoDBTestCase): d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True', 'image_hw_vif_model': 'e1000', } ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', - None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) def test_unplug_tap(self): d = vif.LibvirtGenericVIFDriver() @@ -1296,24 +1326,8 @@ class LibvirtVifTestCase(test.NoDBTestCase): self.flags(tx_queue_size=1024, group='libvirt') d = vif.LibvirtGenericVIFDriver() xml = self._get_instance_xml(d, self.vif_vhostuser) - self._assertXmlEqual(""" - <domain type="qemu"> - <uuid>fake-uuid</uuid> - <name>fake-name</name> - <memory>102400</memory> - <vcpu>4</vcpu> - <os> - <type>None</type> - </os> - <devices> - <interface type="vhostuser"> - <mac address="ca:fe:de:ad:be:ef"/> - <model type="virtio"/> - <driver rx_queue_size="512" tx_queue_size="1024"/> - <source mode="client" path="/tmp/vif-xxx-yyy-zzz" type="unix"/> - </interface> - </devices> - </domain>""", xml) + node = self._get_node(xml) + self._assertQueueSizeEquals(node, "512", "1024") def test_vhostuser_driver_no_path(self): d = vif.LibvirtGenericVIFDriver() @@ -1554,6 +1568,7 @@ class LibvirtVifTestCase(test.NoDBTestCase): <model type="virtio"/> <source mode="client" path="/var/run/openvswitch/vhudc065497-3c" type="unix"/> + <target dev="vhudc065497-3c"/> </interface>""" self._test_config_os_vif(os_vif_type, vif_type, expected_xml) @@ -1591,6 +1606,7 @@ class LibvirtVifTestCase(test.NoDBTestCase): <model type="virtio"/> <source mode="client" path="/var/run/openvswitch/vhudc065497-3c" type="unix"/> + <target dev="nicdc065497-3c"/> </interface>""" self._test_config_os_vif(os_vif_type, vif_type, expected_xml) diff --git a/nova/utils.py b/nova/utils.py index 0a40fa6ffc..ec5e3e86dc 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -1053,3 +1053,99 @@ def normalize_rc_name(rc_name): norm_name = norm_name.upper() norm_name = orc.CUSTOM_NAMESPACE + norm_name return norm_name + + +def raise_if_old_compute(): + # to avoid circular imports + from nova import context as nova_context + from nova.objects import service + + ctxt = nova_context.get_admin_context() + + if CONF.api_database.connection is not None: + scope = 'system' + try: + current_service_version = service.get_minimum_version_all_cells( + ctxt, ['nova-compute']) + except exception.DBNotAllowed: + # This most likely means we are in a nova-compute service + # which is configured with a connection to the API database. + # We should not be attempting to "get out" of our cell to + # look at the minimum versions of nova-compute services in other + # cells, so DBNotAllowed was raised. Leave a warning message + # and fall back to only querying computes in our cell. + LOG.warning( + 'This service is configured for access to the API database ' + 'but is not allowed to directly access the database. You ' + 'should run this service without the ' + '[api_database]/connection config option. The service version ' + 'check will only query the local cell.') + scope = 'cell' + current_service_version = service.Service.get_minimum_version( + ctxt, 'nova-compute') + else: + scope = 'cell' + # We in a cell so target our query to the current cell only + current_service_version = service.Service.get_minimum_version( + ctxt, 'nova-compute') + + if current_service_version == 0: + # 0 means no compute in the system, + # probably a fresh install before the computes are registered + return + + oldest_supported_service_level = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] + + if current_service_version < oldest_supported_service_level: + raise exception.TooOldComputeService( + oldest_supported_version=service.OLDEST_SUPPORTED_SERVICE_VERSION, + scope=scope, + min_service_level=current_service_version, + oldest_supported_service=oldest_supported_service_level) + + +def run_once(message, logger, cleanup=None): + """This is a utility function decorator to ensure a function + is run once and only once in an interpreter instance. + + Note: this is copied from the placement repo (placement/util.py) + + The decorated function object can be reset by calling its + reset function. All exceptions raised by the wrapped function, + logger and cleanup function will be propagated to the caller. + """ + def outer_wrapper(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not wrapper.called: + # Note(sean-k-mooney): the called state is always + # updated even if the wrapped function completes + # by raising an exception. If the caller catches + # the exception it is their responsibility to call + # reset if they want to re-execute the wrapped function. + try: + return func(*args, **kwargs) + finally: + wrapper.called = True + else: + logger(message) + + wrapper.called = False + + def reset(wrapper, *args, **kwargs): + # Note(sean-k-mooney): we conditionally call the + # cleanup function if one is provided only when the + # wrapped function has been called previously. We catch + # and reraise any exception that may be raised and update + # the called state in a finally block to ensure its + # always updated if reset is called. + try: + if cleanup and wrapper.called: + return cleanup(*args, **kwargs) + finally: + wrapper.called = False + + wrapper.reset = functools.partial(reset, wrapper) + return wrapper + return outer_wrapper diff --git a/nova/virt/disk/api.py b/nova/virt/disk/api.py index 79d629f283..9902c0608b 100644 --- a/nova/virt/disk/api.py +++ b/nova/virt/disk/api.py @@ -615,8 +615,8 @@ def _set_passwd(username, admin_passwd, passwd_data, shadow_data): :param username: the username :param admin_passwd: the admin password - :param passwd_data: path to the passwd file - :param shadow_data: path to the shadow password file + :param passwd_data: Data from the passwd file decoded as a string + :param shadow_data: Data from the shadow file decoded as a string :returns: nothing :raises: exception.NovaException(), IOError() diff --git a/nova/virt/disk/vfs/guestfs.py b/nova/virt/disk/vfs/guestfs.py index db260d9a4a..ce5f48794a 100644 --- a/nova/virt/disk/vfs/guestfs.py +++ b/nova/virt/disk/vfs/guestfs.py @@ -308,7 +308,14 @@ class VFSGuestFS(vfs.VFS): def read_file(self, path): LOG.debug("Read file path=%s", path) path = self._canonicalize_path(path) - return self.handle.read_file(path) + data = self.handle.read_file(path) + # NOTE(lyarwood): libguestfs v1.41.1 (0ee02e0117527) switched the + # return type of read_file from string to bytes and as such we need to + # handle both here, decoding and returning a string if bytes is + # provided. https://bugzilla.redhat.com/show_bug.cgi?id=1661871 + if isinstance(data, bytes): + return data.decode() + return data def has_file(self, path): LOG.debug("Has file path=%s", path) diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py index 7cfdb4218b..ea525648b3 100644 --- a/nova/virt/libvirt/config.py +++ b/nova/virt/libvirt/config.py @@ -674,11 +674,13 @@ class LibvirtConfigCPUFeature(LibvirtConfigObject): **kwargs) self.name = name + self.policy = "require" def parse_dom(self, xmldoc): super(LibvirtConfigCPUFeature, self).parse_dom(xmldoc) self.name = xmldoc.get("name") + self.policy = xmldoc.get("policy", "require") def format_dom(self): ft = super(LibvirtConfigCPUFeature, self).format_dom() @@ -730,7 +732,8 @@ class LibvirtConfigCPU(LibvirtConfigObject): elif c.tag == "feature": f = LibvirtConfigCPUFeature() f.parse_dom(c) - self.add_feature(f) + if f.policy != "disable": + self.add_feature(f) def format_dom(self): cpu = super(LibvirtConfigCPU, self).format_dom() @@ -753,7 +756,8 @@ class LibvirtConfigCPU(LibvirtConfigObject): # sorting the features to allow more predictable tests for f in sorted(self.features, key=lambda x: x.name): - cpu.append(f.format_dom()) + if f.policy != "disable": + cpu.append(f.format_dom()) return cpu diff --git a/nova/virt/libvirt/designer.py b/nova/virt/libvirt/designer.py index c18d104e6b..3677ed5280 100644 --- a/nova/virt/libvirt/designer.py +++ b/nova/virt/libvirt/designer.py @@ -133,7 +133,7 @@ def set_vif_host_backend_direct_config(conf, devname, mode="passthrough"): def set_vif_host_backend_vhostuser_config(conf, mode, path, rx_queue_size, - tx_queue_size): + tx_queue_size, tapname=None): """Populate a LibvirtConfigGuestInterface instance with host backend details for vhostuser socket. @@ -147,6 +147,8 @@ def set_vif_host_backend_vhostuser_config(conf, mode, path, rx_queue_size, conf.vhost_rx_queue_size = rx_queue_size if tx_queue_size: conf.vhost_tx_queue_size = tx_queue_size + if tapname: + conf.target_dev = tapname def set_vif_mtu_config(conf, mtu): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 7860e43110..8242065c89 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1791,6 +1791,17 @@ class LibvirtDriver(driver.ComputeDriver): to determine if an attempt to attach the encryptor should be made. """ + # NOTE(lyarwood): Skip any attempt to fetch encryption metadata or the + # actual passphrase from the key manager if a libvirt secert already + # exists locally for the volume. This suggests that the instance was + # only powered off or the underlying host rebooted. + volume_id = driver_block_device.get_volume_id(connection_info) + if self._host.find_secret('volume', volume_id): + LOG.debug("A libvirt secret for volume %s has been found on the " + "host, skipping any attempt to create another or attach " + "an os-brick encryptor.", volume_id) + return + if encryption is None: encryption = self._get_volume_encryption(context, connection_info) @@ -1822,7 +1833,6 @@ class LibvirtDriver(driver.ComputeDriver): # NOTE(lyarwood): Store the passphrase as a libvirt secret locally # on the compute node. This secret is used later when generating # the volume config. - volume_id = driver_block_device.get_volume_id(connection_info) self._host.create_secret('volume', volume_id, password=passphrase) elif encryption: encryptor = self._get_volume_encryptor(connection_info, @@ -2150,7 +2160,11 @@ class LibvirtDriver(driver.ComputeDriver): if 'device_path' in connection_info['data']: path = connection_info['data']['device_path'] elif connection_info['driver_volume_type'] == 'rbd': - path = 'rbd:%s' % (connection_info['data']['name']) + volume_name = connection_info['data']['name'] + path = f"rbd:{volume_name}" + if connection_info['data'].get('auth_enabled'): + username = connection_info['data']['auth_username'] + path = f"rbd:{volume_name}:id={username}" else: path = 'unknown' raise exception.DiskNotFound(location='unknown') @@ -3008,7 +3022,16 @@ class LibvirtDriver(driver.ComputeDriver): # If the rebased image is going to have a backing file then # explicitly set the backing file format to avoid any security # concerns related to file format auto detection. - backing_file = rebase_base + if os.path.isabs(rebase_base): + backing_file = rebase_base + else: + # this is a probably a volume snapshot case where the + # rebase_base is relative. See bug + # https://bugs.launchpad.net/nova/+bug/1885528 + backing_file_name = os.path.basename(rebase_base) + volume_path = os.path.dirname(source_path) + backing_file = os.path.join(volume_path, backing_file_name) + b_file_fmt = images.qemu_img_info(backing_file).file_format qemu_img_extra_arg = ['-F', b_file_fmt] @@ -3360,11 +3383,32 @@ class LibvirtDriver(driver.ComputeDriver): # on which vif type we're using and we are working with a stale network # info cache here, so won't rely on waiting for neutron plug events. # vifs_already_plugged=True means "do not wait for neutron plug events" + external_events = [] + vifs_already_plugged = True + event_expected_for_vnic_types = ( + CONF.workarounds.wait_for_vif_plugged_event_during_hard_reboot) + if event_expected_for_vnic_types: + # NOTE(gibi): We unplugged every vif during destroy above and we + # will replug them with _create_guest_with_network. As the + # workaround config has some vnic_types configured we expect + # vif-plugged events for every vif with those vnic_types. + # TODO(gibi): only wait for events if we know that the networking + # backend sends plug time events. For that we need to finish + # https://bugs.launchpad.net/neutron/+bug/1821058 first in Neutron + # then create a driver -> plug-time event mapping in nova. + external_events = [ + ('network-vif-plugged', vif['id']) + for vif in network_info + if vif['vnic_type'] in event_expected_for_vnic_types + ] + vifs_already_plugged = False + # NOTE(efried): The instance should already have a vtpm_secret_uuid # registered if appropriate. self._create_guest_with_network( context, xml, instance, network_info, block_device_info, - vifs_already_plugged=True) + vifs_already_plugged=vifs_already_plugged, + external_events=external_events) self._prepare_pci_devices_for_use( pci_manager.get_instance_pci_devs(instance, 'all')) @@ -5522,7 +5566,7 @@ class LibvirtDriver(driver.ComputeDriver): raise exception.RequestedVRamTooHigh(req_vram=video_ram, max_vram=max_vram) if max_vram and video_ram: - video.vram = video_ram * units.Mi / units.Ki + video.vram = video_ram * units.Mi // units.Ki guest.add_device(video) # NOTE(sean-k-mooney): return the video device we added @@ -6871,6 +6915,11 @@ class LibvirtDriver(driver.ComputeDriver): if not CONF.devices.enabled_vgpu_types: return [] + # Make sure we register all the types as the compute service could + # be calling this method before init_host() + if len(CONF.devices.enabled_vgpu_types) > 1: + nova.conf.devices.register_dynamic_opts(CONF) + for vgpu_type in CONF.devices.enabled_vgpu_types: group = getattr(CONF, 'vgpu_%s' % vgpu_type, None) if group is None or not group.device_addresses: @@ -7787,16 +7836,17 @@ class LibvirtDriver(driver.ComputeDriver): resources: ty.Dict[str, ty.Set['objects.Resource']] = ( collections.defaultdict(set) ) - result = { - orc.MEMORY_MB: { + + result = {} + if memory_mb: + result[orc.MEMORY_MB] = { 'total': memory_mb, 'min_unit': 1, 'max_unit': memory_mb, 'step_size': 1, 'allocation_ratio': ratios[orc.MEMORY_MB], 'reserved': CONF.reserved_host_memory_mb, - }, - } + } # NOTE(stephenfin): We have to optionally report these since placement # forbids reporting inventory with total=0 @@ -7837,15 +7887,17 @@ class LibvirtDriver(driver.ComputeDriver): # compute RP once the issues from bug #1784020 have been resolved. if provider_tree.has_sharing_provider(orc.DISK_GB): LOG.debug('Ignoring sharing provider - see bug #1784020') - result[orc.DISK_GB] = { - 'total': disk_gb, - 'min_unit': 1, - 'max_unit': disk_gb, - 'step_size': 1, - 'allocation_ratio': ratios[orc.DISK_GB], - 'reserved': (self._get_reserved_host_disk_gb_from_config() + - self._get_disk_size_reserved_for_image_cache()), - } + + if disk_gb: + result[orc.DISK_GB] = { + 'total': disk_gb, + 'min_unit': 1, + 'max_unit': disk_gb, + 'step_size': 1, + 'allocation_ratio': ratios[orc.DISK_GB], + 'reserved': (self._get_reserved_host_disk_gb_from_config() + + self._get_disk_size_reserved_for_image_cache()), + } # TODO(sbauza): Use traits to providing vGPU types. For the moment, # it will be only documentation support by explaining to use @@ -7880,6 +7932,10 @@ class LibvirtDriver(driver.ComputeDriver): """Update resources and inventory for vpmems in provider tree.""" prov_data = provider_tree.data(nodename) for rc, vpmems in self._vpmems_by_rc.items(): + # Skip (and omit) inventories with total=0 because placement does + # not allow setting total=0 for inventory. + if not len(vpmems): + continue inventory[rc] = { 'total': len(vpmems), 'max_unit': len(vpmems), @@ -7992,6 +8048,10 @@ class LibvirtDriver(driver.ComputeDriver): # Dict of PGPU RPs keyed by their libvirt PCI name pgpu_rps = {} for pgpu_dev_id, inventory in inventories_dict.items(): + # Skip (and omit) inventories with total=0 because placement does + # not allow setting total=0 for inventory. + if not inventory['total']: + continue # For each physical GPU, we make sure to have a child provider pgpu_rp_name = '%s_%s' % (nodename, pgpu_dev_id) if not provider_tree.exists(pgpu_rp_name): diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index eb0ca35618..dae4759263 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -367,8 +367,8 @@ class Guest(object): return devs def detach_device_with_retry(self, get_device_conf_func, device, live, - max_retry_count=7, inc_sleep_time=2, - max_sleep_time=30, + max_retry_count=7, inc_sleep_time=10, + max_sleep_time=60, alternative_device_name=None, supports_device_missing_error_code=False): """Detaches a device from the guest. After the initial detach request, diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py index 4d0c38c999..a56b4395e6 100644 --- a/nova/virt/libvirt/host.py +++ b/nova/virt/libvirt/host.py @@ -127,15 +127,15 @@ class Host(object): @staticmethod def _get_libvirt_proxy_classes(libvirt_module): """Return a tuple for tpool.Proxy's autowrap argument containing all - classes defined by the libvirt module except libvirtError. + public vir* classes defined by the libvirt module. """ # Get a list of (name, class) tuples of libvirt classes classes = inspect.getmembers(libvirt_module, inspect.isclass) - # Return a list of just the classes, filtering out libvirtError because - # we don't need to proxy that - return tuple([cls[1] for cls in classes if cls[0] != 'libvirtError']) + # Return a list of just the vir* classes, filtering out libvirtError + # and any private globals pointing at private internal classes. + return tuple([cls[1] for cls in classes if cls[0].startswith("vir")]) def _wrap_libvirt_proxy(self, obj): """Return an object wrapped in a tpool.Proxy using autowrap appropriate diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py index 9543adda7a..e601e46a9b 100644 --- a/nova/virt/libvirt/migration.py +++ b/nova/virt/libvirt/migration.py @@ -344,14 +344,21 @@ def _update_vif_xml(xml_doc, migrate_data, get_vif_config): instance_uuid = xml_doc.findtext('uuid') parser = etree.XMLParser(remove_blank_text=True) interface_nodes = xml_doc.findall('./devices/interface') - migrate_vif_by_mac = {vif.source_vif['address']: vif + # MAC address stored for port in neutron DB and in domain XML + # might be in different cases, so to harmonize that + # we convert MAC to lower case for dict key. + migrate_vif_by_mac = {vif.source_vif['address'].lower(): vif for vif in migrate_data.vifs} for interface_dev in interface_nodes: mac = interface_dev.find('mac') mac = mac if mac is not None else {} mac_addr = mac.get('address') if mac_addr: - migrate_vif = migrate_vif_by_mac[mac_addr] + # MAC address stored in libvirt should always be normalized + # and stored in lower case. But just to be extra safe here + # we still normalize MAC retrieved from XML to be absolutely + # sure it will be the same with the Neutron provided one. + migrate_vif = migrate_vif_by_mac[mac_addr.lower()] vif = migrate_vif.get_dest_vif() # get_vif_config is a partial function of # nova.virt.libvirt.vif.LibvirtGenericVIFDriver.get_config diff --git a/nova/virt/libvirt/vif.py b/nova/virt/libvirt/vif.py index bf332a6b85..67e0771ad9 100644 --- a/nova/virt/libvirt/vif.py +++ b/nova/virt/libvirt/vif.py @@ -462,7 +462,7 @@ class LibvirtGenericVIFDriver(object): designer.set_vif_host_backend_vhostuser_config( conf, vif.mode, vif.path, CONF.libvirt.rx_queue_size, - CONF.libvirt.tx_queue_size) + CONF.libvirt.tx_queue_size, vif.vif_name) def _set_config_VIFHostDevice(self, instance, vif, conf): if vif.dev_type == osv_fields.VIFHostDeviceDevType.ETHERNET: @@ -666,7 +666,8 @@ class LibvirtGenericVIFDriver(object): vif_model = self.get_vif_model(image_meta=image_meta) # TODO(ganso): explore whether multiqueue works for other vif models # that go through this code path. - multiqueue = (self._requests_multiqueue(image_meta) and + multiqueue = (instance.get_flavor().vcpus > 1 and + self._requests_multiqueue(image_meta) and vif_model == network_model.VIF_MODEL_VIRTIO) nova.privsep.linux_net.create_tap_dev(dev, mac, multiqueue=multiqueue) network = vif.get('network') diff --git a/playbooks/ceph/glance-copy-policy.yaml b/playbooks/ceph/glance-copy-policy.yaml index 3e9f7f0201..2b9d25fc50 100644 --- a/playbooks/ceph/glance-copy-policy.yaml +++ b/playbooks/ceph/glance-copy-policy.yaml @@ -7,4 +7,8 @@ create: True mode: 0777 block: | - echo $'{"copy_image": "\'public\':%(visibility)s"}' > /etc/glance/policy.json + # This policy is default to admin only in glance. Override + # here to allow everyone and every type of image (private + # or public) to copy. This way we will be able to test copy + # image via non-admin as well as on private images. + echo $'{"copy_image": ""}' > /etc/glance/policy.json diff --git a/playbooks/legacy/nova-grenade-multinode/post.yaml b/playbooks/legacy/nova-grenade-multinode/post.yaml deleted file mode 100644 index e07f5510ae..0000000000 --- a/playbooks/legacy/nova-grenade-multinode/post.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff --git a/playbooks/legacy/nova-grenade-multinode/run.yaml b/playbooks/legacy/nova-grenade-multinode/run.yaml deleted file mode 100644 index 18f7c753eb..0000000000 --- a/playbooks/legacy/nova-grenade-multinode/run.yaml +++ /dev/null @@ -1,65 +0,0 @@ -- hosts: primary - name: nova-grenade-multinode - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PROJECTS="openstack/grenade $PROJECTS" - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_NEUTRON=1 - # NOTE(mriedem): Run tempest smoke tests specific to compute on the - # new side of the grenade environment. The post-test hook script will - # run non-smoke migration tests in a local/block and shared/ceph - # setup. Note that grenade hard-codes "tox -esmoke" for tempest on - # the old side so the regex is not appied there. - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_REGEX="tempest\.(api\.compute|scenario)\..*smoke.*" - export DEVSTACK_GATE_GRENADE=pullup - export DEVSTACK_GATE_USE_PYTHON3=True - # By default grenade runs only smoke tests so we need to set - # RUN_SMOKE to False in order to run live migration tests using - # grenade - export DEVSTACK_LOCAL_CONFIG="RUN_SMOKE=False" - # LIVE_MIGRATE_BACK_AND_FORTH will tell Tempest to run a live - # migration of the same instance to one compute node and then back - # to the other, which is mostly only interesting for grenade since - # we have mixed level computes. - export DEVSTACK_LOCAL_CONFIG+=$'\n'"LIVE_MIGRATE_BACK_AND_FORTH=True" - export BRANCH_OVERRIDE=default - export DEVSTACK_GATE_TOPOLOGY="multinode" - if [ "$BRANCH_OVERRIDE" != "default" ] ; then - export OVERRIDE_ZUUL_BRANCH=$BRANCH_OVERRIDE - fi - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff --git a/playbooks/legacy/nova-live-migration/post.yaml b/playbooks/legacy/nova-live-migration/post.yaml deleted file mode 100644 index e07f5510ae..0000000000 --- a/playbooks/legacy/nova-live-migration/post.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff --git a/playbooks/legacy/nova-live-migration/run.yaml b/playbooks/legacy/nova-live-migration/run.yaml deleted file mode 100644 index ef8853135b..0000000000 --- a/playbooks/legacy/nova-live-migration/run.yaml +++ /dev/null @@ -1,60 +0,0 @@ -- hosts: primary - name: nova-live-migration - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - name: Configure devstack - shell: - # Force config drive. - cmd: | - set -e - set -x - cat << 'EOF' >>"/tmp/dg-local.conf" - [[local|localrc]] - FORCE_CONFIG_DRIVE=True - - EOF - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_NOTESTS=1 - export DEVSTACK_GATE_TOPOLOGY="multinode" - export DEVSTACK_GATE_USE_PYTHON3=True - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - $BASE/new/nova/gate/test_evacuate.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff --git a/playbooks/nova-evacuate/run.yaml b/playbooks/nova-evacuate/run.yaml new file mode 100644 index 0000000000..35e330a6de --- /dev/null +++ b/playbooks/nova-evacuate/run.yaml @@ -0,0 +1,8 @@ +--- +- hosts: all + roles: + - orchestrate-devstack + +- hosts: controller + roles: + - run-evacuate-hook diff --git a/playbooks/nova-live-migration/post-run.yaml b/playbooks/nova-live-migration/post-run.yaml new file mode 100644 index 0000000000..845a1b15b2 --- /dev/null +++ b/playbooks/nova-live-migration/post-run.yaml @@ -0,0 +1,10 @@ +--- +- hosts: tempest + become: true + roles: + - role: fetch-subunit-output + zuul_work_dir: '{{ devstack_base_dir }}/tempest' + - role: process-stackviz +- hosts: controller + roles: + - run-evacuate-hook diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml new file mode 100644 index 0000000000..4c6135311b --- /dev/null +++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Improved detection of anti-affinity policy violation when performing live + and cold migrations. Most of the violations caused by race conditions due + to performing concurrent live or cold migrations should now be addressed + by extra checks in the compute service. Upon detection, cold migration + operations are automatically rescheduled, while live migrations have two + checks and will be rescheduled if detected by the first one, otherwise the + live migration will fail cleanly and revert the instance state back to its + previous value. diff --git a/releasenotes/notes/bug-1892361-pci-deivce-type-update-c407a66fd37f6405.yaml b/releasenotes/notes/bug-1892361-pci-deivce-type-update-c407a66fd37f6405.yaml new file mode 100644 index 0000000000..ba35c25b02 --- /dev/null +++ b/releasenotes/notes/bug-1892361-pci-deivce-type-update-c407a66fd37f6405.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Fixes `bug 1892361`_ in which the pci stat pools are not updated when an + existing device is enabled with SRIOV capability. Restart of nova-compute + service updates the pci device type from type-PCI to type-PF but the pools + still maintain the device type as type-PCI. And so the PF is considered for + allocation to instance that requests vnic_type=direct. With this fix, the + pci device type updates are detected and the pci stat pools are updated + properly. + + .. _bug 1892361: https://bugs.launchpad.net/nova/+bug/1892361 diff --git a/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml b/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml new file mode 100644 index 0000000000..ff27f936aa --- /dev/null +++ b/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + The `os-resetState`_ API will now log an instance action when called. The + resulting instance action being visable via the `os-instance-actions`_ API + to users and admins, resolving `bug 1911924`_. + + .. _bug 1911924: https://launchpad.net/bugs/1911924 + .. _os-instance-actions: https://docs.openstack.org/api-ref/compute/?expanded=reset-server-state-os-resetstate-action-detail,list-actions-for-server-detail + .. _os-resetState: https://docs.openstack.org/api-ref/compute/?expanded=reset-server-state-os-resetstate-action-detail diff --git a/releasenotes/notes/bug-1939604-547c729b7741831b.yaml b/releasenotes/notes/bug-1939604-547c729b7741831b.yaml new file mode 100644 index 0000000000..e14327c285 --- /dev/null +++ b/releasenotes/notes/bug-1939604-547c729b7741831b.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Addressed an issue that prevented instances with 1 vcpu using multiqueue + feature from being created successfully when their vif_type is TAP. diff --git a/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml new file mode 100644 index 0000000000..c3686a9978 --- /dev/null +++ b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml @@ -0,0 +1,18 @@ +--- +issues: + - | + The libvirt virt driver in Nova implements power on and hard reboot by + destroying the domain first and unpluging the vifs then recreating the + domain and replugging the vifs. However nova does not wait for the + network-vif-plugged event before unpause the domain. This can cause + the domain to start running and requesting IP via DHCP before the + networking backend has finished plugging the vifs. The config option + [workarounds]wait_for_vif_plugged_event_during_hard_reboot has been added, + defaulting to an empty list, that can be used to ensure that the libvirt + driver waits for the network-vif-plugged event for vifs with specific + ``vnic_type`` before it unpauses the domain during hard reboot. This should + only be used if the deployment uses a networking backend that sends such + event for the given ``vif_type`` at vif plug time. The ml2/ovs and the + networking-odl Neutron backend is known to send plug time events for ports + with ``normal`` ``vnic_type``. For more information see + https://bugs.launchpad.net/nova/+bug/1946729 diff --git a/releasenotes/notes/bug_1902925-351f563340a1e9a5.yaml b/releasenotes/notes/bug_1902925-351f563340a1e9a5.yaml new file mode 100644 index 0000000000..5e3fc7b9e0 --- /dev/null +++ b/releasenotes/notes/bug_1902925-351f563340a1e9a5.yaml @@ -0,0 +1,11 @@ +--- + +fixes: + - | + When upgrading compute services from Ussuri to Victoria each by one, the + Compute RPC API was pinning to 5.11 (either automatically or by using + the specific rpc version in the option) but when rebuilding an instance, + a TypeError was raised as an argument was not provided. This error is + fixed by `bug 1902925`_. + + .. _bug 1902925: https://bugs.launchpad.net/nova/+bug/1902925/ diff --git a/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml b/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml new file mode 100644 index 0000000000..b46936b14a --- /dev/null +++ b/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + The libvirt virt driver will no longer attempt to fetch volume + encryption metadata or the associated secret key when attaching ``LUKSv1`` + encrypted volumes if a libvirt secret already exists on the host. + + This resolves `bug 1905701`_ where instances with ``LUKSv1`` encrypted + volumes could not be restarted automatically by the ``nova-compute`` + service after a host reboot when the + ``[DEFAULT]/resume_guests_state_on_host_boot`` configurable was enabled. + + .. _bug 1905701: https://launchpad.net/bugs/1905701 diff --git a/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml new file mode 100644 index 0000000000..ce05b9a867 --- /dev/null +++ b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml @@ -0,0 +1,19 @@ +--- +security: + - | + A vulnerability in the console proxies (novnc, serial, spice) that allowed + open redirection has been `patched`_. The novnc, serial, and spice console + proxies are implemented as websockify servers and the request handler + inherits from the python standard SimpleHTTPRequestHandler. There is a + `known issue`_ in the SimpleHTTPRequestHandler which allows open redirects + by way of URLs in the following format:: + + http://vncproxy.my.domain.com//example.com/%2F.. + + which if visited, will redirect a user to example.com. + + The novnc, serial, and spice console proxies will now reject requests that + pass a redirection URL beginning with "//" with a 400 Bad Request. + + .. _patched: https://bugs.launchpad.net/nova/+bug/1927677 + .. _known issue: https://bugs.python.org/issue32084 diff --git a/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml b/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml new file mode 100644 index 0000000000..31498a9d9c --- /dev/null +++ b/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml @@ -0,0 +1,9 @@ +--- +issues: + - | + When the tempest test coverage was added for resize and cold migrate + with neutron ports having QoS minimum bandwidth policy rules we + discovered that the cross cell resize code path cannot handle such ports. + See bug https://bugs.launchpad.net/nova/+bug/1907522 for details. A fix + was implemented that makes sure that Nova falls back to same-cell resize if + the server has such ports. diff --git a/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml b/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml new file mode 100644 index 0000000000..f74128dc6e --- /dev/null +++ b/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml @@ -0,0 +1,9 @@ +--- +upgrade: + - | + Nova services only support old computes if the compute is not + older than the previous major nova release. From now on nova services will + emit a warning at startup if the deployment contains too old compute + services. From the 23.0.0 (Wallaby) release nova services will refuse to + start if the deployment contains too old compute services to prevent + compatibility issues. diff --git a/roles/run-evacuate-hook/README.rst b/roles/run-evacuate-hook/README.rst new file mode 100644 index 0000000000..e423455aee --- /dev/null +++ b/roles/run-evacuate-hook/README.rst @@ -0,0 +1 @@ +Run Nova evacuation tests against a multinode environment. diff --git a/roles/run-evacuate-hook/files/setup_evacuate_resources.sh b/roles/run-evacuate-hook/files/setup_evacuate_resources.sh new file mode 100755 index 0000000000..c8c385d7ff --- /dev/null +++ b/roles/run-evacuate-hook/files/setup_evacuate_resources.sh @@ -0,0 +1,34 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}') +flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}') +network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}') + +echo "Creating ephemeral test server on subnode" +openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \ +--nic net-id=${network_id} --host $SUBNODE_HOSTNAME --wait evacuate-test + +# TODO(lyarwood) Use osc to launch the bfv volume +echo "Creating boot from volume test server on subnode" +nova --os-compute-api-version 2.74 boot --flavor ${flavor_id} --poll \ +--block-device id=${image_id},source=image,dest=volume,size=1,bootindex=0,shutdown=remove \ +--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} evacuate-bfv-test + +echo "Forcing down the subnode so we can evacuate from it" +openstack --os-compute-api-version 2.11 compute service set --down ${SUBNODE_HOSTNAME} nova-compute + +count=0 +status=$(openstack compute service list --host ${SUBNODE_HOSTNAME} --service nova-compute -f value -c State) +while [ "${status}" != "down" ] +do + sleep 1 + count=$((count+1)) + if [ ${count} -eq 30 ]; then + echo "Timed out waiting for subnode compute service to be marked as down" + exit 5 + fi + status=$(openstack compute service list --host ${SUBNODE_HOSTNAME} --service nova-compute -f value -c State) +done diff --git a/roles/run-evacuate-hook/files/test_evacuate.sh b/roles/run-evacuate-hook/files/test_evacuate.sh new file mode 100755 index 0000000000..bdf8d92441 --- /dev/null +++ b/roles/run-evacuate-hook/files/test_evacuate.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Source tempest to determine the build timeout configuration. +source /opt/stack/devstack/lib/tempest +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Wait for the controller compute service to be enabled. +count=0 +status=$(openstack compute service list --host ${CONTROLLER_HOSTNAME} --service nova-compute -f value -c Status) +while [ "${status}" != "enabled" ] +do + sleep 1 + count=$((count+1)) + if [ ${count} -eq 30 ]; then + echo "Timed out waiting for controller compute service to be enabled" + exit 5 + fi + status=$(openstack compute service list --host ${CONTROLLER_HOSTNAME} --service nova-compute -f value -c Status) +done + +function evacuate_and_wait_for_active() { + local server="$1" + + nova evacuate ${server} + # Wait for the instance to go into ACTIVE state from the evacuate. + count=0 + status=$(openstack server show ${server} -f value -c status) + while [ "${status}" != "ACTIVE" ] + do + sleep 1 + count=$((count+1)) + if [ ${count} -eq ${BUILD_TIMEOUT} ]; then + echo "Timed out waiting for server ${server} to go to ACTIVE status" + exit 6 + fi + status=$(openstack server show ${server} -f value -c status) + done +} + +evacuate_and_wait_for_active evacuate-test +evacuate_and_wait_for_active evacuate-bfv-test + +# Make sure the servers moved. +for server in evacuate-test evacuate-bfv-test; do + host=$(openstack server show ${server} -f value -c OS-EXT-SRV-ATTR:host) + if [[ ${host} != ${CONTROLLER_HOSTNAME} ]]; then + echo "Unexpected host ${host} for server ${server} after evacuate." + exit 7 + fi +done + +# Cleanup test servers +openstack server delete --wait evacuate-test +openstack server delete --wait evacuate-bfv-test diff --git a/roles/run-evacuate-hook/files/test_negative_evacuate.sh b/roles/run-evacuate-hook/files/test_negative_evacuate.sh new file mode 100755 index 0000000000..b1f5f7a4af --- /dev/null +++ b/roles/run-evacuate-hook/files/test_negative_evacuate.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Source tempest to determine the build timeout configuration. +source /opt/stack/devstack/lib/tempest +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Now force the evacuation to the controller; we have to force to bypass the +# scheduler since we killed libvirtd which will trigger the libvirt compute +# driver to auto-disable the nova-compute service and then the ComputeFilter +# would filter out this host and we'd get NoValidHost. Normally forcing a host +# during evacuate and bypassing the scheduler is a very bad idea, but we're +# doing a negative test here. + +function evacuate_and_wait_for_error() { + local server="$1" + + echo "Forcing evacuate of ${server} to local host" + # TODO(mriedem): Use OSC when it supports evacuate. + nova --os-compute-api-version "2.67" evacuate --force ${server} ${CONTROLLER_HOSTNAME} + # Wait for the instance to go into ERROR state from the failed evacuate. + count=0 + status=$(openstack server show ${server} -f value -c status) + while [ "${status}" != "ERROR" ] + do + sleep 1 + count=$((count+1)) + if [ ${count} -eq ${BUILD_TIMEOUT} ]; then + echo "Timed out waiting for server ${server} to go to ERROR status" + exit 4 + fi + status=$(openstack server show ${server} -f value -c status) + done +} + +evacuate_and_wait_for_error evacuate-test +evacuate_and_wait_for_error evacuate-bfv-test diff --git a/roles/run-evacuate-hook/tasks/main.yaml b/roles/run-evacuate-hook/tasks/main.yaml new file mode 100644 index 0000000000..184b9d18f9 --- /dev/null +++ b/roles/run-evacuate-hook/tasks/main.yaml @@ -0,0 +1,82 @@ +- name: Setup resources and mark the subnode as forced down + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/setup_evacuate_resources.sh" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + +- name: Fence subnode by stopping q-agt and n-cpu + delegate_to: compute1 + become: true + systemd: + name: "{{ item }}" + state: stopped + with_items: + - devstack@q-agt + - devstack@n-cpu + +- name: Register running domains on subnode + delegate_to: compute1 + become: true + virt: + command: list_vms + state: running + register: subnode_vms + +- name: Destroy running domains on subnode + delegate_to: compute1 + become: true + virt: + name: "{{ item }}" + state: destroyed + with_items: "{{ subnode_vms.list_vms }}" + +- name: Stop libvirtd on "{{ inventory_hostname }}" + become: true + systemd: + name: "{{ item }}" + state: stopped + enabled: no + with_items: + - libvirtd.service + - libvirtd.socket + - libvirtd-admin.socket + - libvirtd-ro.socket + - virtlogd.service + - virtlogd-admin.socket + - virtlogd.socket + - virtlockd.service + - virtlockd-admin.socket + - virtlockd.socket + +- name: Run negative evacuate tests + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/test_negative_evacuate.sh" + environment: + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + +- name: Start libvirtd on "{{ inventory_hostname }}" + become: true + systemd: + name: "{{ item }}" + state: started + enabled: yes + with_items: + - libvirtd.service + - libvirtd.socket + - libvirtd-admin.socket + - libvirtd-ro.socket + - virtlogd.service + - virtlogd-admin.socket + - virtlogd.socket + - virtlockd.service + - virtlockd-admin.socket + - virtlockd.socket + +- name: Run evacuate tests + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/test_evacuate.sh" + environment: + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 32627e59b6..5a449c520b 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -32,7 +32,7 @@ if [ $checked -eq 0 ]; then echo "Checked $checked cherry-pick hashes: OK" exit 0 else - if ! git show --format='%B' --quiet | grep -qi 'stable.*only'; then + if ! git show --format='%B' --quiet $commit_hash | grep -qi 'stable.*only'; then echo 'Stable branch requires either cherry-pick -x headers or [stable-only] tag!' exit 1 fi @@ -56,7 +56,6 @@ commands = bash -c "! find doc/ -type f -name *.json | xargs grep -U -n $'\r'" # Check that all included JSON files are valid JSON bash -c '! find doc/ -type f -name *.json | xargs -t -n1 python -m json.tool 2>&1 > /dev/null | grep -B1 -v ^python' - bash tools/check-cherry-picks.sh [testenv:fast8] description = @@ -65,6 +64,15 @@ envdir = {toxworkdir}/shared commands = bash tools/flake8wrap.sh -HEAD +[testenv:validate-backport] +description = + Determine whether a backport is ready to be merged by checking whether it has + already been merged to master or more recent stable branches. +deps = +skipsdist = true +commands = + bash tools/check-cherry-picks.sh + [testenv:functional] description = Run functional tests using python3. @@ -118,6 +126,16 @@ deps = {[testenv:functional]deps} commands = {[testenv:functional]commands} +[testenv:functional-without-sample-db-tests] +description = + Run functional tests by excluding the API|Notification + sample tests and DB tests. This env is used in + placement-nova-tox-functional-py38 job which is defined and + run in placement. +deps = {[testenv:functional]deps} +commands = + stestr --test-path=./nova/tests/functional run --exclude-regex '((?:api|notification)_sample_tests|functional\.db\.)' {posargs} + [testenv:api-samples] setenv = {[testenv]setenv} @@ -174,6 +192,7 @@ description = # to install (test-)requirements.txt for docs. deps = -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/victoria} + -r{toxinidir}/requirements.txt -r{toxinidir}/doc/requirements.txt commands = rm -rf doc/build/html doc/build/doctrees @@ -327,10 +346,3 @@ usedevelop = False deps = bindep commands = bindep test - -[testenv:lower-constraints] -usedevelop = False -deps = - -c{toxinidir}/lower-constraints.txt - -r{toxinidir}/test-requirements.txt - -r{toxinidir}/requirements.txt |