nova/tests/functional/compute/test_init_host.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import mock
import time

from nova import context as nova_context
from nova import objects
from nova.tests.functional import integrated_helpers


class ComputeManagerInitHostTestCase(
        integrated_helpers.ProviderUsageBaseTestCase):
    """Tests various actions performed when the nova-compute service starts."""

    compute_driver = 'fake.MediumFakeDriver'

    def test_migrate_disk_and_power_off_crash_finish_revert_migration(self):
        """Tests the scenario that the compute service crashes while the
        driver's migrate_disk_and_power_off method is running (we could be
        slow transferring disks or something when it crashed) and on restart
        of the compute service the driver's finish_revert_migration method
        is called to cleanup the source host and reset the instance task_state.
        """
        # Start two compute service so we migrate across hosts.
        for x in range(2):
            self._start_compute('host%d' % x)
        # Create a server, it does not matter on which host it lands.
        server = self._build_server(networks='auto')
        server = self.api.post_server({'server': server})
        server = self._wait_for_state_change(server, 'ACTIVE')
        # Save the source hostname for assertions later.
        source_host = server['OS-EXT-SRV-ATTR:host']

        def fake_migrate_disk_and_power_off(*args, **kwargs):
            # Simulate the source compute service crashing by restarting it.
            self.restart_compute_service(self.computes[source_host])
            # We have to keep the method from returning before asserting the
            # _init_instance restart behavior otherwise resize_instance will
            # fail and set the instance to ERROR status, revert allocations,
            # etc which is not realistic if the service actually crashed while
            # migrate_disk_and_power_off was running.
            # The sleep value needs to be large to avoid this waking up and
            # interfering with other tests running on the same worker.
            time.sleep(1000000)

        source_driver = self.computes[source_host].manager.driver
        with mock.patch.object(source_driver, 'migrate_disk_and_power_off',
                               side_effect=fake_migrate_disk_and_power_off):
            # Initiate a cold migration from the source host.
            self.admin_api.post_server_action(server['id'], {'migrate': None})
            # Now wait for the task_state to be reset to None during
            # _init_instance.
            server = self._wait_for_server_parameter(server, {
                    'status': 'ACTIVE',
                    'OS-EXT-STS:task_state': None,
                    'OS-EXT-SRV-ATTR:host': source_host
                }
            )

        # Assert we went through the _init_instance processing we expect.
        log_out = self.stdlog.logger.output
        self.assertIn('Instance found in migrating state during startup. '
                      'Resetting task_state', log_out)
        # Assert that driver.finish_revert_migration did not raise an error.
        self.assertNotIn('Failed to revert crashed migration', log_out)

        # The migration status should be "error" rather than stuck as
        # "migrating".
        context = nova_context.get_admin_context()
        # FIXME(mriedem): This is bug 1836369 because we would normally expect
        # Migration.get_by_instance_and_status to raise
        # MigrationNotFoundByStatus since the status should be "error".
        objects.Migration.get_by_instance_and_status(
            context, server['id'], 'migrating')

        # Assert things related to the resize get cleaned up:
        # - things set on the instance during prep_resize like:
        #   - migration_context
        #   - new_flavor
        #   - stashed old_vm_state in system_metadata
        # - migration-based allocations from conductor/scheduler, i.e. that the
        #   allocations created by the scheduler for the instance and dest host
        #   are gone and the source host allocations are back on the instance
        #   rather than the migration record
        instance = objects.Instance.get_by_uuid(
            context, server['id'], expected_attrs=[
                'migration_context', 'flavor', 'system_metadata'
            ])
        # FIXME(mriedem): Leaving these fields set on the instance is
        # bug 1836369.
        self.assertIsNotNone(instance.migration_context)
        self.assertIsNotNone(instance.new_flavor)
        self.assertEqual('active', instance.system_metadata['old_vm_state'])

        dest_host = 'host0' if source_host == 'host1' else 'host1'
        dest_rp_uuid = self._get_provider_uuid_by_host(dest_host)
        dest_allocations = self._get_allocations_by_provider_uuid(dest_rp_uuid)
        # FIXME(mriedem): This is bug 1836369 because we orphaned the
        # allocations created by the scheduler for the server on the dest host.
        self.assertIn(server['id'], dest_allocations)
        source_rp_uuid = self._get_provider_uuid_by_host(source_host)
        source_allocations = self._get_allocations_by_provider_uuid(
            source_rp_uuid)
        # FIXME(mriedem): This is bug 1836369 because the server is running on
        # the source host but is not tracking allocations against the source
        # host.
        self.assertNotIn(server['id'], source_allocations)


class TestComputeRestartInstanceStuckInBuild(
        integrated_helpers.ProviderUsageBaseTestCase):

    compute_driver = 'fake.SmallFakeDriver'

    def setUp(self):
        super(TestComputeRestartInstanceStuckInBuild, self).setUp()
        self.compute1 = self._start_compute(host='host1')

    def test_restart_compute_while_instance_waiting_for_resource_claim(self):
        """Test for bug 1833581 where an instance is stuck in
        BUILD state forever due to compute service is restarted before the
        resource claim finished.
        """

        # To reproduce the problem we need to stop / kill the compute service
        # when an instance build request has already reached the service but
        # the instance_claim() has not finished. One way that this
        # happens in practice is when multiple builds are waiting for the
        # 'nova-compute-resource' semaphore. So one way to reproduce this in
        # the test would be to grab that semaphore, boot an instance, wait for
        # it to reach the compute then stop the compute.
        # Unfortunately when we release the semaphore after the simulated
        # compute restart the original instance_claim execution continues as
        # the stopped compute is not 100% stopped in the func test env. Also
        # we cannot really keep the semaphore forever as this named semaphore
        # is shared between the old and new compute service.
        # There is another way to trigger the issue. We can inject a sleep into
        # instance_claim() to stop it. This is less realistic but it works in
        # the test env.
        server_req = self._build_server(
            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
            networks='none')

        def sleep_forever(*args, **kwargs):
            time.sleep(1000000)

        with mock.patch('nova.compute.resource_tracker.ResourceTracker.'
                        'instance_claim') as mock_instance_claim:
            mock_instance_claim.side_effect = sleep_forever

            server = self.api.post_server({'server': server_req})
            self._wait_for_state_change(server, 'BUILD')

            # the instance.create.start is the closest thing to the
            # instance_claim call we can wait for in the test
            self.notifier.wait_for_versioned_notifications(
                'instance.create.start')

            with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
                self.restart_compute_service(self.compute1)

        # We expect that the instance is pushed to ERROR state during the
        # compute restart.
        self._wait_for_state_change(server, 'ERROR')
        mock_log.assert_called_with(
            'Instance spawn was interrupted before instance_claim, setting '
            'instance to ERROR state',
            instance=mock.ANY)