summaryrefslogtreecommitdiff
path: root/tools/libs/guest/xg_sr_common.h
blob: 2f058ee3a6ff60bd157c66d685e8cd7bf3b1605c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
#ifndef __COMMON__H
#define __COMMON__H

#include <stdbool.h>

#include "xg_private.h"
#include "xg_save_restore.h"
#include "xc_bitops.h"

#include "xg_sr_stream_format.h"

/* String representation of Domain Header types. */
const char *dhdr_type_to_str(uint32_t type);

/* String representation of Record types. */
const char *rec_type_to_str(uint32_t type);

struct xc_sr_context;
struct xc_sr_record;

/**
 * Save operations.  To be implemented for each type of guest, for use by the
 * common save algorithm.
 *
 * Every function must be implemented, even if only with a no-op stub.
 */
struct xc_sr_save_ops
{
    /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
    xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);

    /**
     * Optionally transform the contents of a page from being specific to the
     * sending environment, to being generic for the stream.
     *
     * The page of data at the end of 'page' may be a read-only mapping of a
     * running guest; it must not be modified.  If no transformation is
     * required, the callee should leave '*pages' untouched.
     *
     * If a transformation is required, the callee should allocate themselves
     * a local page using malloc() and return it via '*page'.
     *
     * The caller shall free() '*page' in all cases.  In the case that the
     * callee encounters an error, it should *NOT* free() the memory it
     * allocated for '*page'.
     *
     * It is valid to fail with EAGAIN if the transformation is not able to be
     * completed at this point.  The page shall be retried later.
     *
     * @returns 0 for success, -1 for failure, with errno appropriately set.
     */
    int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
                          void **page);

    /**
     * Set up local environment to save a domain. (Typically querying
     * running domain state, setting up mappings etc.)
     *
     * This is called once before any common setup has occurred, allowing for
     * guest-specific adjustments to be made to common state.
     */
    int (*setup)(struct xc_sr_context *ctx);

    /**
     * Send static records at the head of the stream.  This is called once,
     * after the Image and Domain headers are written.
     */
    int (*static_data)(struct xc_sr_context *ctx);

    /**
     * Send dynamic records which need to be at the start of the stream.  This
     * is called after the STATIC_DATA_END record is written.
     */
    int (*start_of_stream)(struct xc_sr_context *ctx);

    /**
     * Send records which need to be at the start of a checkpoint.  This is
     * called once, or once per checkpoint in a checkpointed stream, and is
     * ahead of memory data.
     */
    int (*start_of_checkpoint)(struct xc_sr_context *ctx);

    /**
     * Send records which need to be at the end of the checkpoint.  This is
     * called once, or once per checkpoint in a checkpointed stream, and is
     * after the memory data.
     */
    int (*end_of_checkpoint)(struct xc_sr_context *ctx);

    /**
     * Check state of guest to decide whether it makes sense to continue
     * migration.  This is called in each iteration or checkpoint to check
     * whether all criteria for the migration are still met.  If that's not
     * the case either migration is cancelled via a bad rc or the situation
     * is handled, e.g. by sending appropriate records.
     */
    int (*check_vm_state)(struct xc_sr_context *ctx);

    /**
     * Clean up the local environment.  Will be called exactly once, either
     * after a successful save, or upon encountering an error.
     */
    int (*cleanup)(struct xc_sr_context *ctx);
};


/**
 * Restore operations.  To be implemented for each type of guest, for use by
 * the common restore algorithm.
 *
 * Every function must be implemented, even if only with a no-op stub.
 */
struct xc_sr_restore_ops
{
    /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
    xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);

    /* Check to see whether a PFN is valid. */
    bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);

    /* Set the GFN of a PFN. */
    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);

    /* Set the type of a PFN. */
    void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
                          xen_pfn_t type);

    /**
     * Optionally transform the contents of a page from being generic in the
     * stream, to being specific to the restoring environment.
     *
     * 'page' is expected to be modified in-place if a transformation is
     * required.
     *
     * @returns 0 for success, -1 for failure, with errno appropriately set.
     */
    int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);

    /**
     * Set up local environment to restore a domain.
     *
     * This is called once before any common setup has occurred, allowing for
     * guest-specific adjustments to be made to common state.
     */
    int (*setup)(struct xc_sr_context *ctx);

    /**
     * Process an individual record from the stream.  The caller shall take
     * care of processing common records (e.g. END, PAGE_DATA).
     *
     * @return 0 for success, -1 for failure, or the following sentinels:
     *  - RECORD_NOT_PROCESSED
     *  - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
     *    a failover is needed.
     */
#define RECORD_NOT_PROCESSED 1
#define BROKEN_CHANNEL 2
    int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);

    /**
     * Perform any actions required after the static data has arrived.  Called
     * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
     * 'missing' should be filled in for any data item the higher level
     * toolstack needs to provide compatiblity for.
     */
    int (*static_data_complete)(struct xc_sr_context *ctx,
                                unsigned int *missing);

    /**
     * Perform any actions required after the stream has been finished. Called
     * after the END record has been received.
     */
    int (*stream_complete)(struct xc_sr_context *ctx);

    /**
     * Clean up the local environment.  Will be called exactly once, either
     * after a successful restore, or upon encountering an error.
     */
    int (*cleanup)(struct xc_sr_context *ctx);
};

/* Wrapper for blobs of data heading Xen-wards. */
struct xc_sr_blob
{
    void *ptr;
    size_t size;
};

/*
 * Update a blob.  Duplicate src/size, freeing the old blob if necessary.  May
 * fail due to memory allocation.
 */
static inline int update_blob(struct xc_sr_blob *blob,
                              const void *src, size_t size)
{
    void *ptr;

    if ( !src || !size )
    {
        errno = EINVAL;
        return -1;
    }

    if ( (ptr = malloc(size)) == NULL )
        return -1;

    free(blob->ptr);
    blob->ptr = memcpy(ptr, src, size);
    blob->size = size;

    return 0;
}

struct xc_sr_context
{
    xc_interface *xch;
    uint32_t domid;
    int fd;

    /* Plain VM, or checkpoints over time. */
    xc_stream_type_t stream_type;

    xc_domaininfo_t dominfo;

    union /* Common save or restore data. */
    {
        struct /* Save data. */
        {
            int recv_fd;

            struct xc_sr_save_ops ops;
            struct save_callbacks *callbacks;

            /* Live migrate vs non live suspend. */
            bool live;

            /* Further debugging information in the stream. */
            bool debug;

            unsigned long p2m_size;

            struct precopy_stats stats;

            xen_pfn_t *batch_pfns;
            unsigned int nr_batch_pfns;
            unsigned long *deferred_pages;
            unsigned long nr_deferred_pages;
            xc_hypercall_buffer_t dirty_bitmap_hbuf;
        } save;

        struct /* Restore data. */
        {
            struct xc_sr_restore_ops ops;
            struct restore_callbacks *callbacks;

            int send_back_fd;
            unsigned long p2m_size;
            xc_hypercall_buffer_t dirty_bitmap_hbuf;

            /* From Image Header. */
            uint32_t format_version;

            /* From Domain Header. */
            uint32_t guest_type;
            uint32_t guest_page_size;

            /* Currently buffering records between a checkpoint */
            bool buffer_all_records;

            /* Whether a STATIC_DATA_END record has been seen/inferred. */
            bool seen_static_data_end;

/*
 * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
 * in case the primary will fail, we can recover from the last
 * checkpoint state.
 * This should be enough for most of the cases because primary only send
 * dirty pages at checkpoint.
 */
#define DEFAULT_BUF_RECORDS 1024
            struct xc_sr_record *buffered_records;
            unsigned int allocated_rec_num;
            unsigned int buffered_rec_num;

            /*
             * Xenstore and Console parameters.
             * INPUT:  evtchn & domid
             * OUTPUT: gfn
             */
            xen_pfn_t    xenstore_gfn,    console_gfn;
            unsigned int xenstore_evtchn, console_evtchn;
            uint32_t     xenstore_domid,  console_domid;

            /* Bitmap of currently populated PFNs during restore. */
            unsigned long *populated_pfns;
            xen_pfn_t max_populated_pfn;

            /* Sender has invoked verify mode on the stream. */
            bool verify;
        } restore;
    };

    union /* Guest-arch specific data. */
    {
        struct /* x86 */
        {
            /* Common save/restore data. */
            union
            {
                struct
                {
                    /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */
                    struct xc_sr_blob cpuid, msr;
                } restore;
            };

            struct /* x86 PV guest. */
            {
                /* 4 or 8; 32 or 64 bit domain */
                unsigned int width;
                /* 3 or 4 pagetable levels */
                unsigned int levels;

                /* Maximum Xen frame */
                xen_pfn_t max_mfn;
                /* Read-only machine to phys map */
                xen_pfn_t *m2p;
                /* first mfn of the compat m2p (Only set for 32bit PV guests) */
                xen_pfn_t compat_m2p_mfn0;
                /* Number of m2p frames mapped */
                unsigned long nr_m2p_frames;

                /* Maximum guest frame */
                xen_pfn_t max_pfn;

                /* Number of frames making up the p2m */
                unsigned int p2m_frames;
                /* Guest's phys to machine map.  Mapped read-only (save) or
                 * allocated locally (restore).  Uses guest unsigned longs. */
                void *p2m;
                /* The guest pfns containing the p2m leaves */
                xen_pfn_t *p2m_pfns;

                /* Read-only mapping of guests shared info page */
                shared_info_any_t *shinfo;

                /* p2m generation count for verifying validity of local p2m. */
                uint64_t p2m_generation;

                union
                {
                    struct
                    {
                        /* State machine for the order of received records. */
                        bool seen_pv_info;

                        /* Types for each page (bounded by max_pfn). */
                        uint32_t *pfn_types;

                        /* x86 PV per-vcpu storage structure for blobs. */
                        struct xc_sr_x86_pv_restore_vcpu
                        {
                            struct xc_sr_blob basic, extd, xsave, msr;
                        } *vcpus;
                        unsigned int nr_vcpus;
                    } restore;
                };
            } pv;

            struct /* x86 HVM guest. */
            {
                union
                {
                    struct
                    {
                        /* Whether qemu enabled logdirty mode, and we should
                         * disable on cleanup. */
                        bool qemu_enabled_logdirty;
                    } save;

                    struct
                    {
                        /* HVM context blob. */
                        struct xc_sr_blob context;
                    } restore;
                };
            } hvm;

        } x86;
    };
};

extern struct xc_sr_save_ops save_ops_x86_pv;
extern struct xc_sr_save_ops save_ops_x86_hvm;

extern struct xc_sr_restore_ops restore_ops_x86_pv;
extern struct xc_sr_restore_ops restore_ops_x86_hvm;

struct xc_sr_record
{
    uint32_t type;
    uint32_t length;
    void *data;
};

/*
 * Writes a split record to the stream, applying correct padding where
 * appropriate.  It is common when sending records containing blobs from Xen
 * that the header and blob data are separate.  This function accepts a second
 * buffer and length, and will merge it with the main record when sending.
 *
 * Records with a non-zero length must provide a valid data field; records
 * with a 0 length shall have their data field ignored.
 *
 * Returns 0 on success and non0 on failure.
 */
int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
                       void *buf, size_t sz);

/*
 * Writes a record to the stream, applying correct padding where appropriate.
 * Records with a non-zero length must provide a valid data field; records
 * with a 0 length shall have their data field ignored.
 *
 * Returns 0 on success and non0 on failure.
 */
static inline int write_record(struct xc_sr_context *ctx,
                               struct xc_sr_record *rec)
{
    return write_split_record(ctx, rec, NULL, 0);
}

/*
 * Reads a record from the stream, and fills in the record structure.
 *
 * Returns 0 on success and non-0 on failure.
 *
 * On success, the records type and size shall be valid.
 * - If size is 0, data shall be NULL.
 * - If size is non-0, data shall be a buffer allocated by malloc() which must
 *   be passed to free() by the caller.
 *
 * On failure, the contents of the record structure are undefined.
 */
int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);

/*
 * This would ideally be private in restore.c, but is needed by
 * x86_pv_localise_page() if we receive pagetables frames ahead of the
 * contents of the frames they point at.
 */
int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
                  const xen_pfn_t *original_pfns, const uint32_t *types);

/* Handle a STATIC_DATA_END record. */
int handle_static_data_end(struct xc_sr_context *ctx);

/* Page type known to the migration logic? */
static inline bool is_known_page_type(uint32_t type)
{
    switch ( type )
    {
    case XEN_DOMCTL_PFINFO_NOTAB:

    case XEN_DOMCTL_PFINFO_L1TAB:
    case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L2TAB:
    case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L3TAB:
    case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L4TAB:
    case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_XTAB:
    case XEN_DOMCTL_PFINFO_XALLOC: /* Synthetic type in Xen 4.2 - 4.5 */
    case XEN_DOMCTL_PFINFO_BROKEN:
        return true;

    default:
        return false;
    }
}

/* Page type backed by RAM in the guest? */
static inline bool page_type_to_populate(uint32_t type)
{
    switch ( type )
    {
    case XEN_DOMCTL_PFINFO_NOTAB:

    case XEN_DOMCTL_PFINFO_L1TAB:
    case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L2TAB:
    case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L3TAB:
    case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L4TAB:
    case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_XALLOC:
        return true;

    case XEN_DOMCTL_PFINFO_XTAB:
    case XEN_DOMCTL_PFINFO_BROKEN:
    default:
        return false;
    }
}

static inline bool page_type_has_stream_data(uint32_t type)
{
    switch ( type )
    {
    case XEN_DOMCTL_PFINFO_NOTAB:

    case XEN_DOMCTL_PFINFO_L1TAB:
    case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L2TAB:
    case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L3TAB:
    case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:

    case XEN_DOMCTL_PFINFO_L4TAB:
    case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
        return true;

    case XEN_DOMCTL_PFINFO_XTAB:
    case XEN_DOMCTL_PFINFO_BROKEN:
    case XEN_DOMCTL_PFINFO_XALLOC:
    default:
        return false;
    }
}

#endif
/*
 * Local variables:
 * mode: C
 * c-file-style: "BSD"
 * c-basic-offset: 4
 * tab-width: 4
 * indent-tabs-mode: nil
 * End:
 */