summaryrefslogtreecommitdiff
path: root/xen/arch/x86/alternative.c
blob: 99482766b51fc1eba573062414c6b3b0673d0e48 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
/* SPDX-License-Identifier: GPL-2.0-or-later */
/******************************************************************************
 * alternative.c
 */

#include <xen/delay.h>
#include <xen/types.h>
#include <asm/apic.h>
#include <asm/endbr.h>
#include <asm/processor.h>
#include <asm/alternative.h>
#include <xen/init.h>
#include <asm/setup.h>
#include <asm/system.h>
#include <asm/traps.h>
#include <asm/nmi.h>
#include <asm/nops.h>
#include <xen/livepatch.h>

#define MAX_PATCH_LEN (255-1)

extern struct alt_instr __alt_instructions[], __alt_instructions_end[];

#ifdef K8_NOP1
static const unsigned char k8nops[] init_or_livepatch_const = {
    K8_NOP1,
    K8_NOP2,
    K8_NOP3,
    K8_NOP4,
    K8_NOP5,
    K8_NOP6,
    K8_NOP7,
    K8_NOP8,
    K8_NOP9,
};
static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
    NULL,
    k8nops,
    k8nops + 1,
    k8nops + 1 + 2,
    k8nops + 1 + 2 + 3,
    k8nops + 1 + 2 + 3 + 4,
    k8nops + 1 + 2 + 3 + 4 + 5,
    k8nops + 1 + 2 + 3 + 4 + 5 + 6,
    k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
    k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif

#ifdef P6_NOP1
static const unsigned char p6nops[] init_or_livepatch_const = {
    P6_NOP1,
    P6_NOP2,
    P6_NOP3,
    P6_NOP4,
    P6_NOP5,
    P6_NOP6,
    P6_NOP7,
    P6_NOP8,
    P6_NOP9,
};
static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
    NULL,
    p6nops,
    p6nops + 1,
    p6nops + 1 + 2,
    p6nops + 1 + 2 + 3,
    p6nops + 1 + 2 + 3 + 4,
    p6nops + 1 + 2 + 3 + 4 + 5,
    p6nops + 1 + 2 + 3 + 4 + 5 + 6,
    p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
    p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif

static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;

#ifdef HAVE_AS_NOPS_DIRECTIVE

/* Nops in .init.rodata to compare against the runtime ideal nops. */
asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
      ".popsection\n\t");
extern char toolchain_nops[ASM_NOP_MAX];
static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal;

#else
# define toolchain_nops_are_ideal false
#endif

static void __init arch_init_ideal_nops(void)
{
    switch ( boot_cpu_data.x86_vendor )
    {
    case X86_VENDOR_INTEL:
        /*
         * Due to a decoder implementation quirk, some specific Intel CPUs
         * actually perform better with the "k8_nops" than with the SDM-
         * recommended NOPs.
         */
        if ( boot_cpu_data.x86 != 6 )
            break;

        switch ( boot_cpu_data.x86_model )
        {
        case 0x0f ... 0x1b:
        case 0x1d ... 0x25:
        case 0x28 ... 0x2f:
            ideal_nops = k8_nops;
            break;
        }
        break;

    case X86_VENDOR_AMD:
        if ( boot_cpu_data.x86 <= 0xf )
            ideal_nops = k8_nops;
        break;
    }

#ifdef HAVE_AS_NOPS_DIRECTIVE
    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
        toolchain_nops_are_ideal = true;
#endif
}

/* Use this to add nops to a buffer, then text_poke the whole buffer. */
void init_or_livepatch add_nops(void *insns, unsigned int len)
{
    while ( len > 0 )
    {
        unsigned int noplen = len;
        if ( noplen > ASM_NOP_MAX )
            noplen = ASM_NOP_MAX;
        memcpy(insns, ideal_nops[noplen], noplen);
        insns += noplen;
        len -= noplen;
    }
}

/*
 * text_poke - Update instructions on a live kernel or non-executed code.
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * When you use this code to patch more than one byte of an instruction
 * you need to make sure that other CPUs cannot execute this code in parallel.
 * Also no thread must be currently preempted in the middle of these
 * instructions. And on the local CPU you need to be protected again NMI or MCE
 * handlers seeing an inconsistent instruction while you patch.
 *
 * You should run this with interrupts disabled or on code that is not
 * executing.
 *
 * "noinline" to cause control flow change and thus invalidate I$ and
 * cause refetch after modification.
 */
static void *init_or_livepatch noinline
text_poke(void *addr, const void *opcode, size_t len)
{
    return memcpy(addr, opcode, len);
}

extern void *const __initdata_cf_clobber_start[];
extern void *const __initdata_cf_clobber_end[];

/*
 * Replace instructions with better alternatives for this CPU type.
 * This runs before SMP is initialized to avoid SMP problems with
 * self modifying code. This implies that asymmetric systems where
 * APs have less capabilities than the boot processor are not handled.
 * Tough. Make sure you disable such features by hand.
 *
 * The caller will set the "force" argument to true for the final
 * invocation, such that no CALLs/JMPs to NULL pointers will be left
 * around. See also the further comment below.
 */
static void init_or_livepatch _apply_alternatives(struct alt_instr *start,
                                                  struct alt_instr *end,
                                                  bool force)
{
    struct alt_instr *a, *base;

    printk(KERN_INFO "alt table %p -> %p\n", start, end);

    /*
     * The scan order should be from start to end. A later scanned
     * alternative code can overwrite a previous scanned alternative code.
     * Some code (e.g. ALTERNATIVE_2()) relies on this order of patching.
     *
     * So be careful if you want to change the scan order to any other
     * order.
     */
    for ( a = base = start; a < end; a++ )
    {
        uint8_t *orig = ALT_ORIG_PTR(a);
        uint8_t *repl = ALT_REPL_PTR(a);
        uint8_t buf[MAX_PATCH_LEN];
        unsigned int total_len = a->orig_len + a->pad_len;

        BUG_ON(a->repl_len > total_len);
        BUG_ON(total_len > sizeof(buf));
        BUG_ON(a->cpuid >= NCAPINTS * 32);

        /*
         * Detect sequences of alt_instr's patching the same origin site, and
         * keep base pointing at the first alt_instr entry.  This is so we can
         * refer to a single ->priv field for some of our patching decisions,
         * in particular the NOP optimization. We deliberately use the alt_instr
         * itself rather than a local variable in case we end up making multiple
         * passes.
         *
         * ->priv being nonzero means that the origin site has already been
         * modified, and we shouldn't try to optimise the nops again.
         */
        if ( ALT_ORIG_PTR(base) != orig )
            base = a;

        /* Skip patch sites already handled during the first pass. */
        if ( a->priv )
        {
            ASSERT(force);
            continue;
        }

        /* If there is no replacement to make, see about optimising the nops. */
        if ( !boot_cpu_has(a->cpuid) )
        {
            /* Origin site site already touched?  Don't nop anything. */
            if ( base->priv )
                continue;

            a->priv = 1;

            /* Nothing useful to do? */
            if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
                continue;

            add_nops(buf, a->pad_len);
            text_poke(orig + a->orig_len, buf, a->pad_len);
            continue;
        }

        memcpy(buf, repl, a->repl_len);

        /* 0xe8/0xe9 are relative branches; fix the offset. */
        if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
        {
            /*
             * Detect the special case of indirect-to-direct branch patching:
             * - replacement is a direct CALL/JMP (opcodes 0xE8/0xE9; already
             *   checked above),
             * - replacement's displacement is -5 (pointing back at the very
             *   insn, which makes no sense in a real replacement insn),
             * - original is an indirect CALL/JMP (opcodes 0xFF/2 or 0xFF/4)
             *   using RIP-relative addressing.
             * Some branch destinations may still be NULL when we come here
             * the first time. Defer patching of those until the post-presmp-
             * initcalls re-invocation (with force set to true). If at that
             * point the branch destination is still NULL, insert "UD2; UD0"
             * (for ease of recognition) instead of CALL/JMP.
             */
            if ( a->cpuid == X86_FEATURE_ALWAYS &&
                 *(int32_t *)(buf + 1) == -5 &&
                 a->orig_len >= 6 &&
                 orig[0] == 0xff &&
                 orig[1] == (*buf & 1 ? 0x25 : 0x15) )
            {
                long disp = *(int32_t *)(orig + 2);
                const uint8_t *dest = *(void **)(orig + 6 + disp);

                if ( dest )
                {
                    /*
                     * When building for CET-IBT, all function pointer targets
                     * should have an endbr64 instruction.
                     *
                     * If this is not the case, leave a warning because
                     * something is probably wrong with the build.  A CET-IBT
                     * enabled system might have exploded already.
                     *
                     * Otherwise, skip the endbr64 instruction.  This is a
                     * marginal perf improvement which saves on instruction
                     * decode bandwidth.
                     */
                    if ( IS_ENABLED(CONFIG_XEN_IBT) )
                    {
                        if ( is_endbr64(dest) )
                            dest += ENDBR64_LEN;
                        else
                            printk(XENLOG_WARNING
                                   "altcall %ps dest %ps has no endbr64\n",
                                   orig, dest);
                    }

                    disp = dest - (orig + 5);
                    ASSERT(disp == (int32_t)disp);
                    *(int32_t *)(buf + 1) = disp;
                }
                else if ( force )
                {
                    buf[0] = 0x0f;
                    buf[1] = 0x0b;
                    buf[2] = 0x0f;
                    buf[3] = 0xff;
                    buf[4] = 0xff;
                }
                else
                    continue;
            }
            else if ( force && system_state < SYS_STATE_active )
                ASSERT_UNREACHABLE();
            else
                *(int32_t *)(buf + 1) += repl - orig;
        }
        else if ( force && system_state < SYS_STATE_active  )
            ASSERT_UNREACHABLE();

        a->priv = 1;

        add_nops(buf + a->repl_len, total_len - a->repl_len);
        text_poke(orig, buf, total_len);
    }

    /*
     * Clobber endbr64 instructions now that altcall has finished optimising
     * all indirect branches to direct ones.
     */
    if ( force && cpu_has_xen_ibt )
    {
        void *const *val;
        unsigned int clobbered = 0;

        /*
         * This is some minor structure (ab)use.  We walk the entire contents
         * of .init.{ro,}data.cf_clobber as if it were an array of pointers.
         *
         * If the pointer points into .text, and at an endbr64 instruction,
         * nop out the endbr64.  This causes the pointer to no longer be a
         * legal indirect branch target under CET-IBT.  This is a
         * defence-in-depth measure, to reduce the options available to an
         * adversary who has managed to hijack a function pointer.
         */
        for ( val = __initdata_cf_clobber_start;
              val < __initdata_cf_clobber_end;
              val++ )
        {
            void *ptr = *val;

            if ( !is_kernel_text(ptr) || !is_endbr64(ptr) )
                continue;

            place_endbr64_poison(ptr);
            clobbered++;
        }

        printk("altcall: Optimised away %u endbr64 instructions\n", clobbered);
    }
}

void init_or_livepatch apply_alternatives(struct alt_instr *start,
                                          struct alt_instr *end)
{
    _apply_alternatives(start, end, true);
}

static unsigned int __initdata alt_todo;
static unsigned int __initdata alt_done;

/*
 * At boot time, we patch alternatives in NMI context.  This means that the
 * active NMI-shadow will defer any further NMIs, removing the slim race
 * condition where an NMI hits while we are midway though patching some
 * instructions in the NMI path.
 */
static int __init cf_check nmi_apply_alternatives(
    const struct cpu_user_regs *regs, int cpu)
{
    /*
     * More than one NMI may occur between the two set_nmi_callback() below.
     * We only need to apply alternatives once.
     */
    if ( !(alt_done & alt_todo) )
    {
        /*
         * Relax perms on .text to be RWX, so we can modify them.
         *
         * This relaxes perms globally, but we run ahead of bringing APs
         * online, so only have our own TLB to worry about.
         */
        modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
                                 (unsigned long)&__2M_text_end,
                                 PAGE_HYPERVISOR_RWX);
        flush_local(FLUSH_TLB_GLOBAL);

        _apply_alternatives(__alt_instructions, __alt_instructions_end,
                            alt_done);

        /*
         * Reinstate perms on .text to be RX.  This also cleans out the dirty
         * bits, which matters when CET Shstk is active.
         */
        modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
                                 (unsigned long)&__2M_text_end,
                                 PAGE_HYPERVISOR_RX);
        flush_local(FLUSH_TLB_GLOBAL);

        alt_done |= alt_todo;
    }

    return 1;
}

/*
 * This routine is called with local interrupt disabled and used during
 * bootup.
 */
static void __init _alternative_instructions(bool force)
{
    unsigned int i;
    nmi_callback_t *saved_nmi_callback;

    /*
     * Don't stop machine check exceptions while patching.
     * MCEs only happen when something got corrupted and in this
     * case we must do something about the corruption.
     * Ignoring it is worse than a unlikely patching race.
     * Also machine checks tend to be broadcast and if one CPU
     * goes into machine check the others follow quickly, so we don't
     * expect a machine check to cause undue problems during to code
     * patching.
     */
    ASSERT(!local_irq_is_enabled());

    /* Set what operation to perform /before/ setting the callback. */
    alt_todo = 1u << force;
    barrier();

    /*
     * As soon as the callback is set up, the next NMI will trigger patching,
     * even an NMI ahead of our explicit self-NMI.
     */
    saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives);

    /* Send ourselves an NMI to trigger the callback. */
    self_nmi();

    /*
     * In practice, the self_nmi() above appears to act synchronously.
     * However, synchronous behaviour is not architecturally guaranteed.  To
     * cover the (hopefully never) async case, poll alt_done for up to one
     * second.
     */
    for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i )
        mdelay(1);

    if ( !(ACCESS_ONCE(alt_done) & alt_todo) )
        panic("Timed out waiting for alternatives self-NMI to hit\n");

    set_nmi_callback(saved_nmi_callback);
}

void __init alternative_instructions(void)
{
    arch_init_ideal_nops();
    _alternative_instructions(false);
}

void __init alternative_branches(void)
{
    local_irq_disable();
    _alternative_instructions(true);
    local_irq_enable();
}