/* SPDX-License-Identifier: GPL-2.0-or-later */ /****************************************************************************** * alternative.c */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_PATCH_LEN (255-1) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; #ifdef K8_NOP1 static const unsigned char k8nops[] init_or_livepatch_const = { K8_NOP1, K8_NOP2, K8_NOP3, K8_NOP4, K8_NOP5, K8_NOP6, K8_NOP7, K8_NOP8, K8_NOP9, }; static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = { NULL, k8nops, k8nops + 1, k8nops + 1 + 2, k8nops + 1 + 2 + 3, k8nops + 1 + 2 + 3 + 4, k8nops + 1 + 2 + 3 + 4 + 5, k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef P6_NOP1 static const unsigned char p6nops[] init_or_livepatch_const = { P6_NOP1, P6_NOP2, P6_NOP3, P6_NOP4, P6_NOP5, P6_NOP6, P6_NOP7, P6_NOP8, P6_NOP9, }; static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = { NULL, p6nops, p6nops + 1, p6nops + 1 + 2, p6nops + 1 + 2 + 3, p6nops + 1 + 2 + 3 + 4, p6nops + 1 + 2 + 3 + 4 + 5, p6nops + 1 + 2 + 3 + 4 + 5 + 6, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops; #ifdef HAVE_AS_NOPS_DIRECTIVE /* Nops in .init.rodata to compare against the runtime ideal nops. */ asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" ".popsection\n\t"); extern char toolchain_nops[ASM_NOP_MAX]; static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal; #else # define toolchain_nops_are_ideal false #endif static void __init arch_init_ideal_nops(void) { switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: /* * Due to a decoder implementation quirk, some specific Intel CPUs * actually perform better with the "k8_nops" than with the SDM- * recommended NOPs. */ if ( boot_cpu_data.x86 != 6 ) break; switch ( boot_cpu_data.x86_model ) { case 0x0f ... 0x1b: case 0x1d ... 0x25: case 0x28 ... 0x2f: ideal_nops = k8_nops; break; } break; case X86_VENDOR_AMD: if ( boot_cpu_data.x86 <= 0xf ) ideal_nops = k8_nops; break; } #ifdef HAVE_AS_NOPS_DIRECTIVE if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) toolchain_nops_are_ideal = true; #endif } /* Use this to add nops to a buffer, then text_poke the whole buffer. */ void init_or_livepatch add_nops(void *insns, unsigned int len) { while ( len > 0 ) { unsigned int noplen = len; if ( noplen > ASM_NOP_MAX ) noplen = ASM_NOP_MAX; memcpy(insns, ideal_nops[noplen], noplen); insns += noplen; len -= noplen; } } /* * text_poke - Update instructions on a live kernel or non-executed code. * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. * * You should run this with interrupts disabled or on code that is not * executing. * * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void *init_or_livepatch noinline text_poke(void *addr, const void *opcode, size_t len) { return memcpy(addr, opcode, len); } extern void *const __initdata_cf_clobber_start[]; extern void *const __initdata_cf_clobber_end[]; /* * Replace instructions with better alternatives for this CPU type. * This runs before SMP is initialized to avoid SMP problems with * self modifying code. This implies that asymmetric systems where * APs have less capabilities than the boot processor are not handled. * Tough. Make sure you disable such features by hand. * * The caller will set the "force" argument to true for the final * invocation, such that no CALLs/JMPs to NULL pointers will be left * around. See also the further comment below. */ static void init_or_livepatch _apply_alternatives(struct alt_instr *start, struct alt_instr *end, bool force) { struct alt_instr *a, *base; printk(KERN_INFO "alt table %p -> %p\n", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite a previous scanned alternative code. * Some code (e.g. ALTERNATIVE_2()) relies on this order of patching. * * So be careful if you want to change the scan order to any other * order. */ for ( a = base = start; a < end; a++ ) { uint8_t *orig = ALT_ORIG_PTR(a); uint8_t *repl = ALT_REPL_PTR(a); uint8_t buf[MAX_PATCH_LEN]; unsigned int total_len = a->orig_len + a->pad_len; BUG_ON(a->repl_len > total_len); BUG_ON(total_len > sizeof(buf)); BUG_ON(a->cpuid >= NCAPINTS * 32); /* * Detect sequences of alt_instr's patching the same origin site, and * keep base pointing at the first alt_instr entry. This is so we can * refer to a single ->priv field for some of our patching decisions, * in particular the NOP optimization. We deliberately use the alt_instr * itself rather than a local variable in case we end up making multiple * passes. * * ->priv being nonzero means that the origin site has already been * modified, and we shouldn't try to optimise the nops again. */ if ( ALT_ORIG_PTR(base) != orig ) base = a; /* Skip patch sites already handled during the first pass. */ if ( a->priv ) { ASSERT(force); continue; } /* If there is no replacement to make, see about optimising the nops. */ if ( !boot_cpu_has(a->cpuid) ) { /* Origin site site already touched? Don't nop anything. */ if ( base->priv ) continue; a->priv = 1; /* Nothing useful to do? */ if ( toolchain_nops_are_ideal || a->pad_len <= 1 ) continue; add_nops(buf, a->pad_len); text_poke(orig + a->orig_len, buf, a->pad_len); continue; } memcpy(buf, repl, a->repl_len); /* 0xe8/0xe9 are relative branches; fix the offset. */ if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 ) { /* * Detect the special case of indirect-to-direct branch patching: * - replacement is a direct CALL/JMP (opcodes 0xE8/0xE9; already * checked above), * - replacement's displacement is -5 (pointing back at the very * insn, which makes no sense in a real replacement insn), * - original is an indirect CALL/JMP (opcodes 0xFF/2 or 0xFF/4) * using RIP-relative addressing. * Some branch destinations may still be NULL when we come here * the first time. Defer patching of those until the post-presmp- * initcalls re-invocation (with force set to true). If at that * point the branch destination is still NULL, insert "UD2; UD0" * (for ease of recognition) instead of CALL/JMP. */ if ( a->cpuid == X86_FEATURE_ALWAYS && *(int32_t *)(buf + 1) == -5 && a->orig_len >= 6 && orig[0] == 0xff && orig[1] == (*buf & 1 ? 0x25 : 0x15) ) { long disp = *(int32_t *)(orig + 2); const uint8_t *dest = *(void **)(orig + 6 + disp); if ( dest ) { /* * When building for CET-IBT, all function pointer targets * should have an endbr64 instruction. * * If this is not the case, leave a warning because * something is probably wrong with the build. A CET-IBT * enabled system might have exploded already. * * Otherwise, skip the endbr64 instruction. This is a * marginal perf improvement which saves on instruction * decode bandwidth. */ if ( IS_ENABLED(CONFIG_XEN_IBT) ) { if ( is_endbr64(dest) ) dest += ENDBR64_LEN; else printk(XENLOG_WARNING "altcall %ps dest %ps has no endbr64\n", orig, dest); } disp = dest - (orig + 5); ASSERT(disp == (int32_t)disp); *(int32_t *)(buf + 1) = disp; } else if ( force ) { buf[0] = 0x0f; buf[1] = 0x0b; buf[2] = 0x0f; buf[3] = 0xff; buf[4] = 0xff; } else continue; } else if ( force && system_state < SYS_STATE_active ) ASSERT_UNREACHABLE(); else *(int32_t *)(buf + 1) += repl - orig; } else if ( force && system_state < SYS_STATE_active ) ASSERT_UNREACHABLE(); a->priv = 1; add_nops(buf + a->repl_len, total_len - a->repl_len); text_poke(orig, buf, total_len); } /* * Clobber endbr64 instructions now that altcall has finished optimising * all indirect branches to direct ones. */ if ( force && cpu_has_xen_ibt ) { void *const *val; unsigned int clobbered = 0; /* * This is some minor structure (ab)use. We walk the entire contents * of .init.{ro,}data.cf_clobber as if it were an array of pointers. * * If the pointer points into .text, and at an endbr64 instruction, * nop out the endbr64. This causes the pointer to no longer be a * legal indirect branch target under CET-IBT. This is a * defence-in-depth measure, to reduce the options available to an * adversary who has managed to hijack a function pointer. */ for ( val = __initdata_cf_clobber_start; val < __initdata_cf_clobber_end; val++ ) { void *ptr = *val; if ( !is_kernel_text(ptr) || !is_endbr64(ptr) ) continue; place_endbr64_poison(ptr); clobbered++; } printk("altcall: Optimised away %u endbr64 instructions\n", clobbered); } } void init_or_livepatch apply_alternatives(struct alt_instr *start, struct alt_instr *end) { _apply_alternatives(start, end, true); } static unsigned int __initdata alt_todo; static unsigned int __initdata alt_done; /* * At boot time, we patch alternatives in NMI context. This means that the * active NMI-shadow will defer any further NMIs, removing the slim race * condition where an NMI hits while we are midway though patching some * instructions in the NMI path. */ static int __init cf_check nmi_apply_alternatives( const struct cpu_user_regs *regs, int cpu) { /* * More than one NMI may occur between the two set_nmi_callback() below. * We only need to apply alternatives once. */ if ( !(alt_done & alt_todo) ) { /* * Relax perms on .text to be RWX, so we can modify them. * * This relaxes perms globally, but we run ahead of bringing APs * online, so only have our own TLB to worry about. */ modify_xen_mappings_lite(XEN_VIRT_START + MB(2), (unsigned long)&__2M_text_end, PAGE_HYPERVISOR_RWX); flush_local(FLUSH_TLB_GLOBAL); _apply_alternatives(__alt_instructions, __alt_instructions_end, alt_done); /* * Reinstate perms on .text to be RX. This also cleans out the dirty * bits, which matters when CET Shstk is active. */ modify_xen_mappings_lite(XEN_VIRT_START + MB(2), (unsigned long)&__2M_text_end, PAGE_HYPERVISOR_RX); flush_local(FLUSH_TLB_GLOBAL); alt_done |= alt_todo; } return 1; } /* * This routine is called with local interrupt disabled and used during * bootup. */ static void __init _alternative_instructions(bool force) { unsigned int i; nmi_callback_t *saved_nmi_callback; /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ ASSERT(!local_irq_is_enabled()); /* Set what operation to perform /before/ setting the callback. */ alt_todo = 1u << force; barrier(); /* * As soon as the callback is set up, the next NMI will trigger patching, * even an NMI ahead of our explicit self-NMI. */ saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives); /* Send ourselves an NMI to trigger the callback. */ self_nmi(); /* * In practice, the self_nmi() above appears to act synchronously. * However, synchronous behaviour is not architecturally guaranteed. To * cover the (hopefully never) async case, poll alt_done for up to one * second. */ for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i ) mdelay(1); if ( !(ACCESS_ONCE(alt_done) & alt_todo) ) panic("Timed out waiting for alternatives self-NMI to hit\n"); set_nmi_callback(saved_nmi_callback); } void __init alternative_instructions(void) { arch_init_ideal_nops(); _alternative_instructions(false); } void __init alternative_branches(void) { local_irq_disable(); _alternative_instructions(true); local_irq_enable(); }