/****************************************************************************** * wait.c * * Sleep in hypervisor context for some event to occur. * * Copyright (c) 2010, Keir Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; If not, see . */ #include #include #include #include struct waitqueue_vcpu { struct list_head list; struct vcpu *vcpu; #ifdef CONFIG_X86 /* * Xen/x86 does not have per-vcpu hypervisor stacks. So we must save the * hypervisor context before sleeping (descheduling), setjmp/longjmp-style. */ void *esp; char *stack; #endif }; int init_waitqueue_vcpu(struct vcpu *v) { struct waitqueue_vcpu *wqv; wqv = xzalloc(struct waitqueue_vcpu); if ( wqv == NULL ) return -ENOMEM; #ifdef CONFIG_X86 wqv->stack = alloc_xenheap_page(); if ( wqv->stack == NULL ) { xfree(wqv); return -ENOMEM; } #endif INIT_LIST_HEAD(&wqv->list); wqv->vcpu = v; v->waitqueue_vcpu = wqv; return 0; } void destroy_waitqueue_vcpu(struct vcpu *v) { struct waitqueue_vcpu *wqv; wqv = v->waitqueue_vcpu; if ( wqv == NULL ) return; BUG_ON(!list_empty(&wqv->list)); #ifdef CONFIG_X86 free_xenheap_page(wqv->stack); #endif xfree(wqv); v->waitqueue_vcpu = NULL; } void init_waitqueue_head(struct waitqueue_head *wq) { spin_lock_init(&wq->lock); INIT_LIST_HEAD(&wq->list); } void destroy_waitqueue_head(struct waitqueue_head *wq) { wake_up_all(wq); } void wake_up_nr(struct waitqueue_head *wq, unsigned int nr) { struct waitqueue_vcpu *wqv; spin_lock(&wq->lock); while ( !list_empty(&wq->list) && nr-- ) { wqv = list_entry(wq->list.next, struct waitqueue_vcpu, list); list_del_init(&wqv->list); vcpu_unpause(wqv->vcpu); put_domain(wqv->vcpu->domain); } spin_unlock(&wq->lock); } void wake_up_one(struct waitqueue_head *wq) { wake_up_nr(wq, 1); } void wake_up_all(struct waitqueue_head *wq) { wake_up_nr(wq, UINT_MAX); } #ifdef CONFIG_X86 static void __prepare_to_wait(struct waitqueue_vcpu *wqv) { struct cpu_info *cpu_info = get_cpu_info(); struct vcpu *curr = current; unsigned long dummy; ASSERT(wqv->esp == 0); /* Save current VCPU affinity; force wakeup on *this* CPU only. */ if ( vcpu_temporary_affinity(curr, smp_processor_id(), VCPU_AFFINITY_WAIT) ) { gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n"); domain_crash(curr->domain); for ( ; ; ) do_softirq(); } /* * Hand-rolled setjmp(). * * __prepare_to_wait() is the leaf of a deep calltree. Preserve the GPRs, * bounds check what we want to stash in wqv->stack, copy the active stack * (up to cpu_info) into wqv->stack, then return normally. Our caller * will shortly schedule() and discard the current context. * * The copy out is performed with a rep movsb. When * check_wakeup_from_wait() longjmp()'s back into us, %rsp is pre-adjusted * to be suitable and %rsi/%rdi are swapped, so the rep movsb instead * copies in from wqv->stack over the active stack. */ asm volatile ( "push %%rbx; push %%rbp; push %%r12;" "push %%r13; push %%r14; push %%r15;" "sub %%esp,%%ecx;" "cmp %[sz], %%ecx;" "ja .L_skip;" /* Bail if >4k */ "mov %%rsp,%%rsi;" /* check_wakeup_from_wait() longjmp()'s to this point. */ ".L_wq_resume: rep movsb;" "mov %%rsp,%%rsi;" ".L_skip:" "pop %%r15; pop %%r14; pop %%r13;" "pop %%r12; pop %%rbp; pop %%rbx" : "=&S" (wqv->esp), "=&c" (dummy), "=&D" (dummy) : "0" (0), "1" (cpu_info), "2" (wqv->stack), [sz] "i" (PAGE_SIZE) : "memory", "rax", "rdx", "r8", "r9", "r10", "r11" ); if ( unlikely(wqv->esp == 0) ) { gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__); domain_crash(curr->domain); for ( ; ; ) do_softirq(); } } static void __finish_wait(struct waitqueue_vcpu *wqv) { wqv->esp = NULL; vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT); } void check_wakeup_from_wait(void) { struct vcpu *curr = current; struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu; ASSERT(list_empty(&wqv->list)); if ( likely(wqv->esp == NULL) ) return; /* Check if we are still pinned. */ if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) ) { gdprintk(XENLOG_ERR, "vcpu affinity lost\n"); domain_crash(curr->domain); /* Re-initiate scheduler and don't longjmp(). */ raise_softirq(SCHEDULE_SOFTIRQ); for ( ; ; ) do_softirq(); } /* * We are about to jump into a deeper call tree. In principle, this risks * executing more RET than CALL instructions, and underflowing the RSB. * * However, we are pinned to the same CPU as previously. Therefore, * either: * * 1) We've scheduled another vCPU in the meantime, and the context * switch path has (by default) issued IBPB which flushes the RSB, or * * 2) We're still in the same context. Returning back to the deeper * call tree is resuming the execution path we left, and remains * balanced as far as that logic is concerned. * * In fact, the path through the scheduler will execute more CALL * than RET instructions, making the RSB unbalanced in the safe * direction. * * Therefore, no actions are necessary here to maintain RSB safety. */ /* * Hand-rolled longjmp(). * * check_wakeup_from_wait() is always called with a shallow stack, * immediately after the vCPU has been rescheduled. * * Adjust %rsp to be the correct depth for the (deeper) stack we want to * restore, then prepare %rsi, %rdi and %rcx such that when we rejoin the * rep movs in __prepare_to_wait(), it copies from wqv->stack over the * active stack. * * All other GPRs are available for use; They're restored from the stack, * or explicitly clobbered. */ asm volatile ( "mov %%rdi, %%rsp;" "jmp .L_wq_resume" : : "S" (wqv->stack), "D" (wqv->esp), "c" ((char *)get_cpu_info() - (char *)wqv->esp) : "memory" ); unreachable(); } #else /* !CONFIG_X86 */ #define __prepare_to_wait(wqv) ((void)0) #define __finish_wait(wqv) ((void)0) #endif void prepare_to_wait(struct waitqueue_head *wq) { struct vcpu *curr = current; struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu; ASSERT_NOT_IN_ATOMIC(); __prepare_to_wait(wqv); ASSERT(list_empty(&wqv->list)); spin_lock(&wq->lock); list_add_tail(&wqv->list, &wq->list); vcpu_pause_nosync(curr); get_knownalive_domain(curr->domain); spin_unlock(&wq->lock); } void finish_wait(struct waitqueue_head *wq) { struct vcpu *curr = current; struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu; __finish_wait(wqv); if ( list_empty(&wqv->list) ) return; spin_lock(&wq->lock); if ( !list_empty(&wqv->list) ) { list_del_init(&wqv->list); vcpu_unpause(curr); put_domain(curr->domain); } spin_unlock(&wq->lock); }