/**************************************************************************** * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge * (C) 2002-2003 University of Cambridge * (C) 2004 - Mark Williamson - Intel Research Cambridge **************************************************************************** * * File: common/schedule.c * Author: Rolf Neugebauer & Keir Fraser * Updated for generic API by Mark Williamson * * Description: Generic CPU scheduling code * implements support functionality for the Xen scheduler API. * */ #ifndef COMPAT #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" #ifdef CONFIG_XEN_GUEST #include #else #define pv_shim false #endif /* opt_sched: scheduler - default to configured value */ static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT; string_param("sched", opt_sched); /* if sched_smt_power_savings is set, * scheduler will give preferrence to partially idle package compared to * the full idle package, when picking pCPU to schedule vCPU. */ bool sched_smt_power_savings; boolean_param("sched_smt_power_savings", sched_smt_power_savings); /* Default scheduling rate limit: 1ms * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined * */ int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; integer_param("sched_ratelimit_us", sched_ratelimit_us); /* Number of vcpus per struct sched_unit. */ bool __read_mostly sched_disable_smt_switching; cpumask_t sched_res_mask; /* Common lock for free cpus. */ static DEFINE_SPINLOCK(sched_free_cpu_lock); /* Various timer handlers. */ static void cf_check s_timer_fn(void *unused); static void cf_check vcpu_periodic_timer_fn(void *data); static void cf_check vcpu_singleshot_timer_fn(void *data); static void cf_check poll_timer_fn(void *data); /* This is global for now so that private implementations can reach it */ DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res); static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx); DEFINE_RCU_READ_LOCK(sched_res_rculock); /* Scratch space for cpumasks. */ DEFINE_PER_CPU(cpumask_t, cpumask_scratch); /* How many urgent vcpus. */ DEFINE_PER_CPU(atomic_t, sched_urgent_count); extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[]; #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array) #define schedulers __start_schedulers_array static struct scheduler __read_mostly ops; static bool scheduler_active; static void sched_set_affinity( struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft); static struct sched_resource *cf_check sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit) { return unit->res; } static void *cf_check sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd) { /* Any non-NULL pointer is fine here. */ return ZERO_BLOCK_PTR; } static void cf_check sched_idle_free_udata(const struct scheduler *ops, void *priv) { } static void cf_check sched_idle_schedule( const struct scheduler *ops, struct sched_unit *unit, s_time_t now, bool tasklet_work_scheduled) { const unsigned int cpu = smp_processor_id(); unit->next_time = -1; unit->next_task = sched_idle_unit(cpu); } static struct scheduler sched_idle_ops = { .name = "Idle Scheduler", .opt_name = "idle", .sched_data = NULL, .pick_resource = sched_idle_res_pick, .do_schedule = sched_idle_schedule, .alloc_udata = sched_idle_alloc_udata, .free_udata = sched_idle_free_udata, }; static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit, unsigned int cpu) { unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu); const struct domain *d = unit->domain; return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL; } static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit, unsigned int cpu) { struct vcpu *v = unit2vcpu_cpu(unit, cpu); return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu]; } static inline struct scheduler *dom_scheduler(const struct domain *d) { if ( likely(d->cpupool != NULL) ) return d->cpupool->sched; /* * If d->cpupool is NULL, this is the idle domain. This is special * because the idle domain does not really belong to any cpupool, and, * hence, does not really have a scheduler. * * This is (should be!) only called like this for allocating the idle * vCPUs for the first time, during boot, in which case what we want * is the default scheduler that has been, choosen at boot. */ ASSERT(is_idle_domain(d)); return &ops; } static inline struct scheduler *unit_scheduler(const struct sched_unit *unit) { const struct domain *d = unit->domain; if ( likely(d->cpupool != NULL) ) return d->cpupool->sched; /* * If d->cpupool is NULL, this is a unit of the idle domain. And this * case is special because the idle domain does not really belong to * a cpupool and, hence, doesn't really have a scheduler). In fact, its * units (may) run on pCPUs which are in different pools, with different * schedulers. * * What we want, in this case, is the scheduler of the pCPU where this * particular idle unit is running. And, since unit->res never changes * for idle units, it is safe to use it, with no locks, to figure that out. */ ASSERT(is_idle_domain(d)); return unit->res->scheduler; } static inline struct scheduler *vcpu_scheduler(const struct vcpu *v) { return unit_scheduler(v->sched_unit); } #define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain) static inline void trace_runstate_change(const struct vcpu *v, int new_state) { struct { uint32_t vcpu:16, domain:16; } d; uint32_t event; if ( likely(!tb_init_done) ) return; d.vcpu = v->vcpu_id; d.domain = v->domain->domain_id; event = TRC_SCHED_RUNSTATE_CHANGE; event |= ( v->runstate.state & 0x3 ) << 8; event |= ( new_state & 0x3 ) << 4; __trace_var(event, 1/*tsc*/, sizeof(d), &d); } static inline void trace_continue_running(const struct vcpu *v) { struct { uint32_t vcpu:16, domain:16; } d; if ( likely(!tb_init_done) ) return; d.vcpu = v->vcpu_id; d.domain = v->domain->domain_id; __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); } static inline void vcpu_urgent_count_update(struct vcpu *v) { if ( is_idle_vcpu(v) ) return; if ( unlikely(v->is_urgent) ) { if ( !(v->pause_flags & VPF_blocked) || !test_bit(v->vcpu_id, v->domain->poll_mask) ) { v->is_urgent = 0; atomic_dec(&per_cpu(sched_urgent_count, v->processor)); } } else { if ( unlikely(v->pause_flags & VPF_blocked) && unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) ) { v->is_urgent = 1; atomic_inc(&per_cpu(sched_urgent_count, v->processor)); } } } static inline void vcpu_runstate_change( struct vcpu *v, int new_state, s_time_t new_entry_time) { s_time_t delta; struct sched_unit *unit = v->sched_unit; ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); if ( v->runstate.state == new_state ) return; vcpu_urgent_count_update(v); trace_runstate_change(v, new_state); if ( !is_idle_vcpu(v) ) { unit->runstate_cnt[v->runstate.state]--; unit->runstate_cnt[new_state]++; } delta = new_entry_time - v->runstate.state_entry_time; if ( delta > 0 ) { v->runstate.time[v->runstate.state] += delta; v->runstate.state_entry_time = new_entry_time; } v->runstate.state = new_state; } void sched_guest_idle(void (*idle) (void), unsigned int cpu) { /* * Another vcpu of the unit is active in guest context while this one is * idle. In case of a scheduling event we don't want to have high latencies * due to a cpu needing to wake up from deep C state for joining the * rendezvous, so avoid those deep C states by incrementing the urgent * count of the cpu. */ atomic_inc(&per_cpu(sched_urgent_count, cpu)); idle(); atomic_dec(&per_cpu(sched_urgent_count, cpu)); } void vcpu_runstate_get(const struct vcpu *v, struct vcpu_runstate_info *runstate) { spinlock_t *lock; s_time_t delta; struct sched_unit *unit; rcu_read_lock(&sched_res_rculock); /* * Be careful in case of an idle vcpu: the assignment to a unit might * change even with the scheduling lock held, so be sure to use the * correct unit for locking in order to avoid triggering an ASSERT() in * the unlock function. */ unit = is_idle_vcpu(v) ? get_sched_res(v->processor)->sched_unit_idle : v->sched_unit; lock = likely(v == current) ? NULL : unit_schedule_lock_irq(unit); memcpy(runstate, &v->runstate, sizeof(*runstate)); delta = NOW() - runstate->state_entry_time; if ( delta > 0 ) runstate->time[runstate->state] += delta; if ( unlikely(lock != NULL) ) unit_schedule_unlock_irq(lock, unit); rcu_read_unlock(&sched_res_rculock); } uint64_t get_cpu_idle_time(unsigned int cpu) { struct vcpu_runstate_info state = { 0 }; const struct vcpu *v = idle_vcpu[cpu]; if ( cpu_online(cpu) && get_sched_res(cpu) ) vcpu_runstate_get(v, &state); return state.time[RUNSTATE_running]; } /* * If locks are different, take the one with the lower address first. * This avoids dead- or live-locks when this code is running on both * cpus at the same time. */ static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) { if ( lock1 == lock2 ) { spin_lock_irqsave(lock1, *flags); } else if ( lock1 < lock2 ) { spin_lock_irqsave(lock1, *flags); spin_lock(lock2); } else { spin_lock_irqsave(lock2, *flags); spin_lock(lock1); } } static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, unsigned long flags) { if ( lock1 != lock2 ) spin_unlock(lock2); spin_unlock_irqrestore(lock1, flags); } static void sched_free_unit_mem(struct sched_unit *unit) { struct sched_unit *prev_unit; struct domain *d = unit->domain; if ( d->sched_unit_list == unit ) d->sched_unit_list = unit->next_in_list; else { for_each_sched_unit ( d, prev_unit ) { if ( prev_unit->next_in_list == unit ) { prev_unit->next_in_list = unit->next_in_list; break; } } } free_cpumask_var(unit->cpu_hard_affinity); free_cpumask_var(unit->cpu_hard_affinity_saved); free_cpumask_var(unit->cpu_soft_affinity); xfree(unit); } static void sched_free_unit(struct sched_unit *unit, struct vcpu *v) { const struct vcpu *vunit; unsigned int cnt = 0; /* Don't count to be released vcpu, might be not in vcpu list yet. */ for_each_sched_unit_vcpu ( unit, vunit ) if ( vunit != v ) cnt++; v->sched_unit = NULL; unit->runstate_cnt[v->runstate.state]--; if ( unit->vcpu_list == v ) unit->vcpu_list = v->next_in_list; if ( !cnt ) sched_free_unit_mem(unit); } static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v) { v->sched_unit = unit; /* All but idle vcpus are allocated with sequential vcpu_id. */ if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id ) { unit->vcpu_list = v; /* * unit_id is always the same as lowest vcpu_id of unit. * This is used for stopping for_each_sched_unit_vcpu() loop and in * order to support cpupools with different granularities. */ unit->unit_id = v->vcpu_id; } unit->runstate_cnt[v->runstate.state]++; } static struct sched_unit *sched_alloc_unit_mem(void) { struct sched_unit *unit; unit = xzalloc(struct sched_unit); if ( !unit ) return NULL; if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) || !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) || !zalloc_cpumask_var(&unit->cpu_soft_affinity) ) { sched_free_unit_mem(unit); unit = NULL; } return unit; } static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d) { struct sched_unit **prev_unit; unit->domain = d; for ( prev_unit = &d->sched_unit_list; *prev_unit; prev_unit = &(*prev_unit)->next_in_list ) if ( (*prev_unit)->next_in_list && (*prev_unit)->next_in_list->unit_id > unit->unit_id ) break; unit->next_in_list = *prev_unit; *prev_unit = unit; } static struct sched_unit *sched_alloc_unit(struct vcpu *v) { struct sched_unit *unit; struct domain *d = v->domain; unsigned int gran = cpupool_get_granularity(d->cpupool); for_each_sched_unit ( d, unit ) if ( unit->unit_id / gran == v->vcpu_id / gran ) break; if ( unit ) { sched_unit_add_vcpu(unit, v); return unit; } if ( (unit = sched_alloc_unit_mem()) == NULL ) return NULL; sched_unit_add_vcpu(unit, v); sched_domain_insert_unit(unit, d); return unit; } static unsigned int sched_select_initial_cpu(const struct vcpu *v) { const struct domain *d = v->domain; nodeid_t node; spinlock_t *lock; unsigned long flags; unsigned int cpu_ret, cpu = smp_processor_id(); cpumask_t *cpus = cpumask_scratch_cpu(cpu); lock = pcpu_schedule_lock_irqsave(cpu, &flags); cpumask_clear(cpus); for_each_node_mask ( node, d->node_affinity ) cpumask_or(cpus, cpus, &node_to_cpumask(node)); cpumask_and(cpus, cpus, d->cpupool->cpu_valid); if ( cpumask_empty(cpus) ) cpumask_copy(cpus, d->cpupool->cpu_valid); if ( v->vcpu_id == 0 ) cpu_ret = cpumask_first(cpus); else { /* We can rely on previous vcpu being available. */ ASSERT(!is_idle_domain(d)); cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus); } pcpu_schedule_unlock_irqrestore(lock, flags, cpu); return cpu_ret; } int sched_init_vcpu(struct vcpu *v) { const struct domain *d = v->domain; struct sched_unit *unit; unsigned int processor; if ( (unit = sched_alloc_unit(v)) == NULL ) return 1; if ( is_idle_domain(d) ) processor = v->vcpu_id; else processor = sched_select_initial_cpu(v); /* Initialise the per-vcpu timers. */ spin_lock_init(&v->periodic_timer_lock); init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor); init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor); init_timer(&v->poll_timer, poll_timer_fn, v, processor); /* If this is not the first vcpu of the unit we are done. */ if ( unit->priv != NULL ) { v->processor = processor; return 0; } rcu_read_lock(&sched_res_rculock); /* The first vcpu of an unit can be set via sched_set_res(). */ sched_set_res(unit, get_sched_res(processor)); unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv); if ( unit->priv == NULL ) { sched_free_unit(unit, v); rcu_read_unlock(&sched_res_rculock); return 1; } if ( is_idle_domain(d) ) { /* Idle vCPUs are always pinned onto their respective pCPUs */ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); } else if ( pv_shim && v->vcpu_id == 0 ) { /* * PV-shim: vcpus are pinned 1:1. Initially only 1 cpu is online, * others will be dealt with when onlining them. This avoids pinning * a vcpu to a not yet online cpu here. */ sched_set_affinity(unit, cpumask_of(0), cpumask_of(0)); } else if ( d->domain_id == 0 && opt_dom0_vcpus_pin ) { /* * If dom0_vcpus_pin is specified, dom0 vCPUs are pinned 1:1 to * their respective pCPUs too. */ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); } #ifdef CONFIG_X86 else if ( d->domain_id == 0 ) { /* * In absence of dom0_vcpus_pin instead, the hard and soft affinity of * dom0 is controlled by the (x86 only) dom0_nodes parameter. At this * point it has been parsed and decoded into the dom0_cpus mask. * * Note that we always honor what user explicitly requested, for both * hard and soft affinity, without doing any dynamic computation of * either of them. */ if ( !dom0_affinity_relaxed ) sched_set_affinity(unit, &dom0_cpus, &cpumask_all); else sched_set_affinity(unit, &cpumask_all, &dom0_cpus); } #endif else sched_set_affinity(unit, &cpumask_all, &cpumask_all); /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */ if ( is_idle_domain(d) ) { get_sched_res(v->processor)->curr = unit; get_sched_res(v->processor)->sched_unit_idle = unit; v->is_running = true; unit->is_running = true; unit->state_entry_time = NOW(); } else { sched_insert_unit(dom_scheduler(d), unit); } rcu_read_unlock(&sched_res_rculock); return 0; } static void vcpu_move_irqs(struct vcpu *v) { arch_move_irqs(v); evtchn_move_pirqs(v); } static void sched_move_irqs(const struct sched_unit *unit) { struct vcpu *v; for_each_sched_unit_vcpu ( unit, v ) vcpu_move_irqs(v); } /* * Move a domain from one cpupool to another. * * A domain with any vcpu having temporary affinity settings will be denied * to move. Hard and soft affinities will be reset. * * In order to support cpupools with different scheduling granularities all * scheduling units are replaced by new ones. * * The complete move is done in the following steps: * - check prerequisites (no vcpu with temporary affinities) * - allocate all new data structures (scheduler specific domain data, unit * memory, scheduler specific unit data) * - pause domain * - temporarily move all (old) units to the same scheduling resource (this * makes the final resource assignment easier in case the new cpupool has * a larger granularity than the old one, as the scheduling locks for all * vcpus must be held for that operation) * - remove old units from scheduling * - set new cpupool and scheduler domain data pointers in struct domain * - switch all vcpus to new units, still assigned to the old scheduling * resource * - migrate all new units to scheduling resources of the new cpupool * - unpause the domain * - free the old memory (scheduler specific domain data, unit memory, * scheduler specific unit data) */ int sched_move_domain(struct domain *d, struct cpupool *c) { struct vcpu *v; struct sched_unit *unit, *old_unit; struct sched_unit *new_units = NULL, *old_units; struct sched_unit **unit_ptr = &new_units; unsigned int new_p, unit_idx; void *domdata; struct scheduler *old_ops = dom_scheduler(d); void *old_domdata; unsigned int gran = cpupool_get_granularity(c); unsigned int n_units = d->vcpu[0] ? DIV_ROUND_UP(d->max_vcpus, gran) : 0; int ret = 0; for_each_vcpu ( d, v ) { if ( v->affinity_broken ) return -EBUSY; } rcu_read_lock(&sched_res_rculock); domdata = sched_alloc_domdata(c->sched, d); if ( IS_ERR(domdata) ) { ret = PTR_ERR(domdata); goto out; } for ( unit_idx = 0; unit_idx < n_units; unit_idx++ ) { unit = sched_alloc_unit_mem(); if ( unit ) { /* Initialize unit for sched_alloc_udata() to work. */ unit->domain = d; unit->unit_id = unit_idx * gran; unit->vcpu_list = d->vcpu[unit->unit_id]; unit->priv = sched_alloc_udata(c->sched, unit, domdata); *unit_ptr = unit; } if ( !unit || !unit->priv ) { old_units = new_units; old_domdata = domdata; ret = -ENOMEM; goto out_free; } unit_ptr = &unit->next_in_list; } domain_pause(d); old_domdata = d->sched_priv; /* * Temporarily move all units to same processor to make locking * easier when moving the new units to the new processors. */ new_p = cpumask_first(d->cpupool->cpu_valid); for_each_sched_unit ( d, unit ) { spinlock_t *lock = unit_schedule_lock_irq(unit); sched_set_res(unit, get_sched_res(new_p)); spin_unlock_irq(lock); sched_remove_unit(old_ops, unit); } old_units = d->sched_unit_list; d->cpupool = c; d->sched_priv = domdata; unit = new_units; for_each_vcpu ( d, v ) { old_unit = v->sched_unit; if ( unit->unit_id + gran == v->vcpu_id ) unit = unit->next_in_list; unit->state_entry_time = old_unit->state_entry_time; unit->runstate_cnt[v->runstate.state]++; /* Temporarily use old resource assignment */ unit->res = get_sched_res(new_p); v->sched_unit = unit; } d->sched_unit_list = new_units; new_p = cpumask_first(c->cpu_valid); for_each_sched_unit ( d, unit ) { spinlock_t *lock; unsigned int unit_p = new_p; for_each_sched_unit_vcpu ( unit, v ) { migrate_timer(&v->periodic_timer, new_p); migrate_timer(&v->singleshot_timer, new_p); migrate_timer(&v->poll_timer, new_p); new_p = cpumask_cycle(new_p, c->cpu_valid); } lock = unit_schedule_lock_irq(unit); sched_set_affinity(unit, &cpumask_all, &cpumask_all); sched_set_res(unit, get_sched_res(unit_p)); /* * With v->processor modified we must not * - make any further changes assuming we hold the scheduler lock, * - use unit_schedule_unlock_irq(). */ spin_unlock_irq(lock); if ( !d->is_dying ) sched_move_irqs(unit); sched_insert_unit(c->sched, unit); unit_idx++; } domain_update_node_affinity(d); domain_unpause(d); out_free: for ( unit = old_units; unit; ) { if ( unit->priv ) sched_free_udata(c->sched, unit->priv); old_unit = unit; unit = unit->next_in_list; xfree(old_unit); } sched_free_domdata(old_ops, old_domdata); out: rcu_read_unlock(&sched_res_rculock); return ret; } void sched_destroy_vcpu(struct vcpu *v) { struct sched_unit *unit = v->sched_unit; kill_timer(&v->periodic_timer); kill_timer(&v->singleshot_timer); kill_timer(&v->poll_timer); if ( test_and_clear_bool(v->is_urgent) ) atomic_dec(&per_cpu(sched_urgent_count, v->processor)); /* * Vcpus are being destroyed top-down. So being the first vcpu of an unit * is the same as being the only one. */ if ( unit->vcpu_list == v ) { rcu_read_lock(&sched_res_rculock); sched_remove_unit(vcpu_scheduler(v), unit); sched_free_udata(vcpu_scheduler(v), unit->priv); sched_free_unit(unit, v); rcu_read_unlock(&sched_res_rculock); } } int sched_init_domain(struct domain *d, unsigned int poolid) { void *sdom; int ret; ASSERT(d->cpupool == NULL); ASSERT(d->domain_id < DOMID_FIRST_RESERVED); if ( (ret = cpupool_add_domain(d, poolid)) ) return ret; SCHED_STAT_CRANK(dom_init); TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id); rcu_read_lock(&sched_res_rculock); sdom = sched_alloc_domdata(dom_scheduler(d), d); rcu_read_unlock(&sched_res_rculock); if ( IS_ERR(sdom) ) return PTR_ERR(sdom); d->sched_priv = sdom; return 0; } void sched_destroy_domain(struct domain *d) { ASSERT(d->domain_id < DOMID_FIRST_RESERVED); if ( d->cpupool ) { SCHED_STAT_CRANK(dom_destroy); TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); rcu_read_lock(&sched_res_rculock); sched_free_domdata(dom_scheduler(d), d->sched_priv); d->sched_priv = NULL; rcu_read_unlock(&sched_res_rculock); cpupool_rm_domain(d); } } static void vcpu_sleep_nosync_locked(struct vcpu *v) { struct sched_unit *unit = v->sched_unit; ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); if ( likely(!vcpu_runnable(v)) ) { if ( v->runstate.state == RUNSTATE_runnable ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); /* Only put unit to sleep in case all vcpus are not runnable. */ if ( likely(!unit_runnable(unit)) ) sched_sleep(unit_scheduler(unit), unit); else if ( unit_running(unit) > 1 && v->is_running && !v->force_context_switch ) { v->force_context_switch = true; cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); } } } void vcpu_sleep_nosync(struct vcpu *v) { unsigned long flags; spinlock_t *lock; TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); rcu_read_lock(&sched_res_rculock); lock = unit_schedule_lock_irqsave(v->sched_unit, &flags); vcpu_sleep_nosync_locked(v); unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit); rcu_read_unlock(&sched_res_rculock); } void vcpu_sleep_sync(struct vcpu *v) { vcpu_sleep_nosync(v); while ( !vcpu_runnable(v) && v->is_running ) cpu_relax(); sync_vcpu_execstate(v); } void vcpu_wake(struct vcpu *v) { unsigned long flags; spinlock_t *lock; struct sched_unit *unit = v->sched_unit; TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); rcu_read_lock(&sched_res_rculock); lock = unit_schedule_lock_irqsave(unit, &flags); if ( likely(vcpu_runnable(v)) ) { if ( v->runstate.state >= RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); /* * Call sched_wake() unconditionally, even if unit is running already. * We might have not been de-scheduled after vcpu_sleep_nosync_locked() * and are now to be woken up again. */ sched_wake(unit_scheduler(unit), unit); if ( unit->is_running && !v->is_running && !v->force_context_switch ) { v->force_context_switch = true; cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); } } else if ( !(v->pause_flags & VPF_blocked) ) { if ( v->runstate.state == RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); } unit_schedule_unlock_irqrestore(lock, flags, unit); rcu_read_unlock(&sched_res_rculock); } void vcpu_unblock(struct vcpu *v) { if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) return; /* Polling period ends when a VCPU is unblocked. */ if ( unlikely(v->poll_evtchn != 0) ) { v->poll_evtchn = 0; /* * We *must* re-clear _VPF_blocked to avoid racing other wakeups of * this VCPU (and it then going back to sleep on poll_mask). * Test-and-clear is idiomatic and ensures clear_bit not reordered. */ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) clear_bit(_VPF_blocked, &v->pause_flags); } vcpu_wake(v); } /* * Do the actual movement of an unit from old to new CPU. Locks for *both* * CPUs needs to have been taken already when calling this! */ static void sched_unit_move_locked(struct sched_unit *unit, unsigned int new_cpu) { unsigned int old_cpu = unit->res->master_cpu; const struct vcpu *v; rcu_read_lock(&sched_res_rculock); /* * Transfer urgency status to new CPU before switching CPUs, as * once the switch occurs, v->is_urgent is no longer protected by * the per-CPU scheduler lock we are holding. */ for_each_sched_unit_vcpu ( unit, v ) { if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) ) { atomic_inc(&per_cpu(sched_urgent_count, new_cpu)); atomic_dec(&per_cpu(sched_urgent_count, old_cpu)); } } /* * Actual CPU switch to new CPU. This is safe because the lock * pointer can't change while the current lock is held. */ sched_migrate(unit_scheduler(unit), unit, new_cpu); rcu_read_unlock(&sched_res_rculock); } /* * Initiating migration * * In order to migrate, we need the unit in question to have stopped * running and have called sched_sleep() (to take it off any * runqueues, for instance); and if it is currently running, it needs * to be scheduled out. Finally, we need to hold the scheduling locks * for both the processor we're migrating from, and the processor * we're migrating to. * * In order to avoid deadlock while satisfying the final requirement, * we must release any scheduling lock we hold, then try to grab both * locks we want, then double-check to make sure that what we started * to do hasn't been changed in the mean time. * * These steps are encapsulated in the following two functions; they * should be called like this: * * lock = unit_schedule_lock_irq(unit); * sched_unit_migrate_start(unit); * unit_schedule_unlock_irq(lock, unit) * sched_unit_migrate_finish(unit); * * sched_unit_migrate_finish() will do the work now if it can, or simply * return if it can't (because unit is still running); in that case * sched_unit_migrate_finish() will be called by unit_context_saved(). */ static void sched_unit_migrate_start(struct sched_unit *unit) { struct vcpu *v; for_each_sched_unit_vcpu ( unit, v ) { set_bit(_VPF_migrating, &v->pause_flags); vcpu_sleep_nosync_locked(v); } } static void sched_unit_migrate_finish(struct sched_unit *unit) { unsigned long flags; unsigned int old_cpu, new_cpu; spinlock_t *old_lock, *new_lock; bool pick_called = false; struct vcpu *v; /* * If the unit is currently running, this will be handled by * unit_context_saved(); and in any case, if the bit is cleared, then * someone else has already done the work so we don't need to. */ if ( unit->is_running ) return; for_each_sched_unit_vcpu ( unit, v ) if ( !test_bit(_VPF_migrating, &v->pause_flags) ) return; old_cpu = new_cpu = unit->res->master_cpu; for ( ; ; ) { /* * We need another iteration if the pre-calculated lock addresses * are not correct any longer after evaluating old and new cpu holding * the locks. */ old_lock = get_sched_res(old_cpu)->schedule_lock; new_lock = get_sched_res(new_cpu)->schedule_lock; sched_spin_lock_double(old_lock, new_lock, &flags); old_cpu = unit->res->master_cpu; if ( old_lock == get_sched_res(old_cpu)->schedule_lock ) { /* * If we selected a CPU on the previosu iteration, check if it * remains suitable for running this vCPU. */ if ( pick_called && (new_lock == get_sched_res(new_cpu)->schedule_lock) && cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) && cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) break; /* Select a new CPU. */ new_cpu = sched_pick_resource(unit_scheduler(unit), unit)->master_cpu; if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) && cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) break; pick_called = true; } else { /* * We do not hold the scheduler lock appropriate for this vCPU. * Thus we cannot select a new CPU on this iteration. Try again. */ pick_called = false; } sched_spin_unlock_double(old_lock, new_lock, flags); } /* * NB. Check of v->running happens /after/ setting migration flag * because they both happen in (different) spinlock regions, and those * regions are strictly serialised. */ if ( unit->is_running ) { sched_spin_unlock_double(old_lock, new_lock, flags); return; } for_each_sched_unit_vcpu ( unit, v ) { if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) { sched_spin_unlock_double(old_lock, new_lock, flags); return; } } sched_unit_move_locked(unit, new_cpu); sched_spin_unlock_double(old_lock, new_lock, flags); if ( old_cpu != new_cpu ) sched_move_irqs(unit); /* Wake on new CPU. */ for_each_sched_unit_vcpu ( unit, v ) vcpu_wake(v); } static bool sched_check_affinity_broken(const struct sched_unit *unit) { const struct vcpu *v; for_each_sched_unit_vcpu ( unit, v ) if ( v->affinity_broken ) return true; return false; } /* * This function is used by cpu_hotplug code via cpu notifier chain * and from cpupools to switch schedulers on a cpu. * Caller must get domlist_read_lock. */ int cpu_disable_scheduler(unsigned int cpu) { struct domain *d; const struct cpupool *c; int ret = 0; rcu_read_lock(&sched_res_rculock); c = get_sched_res(cpu)->cpupool; if ( c == NULL ) goto out; for_each_domain_in_cpupool ( d, c ) { struct sched_unit *unit; for_each_sched_unit ( d, unit ) { unsigned long flags; spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags); if ( !cpumask_intersects(unit->cpu_hard_affinity, c->cpu_valid) && cpumask_test_cpu(cpu, unit->cpu_hard_affinity) ) { if ( sched_check_affinity_broken(unit) ) { /* The unit is temporarily pinned, can't move it. */ unit_schedule_unlock_irqrestore(lock, flags, unit); ret = -EADDRINUSE; break; } printk(XENLOG_DEBUG "Breaking affinity for %pv\n", unit->vcpu_list); sched_set_affinity(unit, &cpumask_all, NULL); } if ( unit->res != get_sched_res(cpu) ) { /* The unit is not on this cpu, so we can move on. */ unit_schedule_unlock_irqrestore(lock, flags, unit); continue; } /* If it is on this cpu, we must send it away. * We are doing some cpupool manipulations: * * we want to call the scheduler, and let it re-evaluation * the placement of the vcpu, taking into account the new * cpupool configuration; * * the scheduler will always find a suitable solution, or * things would have failed before getting in here. */ sched_unit_migrate_start(unit); unit_schedule_unlock_irqrestore(lock, flags, unit); sched_unit_migrate_finish(unit); /* * The only caveat, in this case, is that if a vcpu active in * the hypervisor isn't migratable. In this case, the caller * should try again after releasing and reaquiring all locks. */ if ( unit->res == get_sched_res(cpu) ) ret = -EAGAIN; } } out: rcu_read_unlock(&sched_res_rculock); return ret; } static int cpu_disable_scheduler_check(unsigned int cpu) { struct domain *d; const struct vcpu *v; const struct cpupool *c; c = get_sched_res(cpu)->cpupool; if ( c == NULL ) return 0; for_each_domain_in_cpupool ( d, c ) for_each_vcpu ( d, v ) if ( v->affinity_broken ) return -EADDRINUSE; return 0; } /* * Called after a cpu has come up again in a suspend/resume cycle. * Migrate all timers for this cpu (they have been migrated to cpu 0 when the * cpu was going down). * Note that only timers related to a physical cpu are migrated, not the ones * related to a vcpu or domain. */ void sched_migrate_timers(unsigned int cpu) { struct sched_resource *sr; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); /* * Note that on a system with parked cpus (e.g. smt=0 on Intel cpus) this * will be called for the parked cpus, too, so the case for no scheduling * resource being available must be considered. */ if ( sr && sr->master_cpu == cpu ) { migrate_timer(&sr->s_timer, cpu); sched_move_timers(sr->scheduler, sr); } rcu_read_unlock(&sched_res_rculock); } /* * In general, this must be called with the scheduler lock held, because the * adjust_affinity hook may want to modify the vCPU state. However, when the * vCPU is being initialized (either for dom0 or domU) there is no risk of * races, and it's fine to not take the look (we're talking about * sched_setup_dom0_vcpus() an sched_init_vcpu()). */ static void sched_set_affinity( struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft) { rcu_read_lock(&sched_res_rculock); sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft); rcu_read_unlock(&sched_res_rculock); if ( hard ) cpumask_copy(unit->cpu_hard_affinity, hard); if ( soft ) cpumask_copy(unit->cpu_soft_affinity, soft); unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity, unit->cpu_soft_affinity) && cpumask_intersects(unit->cpu_soft_affinity, unit->cpu_hard_affinity); } static int vcpu_set_affinity( struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which) { struct sched_unit *unit = v->sched_unit; spinlock_t *lock; int ret = 0; rcu_read_lock(&sched_res_rculock); lock = unit_schedule_lock_irq(unit); if ( v->affinity_broken ) ret = -EBUSY; else { /* * Tell the scheduler we changes something about affinity, * and ask to re-evaluate vcpu placement. */ if ( which == unit->cpu_hard_affinity ) { sched_set_affinity(unit, affinity, NULL); } else { ASSERT(which == unit->cpu_soft_affinity); sched_set_affinity(unit, NULL, affinity); } sched_unit_migrate_start(unit); } unit_schedule_unlock_irq(lock, unit); domain_update_node_affinity(v->domain); sched_unit_migrate_finish(unit); rcu_read_unlock(&sched_res_rculock); return ret; } int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity) { cpumask_t *online; online = VCPU2ONLINE(v); if ( !cpumask_intersects(online, affinity) ) return -EINVAL; return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity); } static int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity) { return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity); } /* Block the currently-executing domain until a pertinent event occurs. */ void vcpu_block(void) { struct vcpu *v = current; set_bit(_VPF_blocked, &v->pause_flags); smp_mb__after_atomic(); arch_vcpu_block(v); /* Check for events /after/ blocking: avoids wakeup waiting race. */ if ( local_events_need_delivery() ) { clear_bit(_VPF_blocked, &v->pause_flags); } else { TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); } } static void vcpu_block_enable_events(void) { local_event_delivery_enable(); vcpu_block(); } static long do_poll(struct sched_poll *sched_poll) { struct vcpu *v = current; struct domain *d = v->domain; evtchn_port_t port = 0; long rc; unsigned int i; /* Fairly arbitrary limit. */ if ( sched_poll->nr_ports > 128 ) return -EINVAL; if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) return -EFAULT; set_bit(_VPF_blocked, &v->pause_flags); v->poll_evtchn = -1; set_bit(v->vcpu_id, d->poll_mask); arch_vcpu_block(v); #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ /* Check for events /after/ setting flags: avoids wakeup waiting race. */ smp_mb(); /* * Someone may have seen we are blocked but not that we are polling, or * vice versa. We are certainly being woken, so clean up and bail. Beyond * this point others can be guaranteed to clean up for us if they wake us. */ rc = 0; if ( (v->poll_evtchn == 0) || !test_bit(_VPF_blocked, &v->pause_flags) || !test_bit(v->vcpu_id, d->poll_mask) ) goto out; #endif rc = 0; if ( local_events_need_delivery() ) goto out; for ( i = 0; i < sched_poll->nr_ports; i++ ) { rc = -EFAULT; if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) goto out; rc = evtchn_port_poll(d, port); if ( rc ) { if ( rc > 0 ) rc = 0; goto out; } } if ( sched_poll->nr_ports == 1 ) v->poll_evtchn = port; if ( sched_poll->timeout != 0 ) set_timer(&v->poll_timer, sched_poll->timeout); TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); return 0; out: v->poll_evtchn = 0; clear_bit(v->vcpu_id, d->poll_mask); clear_bit(_VPF_blocked, &v->pause_flags); return rc; } /* Voluntarily yield the processor for this allocation. */ long vcpu_yield(void) { struct vcpu * v=current; spinlock_t *lock; rcu_read_lock(&sched_res_rculock); lock = unit_schedule_lock_irq(v->sched_unit); sched_yield(vcpu_scheduler(v), v->sched_unit); unit_schedule_unlock_irq(lock, v->sched_unit); rcu_read_unlock(&sched_res_rculock); SCHED_STAT_CRANK(vcpu_yield); TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); return 0; } static void cf_check domain_watchdog_timeout(void *data) { struct domain *d = data; if ( d->is_shutting_down || d->is_dying ) return; printk("Watchdog timer fired for domain %u\n", d->domain_id); domain_shutdown(d, SHUTDOWN_watchdog); } static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout) { if ( id > NR_DOMAIN_WATCHDOG_TIMERS ) return -EINVAL; spin_lock(&d->watchdog_lock); if ( id == 0 ) { for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ ) { if ( test_and_set_bit(id, &d->watchdog_inuse_map) ) continue; set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); break; } spin_unlock(&d->watchdog_lock); return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1; } id -= 1; if ( !test_bit(id, &d->watchdog_inuse_map) ) { spin_unlock(&d->watchdog_lock); return -EINVAL; } if ( timeout == 0 ) { stop_timer(&d->watchdog_timer[id]); clear_bit(id, &d->watchdog_inuse_map); } else { set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); } spin_unlock(&d->watchdog_lock); return 0; } void watchdog_domain_init(struct domain *d) { unsigned int i; spin_lock_init(&d->watchdog_lock); d->watchdog_inuse_map = 0; for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); } void watchdog_domain_destroy(struct domain *d) { unsigned int i; for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) kill_timer(&d->watchdog_timer[i]); } /* * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if * cpu is NR_CPUS). * Temporary pinning can be done due to two reasons, which may be nested: * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case * of a conflict (e.g. in case cpupool doesn't include requested CPU, or * another conflicting temporary pinning is already in effect. * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the * CPU it is just running on. Can't fail if used properly. */ int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason) { struct sched_unit *unit = v->sched_unit; spinlock_t *lock; int ret = -EINVAL; bool migrate; rcu_read_lock(&sched_res_rculock); lock = unit_schedule_lock_irq(unit); if ( cpu == NR_CPUS ) { if ( v->affinity_broken & reason ) { ret = 0; v->affinity_broken &= ~reason; } if ( !ret && !sched_check_affinity_broken(unit) ) sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); } else if ( cpu < nr_cpu_ids ) { if ( (v->affinity_broken & reason) || (sched_check_affinity_broken(unit) && v->processor != cpu) ) ret = -EBUSY; else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) ) { if ( !sched_check_affinity_broken(unit) ) { cpumask_copy(unit->cpu_hard_affinity_saved, unit->cpu_hard_affinity); sched_set_affinity(unit, cpumask_of(cpu), NULL); } v->affinity_broken |= reason; ret = 0; } } migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity); if ( migrate ) sched_unit_migrate_start(unit); unit_schedule_unlock_irq(lock, unit); if ( migrate ) sched_unit_migrate_finish(unit); rcu_read_unlock(&sched_res_rculock); return ret; } static inline int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff) { return vcpuaff->flags == 0 || ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) && guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) || ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) && guest_handle_is_null(vcpuaff->cpumap_soft.bitmap)); } int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, struct xen_domctl_vcpuaffinity *vcpuaff) { struct vcpu *v; const struct sched_unit *unit; int ret = 0; if ( vcpuaff->vcpu >= d->max_vcpus ) return -EINVAL; if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL ) return -ESRCH; if ( vcpuaffinity_params_invalid(vcpuaff) ) return -EINVAL; unit = v->sched_unit; if ( cmd == XEN_DOMCTL_setvcpuaffinity ) { cpumask_var_t new_affinity, old_affinity; cpumask_t *online = cpupool_domain_master_cpumask(v->domain); /* * We want to be able to restore hard affinity if we are trying * setting both and changing soft affinity (which happens later, * when hard affinity has been succesfully chaged already) fails. */ if ( !alloc_cpumask_var(&old_affinity) ) return -ENOMEM; cpumask_copy(old_affinity, unit->cpu_hard_affinity); if ( !alloc_cpumask_var(&new_affinity) ) { free_cpumask_var(old_affinity); return -ENOMEM; } /* Undo a stuck SCHED_pin_override? */ if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE ) vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE); ret = 0; /* * We both set a new affinity and report back to the caller what * the scheduler will be effectively using. */ if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD ) { ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity), &vcpuaff->cpumap_hard, nr_cpu_ids); if ( !ret ) ret = vcpu_set_hard_affinity(v, new_affinity); if ( ret ) goto setvcpuaffinity_out; /* * For hard affinity, what we return is the intersection of * cpupool's online mask and the new hard affinity. */ cpumask_and(new_affinity, online, unit->cpu_hard_affinity); ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard, new_affinity); } if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT ) { ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity), &vcpuaff->cpumap_soft, nr_cpu_ids); if ( !ret) ret = vcpu_set_soft_affinity(v, new_affinity); if ( ret ) { /* * Since we're returning error, the caller expects nothing * happened, so we rollback the changes to hard affinity * (if any). */ if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD ) vcpu_set_hard_affinity(v, old_affinity); goto setvcpuaffinity_out; } /* * For soft affinity, we return the intersection between the * new soft affinity, the cpupool's online map and the (new) * hard affinity. */ cpumask_and(new_affinity, new_affinity, online); cpumask_and(new_affinity, new_affinity, unit->cpu_hard_affinity); ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft, new_affinity); } setvcpuaffinity_out: free_cpumask_var(new_affinity); free_cpumask_var(old_affinity); } else { if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD ) ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard, unit->cpu_hard_affinity); if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT ) ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft, unit->cpu_soft_affinity); } return ret; } bool alloc_affinity_masks(struct affinity_masks *affinity) { if ( !alloc_cpumask_var(&affinity->hard) ) return false; if ( !alloc_cpumask_var(&affinity->soft) ) { free_cpumask_var(affinity->hard); return false; } return true; } void free_affinity_masks(struct affinity_masks *affinity) { free_cpumask_var(affinity->soft); free_cpumask_var(affinity->hard); } void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity) { struct affinity_masks masks; cpumask_t *dom_affinity; const cpumask_t *online; struct sched_unit *unit; unsigned int cpu; /* Do we have vcpus already? If not, no need to update node-affinity. */ if ( !d->vcpu || !d->vcpu[0] ) return; if ( !affinity ) { affinity = &masks; if ( !alloc_affinity_masks(affinity) ) return; } cpumask_clear(affinity->hard); cpumask_clear(affinity->soft); online = cpupool_domain_master_cpumask(d); spin_lock(&d->node_affinity_lock); /* * If d->auto_node_affinity is true, let's compute the domain's * node-affinity and update d->node_affinity accordingly. if false, * just leave d->auto_node_affinity alone. */ if ( d->auto_node_affinity ) { /* * We want the narrowest possible set of pcpus (to get the narowest * possible set of nodes). What we need is the cpumask of where the * domain can run (the union of the hard affinity of all its vcpus), * and the full mask of where it would prefer to run (the union of * the soft affinity of all its various vcpus). Let's build them. */ for_each_sched_unit ( d, unit ) { cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity); cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity); } /* Filter out non-online cpus */ cpumask_and(affinity->hard, affinity->hard, online); ASSERT(!cpumask_empty(affinity->hard)); /* And compute the intersection between hard, online and soft */ cpumask_and(affinity->soft, affinity->soft, affinity->hard); /* * If not empty, the intersection of hard, soft and online is the * narrowest set we want. If empty, we fall back to hard&online. */ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard : affinity->soft; nodes_clear(d->node_affinity); for_each_cpu ( cpu, dom_affinity ) node_set(cpu_to_node(cpu), d->node_affinity); } spin_unlock(&d->node_affinity_lock); if ( affinity == &masks ) free_affinity_masks(affinity); } typedef long ret_t; #endif /* !COMPAT */ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { ret_t ret = 0; switch ( cmd ) { case SCHEDOP_yield: { ret = vcpu_yield(); break; } case SCHEDOP_block: { vcpu_block_enable_events(); break; } case SCHEDOP_shutdown: { struct sched_shutdown sched_shutdown; ret = -EFAULT; if ( copy_from_guest(&sched_shutdown, arg, 1) ) break; TRACE_3D(TRC_SCHED_SHUTDOWN, current->domain->domain_id, current->vcpu_id, sched_shutdown.reason); ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason); break; } case SCHEDOP_shutdown_code: { struct sched_shutdown sched_shutdown; struct domain *d = current->domain; ret = -EFAULT; if ( copy_from_guest(&sched_shutdown, arg, 1) ) break; TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, d->domain_id, current->vcpu_id, sched_shutdown.reason); spin_lock(&d->shutdown_lock); if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) d->shutdown_code = (u8)sched_shutdown.reason; spin_unlock(&d->shutdown_lock); ret = 0; break; } case SCHEDOP_poll: { struct sched_poll sched_poll; ret = -EFAULT; if ( copy_from_guest(&sched_poll, arg, 1) ) break; ret = do_poll(&sched_poll); break; } case SCHEDOP_remote_shutdown: { struct domain *d; struct sched_remote_shutdown sched_remote_shutdown; ret = -EFAULT; if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) break; ret = -ESRCH; d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); if ( d == NULL ) break; ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); if ( likely(!ret) ) domain_shutdown(d, sched_remote_shutdown.reason); rcu_unlock_domain(d); break; } case SCHEDOP_watchdog: { struct sched_watchdog sched_watchdog; ret = -EFAULT; if ( copy_from_guest(&sched_watchdog, arg, 1) ) break; ret = domain_watchdog( current->domain, sched_watchdog.id, sched_watchdog.timeout); break; } case SCHEDOP_pin_override: { struct sched_pin_override sched_pin_override; unsigned int cpu; ret = -EPERM; if ( !is_hardware_domain(current->domain) ) break; ret = -EFAULT; if ( copy_from_guest(&sched_pin_override, arg, 1) ) break; ret = -EINVAL; if ( sched_pin_override.pcpu >= NR_CPUS ) break; cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu; ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE); break; } default: ret = -ENOSYS; } return ret; } #ifndef COMPAT /* Per-vcpu oneshot-timer hypercall. */ long do_set_timer_op(s_time_t timeout) { struct vcpu *v = current; s_time_t offset = timeout - NOW(); if ( timeout == 0 ) { stop_timer(&v->singleshot_timer); } else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) { /* * Linux workaround: occasionally we will see timeouts a long way in * the future due to wrapping in Linux's jiffy time handling. We check * for timeouts wrapped negative, and for positive timeouts more than * about 13 days in the future (2^50ns). The correct fix is to trigger * an interrupt immediately (since Linux in fact has pending work to * do in this situation). However, older guests also set a long timeout * when they have *no* pending timers at all: setting an immediate * timeout in this case can burn a lot of CPU. We therefore go for a * reasonable middleground of triggering a timer event in 100ms. */ gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n", timeout); set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); } else { migrate_timer(&v->singleshot_timer, smp_processor_id()); set_timer(&v->singleshot_timer, timeout); } return 0; } /* sched_id - fetch ID of current scheduler */ int sched_id(void) { return ops.sched_id; } /* Adjust scheduling parameter for a given domain. */ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) { long ret; ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd); if ( ret ) return ret; if ( op->sched_id != dom_scheduler(d)->sched_id ) return -EINVAL; switch ( op->cmd ) { case XEN_DOMCTL_SCHEDOP_putinfo: case XEN_DOMCTL_SCHEDOP_getinfo: case XEN_DOMCTL_SCHEDOP_putvcpuinfo: case XEN_DOMCTL_SCHEDOP_getvcpuinfo: break; default: return -EINVAL; } /* NB: the pluggable scheduler code needs to take care * of locking by itself. */ rcu_read_lock(&sched_res_rculock); if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); rcu_read_unlock(&sched_res_rculock); return ret; } long sched_adjust_global(struct xen_sysctl_scheduler_op *op) { struct cpupool *pool; int rc; rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd); if ( rc ) return rc; if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) && (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) ) return -EINVAL; pool = cpupool_get_by_id(op->cpupool_id); if ( pool == NULL ) return -ESRCH; rcu_read_lock(&sched_res_rculock); rc = ((op->sched_id == pool->sched->sched_id) ? sched_adjust_cpupool(pool->sched, op) : -EINVAL); rcu_read_unlock(&sched_res_rculock); cpupool_put(pool); return rc; } static void vcpu_periodic_timer_work_locked(struct vcpu *v) { s_time_t now; s_time_t periodic_next_event; now = NOW(); periodic_next_event = v->periodic_last_event + v->periodic_period; if ( now >= periodic_next_event ) { send_timer_event(v); v->periodic_last_event = now; periodic_next_event = now + v->periodic_period; } migrate_timer(&v->periodic_timer, v->processor); set_timer(&v->periodic_timer, periodic_next_event); } static void vcpu_periodic_timer_work(struct vcpu *v) { if ( v->periodic_period == 0 ) return; spin_lock(&v->periodic_timer_lock); if ( v->periodic_period ) vcpu_periodic_timer_work_locked(v); spin_unlock(&v->periodic_timer_lock); } /* * Set the periodic timer of a vcpu. */ void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value) { spin_lock(&v->periodic_timer_lock); stop_timer(&v->periodic_timer); v->periodic_period = value; if ( value ) vcpu_periodic_timer_work_locked(v); spin_unlock(&v->periodic_timer_lock); } static void sched_switch_units(struct sched_resource *sr, struct sched_unit *next, struct sched_unit *prev, s_time_t now) { unsigned int cpu; ASSERT(unit_running(prev)); if ( prev != next ) { sr->curr = next; sr->prev = prev; TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, prev->unit_id, now - prev->state_entry_time); TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, next->unit_id, (next->vcpu_list->runstate.state == RUNSTATE_runnable) ? (now - next->state_entry_time) : 0, prev->next_time); TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id, next->domain->domain_id, next->unit_id); ASSERT(!unit_running(next)); /* * NB. Don't add any trace records from here until the actual context * switch, else lost_records resume will not work properly. */ ASSERT(!next->is_running); next->is_running = true; next->state_entry_time = now; if ( is_idle_unit(prev) ) { prev->runstate_cnt[RUNSTATE_running] = 0; prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity; } if ( is_idle_unit(next) ) { next->runstate_cnt[RUNSTATE_running] = sr->granularity; next->runstate_cnt[RUNSTATE_runnable] = 0; } } for_each_cpu ( cpu, sr->cpus ) { struct vcpu *vprev = get_cpu_current(cpu); struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu); if ( vprev != vnext || vprev->runstate.state != vnext->new_state ) { vcpu_runstate_change(vprev, ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked : (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)), now); vcpu_runstate_change(vnext, vnext->new_state, now); } vnext->is_running = true; if ( is_idle_vcpu(vnext) ) vnext->sched_unit = next; } } static bool sched_tasklet_check_cpu(unsigned int cpu) { unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu); switch ( *tasklet_work ) { case TASKLET_enqueued: set_bit(_TASKLET_scheduled, tasklet_work); /* fallthrough */ case TASKLET_enqueued|TASKLET_scheduled: return true; break; case TASKLET_scheduled: clear_bit(_TASKLET_scheduled, tasklet_work); /* fallthrough */ case 0: /* return false; */ break; default: BUG(); } return false; } static bool sched_tasklet_check(unsigned int cpu) { bool tasklet_work_scheduled = false; const cpumask_t *mask = get_sched_res(cpu)->cpus; unsigned int cpu_iter; for_each_cpu ( cpu_iter, mask ) if ( sched_tasklet_check_cpu(cpu_iter) ) tasklet_work_scheduled = true; return tasklet_work_scheduled; } static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now, unsigned int cpu) { struct sched_resource *sr = get_sched_res(cpu); struct scheduler *sched = sr->scheduler; struct sched_unit *next; /* get policy-specific decision on scheduling... */ sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu)); next = prev->next_task; if ( prev->next_time >= 0 ) /* -ve means no limit */ set_timer(&sr->s_timer, now + prev->next_time); sched_switch_units(sr, next, prev, now); return next; } static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext) { /* Clear running flag /after/ writing context to memory. */ smp_wmb(); if ( vprev != vnext ) vprev->is_running = false; } static void unit_context_saved(struct sched_resource *sr) { struct sched_unit *unit = sr->prev; if ( !unit ) return; unit->is_running = false; unit->state_entry_time = NOW(); sr->prev = NULL; /* Check for migration request /after/ clearing running flag. */ smp_mb(); sched_context_saved(unit_scheduler(unit), unit); /* Idle never migrates and idle vcpus might belong to other units. */ if ( !is_idle_unit(unit) ) sched_unit_migrate_finish(unit); } /* * Rendezvous on end of context switch. * As no lock is protecting this rendezvous function we need to use atomic * access functions on the counter. * The counter will be 0 in case no rendezvous is needed. For the rendezvous * case it is initialised to the number of cpus to rendezvous plus 1. Each * member entering decrements the counter. The last one will decrement it to * 1 and perform the final needed action in that case (call of * unit_context_saved()), and then set the counter to zero. The other members * will wait until the counter becomes zero until they proceed. */ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext) { struct sched_unit *next = vnext->sched_unit; struct sched_resource *sr; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(smp_processor_id()); if ( atomic_read(&next->rendezvous_out_cnt) ) { int cnt = atomic_dec_return(&next->rendezvous_out_cnt); vcpu_context_saved(vprev, vnext); /* Call unit_context_saved() before releasing other waiters. */ if ( cnt == 1 ) { unit_context_saved(sr); atomic_set(&next->rendezvous_out_cnt, 0); } else while ( atomic_read(&next->rendezvous_out_cnt) ) cpu_relax(); } else { vcpu_context_saved(vprev, vnext); if ( sr->granularity == 1 ) unit_context_saved(sr); } if ( is_idle_vcpu(vprev) && vprev != vnext ) vprev->sched_unit = sr->sched_unit_idle; rcu_read_unlock(&sched_res_rculock); } /* * Switch to a new context or keep the current one running. * On x86 it won't return, so it needs to drop the still held sched_res_rculock. */ static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext, bool reset_idle_unit, s_time_t now) { if ( unlikely(vprev == vnext) ) { TRACE_4D(TRC_SCHED_SWITCH_INFCONT, vnext->domain->domain_id, vnext->sched_unit->unit_id, now - vprev->runstate.state_entry_time, vprev->sched_unit->next_time); sched_context_switched(vprev, vnext); /* * We are switching from a non-idle to an idle unit. * A vcpu of the idle unit might have been running before due to * the guest vcpu being blocked. We must adjust the unit of the idle * vcpu which might have been set to the guest's one. */ if ( reset_idle_unit ) vnext->sched_unit = get_sched_res(smp_processor_id())->sched_unit_idle; rcu_read_unlock(&sched_res_rculock); trace_continue_running(vnext); return continue_running(vprev); } SCHED_STAT_CRANK(sched_ctx); stop_timer(&vprev->periodic_timer); if ( vnext->sched_unit->migrated ) vcpu_move_irqs(vnext); vcpu_periodic_timer_work(vnext); rcu_read_unlock(&sched_res_rculock); context_switch(vprev, vnext); } /* * Force a context switch of a single vcpu of an unit. * Might be called either if a vcpu of an already running unit is woken up * or if a vcpu of a running unit is put asleep with other vcpus of the same * unit still running. * Returns either NULL if v is already in the correct state or the vcpu to * run next. */ static struct vcpu *sched_force_context_switch(struct vcpu *vprev, struct vcpu *v, unsigned int cpu, s_time_t now) { v->force_context_switch = false; if ( vcpu_runnable(v) == v->is_running ) return NULL; if ( vcpu_runnable(v) ) { if ( is_idle_vcpu(vprev) ) { vcpu_runstate_change(vprev, RUNSTATE_runnable, now); vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; } vcpu_runstate_change(v, RUNSTATE_running, now); } else { /* Make sure not to switch last vcpu of an unit away. */ if ( unit_running(v->sched_unit) == 1 ) return NULL; v->new_state = vcpu_runstate_blocked(v); vcpu_runstate_change(v, v->new_state, now); v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu); if ( v != vprev ) { if ( is_idle_vcpu(vprev) ) { vcpu_runstate_change(vprev, RUNSTATE_runnable, now); vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; } else { v->sched_unit = vprev->sched_unit; vcpu_runstate_change(v, RUNSTATE_running, now); } } } /* This vcpu will be switched to. */ v->is_running = true; /* Make sure not to loose another slave call. */ raise_softirq(SCHED_SLAVE_SOFTIRQ); return v; } /* * Rendezvous before taking a scheduling decision. * Called with schedule lock held, so all accesses to the rendezvous counter * can be normal ones (no atomic accesses needed). * The counter is initialized to the number of cpus to rendezvous initially. * Each cpu entering will decrement the counter. In case the counter becomes * zero do_schedule() is called and the rendezvous counter for leaving * context_switch() is set. All other members will wait until the counter is * becoming zero, dropping the schedule lock in between. * Either returns the new unit to run, or NULL if no context switch is * required or (on Arm) has already been performed. If NULL is returned * sched_res_rculock has been dropped. */ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev, spinlock_t **lock, int cpu, s_time_t now) { struct sched_unit *next; struct vcpu *v; struct sched_resource *sr = get_sched_res(cpu); unsigned int gran = sr->granularity; if ( !--prev->rendezvous_in_cnt ) { next = do_schedule(prev, now, cpu); atomic_set(&next->rendezvous_out_cnt, gran + 1); return next; } v = unit2vcpu_cpu(prev, cpu); while ( prev->rendezvous_in_cnt ) { if ( v && v->force_context_switch ) { struct vcpu *vprev = current; v = sched_force_context_switch(vprev, v, cpu, now); if ( v ) { /* We'll come back another time, so adjust rendezvous_in_cnt. */ prev->rendezvous_in_cnt++; atomic_set(&prev->rendezvous_out_cnt, 0); pcpu_schedule_unlock_irq(*lock, cpu); sched_context_switch(vprev, v, false, now); return NULL; /* ARM only. */ } v = unit2vcpu_cpu(prev, cpu); } /* * Check for any work to be done which might need cpu synchronization. * This is either pending RCU work, or tasklet work when coming from * idle. It is mandatory that RCU softirqs are of higher priority * than scheduling ones as otherwise a deadlock might occur. * In order to avoid deadlocks we can't do that here, but have to * schedule the previous vcpu again, which will lead to the desired * processing to be done. * Undo the rendezvous_in_cnt decrement and schedule another call of * sched_slave(). */ BUILD_BUG_ON(RCU_SOFTIRQ > SCHED_SLAVE_SOFTIRQ || RCU_SOFTIRQ > SCHEDULE_SOFTIRQ); if ( rcu_pending(cpu) || (is_idle_unit(prev) && sched_tasklet_check_cpu(cpu)) ) { struct vcpu *vprev = current; prev->rendezvous_in_cnt++; atomic_set(&prev->rendezvous_out_cnt, 0); pcpu_schedule_unlock_irq(*lock, cpu); raise_softirq(SCHED_SLAVE_SOFTIRQ); sched_context_switch(vprev, vprev, false, now); return NULL; /* ARM only. */ } pcpu_schedule_unlock_irq(*lock, cpu); cpu_relax(); *lock = pcpu_schedule_lock_irq(cpu); /* * Check for scheduling resource switched. This happens when we are * moved away from our cpupool and cpus are subject of the idle * scheduler now. * * This is also a bail out case when scheduler_disable() has been * called. */ if ( unlikely(sr != get_sched_res(cpu) || !scheduler_active) ) { ASSERT(is_idle_unit(prev)); atomic_set(&prev->next_task->rendezvous_out_cnt, 0); prev->rendezvous_in_cnt = 0; pcpu_schedule_unlock_irq(*lock, cpu); rcu_read_unlock(&sched_res_rculock); return NULL; } } return prev->next_task; } static void cf_check sched_slave(void) { struct vcpu *v, *vprev = current; struct sched_unit *prev = vprev->sched_unit, *next; s_time_t now; spinlock_t *lock; bool do_softirq = false; unsigned int cpu = smp_processor_id(); ASSERT_NOT_IN_ATOMIC(); rcu_read_lock(&sched_res_rculock); lock = pcpu_schedule_lock_irq(cpu); now = NOW(); v = unit2vcpu_cpu(prev, cpu); if ( v && v->force_context_switch ) { v = sched_force_context_switch(vprev, v, cpu, now); if ( v ) { pcpu_schedule_unlock_irq(lock, cpu); sched_context_switch(vprev, v, false, now); return; } do_softirq = true; } if ( !prev->rendezvous_in_cnt ) { pcpu_schedule_unlock_irq(lock, cpu); rcu_read_unlock(&sched_res_rculock); /* Check for failed forced context switch. */ if ( do_softirq ) raise_softirq(SCHEDULE_SOFTIRQ); return; } stop_timer(&get_sched_res(cpu)->s_timer); next = sched_wait_rendezvous_in(prev, &lock, cpu, now); if ( !next ) return; pcpu_schedule_unlock_irq(lock, cpu); sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu), is_idle_unit(next) && !is_idle_unit(prev), now); } /* * The main function * - deschedule the current domain (scheduler independent). * - pick a new domain (scheduler dependent). */ static void cf_check schedule(void) { struct vcpu *vnext, *vprev = current; struct sched_unit *prev = vprev->sched_unit, *next = NULL; s_time_t now; struct sched_resource *sr; spinlock_t *lock; int cpu = smp_processor_id(); unsigned int gran; ASSERT_NOT_IN_ATOMIC(); SCHED_STAT_CRANK(sched_run); rcu_read_lock(&sched_res_rculock); lock = pcpu_schedule_lock_irq(cpu); sr = get_sched_res(cpu); gran = sr->granularity; if ( prev->rendezvous_in_cnt ) { /* * We have a race: sched_slave() should be called, so raise a softirq * in order to re-enter schedule() later and call sched_slave() now. */ pcpu_schedule_unlock_irq(lock, cpu); rcu_read_unlock(&sched_res_rculock); raise_softirq(SCHEDULE_SOFTIRQ); return sched_slave(); } stop_timer(&sr->s_timer); now = NOW(); if ( gran > 1 ) { cpumask_t *mask = cpumask_scratch_cpu(cpu); prev->rendezvous_in_cnt = gran; cpumask_andnot(mask, sr->cpus, cpumask_of(cpu)); cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ); next = sched_wait_rendezvous_in(prev, &lock, cpu, now); if ( !next ) return; } else { prev->rendezvous_in_cnt = 0; next = do_schedule(prev, now, cpu); atomic_set(&next->rendezvous_out_cnt, 0); } pcpu_schedule_unlock_irq(lock, cpu); vnext = sched_unit2vcpu_cpu(next, cpu); sched_context_switch(vprev, vnext, !is_idle_unit(prev) && is_idle_unit(next), now); } /* The scheduler timer: force a run through the scheduler */ static void cf_check s_timer_fn(void *unused) { raise_softirq(SCHEDULE_SOFTIRQ); SCHED_STAT_CRANK(sched_irq); } /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ static void cf_check vcpu_periodic_timer_fn(void *data) { struct vcpu *v = data; vcpu_periodic_timer_work(v); } /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ static void cf_check vcpu_singleshot_timer_fn(void *data) { struct vcpu *v = data; send_timer_event(v); } /* SCHEDOP_poll timeout callback. */ static void cf_check poll_timer_fn(void *data) { struct vcpu *v = data; if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) vcpu_unblock(v); } static struct sched_resource *sched_alloc_res(void) { struct sched_resource *sr; sr = xzalloc(struct sched_resource); if ( sr == NULL ) return NULL; if ( !zalloc_cpumask_var(&sr->cpus) ) { xfree(sr); return NULL; } return sr; } static int cpu_schedule_up(unsigned int cpu) { struct sched_resource *sr; sr = sched_alloc_res(); if ( sr == NULL ) return -ENOMEM; sr->master_cpu = cpu; cpumask_copy(sr->cpus, cpumask_of(cpu)); set_sched_res(cpu, sr); sr->scheduler = &sched_idle_ops; spin_lock_init(&sr->_lock); sr->schedule_lock = &sched_free_cpu_lock; init_timer(&sr->s_timer, s_timer_fn, NULL, cpu); atomic_set(&per_cpu(sched_urgent_count, cpu), 0); /* We start with cpu granularity. */ sr->granularity = 1; cpumask_set_cpu(cpu, &sched_res_mask); /* Boot CPU is dealt with later in scheduler_init(). */ if ( cpu == 0 ) return 0; /* * Guard in particular against the compiler suspecting out-of-bounds * array accesses below when NR_CPUS=1. */ BUG_ON(cpu >= NR_CPUS); if ( idle_vcpu[cpu] == NULL ) vcpu_create(idle_vcpu[0]->domain, cpu); else idle_vcpu[cpu]->sched_unit->res = sr; if ( idle_vcpu[cpu] == NULL ) return -ENOMEM; idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0; /* * No need to allocate any scheduler data, as cpus coming online are * free initially and the idle scheduler doesn't need any data areas * allocated. */ sr->curr = idle_vcpu[cpu]->sched_unit; sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit; sr->sched_priv = NULL; return 0; } static void cf_check sched_res_free(struct rcu_head *head) { struct sched_resource *sr = container_of(head, struct sched_resource, rcu); free_cpumask_var(sr->cpus); if ( sr->sched_unit_idle ) sched_free_unit_mem(sr->sched_unit_idle); xfree(sr); } static void cpu_schedule_down(unsigned int cpu) { struct sched_resource *sr; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); kill_timer(&sr->s_timer); cpumask_clear_cpu(cpu, &sched_res_mask); set_sched_res(cpu, NULL); /* Keep idle unit. */ sr->sched_unit_idle = NULL; call_rcu(&sr->rcu, sched_res_free); rcu_read_unlock(&sched_res_rculock); } void sched_rm_cpu(unsigned int cpu) { int rc; rcu_read_lock(&domlist_read_lock); rc = cpu_disable_scheduler(cpu); BUG_ON(rc); rcu_read_unlock(&domlist_read_lock); cpu_schedule_down(cpu); } static int cf_check cpu_schedule_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; /* * All scheduler related suspend/resume handling needed is done in * cpupool.c. */ if ( system_state > SYS_STATE_active ) return NOTIFY_DONE; rcu_read_lock(&sched_res_rculock); /* * From the scheduler perspective, bringing up a pCPU requires * allocating and initializing the per-pCPU scheduler specific data, * as well as "registering" this pCPU to the scheduler (which may * involve modifying some scheduler wide data structures). * As new pCPUs always start as "free" cpus with the minimal idle * scheduler being in charge, we don't need any of that. * * On the other hand, at teardown, we need to reverse what has been done * during initialization, and then free the per-pCPU specific data. A * pCPU brought down is not forced through "free" cpus, so here we need to * use the appropriate hooks. * * This happens by calling the deinit_pdata and free_pdata hooks, in this * order. If no per-pCPU memory was allocated, there is no need to * provide an implementation of free_pdata. deinit_pdata may, however, * be necessary/useful in this case too (e.g., it can undo something done * on scheduler wide data structure during switch_sched). Both deinit_pdata * and free_pdata are called during CPU_DEAD. * * If something goes wrong during bringup, we go to CPU_UP_CANCELLED. */ switch ( action ) { case CPU_UP_PREPARE: rc = cpu_schedule_up(cpu); break; case CPU_DOWN_PREPARE: rcu_read_lock(&domlist_read_lock); rc = cpu_disable_scheduler_check(cpu); rcu_read_unlock(&domlist_read_lock); break; case CPU_DEAD: sched_rm_cpu(cpu); break; case CPU_UP_CANCELED: cpu_schedule_down(cpu); break; default: break; } rcu_read_unlock(&sched_res_rculock); return notifier_from_errno(rc); } static struct notifier_block cpu_schedule_nfb = { .notifier_call = cpu_schedule_callback }; const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu) { const cpumask_t *mask; switch ( opt ) { case SCHED_GRAN_cpu: mask = cpumask_of(cpu); break; case SCHED_GRAN_core: mask = per_cpu(cpu_sibling_mask, cpu); break; case SCHED_GRAN_socket: mask = per_cpu(cpu_core_mask, cpu); break; default: ASSERT_UNREACHABLE(); return NULL; } return mask; } static void cf_check schedule_dummy(void) { sched_tasklet_check_cpu(smp_processor_id()); } void scheduler_disable(void) { scheduler_active = false; open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy); open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy); } void scheduler_enable(void) { open_softirq(SCHEDULE_SOFTIRQ, schedule); open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave); scheduler_active = true; } static inline const struct scheduler *__init sched_get_by_name(const char *sched_name) { unsigned int i; for ( i = 0; i < NUM_SCHEDULERS; i++ ) if ( schedulers[i] && !strcmp(schedulers[i]->opt_name, sched_name) ) return schedulers[i]; return NULL; } int __init sched_get_id_by_name(const char *sched_name) { const struct scheduler *scheduler = sched_get_by_name(sched_name); return scheduler ? scheduler->sched_id : -1; } /* Initialise the data structures. */ void __init scheduler_init(void) { struct domain *idle_domain; const struct scheduler *scheduler; int i; scheduler_enable(); for ( i = 0; i < NUM_SCHEDULERS; i++) { #define sched_test_func(f) \ if ( !schedulers[i]->f ) \ { \ printk("scheduler %s misses .%s, dropped\n", \ schedulers[i]->opt_name, #f); \ schedulers[i] = NULL; \ } sched_test_func(init); sched_test_func(deinit); sched_test_func(pick_resource); sched_test_func(alloc_udata); sched_test_func(free_udata); sched_test_func(switch_sched); sched_test_func(do_schedule); #undef sched_test_func if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 ) { printk("scheduler %s failed initialization, dropped\n", schedulers[i]->opt_name); schedulers[i] = NULL; } } scheduler = sched_get_by_name(opt_sched); if ( !scheduler ) { printk("Could not find scheduler: %s\n", opt_sched); scheduler = sched_get_by_name(CONFIG_SCHED_DEFAULT); BUG_ON(!scheduler); printk("Using '%s' (%s)\n", scheduler->name, scheduler->opt_name); } ops = *scheduler; if ( cpu_schedule_up(0) ) BUG(); register_cpu_notifier(&cpu_schedule_nfb); printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); if ( sched_init(&ops) ) panic("scheduler returned error on init\n"); if ( sched_ratelimit_us && (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) ) { printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n" " Resetting to default %u\n", XEN_SYSCTL_SCHED_RATELIMIT_MIN, XEN_SYSCTL_SCHED_RATELIMIT_MAX, SCHED_DEFAULT_RATELIMIT_US); sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; } /* * The idle dom is created privileged to ensure unrestricted access during * setup and will be demoted by xsm_set_system_active() when setup is * complete. */ idle_domain = domain_create(DOMID_IDLE, NULL, CDF_privileged); BUG_ON(IS_ERR(idle_domain)); BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu)); idle_domain->vcpu = idle_vcpu; idle_domain->max_vcpus = nr_cpu_ids; if ( vcpu_create(idle_domain, 0) == NULL ) BUG(); rcu_read_lock(&sched_res_rculock); get_sched_res(0)->curr = idle_vcpu[0]->sched_unit; get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit; rcu_read_unlock(&sched_res_rculock); } /* * Move a pCPU from free cpus (running the idle scheduler) to a cpupool * using any "real" scheduler. * The cpu is still marked as "free" and not yet valid for its cpupool. */ int schedule_cpu_add(unsigned int cpu, struct cpupool *c) { struct vcpu *idle; void *ppriv, *vpriv; struct scheduler *new_ops = c->sched; struct sched_resource *sr; spinlock_t *old_lock, *new_lock; unsigned long flags; int ret = 0; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid)); ASSERT(get_sched_res(cpu)->cpupool == NULL); /* * To setup the cpu for the new scheduler we need: * - a valid instance of per-CPU scheduler specific data, as it is * allocated by sched_alloc_pdata(). Note that we do not want to * initialize it yet, as that will be done by the target scheduler, * in sched_switch_sched(), in proper ordering and with locking. * - a valid instance of per-vCPU scheduler specific data, for the idle * vCPU of cpu. That is what the target scheduler will use for the * sched_priv field of the per-vCPU info of the idle domain. */ idle = idle_vcpu[cpu]; ppriv = sched_alloc_pdata(new_ops, cpu); if ( IS_ERR(ppriv) ) { ret = PTR_ERR(ppriv); goto out; } vpriv = sched_alloc_udata(new_ops, idle->sched_unit, idle->domain->sched_priv); if ( vpriv == NULL ) { sched_free_pdata(new_ops, ppriv, cpu); ret = -ENOMEM; goto out; } /* * The actual switch, including the rerouting of the scheduler lock to * whatever new_ops prefers, needs to happen in one critical section, * protected by old_ops' lock, or races are possible. * It is, in fact, the lock of the idle scheduler that we are taking. * But that is ok as anyone trying to schedule on this cpu will spin until * when we release that lock (bottom of this function). When he'll get the * lock --thanks to the loop inside *_schedule_lock() functions-- he'll * notice that the lock itself changed, and retry acquiring the new one * (which will be the correct, remapped one, at that point). */ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); if ( cpupool_get_granularity(c) > 1 ) { const cpumask_t *mask; unsigned int cpu_iter, idx = 0; struct sched_unit *master_unit; struct sched_resource *sr_old; /* * We need to merge multiple idle_vcpu units and sched_resource structs * into one. As the free cpus all share the same lock we are fine doing * that now. The worst which could happen would be someone waiting for * the lock, thus dereferencing sched_res->schedule_lock. This is the * reason we are freeing struct sched_res via call_rcu() to avoid the * lock pointer suddenly disappearing. */ mask = sched_get_opt_cpumask(c->gran, cpu); master_unit = idle_vcpu[cpu]->sched_unit; for_each_cpu ( cpu_iter, mask ) { if ( idx ) cpumask_clear_cpu(cpu_iter, &sched_res_mask); per_cpu(sched_res_idx, cpu_iter) = idx++; if ( cpu == cpu_iter ) continue; sr_old = get_sched_res(cpu_iter); kill_timer(&sr_old->s_timer); idle_vcpu[cpu_iter]->sched_unit = master_unit; master_unit->runstate_cnt[RUNSTATE_running]++; set_sched_res(cpu_iter, sr); cpumask_set_cpu(cpu_iter, sr->cpus); call_rcu(&sr_old->rcu, sched_res_free); } } new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv); sr->scheduler = new_ops; sr->sched_priv = ppriv; /* * Reroute the lock to the per pCPU lock as /last/ thing. In fact, * if it is free (and it can be) we want that anyone that manages * taking it, finds all the initializations we've done above in place. */ smp_wmb(); sr->schedule_lock = new_lock; /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */ spin_unlock_irqrestore(old_lock, flags); sr->granularity = cpupool_get_granularity(c); sr->cpupool = c; /* The cpu is added to a pool, trigger it to go pick up some work */ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); out: rcu_read_unlock(&sched_res_rculock); return ret; } /* * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot * be made in stop_machine() context. * * Between alloc_cpu_rm_data() and the real cpu removal action the relevant * contents of struct sched_resource can't change, as the cpu in question is * locked against any other movement to or from cpupools, and the data copied * by alloc_cpu_rm_data() is modified only in case the cpu in question is * being moved from or to a cpupool. */ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc) { struct cpu_rm_data *data; const struct sched_resource *sr; unsigned int idx; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1); if ( !data ) goto out; if ( aff_alloc ) { if ( !alloc_affinity_masks(&data->affinity) ) { XFREE(data); goto out; } } else memset(&data->affinity, 0, sizeof(data->affinity)); data->old_ops = sr->scheduler; data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; data->ppriv_old = sr->sched_priv; for ( idx = 0; idx < sr->granularity - 1; idx++ ) { data->sr[idx] = sched_alloc_res(); if ( data->sr[idx] ) { data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem(); if ( !data->sr[idx]->sched_unit_idle ) { sched_res_free(&data->sr[idx]->rcu); data->sr[idx] = NULL; } } if ( !data->sr[idx] ) { while ( idx > 0 ) sched_res_free(&data->sr[--idx]->rcu); free_affinity_masks(&data->affinity); XFREE(data); goto out; } data->sr[idx]->curr = data->sr[idx]->sched_unit_idle; data->sr[idx]->scheduler = &sched_idle_ops; data->sr[idx]->granularity = 1; /* We want the lock not to change when replacing the resource. */ data->sr[idx]->schedule_lock = sr->schedule_lock; } out: rcu_read_unlock(&sched_res_rculock); return data; } void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) { sched_free_udata(mem->old_ops, mem->vpriv_old); sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); free_affinity_masks(&mem->affinity); xfree(mem); } /* * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops * (the idle scheduler). * The cpu is already marked as "free" and not valid any longer for its * cpupool. */ int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data) { struct sched_resource *sr; struct sched_unit *unit; spinlock_t *old_lock; unsigned long flags; int idx = 0; unsigned int cpu_iter; bool free_data = !data; if ( !data ) data = alloc_cpu_rm_data(cpu, false); if ( !data ) return -ENOMEM; rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); ASSERT(sr->granularity); ASSERT(sr->cpupool != NULL); ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); /* See comment in schedule_cpu_add() regarding lock switching. */ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); for_each_cpu ( cpu_iter, sr->cpus ) { per_cpu(sched_res_idx, cpu_iter) = 0; if ( cpu_iter == cpu ) { unit = idle_vcpu[cpu_iter]->sched_unit; unit->priv = NULL; atomic_set(&unit->next_task->rendezvous_out_cnt, 0); unit->rendezvous_in_cnt = 0; } else { /* Initialize unit. */ unit = data->sr[idx]->sched_unit_idle; unit->res = data->sr[idx]; unit->is_running = true; sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); /* Adjust cpu masks of resources (old and new). */ cpumask_clear_cpu(cpu_iter, sr->cpus); cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus); cpumask_set_cpu(cpu_iter, &sched_res_mask); /* Init timer. */ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter); /* Last resource initializations and insert resource pointer. */ data->sr[idx]->master_cpu = cpu_iter; set_sched_res(cpu_iter, data->sr[idx]); /* Last action: set the new lock pointer. */ smp_mb(); data->sr[idx]->schedule_lock = &sched_free_cpu_lock; idx++; } } sr->scheduler = &sched_idle_ops; sr->sched_priv = NULL; sr->granularity = 1; sr->cpupool = NULL; smp_mb(); sr->schedule_lock = &sched_free_cpu_lock; /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ spin_unlock_irqrestore(old_lock, flags); sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); rcu_read_unlock(&sched_res_rculock); if ( free_data ) free_cpu_rm_data(data, cpu); return 0; } struct scheduler *scheduler_get_default(void) { return &ops; } struct scheduler *scheduler_alloc(unsigned int sched_id) { int i; int ret; struct scheduler *sched; for ( i = 0; i < NUM_SCHEDULERS; i++ ) if ( schedulers[i] && schedulers[i]->sched_id == sched_id ) goto found; return ERR_PTR(-ENOENT); found: if ( (sched = xmalloc(struct scheduler)) == NULL ) return ERR_PTR(-ENOMEM); memcpy(sched, schedulers[i], sizeof(*sched)); if ( (ret = sched_init(sched)) != 0 ) { xfree(sched); sched = ERR_PTR(ret); } return sched; } void scheduler_free(struct scheduler *sched) { BUG_ON(sched == &ops); sched_deinit(sched); xfree(sched); } void schedule_dump(struct cpupool *c) { unsigned int i, j; struct scheduler *sched; cpumask_t *cpus; /* Locking, if necessary, must be handled withing each scheduler */ rcu_read_lock(&sched_res_rculock); if ( c != NULL ) { sched = c->sched; cpus = c->res_valid; printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); sched_dump_settings(sched); } else { sched = &ops; cpus = &cpupool_free_cpus; } printk("CPUs info:\n"); for_each_cpu (i, cpus) { struct sched_resource *sr = get_sched_res(i); unsigned long flags; spinlock_t *lock; lock = pcpu_schedule_lock_irqsave(i, &flags); printk("CPU[%02d] current=%pv, curr=%pv, prev=%pv\n", i, get_cpu_current(i), sr->curr ? sr->curr->vcpu_list : NULL, sr->prev ? sr->prev->vcpu_list : NULL); for_each_cpu (j, sr->cpus) if ( i != j ) printk("CPU[%02d] current=%pv\n", j, get_cpu_current(j)); pcpu_schedule_unlock_irqrestore(lock, flags, i); sched_dump_cpu_state(sched, i); } rcu_read_unlock(&sched_res_rculock); } void wait(void) { schedule(); } #ifdef CONFIG_X86 void __init sched_setup_dom0_vcpus(struct domain *d) { unsigned int i; for ( i = 1; i < d->max_vcpus; i++ ) vcpu_create(d, i); domain_update_node_affinity(d); } #endif #ifdef CONFIG_COMPAT #include "compat.c" #endif #endif /* !COMPAT */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */