/**************************************************************************** * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. **************************************************************************** * * File: common/csched_credit.c * Author: Emmanuel Ackaouy * * Description: Credit-based SMP CPU scheduler */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* * Locking: * - Scheduler-lock (a.k.a. runqueue lock): * + is per-runqueue, and there is one runqueue per-cpu; * + serializes all runqueue manipulation operations; * - Private data lock (a.k.a. private scheduler lock): * + serializes accesses to the scheduler global state (weight, * credit, balance_credit, etc); * + serializes updates to the domains' scheduling parameters. * * Ordering is "private lock always comes first": * + if we need both locks, we must acquire the private * scheduler lock for first; * + if we already own a runqueue lock, we must never acquire * the private scheduler lock. */ /* * Basic constants */ #define CSCHED_DEFAULT_WEIGHT 256 #define CSCHED_TICKS_PER_TSLICE 3 /* Default timeslice: 30ms */ #define CSCHED_DEFAULT_TSLICE_MS 30 #define CSCHED_CREDITS_PER_MSEC 10 /* Never set a timer shorter than this value. */ #define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN /* * Priorities */ #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ #define CSCHED_PRI_IDLE -64 /* idle */ /* * Flags * * Note that svc->flags (where these flags live) is protected by an * inconsistent set of locks. Therefore atomic-safe bit operations must * be used for accessing it. */ #define CSCHED_FLAG_UNIT_PARKED 0x0 /* UNIT over capped credits */ #define CSCHED_FLAG_UNIT_YIELD 0x1 /* UNIT yielding */ #define CSCHED_FLAG_UNIT_MIGRATING 0x2 /* UNIT may have moved to a new pcpu */ #define CSCHED_FLAG_UNIT_PINNED 0x4 /* UNIT can run only on 1 pcpu */ /* * Useful macros */ #define CSCHED_PRIV(_ops) \ ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_PCPU(_c) \ ((struct csched_pcpu *)get_sched_res(_c)->sched_priv) #define CSCHED_UNIT(unit) ((struct csched_unit *) (unit)->priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) /* * CSCHED_STATS * * Manage very basic per-unit counters and stats. * * Useful for debugging live systems. The stats are displayed * with runq dumps ('r' on the Xen console). */ #ifdef SCHED_STATS #define CSCHED_STATS #define SCHED_UNIT_STATS_RESET(_V) \ do \ { \ memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ } while ( 0 ) #define SCHED_UNIT_STAT_CRANK(_V, _X) (((_V)->stats._X)++) #define SCHED_UNIT_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) #else /* !SCHED_STATS */ #undef CSCHED_STATS #define SCHED_UNIT_STATS_RESET(_V) do {} while ( 0 ) #define SCHED_UNIT_STAT_CRANK(_V, _X) do {} while ( 0 ) #define SCHED_UNIT_STAT_SET(_V, _X, _Y) do {} while ( 0 ) #endif /* SCHED_STATS */ /* * Credit tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1) #define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2) #define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3) #define TRC_CSCHED_STOLEN_UNIT TRC_SCHED_CLASS_EVT(CSCHED, 4) #define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5) #define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6) #define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7) #define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8) #define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9) #define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10) #define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11) /* * Boot parameters */ static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms); /* * Physical CPU */ struct csched_pcpu { struct list_head runq; uint32_t runq_sort_last; unsigned int idle_bias; unsigned int nr_runnable; unsigned int tick; struct timer ticker; }; /* * Virtual UNIT */ struct csched_unit { struct list_head runq_elem; struct list_head active_unit_elem; /* Up-pointers */ struct csched_dom *sdom; struct sched_unit *unit; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; int pri; atomic_t credit; unsigned int residual; s_time_t last_sched_time; #ifdef CSCHED_STATS struct { int credit_last; uint32_t credit_incr; uint32_t state_active; uint32_t state_idle; uint32_t migrate_q; uint32_t migrate_r; uint32_t kicked_away; } stats; #endif }; /* * Domain */ struct csched_dom { struct list_head active_unit; struct list_head active_sdom_elem; struct domain *dom; uint16_t active_unit_count; uint16_t weight; uint16_t cap; }; /* * System-wide private data */ struct csched_private { /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; cpumask_var_t idlers; cpumask_var_t cpus; uint32_t *balance_bias; uint32_t runq_sort; uint32_t ncpus; /* Period of master and tick in milliseconds */ unsigned int tick_period_us, ticks_per_tslice; s_time_t ratelimit, tslice, unit_migr_delay; struct list_head active_sdom; uint32_t weight; uint32_t credit; int credit_balance; unsigned int credits_per_tslice; unsigned int master; struct timer master_ticker; }; static void cf_check csched_tick(void *_cpu); static void cf_check csched_acct(void *dummy); static inline int __unit_on_runq(const struct csched_unit *svc) { return !list_empty(&svc->runq_elem); } static inline struct csched_unit * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_unit, runq_elem); } /* Is the first element of cpu's runq (if any) cpu's idle unit? */ static inline bool is_runq_idle(unsigned int cpu) { /* * We're peeking at cpu's runq, we must hold the proper lock. */ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); return list_empty(RUNQ(cpu)) || is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit); } static inline void inc_nr_runnable(unsigned int cpu) { ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); CSCHED_PCPU(cpu)->nr_runnable++; } static inline void dec_nr_runnable(unsigned int cpu) { ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1); CSCHED_PCPU(cpu)->nr_runnable--; } static inline void __runq_insert(struct csched_unit *svc) { unsigned int cpu = sched_unit_master(svc->unit); const struct list_head * const runq = RUNQ(cpu); struct list_head *iter; BUG_ON( __unit_on_runq(svc) ); list_for_each( iter, runq ) { const struct csched_unit * const iter_svc = __runq_elem(iter); if ( svc->pri > iter_svc->pri ) break; } /* If the unit yielded, try to put it behind one lower-priority * runnable unit if we can. The next runq_sort will bring it forward * within 30ms if the queue too long. */ if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags) && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) { iter=iter->next; /* Some sanity checks */ BUG_ON(iter == runq); } list_add_tail(&svc->runq_elem, iter); } static inline void runq_insert(struct csched_unit *svc) { __runq_insert(svc); inc_nr_runnable(sched_unit_master(svc->unit)); } static inline void __runq_remove(struct csched_unit *svc) { BUG_ON( !__unit_on_runq(svc) ); list_del_init(&svc->runq_elem); } static inline void runq_remove(struct csched_unit *svc) { dec_nr_runnable(sched_unit_master(svc->unit)); __runq_remove(svc); } static void burn_credits(struct csched_unit *svc, s_time_t now) { s_time_t delta; uint64_t val; unsigned int credits; /* Assert svc is current */ ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) ); if ( (delta = now - svc->start_time) <= 0 ) return; val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual; svc->residual = do_div(val, MILLISECS(1)); credits = val; ASSERT(credits == val); /* make sure we haven't truncated val */ atomic_sub(credits, &svc->credit); svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC; } static bool __read_mostly opt_tickle_one_idle = true; boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle); static DEFINE_PER_CPU(unsigned int, last_tickle_cpu); static inline void __runq_tickle(const struct csched_unit *new) { unsigned int cpu = sched_unit_master(new->unit); const struct sched_resource *sr = get_sched_res(cpu); const struct sched_unit *unit = new->unit; struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu)); struct csched_private *prv = CSCHED_PRIV(sr->scheduler); cpumask_t mask, idle_mask, *online; int balance_step, idlers_empty; ASSERT(cur); cpumask_clear(&mask); online = cpupool_domain_master_cpumask(new->sdom->dom); cpumask_and(&idle_mask, prv->idlers, online); idlers_empty = cpumask_empty(&idle_mask); /* * Exclusive pinning is when a unit has hard-affinity with only one * cpu, and there is no other unit that has hard-affinity with that * same cpu. This is infrequent, but if it happens, is for achieving * the most possible determinism, and least possible overhead for * the units in question. * * Try to identify the vast majority of these situations, and deal * with them quickly. */ if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) && cpumask_test_cpu(cpu, &idle_mask)) ) { ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); SCHED_STAT_CRANK(tickled_idle_cpu_excl); __cpumask_set_cpu(cpu, &mask); goto tickle; } /* * If the pcpu is idle, or there are no idlers and the new * unit is a higher priority than the old unit, run it here. * * If there are idle cpus, first try to find one suitable to run * new, so we can avoid preempting cur. If we cannot find a * suitable idler on which to run new, run it here, but try to * find a suitable idler on which to run cur instead. */ if ( cur->pri == CSCHED_PRI_IDLE || (idlers_empty && new->pri > cur->pri) ) { if ( cur->pri != CSCHED_PRI_IDLE ) SCHED_STAT_CRANK(tickled_busy_cpu); else SCHED_STAT_CRANK(tickled_idle_cpu); __cpumask_set_cpu(cpu, &mask); } else if ( !idlers_empty ) { /* * Soft and hard affinity balancing loop. For units without * a useful soft affinity, consider hard affinity only. */ for_each_affinity_balance_step( balance_step ) { int new_idlers_empty; if ( balance_step == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) continue; /* Are there idlers suitable for new (for this balance step)? */ affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu)); cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &idle_mask); new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu)); /* * Let's not be too harsh! If there aren't idlers suitable * for new in its soft affinity mask, make sure we check its * hard affinity as well, before taking final decisions. */ if ( new_idlers_empty && balance_step == BALANCE_SOFT_AFFINITY ) continue; /* * If there are no suitable idlers for new, and it's higher * priority than cur, check whether we can migrate cur away. * We have to do it indirectly, via _VPF_migrating (instead * of just tickling any idler suitable for cur) because cur * is running. * * If there are suitable idlers for new, no matter priorities, * leave cur alone (as it is running and is, likely, cache-hot) * and wake some of them (which is waking up and so is, likely, * cache cold anyway). */ if ( new_idlers_empty && new->pri > cur->pri ) { if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) ) { SCHED_UNIT_STAT_CRANK(cur, kicked_away); SCHED_UNIT_STAT_CRANK(cur, migrate_r); SCHED_STAT_CRANK(migrate_kicked_away); sched_set_pause_flags(cur->unit, _VPF_migrating); } /* Tickle cpu anyway, to let new preempt cur. */ SCHED_STAT_CRANK(tickled_busy_cpu); __cpumask_set_cpu(cpu, &mask); } else if ( !new_idlers_empty ) { /* Which of the idlers suitable for new shall we wake up? */ SCHED_STAT_CRANK(tickled_idle_cpu); if ( opt_tickle_one_idle ) { this_cpu(last_tickle_cpu) = cpumask_cycle(this_cpu(last_tickle_cpu), cpumask_scratch_cpu(cpu)); __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); } else cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu)); } /* Did we find anyone? */ if ( !cpumask_empty(&mask) ) break; } } tickle: if ( !cpumask_empty(&mask) ) { if ( unlikely(tb_init_done) ) { /* Avoid TRACE_*: saves checking !tb_init_done each step */ for_each_cpu(cpu, &mask) __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu); } /* * Mark the designated CPUs as busy and send them all the scheduler * interrupt. We need the for_each_cpu for dealing with the * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and * can't use cpumask_andnot(), because prv->idlers needs atomic access. * * In the default (and most common) case, when opt_rickle_one_idle is * true, the loop does only one step, and only one bit is cleared. */ for_each_cpu(cpu, &mask) cpumask_clear_cpu(cpu, prv->idlers); cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ); } else SCHED_STAT_CRANK(tickled_no_cpu); } static void cf_check csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { const struct csched_private *prv = CSCHED_PRIV(ops); /* * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed * very early. xfree() does not really mind, but we want to be sure that, * when we get here, either init_pdata has never been called, or * deinit_pdata has been called already. */ ASSERT(!cpumask_test_cpu(cpu, prv->cpus)); xfree(pcpu); } static void cf_check csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) { struct csched_private *prv = CSCHED_PRIV(ops); struct csched_pcpu *spc = pcpu; unsigned int node = cpu_to_node(cpu); unsigned long flags; /* * Scheduler specific data for this pCPU must still be there and and be * valid. In fact, if we are here: * 1. alloc_pdata must have been called for this cpu, and free_pdata * must not have been called on it before us, * 2. init_pdata must have been called on this cpu, and deinit_pdata * (us!) must not have been called on it already. */ ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus)); spin_lock_irqsave(&prv->lock, flags); prv->credit -= prv->credits_per_tslice; prv->ncpus--; cpumask_clear_cpu(cpu, prv->idlers); cpumask_clear_cpu(cpu, prv->cpus); if ( (prv->master == cpu) && (prv->ncpus > 0) ) { prv->master = cpumask_first(prv->cpus); migrate_timer(&prv->master_ticker, prv->master); } if ( prv->balance_bias[node] == cpu ) { cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node)); if ( !cpumask_empty(cpumask_scratch) ) prv->balance_bias[node] = cpumask_first(cpumask_scratch); } kill_timer(&spc->ticker); if ( prv->ncpus == 0 ) kill_timer(&prv->master_ticker); spin_unlock_irqrestore(&prv->lock, flags); } static void *cf_check csched_alloc_pdata(const struct scheduler *ops, int cpu) { struct csched_pcpu *spc; /* Allocate per-PCPU info */ spc = xzalloc(struct csched_pcpu); if ( spc == NULL ) return ERR_PTR(-ENOMEM); return spc; } static void init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu) { ASSERT(spin_is_locked(&prv->lock)); /* cpu data needs to be allocated, but STILL uninitialized. */ ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL); /* Initialize/update system-wide config */ prv->credit += prv->credits_per_tslice; prv->ncpus++; cpumask_set_cpu(cpu, prv->cpus); if ( prv->ncpus == 1 ) { prv->master = cpu; init_timer(&prv->master_ticker, csched_acct, prv, cpu); set_timer(&prv->master_ticker, NOW() + prv->tslice); } cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu))); if ( cpumask_weight(cpumask_scratch) == 1 ) prv->balance_bias[cpu_to_node(cpu)] = cpu; init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); INIT_LIST_HEAD(&spc->runq); spc->runq_sort_last = prv->runq_sort; spc->idle_bias = nr_cpu_ids - 1; /* Start off idling... */ BUG_ON(!is_idle_unit(curr_on_cpu(cpu))); cpumask_set_cpu(cpu, prv->idlers); spc->nr_runnable = 0; } static void cf_check csched_move_timers(const struct scheduler *ops, struct sched_resource *sr) { struct csched_private *prv = CSCHED_PRIV(ops); struct csched_pcpu *spc = sr->sched_priv; if ( sr->master_cpu == prv->master ) migrate_timer(&prv->master_ticker, prv->master); migrate_timer(&spc->ticker, sr->master_cpu); } /* Change the scheduler of cpu to us (Credit). */ static spinlock_t *cf_check csched_switch_sched(struct scheduler *new_ops, unsigned int cpu, void *pdata, void *vdata) { struct sched_resource *sr = get_sched_res(cpu); struct csched_private *prv = CSCHED_PRIV(new_ops); struct csched_unit *svc = vdata; ASSERT(svc && is_idle_unit(svc->unit)); sched_idle_unit(cpu)->priv = vdata; /* * We are holding the runqueue lock already (it's been taken in * schedule_cpu_switch()). It actually may or may not be the 'right' * one for this cpu, but that is ok for preventing races. */ ASSERT(!local_irq_is_enabled()); spin_lock(&prv->lock); init_pdata(prv, pdata, cpu); spin_unlock(&prv->lock); return &sr->_lock; } #ifndef NDEBUG static inline void __csched_unit_check(const struct sched_unit *unit) { struct csched_unit * const svc = CSCHED_UNIT(unit); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->unit != unit ); BUG_ON( sdom != CSCHED_DOM(unit->domain) ); if ( sdom ) { BUG_ON( is_idle_unit(unit) ); BUG_ON( sdom->dom != unit->domain ); } else { BUG_ON( !is_idle_unit(unit) ); } SCHED_STAT_CRANK(unit_check); } #define CSCHED_UNIT_CHECK(unit) (__csched_unit_check(unit)) #else #define CSCHED_UNIT_CHECK(unit) #endif /* * Delay, in microseconds, between migrations of a UNIT between PCPUs. * This prevents rapid fluttering of a UNIT between CPUs, and reduces the * implicit overheads such as cache-warming. 1ms (1000) has been measured * as a good value. */ static unsigned int vcpu_migration_delay_us; integer_param("vcpu_migration_delay", vcpu_migration_delay_us); static inline bool __csched_vcpu_is_cache_hot(const struct csched_private *prv, const struct csched_unit *svc) { bool hot = prv->unit_migr_delay && (NOW() - svc->last_sched_time) < prv->unit_migr_delay; if ( hot ) SCHED_STAT_CRANK(unit_hot); return hot; } static inline int __csched_unit_is_migrateable(const struct csched_private *prv, const struct sched_unit *unit, int dest_cpu, const cpumask_t *mask) { const struct csched_unit *svc = CSCHED_UNIT(unit); /* * Don't pick up work that's hot on peer PCPU, or that can't (or * would prefer not to) run on cpu. * * The caller is supposed to have already checked that unit is also * not running. */ ASSERT(!unit->is_running); return !__csched_vcpu_is_cache_hot(prv, svc) && cpumask_test_cpu(dest_cpu, mask); } static int _csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit, bool commit) { int cpu = sched_unit_master(unit); /* We must always use cpu's scratch space */ cpumask_t *cpus = cpumask_scratch_cpu(cpu); cpumask_t idlers; const cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); struct csched_pcpu *spc = NULL; int balance_step; for_each_affinity_balance_step( balance_step ) { affinity_balance_cpumask(unit, balance_step, cpus); cpumask_and(cpus, online, cpus); /* * We want to pick up a pcpu among the ones that are online and * can accommodate vc. As far as hard affinity is concerned, there * always will be at least one of these pcpus in the scratch cpumask, * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below * are ok. * * On the other hand, when considering soft affinity, it is possible * that the mask is empty (for instance, if the domain has been put * in a cpupool that does not contain any of the pcpus in its soft * affinity), which would result in the ASSERT()-s inside cpumask_*() * operations triggering (in debug builds). * * Therefore, if that is the case, we just skip the soft affinity * balancing step all together. */ if ( balance_step == BALANCE_SOFT_AFFINITY && (!has_soft_affinity(unit) || cpumask_empty(cpus)) ) continue; /* If present, prefer vc's current processor */ cpu = cpumask_test_cpu(sched_unit_master(unit), cpus) ? sched_unit_master(unit) : cpumask_cycle(sched_unit_master(unit), cpus); ASSERT(cpumask_test_cpu(cpu, cpus)); /* * Try to find an idle processor within the above constraints. * * In multi-core and multi-threaded CPUs, not all idle execution * vehicles are equal! * * We give preference to the idle execution vehicle with the most * idling neighbours in its grouping. This distributes work across * distinct cores first and guarantees we don't do something stupid * like run two UNITs on co-hyperthreads while there are idle cores * or sockets. * * Notice that, when computing the "idleness" of cpu, we may want to * discount unit. That is, iff unit is the currently running and the * only runnable unit on cpu, we add cpu to the idlers. */ cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) ) __cpumask_set_cpu(cpu, &idlers); cpumask_and(cpus, &idlers, cpus); /* * It is important that cpu points to an idle processor, if a suitable * one exists (and we can use cpus to check and, possibly, choose a new * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and * cpu points to a busy thread with an idle sibling, both the threads * will be considered the same, from the "idleness" calculation point * of view", preventing unit from being moved to the thread that is * actually idle. * * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so * we check for it first. */ if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) ) cpu = cpumask_cycle(cpu, cpus); __cpumask_clear_cpu(cpu, cpus); while ( !cpumask_empty(cpus) ) { cpumask_t cpu_idlers; cpumask_t nxt_idlers; int nxt, weight_cpu, weight_nxt; int migrate_factor; nxt = cpumask_cycle(cpu, cpus); if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) { /* We're on the same socket, so check the busy-ness of threads. * Migrate if # of idlers is less at all */ ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); migrate_factor = 1; cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, cpu)); cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, nxt)); } else { /* We're on different sockets, so check the busy-ness of cores. * Migrate only if the other core is twice as idle */ ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); migrate_factor = 2; cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); } weight_cpu = cpumask_weight(&cpu_idlers); weight_nxt = cpumask_weight(&nxt_idlers); /* smt_power_savings: consolidate work rather than spreading it */ if ( sched_smt_power_savings ? weight_cpu > weight_nxt : weight_cpu * migrate_factor < weight_nxt ) { cpumask_and(&nxt_idlers, &nxt_idlers, cpus); spc = CSCHED_PCPU(nxt); cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu)); } else { cpumask_andnot(cpus, cpus, &nxt_idlers); } } /* Stop if cpu is idle */ if ( cpumask_test_cpu(cpu, &idlers) ) break; } if ( commit && spc ) spc->idle_bias = cpu; TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id, cpu); return cpu; } static struct sched_resource *cf_check csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit) { struct csched_unit *svc = CSCHED_UNIT(unit); /* * We have been called by vcpu_migrate() (in schedule.c), as part * of the process of seeing if vc can be migrated to another pcpu. * We make a note about this in svc->flags so that later, in * csched_unit_wake() (still called from vcpu_migrate()) we won't * get boosted, which we don't deserve as we are "only" migrating. */ set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); return get_sched_res(_csched_cpu_pick(ops, unit, true)); } static inline void __csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc) { struct csched_dom * const sdom = svc->sdom; unsigned long flags; spin_lock_irqsave(&prv->lock, flags); if ( list_empty(&svc->active_unit_elem) ) { SCHED_UNIT_STAT_CRANK(svc, state_active); SCHED_STAT_CRANK(acct_unit_active); sdom->active_unit_count++; list_add(&svc->active_unit_elem, &sdom->active_unit); /* Make weight per-unit */ prv->weight += sdom->weight; if ( list_empty(&sdom->active_sdom_elem) ) { list_add(&sdom->active_sdom_elem, &prv->active_sdom); } } TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id, svc->unit->unit_id, sdom->active_unit_count); spin_unlock_irqrestore(&prv->lock, flags); } static inline void __csched_unit_acct_stop_locked(struct csched_private *prv, struct csched_unit *svc) { struct csched_dom * const sdom = svc->sdom; BUG_ON( list_empty(&svc->active_unit_elem) ); SCHED_UNIT_STAT_CRANK(svc, state_idle); SCHED_STAT_CRANK(acct_unit_idle); BUG_ON( prv->weight < sdom->weight ); sdom->active_unit_count--; list_del_init(&svc->active_unit_elem); prv->weight -= sdom->weight; if ( list_empty(&sdom->active_unit) ) { list_del_init(&sdom->active_sdom_elem); } TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id, svc->unit->unit_id, sdom->active_unit_count); } static void csched_unit_acct(struct csched_private *prv, unsigned int cpu) { struct sched_unit *currunit = current->sched_unit; struct csched_unit * const svc = CSCHED_UNIT(currunit); const struct sched_resource *sr = get_sched_res(cpu); const struct scheduler *ops = sr->scheduler; ASSERT( sched_unit_master(currunit) == cpu ); ASSERT( svc->sdom != NULL ); ASSERT( !is_idle_unit(svc->unit) ); /* * If this UNIT's priority was boosted when it last awoke, reset it. * If the UNIT is found here, then it's consuming a non-negligeable * amount of CPU resources and should no longer be boosted. */ if ( svc->pri == CSCHED_PRI_TS_BOOST ) { svc->pri = CSCHED_PRI_TS_UNDER; TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id, svc->unit->unit_id); } /* * Update credits */ burn_credits(svc, NOW()); /* * Put this UNIT and domain back on the active list if it was * idling. */ if ( list_empty(&svc->active_unit_elem) ) { __csched_unit_acct_start(prv, svc); } else { unsigned int new_cpu; unsigned long flags; spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags); /* * If it's been active a while, check if we'd be better off * migrating it to run elsewhere (see multi-core and multi-thread * support in csched_res_pick()). */ new_cpu = _csched_cpu_pick(ops, currunit, false); unit_schedule_unlock_irqrestore(lock, flags, currunit); if ( new_cpu != cpu ) { SCHED_UNIT_STAT_CRANK(svc, migrate_r); SCHED_STAT_CRANK(migrate_running); sched_set_pause_flags(currunit, _VPF_migrating); /* * As we are about to tickle cpu, we should clear its bit in * idlers. But, if we are here, it means there is someone running * on it, and hence the bit must be zero already. */ ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers)); cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } } } static void *cf_check csched_alloc_udata( const struct scheduler *ops, struct sched_unit *unit, void *dd) { struct csched_unit *svc; /* Allocate per-UNIT info */ svc = xzalloc(struct csched_unit); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->runq_elem); INIT_LIST_HEAD(&svc->active_unit_elem); svc->sdom = dd; svc->unit = unit; svc->pri = is_idle_unit(unit) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; SCHED_UNIT_STATS_RESET(svc); SCHED_STAT_CRANK(unit_alloc); return svc; } static void cf_check csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit) { struct csched_unit *svc = unit->priv; spinlock_t *lock; BUG_ON( is_idle_unit(unit) ); /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */ lock = unit_schedule_lock_irq(unit); sched_set_res(unit, csched_res_pick(ops, unit)); spin_unlock_irq(lock); lock = unit_schedule_lock_irq(unit); if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running ) runq_insert(svc); unit_schedule_unlock_irq(lock, unit); SCHED_STAT_CRANK(unit_insert); } static void cf_check csched_free_udata(const struct scheduler *ops, void *priv) { struct csched_unit *svc = priv; BUG_ON( !list_empty(&svc->runq_elem) ); xfree(svc); } static void cf_check csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit) { struct csched_private *prv = CSCHED_PRIV(ops); struct csched_unit * const svc = CSCHED_UNIT(unit); struct csched_dom * const sdom = svc->sdom; SCHED_STAT_CRANK(unit_remove); ASSERT(!__unit_on_runq(svc)); if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) { SCHED_STAT_CRANK(unit_unpark); sched_unit_unpause(svc->unit); } spin_lock_irq(&prv->lock); if ( !list_empty(&svc->active_unit_elem) ) __csched_unit_acct_stop_locked(prv, svc); spin_unlock_irq(&prv->lock); BUG_ON( sdom == NULL ); } static void cf_check csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) { struct csched_unit * const svc = CSCHED_UNIT(unit); unsigned int cpu = sched_unit_master(unit); const struct sched_resource *sr = get_sched_res(cpu); SCHED_STAT_CRANK(unit_sleep); BUG_ON( is_idle_unit(unit) ); if ( curr_on_cpu(cpu) == unit ) { /* * We are about to tickle cpu, so we should clear its bit in idlers. * But, we are here because unit is going to sleep while running on cpu, * so the bit must be zero already. */ ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers)); cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } else if ( __unit_on_runq(svc) ) runq_remove(svc); } static void cf_check csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) { struct csched_unit * const svc = CSCHED_UNIT(unit); bool migrating; BUG_ON( is_idle_unit(unit) ); if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) { SCHED_STAT_CRANK(unit_wake_running); return; } if ( unlikely(__unit_on_runq(svc)) ) { SCHED_STAT_CRANK(unit_wake_onrunq); return; } if ( likely(unit_runnable(unit)) ) SCHED_STAT_CRANK(unit_wake_runnable); else SCHED_STAT_CRANK(unit_wake_not_runnable); /* * We temporarily boost the priority of awaking UNITs! * * If this UNIT consumes a non negligible amount of CPU, it * will eventually find itself in the credit accounting code * path where its priority will be reset to normal. * * If on the other hand the UNIT consumes little CPU and is * blocking and awoken a lot (doing I/O for example), its * priority will remain boosted, optimizing it's wake-to-run * latencies. * * This allows wake-to-run latency sensitive UNITs to preempt * more CPU resource intensive UNITs without impacting overall * system fairness. * * There are two cases, when we don't want to boost: * - UNITs that are waking up after a migration, rather than * after having block; * - UNITs of capped domains unpausing after earning credits * they had overspent. */ migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER && !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) { TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id, unit->unit_id); SCHED_STAT_CRANK(unit_boost); svc->pri = CSCHED_PRI_TS_BOOST; } /* Put the UNIT on the runq and tickle CPUs */ runq_insert(svc); __runq_tickle(svc); } static void cf_check csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit) { struct csched_unit * const svc = CSCHED_UNIT(unit); /* Let the scheduler know that this vcpu is trying to yield */ set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags); } static int cf_check csched_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; int rc = 0; /* Protect both get and put branches with the pluggable scheduler * lock. Runq lock not needed anywhere in here. */ spin_lock_irqsave(&prv->lock, flags); switch ( op->cmd ) { case XEN_DOMCTL_SCHEDOP_getinfo: op->u.credit.weight = sdom->weight; op->u.credit.cap = sdom->cap; break; case XEN_DOMCTL_SCHEDOP_putinfo: if ( op->u.credit.weight != 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) { prv->weight -= sdom->weight * sdom->active_unit_count; prv->weight += op->u.credit.weight * sdom->active_unit_count; } sdom->weight = op->u.credit.weight; } if ( op->u.credit.cap != (uint16_t)~0U ) sdom->cap = op->u.credit.cap; break; default: rc = -EINVAL; break; } spin_unlock_irqrestore(&prv->lock, flags); return rc; } static void cf_check csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft) { struct csched_unit *svc = CSCHED_UNIT(unit); if ( !hard ) return; /* Are we becoming exclusively pinned? */ if ( cpumask_weight(hard) == 1 ) set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); else clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); } static inline void __csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms) { prv->tslice = MILLISECS(timeslice_ms); prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE; if ( timeslice_ms < prv->ticks_per_tslice ) prv->ticks_per_tslice = 1; prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice; prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms; prv->credit = prv->credits_per_tslice * prv->ncpus; } static int cf_check csched_sys_cntl(const struct scheduler *ops, struct xen_sysctl_scheduler_op *sc) { int rc = -EINVAL; struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit; struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; switch ( sc->cmd ) { case XEN_SYSCTL_SCHEDOP_putinfo: if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN || (params->ratelimit_us && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN)) || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) goto out; spin_lock_irqsave(&prv->lock, flags); __csched_set_tslice(prv, params->tslice_ms); if ( !prv->ratelimit && params->ratelimit_us ) printk(XENLOG_INFO "Enabling context switch rate limiting\n"); else if ( prv->ratelimit && !params->ratelimit_us ) printk(XENLOG_INFO "Disabling context switch rate limiting\n"); prv->ratelimit = MICROSECS(params->ratelimit_us); prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us); spin_unlock_irqrestore(&prv->lock, flags); /* FALLTHRU */ case XEN_SYSCTL_SCHEDOP_getinfo: params->tslice_ms = prv->tslice / MILLISECS(1); params->ratelimit_us = prv->ratelimit / MICROSECS(1); params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1); rc = 0; break; } out: return rc; } static void *cf_check csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; sdom = xzalloc(struct csched_dom); if ( sdom == NULL ) return ERR_PTR(-ENOMEM); /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_unit); INIT_LIST_HEAD(&sdom->active_sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; return sdom; } static void cf_check csched_free_domdata(const struct scheduler *ops, void *data) { xfree(data); } /* * This is a O(n) optimized sort of the runq. * * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk * through the runq and move up any UNDERs that are preceded by OVERS. We * remember the last UNDER to make the move up operation O(1). */ static void csched_runq_sort(struct csched_private *prv, unsigned int cpu) { struct csched_pcpu * const spc = CSCHED_PCPU(cpu); struct list_head *runq, *elem, *next, *last_under; struct csched_unit *svc_elem; spinlock_t *lock; unsigned long flags; int sort_epoch; sort_epoch = prv->runq_sort; if ( sort_epoch == spc->runq_sort_last ) return; spc->runq_sort_last = sort_epoch; lock = pcpu_schedule_lock_irqsave(cpu, &flags); runq = &spc->runq; elem = runq->next; last_under = runq; while ( elem != runq ) { next = elem->next; svc_elem = __runq_elem(elem); if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) { /* does elem need to move up the runq? */ if ( elem->prev != last_under ) { list_del(elem); list_add(elem, last_under); } last_under = elem; } elem = next; } pcpu_schedule_unlock_irqrestore(lock, flags, cpu); } static void cf_check csched_acct(void* dummy) { struct csched_private *prv = dummy; unsigned long flags; struct list_head *iter_unit, *next_unit; struct list_head *iter_sdom, *next_sdom; struct csched_unit *svc; struct csched_dom *sdom; uint32_t credit_total; uint32_t weight_total; uint32_t weight_left; uint32_t credit_fair; uint32_t credit_peak; uint32_t credit_cap; int credit_balance; int credit_xtra; int credit; spin_lock_irqsave(&prv->lock, flags); weight_total = prv->weight; credit_total = prv->credit; /* Converge balance towards 0 when it drops negative */ if ( prv->credit_balance < 0 ) { credit_total -= prv->credit_balance; SCHED_STAT_CRANK(acct_balance); } if ( unlikely(weight_total == 0) ) { prv->credit_balance = 0; spin_unlock_irqrestore(&prv->lock, flags); SCHED_STAT_CRANK(acct_no_work); goto out; } SCHED_STAT_CRANK(acct_run); weight_left = weight_total; credit_balance = 0; credit_xtra = 0; credit_cap = 0U; list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) { sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); BUG_ON( is_idle_domain(sdom->dom) ); BUG_ON( sdom->active_unit_count == 0 ); BUG_ON( sdom->weight == 0 ); BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left ); weight_left -= ( sdom->weight * sdom->active_unit_count ); /* * A domain's fair share is computed using its weight in competition * with that of all other active domains. * * At most, a domain can use credits to run all its active UNITs * for one full accounting period. We allow a domain to earn more * only when the system-wide credit balance is negative. */ credit_peak = sdom->active_unit_count * prv->credits_per_tslice; if ( prv->credit_balance < 0 ) { credit_peak += ( ( -prv->credit_balance * sdom->weight * sdom->active_unit_count) + (weight_total - 1) ) / weight_total; } if ( sdom->cap != 0U ) { credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100; if ( credit_cap < credit_peak ) credit_peak = credit_cap; /* FIXME -- set cap per-unit as well...? */ credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 ) ) / sdom->active_unit_count; } credit_fair = ( ( credit_total * sdom->weight * sdom->active_unit_count ) + (weight_total - 1) ) / weight_total; if ( credit_fair < credit_peak ) { credit_xtra = 1; } else { if ( weight_left != 0U ) { /* Give other domains a chance at unused credits */ credit_total += ( ( ( credit_fair - credit_peak ) * weight_total ) + ( weight_left - 1 ) ) / weight_left; } if ( credit_xtra ) { /* * Lazily keep domains with extra credits at the head of * the queue to give others a chance at them in future * accounting periods. */ SCHED_STAT_CRANK(acct_reorder); list_del(&sdom->active_sdom_elem); list_add(&sdom->active_sdom_elem, &prv->active_sdom); } credit_fair = credit_peak; } /* Compute fair share per UNIT */ credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 ) ) / sdom->active_unit_count; list_for_each_safe( iter_unit, next_unit, &sdom->active_unit ) { svc = list_entry(iter_unit, struct csched_unit, active_unit_elem); BUG_ON( sdom != svc->sdom ); /* Increment credit */ atomic_add(credit_fair, &svc->credit); credit = atomic_read(&svc->credit); /* * Recompute priority or, if UNIT is idling, remove it from * the active list. */ if ( credit < 0 ) { svc->pri = CSCHED_PRI_TS_OVER; /* Park running UNITs of capped-out domains */ if ( sdom->cap != 0U && credit < -credit_cap && !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) { SCHED_STAT_CRANK(unit_park); sched_unit_pause_nosync(svc->unit); } /* Lower bound on credits */ if ( credit < -prv->credits_per_tslice ) { SCHED_STAT_CRANK(acct_min_credit); credit = -prv->credits_per_tslice; atomic_set(&svc->credit, credit); } } else { svc->pri = CSCHED_PRI_TS_UNDER; /* Unpark any capped domains whose credits go positive */ if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) { /* * It's important to unset the flag AFTER the unpause() * call to make sure the UNIT's priority is not boosted * if it is woken up here. */ SCHED_STAT_CRANK(unit_unpark); sched_unit_unpause(svc->unit); clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags); } /* Upper bound on credits means UNIT stops earning */ if ( credit > prv->credits_per_tslice ) { __csched_unit_acct_stop_locked(prv, svc); /* Divide credits in half, so that when it starts * accounting again, it starts a little bit "ahead" */ credit /= 2; atomic_set(&svc->credit, credit); } } SCHED_UNIT_STAT_SET(svc, credit_last, credit); SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair); credit_balance += credit; } } prv->credit_balance = credit_balance; spin_unlock_irqrestore(&prv->lock, flags); /* Inform each CPU that its runq needs to be sorted */ prv->runq_sort++; out: set_timer( &prv->master_ticker, NOW() + prv->tslice); } static void cf_check csched_tick(void *_cpu) { unsigned int cpu = (unsigned long)_cpu; const struct sched_resource *sr = get_sched_res(cpu); struct csched_pcpu *spc = CSCHED_PCPU(cpu); struct csched_private *prv = CSCHED_PRIV(sr->scheduler); spc->tick++; /* * Accounting for running UNIT */ if ( !is_idle_unit(current->sched_unit) ) csched_unit_acct(prv, cpu); /* * Check if runq needs to be sorted * * Every physical CPU resorts the runq after the accounting master has * modified priorities. This is a special O(n) sort and runs at most * once per accounting period (currently 30 milliseconds). */ csched_runq_sort(prv, cpu); set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); } static struct csched_unit * csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) { const struct sched_resource *sr = get_sched_res(cpu); const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler); const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); struct csched_unit *speer; struct list_head *iter; struct sched_unit *unit; ASSERT(peer_pcpu != NULL); /* * Don't steal from an idle CPU's runq because it's about to * pick up work from it itself. */ if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) ) goto out; list_for_each( iter, &peer_pcpu->runq ) { speer = __runq_elem(iter); /* * If next available UNIT here is not of strictly higher * priority than ours, this PCPU is useless to us. */ if ( speer->pri <= pri ) break; /* Is this UNIT runnable on our PCPU? */ unit = speer->unit; BUG_ON( is_idle_unit(unit) ); /* * If the unit is still in peer_cpu's scheduling tail, or if it * has no useful soft affinity, skip it. * * In fact, what we want is to check if we have any "soft-affine * work" to steal, before starting to look at "hard-affine work". * * Notice that, if not even one unit on this runq has a useful * soft affinity, we could have avoid considering this runq for * a soft balancing step in the first place. This, for instance, * can be implemented by taking note of on what runq there are * units with useful soft affinities in some sort of bitmap * or counter. */ if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit)) ) continue; affinity_balance_cpumask(unit, balance_step, cpumask_scratch); if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) ) { /* We got a candidate. Grab it! */ TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu, unit->domain->domain_id, unit->unit_id); SCHED_UNIT_STAT_CRANK(speer, migrate_q); SCHED_STAT_CRANK(migrate_queued); runq_remove(speer); sched_set_res(unit, get_sched_res(cpu)); /* * speer will start executing directly on cpu, without having to * go through runq_insert(). So we must update the runnable count * for cpu here. */ inc_nr_runnable(cpu); return speer; } } out: SCHED_STAT_CRANK(steal_peer_idle); return NULL; } static struct csched_unit * csched_load_balance(struct csched_private *prv, int cpu, struct csched_unit *snext, bool *stolen) { const struct cpupool *c = get_sched_res(cpu)->cpupool; struct csched_unit *speer; cpumask_t workers; const cpumask_t *online = c->res_valid; int peer_cpu, first_cpu, peer_node, bstep; int node = cpu_to_node(cpu); BUG_ON(get_sched_res(cpu) != snext->unit->res); /* If this CPU is going offline, we shouldn't steal work. */ if ( unlikely(!cpumask_test_cpu(cpu, online)) ) goto out; if ( snext->pri == CSCHED_PRI_IDLE ) SCHED_STAT_CRANK(load_balance_idle); else if ( snext->pri == CSCHED_PRI_TS_OVER ) SCHED_STAT_CRANK(load_balance_over); else SCHED_STAT_CRANK(load_balance_other); /* * Let's look around for work to steal, taking both hard affinity * and soft affinity into account. More specifically, we check all * the non-idle CPUs' runq, looking for: * 1. any "soft-affine work" to steal first, * 2. if not finding anything, any "hard-affine work" to steal. */ for_each_affinity_balance_step( bstep ) { /* * We peek at the non-idling CPUs in a node-wise fashion. In fact, * it is more likely that we find some affine work on our same * node, not to mention that migrating units within the same node * could well expected to be cheaper than across-nodes (memory * stays local, there might be some node-wide cache[s], etc.). */ peer_node = node; do { /* Select the pCPUs in this node that have work we can steal. */ cpumask_andnot(&workers, online, prv->idlers); cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); __cpumask_clear_cpu(cpu, &workers); first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers); if ( first_cpu >= nr_cpu_ids ) goto next_node; peer_cpu = first_cpu; do { spinlock_t *lock; /* * If there is only one runnable unit on peer_cpu, it means * there's no one to be stolen in its runqueue, so skip it. * * Checking this without holding the lock is racy... But that's * the whole point of this optimization! * * In more details: * - if we race with dec_nr_runnable(), we may try to take the * lock and call csched_runq_steal() for no reason. This is * not a functional issue, and should be infrequent enough. * And we can avoid that by re-checking nr_runnable after * having grabbed the lock, if we want; * - if we race with inc_nr_runnable(), we skip a pCPU that may * have runnable units in its runqueue, but that's not a * problem because: * + if racing with csched_unit_insert() or csched_unit_wake(), * __runq_tickle() will be called afterwords, so the unit * won't get stuck in the runqueue for too long; * + if racing with csched_runq_steal(), it may be that an * unit that we could have picked up, stays in a runqueue * until someone else tries to steal it again. But this is * no worse than what can happen already (without this * optimization), it the pCPU would schedule right after we * have taken the lock, and hence block on it. */ if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 ) { TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0); goto next_cpu; } /* * Get ahold of the scheduler lock for this peer CPU. * * Note: We don't spin on this lock but simply try it. Spinning * could cause a deadlock if the peer CPU is also load * balancing and trying to lock this CPU. */ lock = pcpu_schedule_trylock(peer_cpu); SCHED_STAT_CRANK(steal_trylock); if ( !lock ) { SCHED_STAT_CRANK(steal_trylock_failed); TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0); goto next_cpu; } TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1); /* Any work over there to steal? */ speer = cpumask_test_cpu(peer_cpu, online) ? csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; pcpu_schedule_unlock(lock, peer_cpu); /* As soon as one unit is found, balancing ends */ if ( speer != NULL ) { *stolen = true; /* * Next time we'll look for work to steal on this node, we * will start from the next pCPU, with respect to this one, * so we don't risk stealing always from the same ones. */ prv->balance_bias[peer_node] = peer_cpu; return speer; } next_cpu: peer_cpu = cpumask_cycle(peer_cpu, &workers); } while( peer_cpu != first_cpu ); next_node: peer_node = cycle_node(peer_node, node_online_map); } while( peer_node != node ); } out: /* Failed to find more important work elsewhere... */ __runq_remove(snext); return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static void cf_check csched_schedule( const struct scheduler *ops, struct sched_unit *unit, s_time_t now, bool tasklet_work_scheduled) { const unsigned int cur_cpu = smp_processor_id(); const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu); struct list_head * const runq = RUNQ(sched_cpu); struct csched_unit * const scurr = CSCHED_UNIT(unit); struct csched_private *prv = CSCHED_PRIV(ops); struct csched_unit *snext; s_time_t runtime, tslice; bool migrated = false; SCHED_STAT_CRANK(schedule); CSCHED_UNIT_CHECK(unit); /* * Here in Credit1 code, we usually just call TRACE_nD() helpers, and * don't care about packing. But scheduling happens very often, so it * actually is important that the record is as small as possible. */ if ( unlikely(tb_init_done) ) { struct { unsigned cpu:16, tasklet:8, idle:8; } d; d.cpu = cur_cpu; d.tasklet = tasklet_work_scheduled; d.idle = is_idle_unit(unit); __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d), (unsigned char *)&d); } runtime = now - unit->state_entry_time; if ( runtime < 0 ) /* Does this ever happen? */ runtime = 0; if ( !is_idle_unit(unit) ) { /* Update credits of a non-idle UNIT. */ burn_credits(scurr, now); scurr->start_time -= now; scurr->last_sched_time = now; } else { /* Re-instate a boosted idle UNIT as normal-idle. */ scurr->pri = CSCHED_PRI_IDLE; } /* Choices, choices: * - If we have a tasklet, we need to run the idle unit no matter what. * - If sched rate limiting is in effect, and the current unit has * run for less than that amount of time, continue the current one, * but with a shorter timeslice and return it immediately * - Otherwise, chose the one with the highest priority (which may * be the one currently running) * - If the currently running one is TS_OVER, see if there * is a higher priority one waiting on the runqueue of another * cpu and steal it. */ /* * If we have schedule rate limiting enabled, check to see * how long we've run for. * * If scurr is yielding, however, we don't let rate limiting kick in. * In fact, it may be the case that scurr is about to spin, and there's * no point forcing it to do so until rate limiting expires. */ if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags) && !tasklet_work_scheduled && prv->ratelimit && unit_runnable_state(unit) && !is_idle_unit(unit) && runtime < prv->ratelimit ) { snext = scurr; snext->start_time += now; perfc_incr(delay_ms); /* * Next timeslice must last just until we'll have executed for * ratelimit. However, to avoid setting a really short timer, which * will most likely be inaccurate and counterproductive, we never go * below CSCHED_MIN_TIMER. */ tslice = prv->ratelimit - runtime; if ( unlikely(runtime < CSCHED_MIN_TIMER) ) tslice = CSCHED_MIN_TIMER; if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned runtime; } d; d.dom = unit->domain->domain_id; d.unit = unit->unit_id; d.runtime = runtime; __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d), (unsigned char *)&d); } goto out; } tslice = prv->tslice; /* * Select next runnable local UNIT (ie top of local runq) */ if ( unit_runnable(unit) ) __runq_insert(scurr); else { BUG_ON( is_idle_unit(unit) || list_empty(runq) ); /* Current has blocked. Update the runnable counter for this cpu. */ dec_nr_runnable(sched_cpu); } /* * Clear YIELD flag before scheduling out */ clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags); do { snext = __runq_elem(runq->next); /* Tasklet work (which runs in idle UNIT context) overrides all else. */ if ( tasklet_work_scheduled ) { TRACE_0D(TRC_CSCHED_SCHED_TASKLET); snext = CSCHED_UNIT(sched_idle_unit(sched_cpu)); snext->pri = CSCHED_PRI_TS_BOOST; } /* * SMP Load balance: * * If the next highest priority local runnable UNIT has already eaten * through its credits, look on other PCPUs to see if we have more * urgent work... If not, csched_load_balance() will return snext, but * already removed from the runq. */ if ( snext->pri > CSCHED_PRI_TS_OVER ) __runq_remove(snext); else snext = csched_load_balance(prv, sched_cpu, snext, &migrated); } while ( !unit_runnable_state(snext->unit) ); /* * Update idlers mask if necessary. When we're idling, other CPUs * will tickle us when they get extra work. */ if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE ) { if ( !cpumask_test_cpu(sched_cpu, prv->idlers) ) cpumask_set_cpu(sched_cpu, prv->idlers); } else if ( cpumask_test_cpu(sched_cpu, prv->idlers) ) { cpumask_clear_cpu(sched_cpu, prv->idlers); } if ( !is_idle_unit(snext->unit) ) snext->start_time += now; out: /* * Return task to run next... */ unit->next_time = (is_idle_unit(snext->unit) ? -1 : tslice); unit->next_task = snext->unit; snext->unit->migrated = migrated; /* Stop credit tick when going to idle, restart it when coming from idle. */ if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) ) stop_timer(&spc->ticker); if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) ) set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us) - now % MICROSECS(prv->tick_period_us) ); CSCHED_UNIT_CHECK(unit->next_task); } static void csched_dump_unit(const struct csched_unit *svc) { struct csched_dom * const sdom = svc->sdom; printk("[%i.%i] pri=%i flags=%x cpu=%i", svc->unit->domain->domain_id, svc->unit->unit_id, svc->pri, svc->flags, sched_unit_master(svc->unit)); if ( sdom ) { printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit), sdom->weight, sdom->cap); #ifdef CSCHED_STATS printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}", svc->stats.credit_last, svc->stats.credit_incr, svc->stats.state_active, svc->stats.state_idle, svc->stats.migrate_q, svc->stats.migrate_r, svc->stats.kicked_away); #endif } printk("\n"); } static void cf_check csched_dump_pcpu(const struct scheduler *ops, int cpu) { const struct list_head *runq; struct list_head *iter; struct csched_private *prv = CSCHED_PRIV(ops); const struct csched_pcpu *spc; const struct csched_unit *svc; spinlock_t *lock; unsigned long flags; int loop; /* * We need both locks: * - csched_dump_unit() wants to access domains' scheduling * parameters, which are protected by the private scheduler lock; * - we scan through the runqueue, so we need the proper runqueue * lock (the one of the runqueue of this cpu). */ spin_lock_irqsave(&prv->lock, flags); lock = pcpu_schedule_lock(cpu); spc = CSCHED_PCPU(cpu); runq = &spc->runq; printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n", cpu, spc->nr_runnable, spc->runq_sort_last, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); /* current UNIT (nothing to say if that's the idle unit). */ svc = CSCHED_UNIT(curr_on_cpu(cpu)); if ( svc && !is_idle_unit(svc->unit) ) { printk("\trun: "); csched_dump_unit(svc); } loop = 0; list_for_each( iter, runq ) { svc = __runq_elem(iter); if ( svc ) { printk("\t%3d: ", ++loop); csched_dump_unit(svc); } } pcpu_schedule_unlock(lock, cpu); spin_unlock_irqrestore(&prv->lock, flags); } static void cf_check csched_dump(const struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; struct csched_private *prv = CSCHED_PRIV(ops); int loop; unsigned long flags; spin_lock_irqsave(&prv->lock, flags); printk("info:\n" "\tncpus = %u\n" "\tmaster = %u\n" "\tcredit = %u\n" "\tcredit balance = %d\n" "\tweight = %u\n" "\trunq_sort = %u\n" "\tdefault-weight = %d\n" "\ttslice = %"PRI_stime"ms\n" "\tratelimit = %"PRI_stime"us\n" "\tcredits per msec = %d\n" "\tticks per tslice = %d\n" "\tmigration delay = %"PRI_stime"us\n", prv->ncpus, prv->master, prv->credit, prv->credit_balance, prv->weight, prv->runq_sort, CSCHED_DEFAULT_WEIGHT, prv->tslice / MILLISECS(1), prv->ratelimit / MICROSECS(1), CSCHED_CREDITS_PER_MSEC, prv->ticks_per_tslice, prv->unit_migr_delay/ MICROSECS(1)); printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers)); printk("active units:\n"); loop = 0; list_for_each( iter_sdom, &prv->active_sdom ) { const struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); list_for_each( iter_svc, &sdom->active_unit ) { const struct csched_unit *svc; spinlock_t *lock; svc = list_entry(iter_svc, struct csched_unit, active_unit_elem); lock = unit_schedule_lock(svc->unit); printk("\t%3d: ", ++loop); csched_dump_unit(svc); unit_schedule_unlock(lock, svc->unit); } } spin_unlock_irqrestore(&prv->lock, flags); } static int __init cf_check csched_global_init(void) { if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN ) { printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n" " Resetting to default %u\n", XEN_SYSCTL_CSCHED_TSLICE_MIN, XEN_SYSCTL_CSCHED_TSLICE_MAX, CSCHED_DEFAULT_TSLICE_MS); sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; } if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) printk("WARNING: sched_ratelimit_us >" "sched_credit_tslice_ms is undefined\n" "Setting ratelimit to tslice\n"); if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) { vcpu_migration_delay_us = 0; printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n" "Resetting to default: %u\n", XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us); } return 0; } static int cf_check csched_init(struct scheduler *ops) { struct csched_private *prv; prv = xzalloc(struct csched_private); if ( prv == NULL ) return -ENOMEM; prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES); if ( prv->balance_bias == NULL ) { xfree(prv); return -ENOMEM; } if ( !zalloc_cpumask_var(&prv->cpus) || !zalloc_cpumask_var(&prv->idlers) ) { free_cpumask_var(prv->cpus); xfree(prv->balance_bias); xfree(prv); return -ENOMEM; } ops->sched_data = prv; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->active_sdom); prv->master = UINT_MAX; __csched_set_tslice(prv, sched_credit_tslice_ms); if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) prv->ratelimit = prv->tslice; else prv->ratelimit = MICROSECS(sched_ratelimit_us); prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us); return 0; } static void cf_check csched_deinit(struct scheduler *ops) { struct csched_private *prv; prv = CSCHED_PRIV(ops); if ( prv != NULL ) { ops->sched_data = NULL; free_cpumask_var(prv->cpus); free_cpumask_var(prv->idlers); xfree(prv->balance_bias); xfree(prv); } } static const struct scheduler sched_credit_def = { .name = "SMP Credit Scheduler", .opt_name = "credit", .sched_id = XEN_SCHEDULER_CREDIT, .sched_data = NULL, .global_init = csched_global_init, .insert_unit = csched_unit_insert, .remove_unit = csched_unit_remove, .sleep = csched_unit_sleep, .wake = csched_unit_wake, .yield = csched_unit_yield, .adjust = csched_dom_cntl, .adjust_affinity= csched_aff_cntl, .adjust_global = csched_sys_cntl, .pick_resource = csched_res_pick, .do_schedule = csched_schedule, .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, .deinit = csched_deinit, .alloc_udata = csched_alloc_udata, .free_udata = csched_free_udata, .alloc_pdata = csched_alloc_pdata, .deinit_pdata = csched_deinit_pdata, .free_pdata = csched_free_pdata, .switch_sched = csched_switch_sched, .alloc_domdata = csched_alloc_domdata, .free_domdata = csched_free_domdata, .move_timers = csched_move_timers, }; REGISTER_SCHEDULER(sched_credit_def);