/**************************************************************************** * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd **************************************************************************** * * File: common/sched_credit2.c * Author: George Dunlap * * Description: Credit-based SMP CPU scheduler * Based on an earlier verson by Emmanuel Ackaouy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* Meant only for helping developers during debugging. */ /* #define d2printk printk */ #define d2printk(x...) /* * Credit2 tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) #define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) #define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13) #define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14) #define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15) #define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16) #define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17) #define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19) #define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20) #define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21) #define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22) #define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23) /* * TODO: * + Hyperthreading * - "Discount" time run on a thread with busy siblings * + Algorithm: * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., * a flash animation in the background) can we schedule it with low enough latency * so that audio doesn't skip? * + Optimizing * - Profiling, making new algorithms, making math more efficient (no long division) */ /* * Design: * * VMs "burn" credits based on their weight; higher weight means * credits burn more slowly. The highest weight unit burns credits at * a rate of 1 credit per nanosecond. Others burn proportionally * more. * * units are inserted into the runqueue by credit order. * * Credits are "reset" when the next unit in the runqueue is less than * or equal to zero. At that point, everyone's credits are "clipped" * to a small value, and a fixed credit is added to everyone. */ /* * Utilization cap: * * Setting an pCPU utilization cap for a domain means the following: * * - a domain can have a cap, expressed in terms of % of physical CPU time. * A domain that must not use more than 1/4 of _one_ physical CPU, will * be given a cap of 25%; a domain that must not use more than 1+1/2 of * physical CPU time, will be given a cap of 150%; * * - caps are per-domain (not per-unit). If a domain has only 1 unit, and * a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4 * units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be * split among the v units. How much each of the units will actually get, * during any given interval of time, is unspecified (as it depends on * various aspects: workload, system load, etc.). For instance, it is * possible that, during a given time interval, 2 units use 100% each, * and the other two use nothing; while during another time interval, * two units use 80%, one uses 10% and the other 30%; or that each use * 50% (and so on and so forth). * * For implementing this, we use the following approach: * * - each domain is given a 'budget', an each domain has a timer, which * replenishes the domain's budget periodically. The budget is the amount * of time the units of the domain can use every 'period'; * * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains * (but each domain has its own timer; so the all are periodic by the same * period, but replenishment of the budgets of the various domains, at * periods boundaries, are not synchronous); * * - when units run, they consume budget. When they don't run, they don't * consume budget. If there is no budget left for the domain, no unit of * that domain can run. If an unit tries to run and finds that there is no * budget, it blocks. * At whatever time an unit wants to run, it must check the domain's budget, * and if there is some, it can use it. * * - budget is replenished to the top of the capacity for the domain once * per period. Even if there was some leftover budget from previous period, * though, the budget after a replenishment will always be at most equal * to the total capacify of the domain ('tot_budget'); * * - when a budget replenishment occurs, if there are units that had been * blocked because of lack of budget, they'll be unblocked, and they will * (potentially) be able to run again. * * Finally, some even more implementation related detail: * * - budget is stored in a domain-wide pool. Units of the domain that want * to run go to such pool, and grub some. When they do so, the amount * they grabbed is _immediately_ removed from the pool. This happens in * unit_grab_budget(); * * - when units stop running, if they've not consumed all the budget they * took, the leftover is put back in the pool. This happens in * unit_return_budget(); * * - the above means that an unit can find out that there is no budget and * block, not only if the cap has actually been reached (for this period), * but also if some other units, in order to run, have grabbed a certain * quota of budget, no matter whether they've already used it all or not. * An unit blocking because (any form of) lack of budget is said to be * "parked", and such blocking happens in park_unit(); * * - when an unit stops running, and puts back some budget in the domain pool, * we need to check whether there is someone which has been parked and that * can be unparked. This happens in unpark_parked_units(), called from * csched2_context_saved(); * * - of course, unparking happens also as a consequence of the domain's budget * being replenished by the periodic timer. This also occurs by means of * calling csched2_context_saved() (but from replenish_domain_budget()); * * - parked units of a domain are kept in a (per-domain) list, called * 'parked_units'). Manipulation of the list and of the domain-wide budget * pool, must occur only when holding the 'budget_lock'. */ /* * Locking: * * - runqueue lock * + it is per-runqueue, so: * * cpus in a runqueue take the runqueue lock, when using * pcpu_schedule_lock() / unit_schedule_lock() (and friends), * * a cpu may (try to) take a "remote" runqueue lock, e.g., for * load balancing; * + serializes runqueue operations (removing and inserting units); * + protects runqueue-wide data in csched2_runqueue_data; * + protects unit parameters in csched2_unit for the unit in the * runqueue. * * - Private scheduler lock * + protects scheduler-wide data in csched2_private, such as: * * the list of domains active in this scheduler, * * what cpus and what runqueues are active and in what * runqueue each cpu is; * + serializes the operation of changing the weights of domains; * * - Budget lock * + it is per-domain; * + protects, in domains that have an utilization cap; * * manipulation of the total budget of the domain (as it is shared * among all units of the domain), * * manipulation of the list of units that are blocked waiting for * some budget to be available. * * - Type: * + runqueue locks are 'regular' spinlocks; * + the private scheduler lock can be an rwlock. In fact, data * it protects is modified only during initialization, cpupool * manipulation and when changing weights, and read in all * other cases (e.g., during load balancing); * + budget locks are 'regular' spinlocks. * * Ordering: * + tylock must be used when wanting to take a runqueue lock, * if we already hold another one; * + if taking both a runqueue lock and the private scheduler * lock is, the latter must always be taken for first; * + if taking both a runqueue lock and a budget lock, the former * must always be taken for first. */ /* * Basic constants */ /* Default weight: How much a new domain starts with. */ #define CSCHED2_DEFAULT_WEIGHT 256 /* * Min timer: Minimum length a timer will be set, to * achieve efficiency. */ #define CSCHED2_MIN_TIMER MICROSECS(500) /* * Amount of credit VMs begin with, and are reset to. * ATM, set so that highest-weight VMs can only run for 10ms * before a reset event. */ #define CSCHED2_CREDIT_INIT MILLISECS(10) /* * Minimum amount of credits VMs can have. Ideally, no VM would get * close to this (unless a vCPU manages to execute for really long * time uninterrupted). In case it happens, it makes no sense to * track even deeper undershoots. * * NOTE: If making this smaller than -CSCHED2_CREDIT_INIT, adjust * reset_credit() accordingly. */ #define CSCHED2_CREDIT_MIN (-CSCHED2_CREDIT_INIT) /* * Amount of credit the idle units have. It never changes, as idle * units does not consume credits, and it must be lower than whatever * amount of credit 'regular' unit would end up with. */ #define CSCHED2_IDLE_CREDIT (CSCHED2_CREDIT_MIN-1) /* * Carryover: How much "extra" credit may be carried over after * a reset. */ #define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER /* * Stickiness: Cross-L2 migration resistance. Should be less than * MIN_TIMER. */ #define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) /* How much to "compensate" an unit for L2 migration. */ #define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50) /* How tolerant we should be when peeking at runtime of units on other cpus */ #define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50) /* Reset: Value below which credit will be reset. */ #define CSCHED2_CREDIT_RESET 0 /* Max timer: Maximum time a guest can be run for. */ #define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT /* Period of the cap replenishment timer. */ #define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1)) /* * Flags */ /* * CSFLAG_scheduled: Is this unit either running on, or context-switching off, * a physical cpu? * + Accessed only with runqueue lock held * + Set when chosen as next in csched2_schedule(). * + Cleared after context switch has been saved in csched2_context_saved() * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should * set CSFLAG_delayed_runq_add * + Checked to be false in runq_insert. */ #define __CSFLAG_scheduled 1 #define CSFLAG_scheduled (1U<<__CSFLAG_scheduled) /* * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done * being context switched out? * + Set when scheduling out in csched2_schedule() if prev is runnable * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and * clears the bit. */ #define __CSFLAG_delayed_runq_add 2 #define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add) /* * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a * credit2-initiated runq migrate request; migrate it to the runqueue indicated * in the svc struct. */ #define __CSFLAG_runq_migrate_request 3 #define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request) /* * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The * scheduler is invoked to see if we can give the cpu to someone else, and * get back to the yielding unit in a while. */ #define __CSFLAG_unit_yield 4 #define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield) /* * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard * affinity set to one and only 1 cpu (and, hence, can only run there). */ #define __CSFLAG_pinned 5 #define CSFLAG_pinned (1U<<__CSFLAG_pinned) static unsigned int __read_mostly opt_migrate_resist = 500; integer_param("sched_credit2_migrate_resist", opt_migrate_resist); /* * Load tracking and load balancing * * Load history of runqueues and units is accounted for by using an * exponential weighted moving average algorithm. However, instead of using * fractions,we shift everything to left by the number of bits we want to * use for representing the fractional part (Q-format). * * We may also want to reduce the precision of time accounting, to * accommodate 'longer windows'. So, if that is the case, we just need to * shift all time samples to the right. * * The details of the formulas used for load tracking are explained close to * update_runq_load(). Let's just say here that, with full nanosecond time * granularity, a 30 bits wide 'decaying window' is ~1 second long. * * We want to consider the following equations: * * avg[0] = load*P * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W * * where W is the length of the window, P the multiplier for transitiong into * Q-format fixed point arithmetic and load is the instantaneous load of a * runqueue, which basically is the number of runnable units there are on the * runqueue (for the meaning of the other terms, look at the doc comment to * update_runq_load()). * * So, again, with full nanosecond granularity, and 1 second window, we have: * * W = 2^30 * P = 2^18 * * The maximum possible value for the average load, which we want to store in * s_time_t type variables (i.e., we have 63 bits available) is load*P. This * means that, with P 18 bits wide, load can occupy 45 bits. This in turn * means we can have 2^45 units in each runqueue, before overflow occurs! * * However, it can happen that, at step j+1, if: * * avg[j] = load*P * delta = W * * then: * * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W * * So we must be able to deal with W*load*P. This means load can't be higher * than: * * 2^(63 - 30 - 18) = 2^15 = 32768 * * So 32768 is the maximum number of units the we can have in a runqueue, * at any given time, and still not have problems with the load tracking * calculations... and this is more than fine. * * As a matter of fact, since we are using microseconds granularity, we have * W=2^20. So, still with 18 fractional bits and a 1 second long window, there * may be 2^25 = 33554432 units in a runq before we have to start thinking * about overflow. */ /* If >0, decreases the granularity of time samples used for load tracking. */ #define LOADAVG_GRANULARITY_SHIFT (10) /* Time window during which we still give value to previous load history. */ #define LOADAVG_WINDOW_SHIFT (30) /* 18 bits by default (and not less than 4) for decimals. */ #define LOADAVG_PRECISION_SHIFT (18) #define LOADAVG_PRECISION_SHIFT_MIN (4) /* * Both the length of the window and the number of fractional bits can be * decided with boot parameters. * * The length of the window is always expressed in nanoseconds. The actual * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT. */ static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT; integer_param("credit2_load_window_shift", opt_load_window_shift); static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT; integer_param("credit2_load_precision_shift", opt_load_precision_shift); static int __read_mostly opt_underload_balance_tolerance = 0; integer_param("credit2_balance_under", opt_underload_balance_tolerance); static int __read_mostly opt_overload_balance_tolerance = -3; integer_param("credit2_balance_over", opt_overload_balance_tolerance); /* * Domains subject to a cap receive a replenishment of their runtime budget * once every opt_cap_period interval. Default is 10 ms. The amount of budget * they receive depends on their cap. For instance, a domain with a 50% cap * will receive 50% of 10 ms, so 5 ms. */ static unsigned int __read_mostly opt_cap_period = 10; /* ms */ integer_param("credit2_cap_period_ms", opt_cap_period); /* * Runqueue organization. * * The various cpus are to be assigned each one to a runqueue, and we * want that to happen basing on topology. At the moment, it is possible * to choose to arrange runqueues to be: * * - per-cpu: meaning that there will be one runqueue per logical cpu. This * will happen when if the opt_runqueue parameter is set to 'cpu'. * * - per-core: meaning that there will be one runqueue per each physical * core of the host. This will happen if the opt_runqueue * parameter is set to 'core'; * * - per-socket: meaning that there will be one runqueue per each physical * socket (AKA package, which often, but not always, also * matches a NUMA node) of the host; This will happen if * the opt_runqueue parameter is set to 'socket'; * * - per-node: meaning that there will be one runqueue per each physical * NUMA node of the host. This will happen if the opt_runqueue * parameter is set to 'node'; * * - global: meaning that there will be only one runqueue to which all the * (logical) processors of the host belong. This will happen if * the opt_runqueue parameter is set to 'all'. * * Depending on the value of opt_runqueue, therefore, cpus that are part of * either the same physical core, the same physical socket, the same NUMA * node, or just all of them, will be put together to form runqueues. */ #define OPT_RUNQUEUE_CPU 0 #define OPT_RUNQUEUE_CORE 1 #define OPT_RUNQUEUE_SOCKET 2 #define OPT_RUNQUEUE_NODE 3 #define OPT_RUNQUEUE_ALL 4 static const char *const opt_runqueue_str[] = { [OPT_RUNQUEUE_CPU] = "cpu", [OPT_RUNQUEUE_CORE] = "core", [OPT_RUNQUEUE_SOCKET] = "socket", [OPT_RUNQUEUE_NODE] = "node", [OPT_RUNQUEUE_ALL] = "all" }; static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET; static int __init cf_check parse_credit2_runqueue(const char *s) { unsigned int i; for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ ) { if ( !strcmp(s, opt_runqueue_str[i]) ) { opt_runqueue = i; return 0; } } return -EINVAL; } custom_param("credit2_runqueue", parse_credit2_runqueue); /* * How many CPUs will be put, at most, in each runqueue. * * Runqueues are still arranged according to the host topology (and according * to the value of the 'credit2_runqueue' parameter). But we also have a cap * to the number of CPUs that share runqueues. * * This should be considered an upper limit. In fact, we also try to balance * the number of CPUs in each runqueue. And, when doing that, it is possible * that fewer CPUs than what this parameters mandates will actually be put * in each runqueue. */ #define MAX_CPUS_RUNQ 16 static unsigned int __read_mostly opt_max_cpus_runqueue = MAX_CPUS_RUNQ; integer_param("sched_credit2_max_cpus_runqueue", opt_max_cpus_runqueue); /* * Per-runqueue data */ struct csched2_runqueue_data { spinlock_t lock; /* Lock for this runqueue */ struct list_head rql; /* List of runqueues */ struct list_head runq; /* Ordered list of runnable vms */ unsigned int refcnt; /* How many CPUs reference this runqueue */ /* (including not yet active ones) */ unsigned int nr_cpus; /* How many CPUs are sharing this runqueue */ /* (only active ones) */ int id; /* ID of this runqueue (-1 if invalid) */ int load; /* Instantaneous load (num of non-idle units) */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ s_time_t b_avgload; /* Decaying queue load modified by balancing */ cpumask_t active, /* CPUs enabled for this runqueue */ smt_idle, /* Fully idle-and-untickled cores (see below) */ tickled, /* Have been asked to go through schedule */ idle; /* Currently idle pcpus */ struct list_head svc; /* List of all units assigned to the runqueue */ unsigned int max_weight; /* Max weight of the units in this runqueue */ unsigned int pick_bias; /* Last picked pcpu. Start from it next time */ }; /* * System-wide private data */ struct csched2_private { rwlock_t lock; /* Private scheduler lock */ unsigned int load_precision_shift; /* Precision of load calculations */ unsigned int load_window_shift; /* Lenght of load decaying window */ unsigned int ratelimit_us; /* Rate limiting for this scheduler */ unsigned int active_queues; /* Number of active runqueues */ struct list_head rql; /* List of runqueues */ cpumask_t initialized; /* CPUs part of this scheduler */ struct list_head sdom; /* List of domains (for debug key) */ }; /* * Physical CPU */ struct csched2_pcpu { cpumask_t sibling_mask; /* Siblings in the same runqueue */ struct csched2_runqueue_data *rqd; /* Runqueue for this CPU */ }; /* * Schedule Unit */ struct csched2_unit { struct csched2_dom *sdom; /* Up-pointer to domain */ struct sched_unit *unit; /* Up-pointer, to schedule unit */ struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */ int credit; /* Current amount of credit */ unsigned int weight; /* Weight of this unit */ unsigned int residual; /* Reminder of div(max_weight/weight) */ unsigned flags; /* Status flags (16 bits would be ok, */ s_time_t budget; /* Current budget (if domains has cap) */ /* but clear_bit() does not like that) */ s_time_t budget_quota; /* Budget to which unit is entitled */ s_time_t start_time; /* Time we were scheduled (for credit) */ /* Individual contribution to load */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ struct list_head runq_elem; /* On the runqueue (rqd->runq) */ struct list_head parked_elem; /* On the parked_units list */ struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */ struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */ int tickled_cpu; /* Cpu that will pick us (-1 if none) */ }; /* * Domain */ struct csched2_dom { struct domain *dom; /* Up-pointer to domain */ spinlock_t budget_lock; /* Serialized budget calculations */ s_time_t tot_budget; /* Total amount of budget */ s_time_t budget; /* Currently available budget */ struct timer repl_timer; /* Timer for periodic replenishment of budget */ s_time_t next_repl; /* Time at which next replenishment occurs */ struct list_head parked_units; /* List of CPUs waiting for budget */ struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */ uint16_t weight; /* User specified weight */ uint16_t cap; /* User specified cap */ uint16_t nr_units; /* Number of units of this domain */ }; /* * Accessor helpers functions. */ static inline struct csched2_private *csched2_priv(const struct scheduler *ops) { return ops->sched_data; } static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu) { return get_sched_res(cpu)->sched_priv; } static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit) { return unit->priv; } static inline struct csched2_dom *csched2_dom(const struct domain *d) { return d->sched_priv; } /* CPU to runq_id macro */ static inline int c2r(unsigned int cpu) { return csched2_pcpu(cpu)->rqd->id; } /* CPU to runqueue struct macro */ static inline struct csched2_runqueue_data *c2rqd(unsigned int cpu) { return csched2_pcpu(cpu)->rqd; } /* Does the domain of this unit have a cap? */ static inline bool has_cap(const struct csched2_unit *svc) { return svc->budget != STIME_MAX; } /* * Hyperthreading (SMT) support. * * We use a special per-runq mask (smt_idle) and update it according to the * following logic: * - when _all_ the SMT sibling in a core are idle, all their corresponding * bits are set in the smt_idle mask; * - when even _just_one_ of the SMT siblings in a core is not idle, all the * bits correspondings to it and to all its siblings are clear in the * smt_idle mask. * * Once we have such a mask, it is easy to implement a policy that, either: * - uses fully idle cores first: it is enough to try to schedule the units * on pcpus from smt_idle mask first. This is what happens if * sched_smt_power_savings was not set at boot (default), and it maximizes * true parallelism, and hence performance; * - uses already busy cores first: it is enough to try to schedule the units * on pcpus that are idle, but are not in smt_idle. This is what happens if * sched_smt_power_savings is set at boot, and it allows as more cores as * possible to stay in low power states, minimizing power consumption. * * This logic is entirely implemented in runq_tickle(), and that is enough. * In fact, in this scheduler, placement of an unit on one of the pcpus of a * runq, _always_ happens by means of tickling: * - when an unit wakes up, it calls csched2_unit_wake(), which calls * runq_tickle(); * - when a migration is initiated in schedule.c, we call csched2_res_pick(), * csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake(). * csched2_res_pick() looks for the least loaded runq and return just any * of its processors. Then, csched2_unit_migrate() just moves the unit to * the chosen runq, and it is again runq_tickle(), called by * csched2_unit_wake() that actually decides what pcpu to use within the * chosen runq; * - when a migration is initiated in sched_credit2.c, by calling migrate() * directly, that again temporarily use a random pcpu from the new runq, * and then calls runq_tickle(), by itself. */ /* * If all the siblings of cpu (including cpu itself) are both idle and * untickled, set all their bits in mask. * * NB that rqd->smt_idle is different than rqd->idle. rqd->idle * records pcpus that at are merely idle (i.e., at the moment do not * have an unit running on them). But you have to manually filter out * which pcpus have been tickled in order to find cores that are not * going to be busy soon. Filtering out tickled cpus pairwise is a * lot of extra pain; so for rqd->smt_idle, we explicitly make so that * the bits of a pcpu are set only if all the threads on its core are * both idle *and* untickled. * * This means changing the mask when either rqd->idle or rqd->tickled * changes. */ static inline void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers, cpumask_t *mask) { const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; if ( cpumask_subset(cpu_siblings, idlers) ) cpumask_or(mask, mask, cpu_siblings); } /* * Clear the bits of all the siblings of cpu from mask (if necessary). */ static inline void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask) { const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; if ( cpumask_subset(cpu_siblings, mask) ) cpumask_andnot(mask, mask, cpu_siblings); } /* * In csched2_res_pick(), it may not be possible to actually look at remote * runqueues (the trylock-s on their spinlocks can fail!). If that happens, * we pick, in order of decreasing preference: * 1) svc's current pcpu, if it is part of svc's soft affinity; * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity; * 3) svc's current pcpu, if it is part of svc's hard affinity; * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity; * 5) just one valid pcpu from svc's hard affinity * * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also * note that at least 5 is guaranteed to _always_ return at least one pcpu. */ static int get_fallback_cpu(struct csched2_unit *svc) { const struct sched_unit *unit = svc->unit; unsigned int bs; SCHED_STAT_CRANK(need_fallback_cpu); for_each_affinity_balance_step( bs ) { int cpu = sched_unit_master(unit); if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) continue; affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpupool_domain_master_cpumask(unit->domain)); /* * This is cases 1 or 3 (depending on bs): if processor is (still) * in our affinity, go for it, for cache betterness. */ if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) return cpu; /* * This is cases 2 or 4 (depending on bs): v->processor isn't there * any longer, check if we at least can stay in our current runq. */ if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu), &svc->rqd->active)) ) { cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &svc->rqd->active); return cpumask_first(cpumask_scratch_cpu(cpu)); } /* * We may well pick any valid pcpu from our soft-affinity, outside * of our current runqueue, but we decide not to. In fact, changing * runqueue is slow, affects load distribution, and is a source of * overhead for the units running on the other runqueue (we need the * lock). So, better do that as a consequence of a well informed * decision (or if we really don't have any other chance, as we will, * at step 5, if we get to there). * * Also, being here, looking for a fallback, is an unfortunate and * infrequent event, while the decision of putting us in the runqueue * wehere we are was (likely) made taking all the relevant factors * into account. So let's not disrupt that, just for the sake of * soft-affinity, and let's wait here to be able to made (hopefully, * soon), another similar well informed decision. */ if ( bs == BALANCE_SOFT_AFFINITY ) continue; /* * This is cases 5: last stand, just one valid pcpu from our hard * affinity. It's guaranteed that there is at least one valid cpu, * and therefore we are sure that we return it, and never really * exit the loop. */ ASSERT(bs == BALANCE_HARD_AFFINITY && !cpumask_empty(cpumask_scratch_cpu(cpu))); cpu = cpumask_first(cpumask_scratch_cpu(cpu)); if ( likely(cpu < nr_cpu_ids) ) return cpu; } ASSERT_UNREACHABLE(); /* * We can't be here. But if that somehow happen (in non-debug builds), * at least return something which both online and in our hard-affinity. */ return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit))); } /* * Time-to-credit, credit-to-time. * * We keep track of the "residual" time to make sure that frequent short * schedules still get accounted for in the end. * * FIXME: Do pre-calculated division? */ static void t2c_update(const struct csched2_runqueue_data *rqd, s_time_t time, struct csched2_unit *svc) { int64_t val = time * rqd->max_weight + svc->residual; svc->residual = do_div(val, svc->weight); /* Getting to lower credit than CSCHED2_CREDIT_MIN makes no sense. */ val = svc->credit - val; if ( unlikely(val < CSCHED2_CREDIT_MIN) ) svc->credit = CSCHED2_CREDIT_MIN; else svc->credit = val; } static s_time_t c2t(const struct csched2_runqueue_data *rqd, s_time_t credit, const struct csched2_unit *svc) { return credit * svc->weight / rqd->max_weight; } /* * Runqueue related code. */ static inline int unit_on_runq(const struct csched2_unit *svc) { return !list_empty(&svc->runq_elem); } static inline struct csched2_unit * runq_elem(struct list_head *elem) { return list_entry(elem, struct csched2_unit, runq_elem); } static inline bool same_node(unsigned int cpua, unsigned int cpub) { return cpu_to_node(cpua) == cpu_to_node(cpub); } static inline bool same_socket(unsigned int cpua, unsigned int cpub) { return cpu_to_socket(cpua) == cpu_to_socket(cpub); } static inline bool same_core(unsigned int cpua, unsigned int cpub) { return same_socket(cpua, cpub) && cpu_to_core(cpua) == cpu_to_core(cpub); } static inline bool cpu_runqueue_match(const struct csched2_runqueue_data *rqd, unsigned int cpu) { unsigned int peer_cpu = rqd->pick_bias; BUG_ON(cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID); /* OPT_RUNQUEUE_CPU will never find an existing runqueue. */ return opt_runqueue == OPT_RUNQUEUE_ALL || (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) || (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) || (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)); } /* * Additional checks, to avoid separating siblings in different runqueues. * This deals with both Intel's HTs and AMD's CUs. An arch that does not have * any similar concept will just have cpu_nr_siblings() always return 1, and * setup the cpu_sibling_mask-s acordingly (as currently does ARM), and things * will just work as well. */ static bool cpu_runqueue_siblings_match(const struct csched2_runqueue_data *rqd, unsigned int cpu, unsigned int max_cpus_runq) { unsigned int nr_sibls = cpu_nr_siblings(cpu); unsigned int rcpu, tot_sibls = 0; /* * If we put the CPU in this runqueue, we must be sure that there will * be enough room for accepting its sibling(s) as well. */ cpumask_clear(cpumask_scratch_cpu(cpu)); for_each_cpu ( rcpu, &rqd->active ) { ASSERT(rcpu != cpu); if ( !cpumask_intersects(per_cpu(cpu_sibling_mask, rcpu), cpumask_scratch_cpu(cpu)) ) { /* * For each CPU already in the runqueue, account for it and for * its sibling(s), independently from whether they are in the * runqueue or not. Of course, we do this only once, for each CPU * that is already inside the runqueue and all its siblings! * * This way, even if there are CPUs in the runqueue with siblings * in a different cpupools, we still count all of them here. * The reason for this is that, if at some future point we will * move those sibling CPUs to this cpupool, we want them to land * in this runqueue. Hence we must be sure to leave space for them. */ cpumask_or(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), per_cpu(cpu_sibling_mask, rcpu)); tot_sibls += cpu_nr_siblings(rcpu); } } /* * We know that neither the CPU, nor any of its sibling are here, * or we wouldn't even have entered the function. */ ASSERT(!cpumask_intersects(cpumask_scratch_cpu(cpu), per_cpu(cpu_sibling_mask, cpu))); /* Try adding CPU and its sibling(s) to the count and check... */ return tot_sibls + nr_sibls <= max_cpus_runq; } static struct csched2_runqueue_data * cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu) { struct csched2_private *prv = csched2_priv(ops); struct csched2_runqueue_data *rqd, *rqd_new; struct csched2_runqueue_data *rqd_valid = NULL; struct list_head *rqd_ins; unsigned long flags; int rqi = 0; unsigned int min_rqs, max_cpus_runq; bool rqi_unused = false; /* Prealloc in case we need it - not allowed with interrupts off. */ rqd_new = xzalloc(struct csched2_runqueue_data); /* * While respecting the limit of not having more than the max number of * CPUs per runqueue, let's also try to "spread" the CPU, as evenly as * possible, among the runqueues. For doing that, we need to know upfront * how many CPUs we have, so let's use the number of CPUs that are online * for that. */ min_rqs = ((num_online_cpus() - 1) / opt_max_cpus_runqueue) + 1; max_cpus_runq = num_online_cpus() / min_rqs; write_lock_irqsave(&prv->lock, flags); rqd_ins = &prv->rql; list_for_each_entry ( rqd, &prv->rql, rql ) { /* Remember first unused queue index. */ if ( !rqi_unused && rqd->id > rqi ) rqi_unused = true; /* * First of all, let's check whether, according to the system * topology, this CPU belongs in this runqueue. */ if ( cpu_runqueue_match(rqd, cpu) ) { /* * If the CPU has any siblings, they are online and they are * being added to this cpupool, always keep them together. Even * if that means violating what the opt_max_cpus_runqueue param * indicates. However, if this happens, chances are high that a * too small value was used for the parameter, so warn the user * about that. * * Note that we cannot check this once and for all, say, during * scheduler initialization. In fact, at least in theory, the * number of siblings a CPU has may not be the same for all the * CPUs. */ if ( cpumask_intersects(&rqd->active, per_cpu(cpu_sibling_mask, cpu)) ) { if ( cpumask_weight(&rqd->active) >= opt_max_cpus_runqueue ) { printk("WARNING: %s: more than opt_max_cpus_runqueue " "in a runqueue (%u vs %u), due to topology constraints.\n" "Consider raising it!\n", __func__, opt_max_cpus_runqueue, cpumask_weight(&rqd->active)); } rqd_valid = rqd; break; } /* * If we're using core (or socket) scheduling, no need to do any * further checking beyond the number of CPUs already in this * runqueue respecting our upper bound. * * Otherwise, let's try to make sure that siblings stay in the * same runqueue, pretty much under any cinrcumnstances. * * Furthermore, try to respect credit2_runqueue=all, as long as * max_cpus_runq isn't violated. */ if ( rqd->refcnt < max_cpus_runq && (ops->cpupool->gran != SCHED_GRAN_cpu || cpu_runqueue_siblings_match(rqd, cpu, max_cpus_runq) || opt_runqueue == OPT_RUNQUEUE_ALL) ) { /* * This runqueue is ok, but as we said, we also want an even * distribution of the CPUs. So, unless this is the very first * match, we go on, check all runqueues and actually add the * CPU into the one that is less full. */ if ( !rqd_valid || rqd->refcnt < rqd_valid->refcnt ) rqd_valid = rqd; } } if ( !rqi_unused ) { rqi++; rqd_ins = &rqd->rql; } } if ( !rqd_valid ) { if ( !rqd_new ) { rqd = ERR_PTR(-ENOMEM); goto out; } rqd = rqd_new; rqd_new = NULL; list_add(&rqd->rql, rqd_ins); rqd->pick_bias = cpu; rqd->id = rqi; } else rqd = rqd_valid; rqd->refcnt++; out: write_unlock_irqrestore(&prv->lock, flags); xfree(rqd_new); return rqd; } /* Find the domain with the highest weight. */ static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight, int old_weight) { /* Try to avoid brute-force search: * - If new_weight is larger, max_weigth <- new_weight * - If old_weight != max_weight, someone else is still max_weight * (No action required) * - If old_weight == max_weight, brute-force search for max weight */ if ( new_weight > rqd->max_weight ) { rqd->max_weight = new_weight; SCHED_STAT_CRANK(upd_max_weight_quick); } else if ( old_weight == rqd->max_weight ) { struct list_head *iter; int max_weight = 1; list_for_each( iter, &rqd->svc ) { const struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem); if ( svc->weight > max_weight ) max_weight = svc->weight; } rqd->max_weight = max_weight; SCHED_STAT_CRANK(upd_max_weight_full); } if ( unlikely(tb_init_done) ) { struct { unsigned rqi:16, max_weight:16; } d; d.rqi = rqd->id; d.max_weight = rqd->max_weight; __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1, sizeof(d), (unsigned char *)&d); } } /* Add and remove from runqueue assignment (not active run queue) */ static void _runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd) { svc->rqd = rqd; list_add_tail(&svc->rqd_elem, &svc->rqd->svc); update_max_weight(svc->rqd, svc->weight, 0); /* Expected new load based on adding this unit */ rqd->b_avgload += svc->avgload; if ( unlikely(tb_init_done) ) { struct { uint16_t unit, dom; uint16_t rqi, _pad; } d = { .unit = svc->unit->unit_id, .dom = svc->unit->domain->domain_id, .rqi = rqd->id, }; __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, sizeof(d), (unsigned char *)&d); } } static void runq_assign(const struct sched_unit *unit) { struct csched2_unit *svc = unit->priv; ASSERT(svc->rqd == NULL); _runq_assign(svc, c2rqd(sched_unit_master(unit))); } static void _runq_deassign(struct csched2_unit *svc) { struct csched2_runqueue_data *rqd = svc->rqd; ASSERT(!unit_on_runq(svc)); ASSERT(!(svc->flags & CSFLAG_scheduled)); list_del_init(&svc->rqd_elem); update_max_weight(rqd, 0, svc->weight); /* Expected new load based on removing this unit */ rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); svc->rqd = NULL; } static void runq_deassign(const struct sched_unit *unit) { struct csched2_unit *svc = unit->priv; ASSERT(svc->rqd == c2rqd(sched_unit_master(unit))); _runq_deassign(svc); } /* * Track the runq load by gathering instantaneous load samples, and using * exponentially weighted moving average (EWMA) for the 'decaying'. * * We consider a window of length W=2^(prv->load_window_shift) nsecs * (which takes LOADAVG_GRANULARITY_SHIFT into account). * * If load is the instantaneous load, the formula for EWMA looks as follows, * for the i-eth sample: * * avg[i] = a*load + (1 - a)*avg[i-1] * * where avg[i] is the new value of the average load, avg[i-1] is the value * of the average load calculated so far, and a is a coefficient less or * equal to 1. * * So, for us, it becomes: * * avgload = a*load + (1 - a)*avgload * * For determining a, we consider _when_ we are doing the load update, wrt * the length of the window. We define delta as follows: * * delta = t - load_last_update * * where t is current time (i.e., time at which we are both sampling and * updating the load average) and load_last_update is the last time we did * that. * * There are two possible situations: * * a) delta <= W * this means that, during the last window of length W, the runeuque load * was avgload for (W - detla) time, and load for delta time: * * |----------- W ---------| * | | * | load_last_update t * -------------------------|---------|--- * | | | * \__W - delta__/\_delta__/ * | | | * |___avgload___|__load___| * * So, what about using delta/W as our smoothing coefficient a. If we do, * here's what happens: * * a = delta / W * 1 - a = 1 - (delta / W) = (W - delta) / W * * Which matches the above description of what happened in the last * window of length W. * * Note that this also means that the weight that we assign to both the * latest load sample, and to previous history, varies at each update. * The longer the latest load sample has been in efect, within the last * window, the higher it weights (and the lesser the previous history * weights). * * This is some sort of extension of plain EWMA to fit even better to our * use case. * * b) delta > W * this means more than a full window has passed since the last update: * * |----------- W ---------| * | | * load_last_update t * ----|------------------------------|--- * | | * \_________________delta________/ * * Basically, it means the last load sample has been in effect for more * than W time, and hence we should just use it, and forget everything * before that. * * This can be seen as a 'reset condition', occurring when, for whatever * reason, load has not been updated for longer than we expected. (It is * also how avgload is assigned its first value.) * * The formula for avgload then becomes: * * avgload = (delta/W)*load + (W - delta)*avgload/W * avgload = delta*load/W + W*avgload/W - delta*avgload/W * avgload = avgload + delta*load/W - delta*avgload/W * * So, final form is: * * avgload_0 = load * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W * * As a confirmation, let's look at the extremes, when delta is 0 (i.e., * what happens if we update the load twice, at the same time instant?): * * avgload = avgload + 0*load/W - 0*avgload/W * avgload = avgload * * and when delta is W (i.e., what happens if we update at the last * possible instant before the window 'expires'?): * * avgload = avgload + W*load/W - W*avgload/W * avgload = avgload + load - avgload * avgload = load * * Which, in both cases, is what we expect. */ static void update_runq_load(const struct scheduler *ops, struct csched2_runqueue_data *rqd, int change, s_time_t now) { struct csched2_private *prv = csched2_priv(ops); s_time_t delta, load = rqd->load; unsigned int P, W; W = prv->load_window_shift; P = prv->load_precision_shift; now >>= LOADAVG_GRANULARITY_SHIFT; /* * To avoid using fractions, we shift to left by load_precision_shift, * and use the least last load_precision_shift bits as fractional part. * Looking back at the formula we want to use, we now have: * * P = 2^(load_precision_shift) * P*avgload = P*(avgload + delta*load/W - delta*avgload/W) * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W * * And if we are ok storing and using P*avgload, we can rewrite this as: * * P*avgload = avgload' * avgload' = avgload' + delta*P*load/W - delta*avgload'/W * * Coupled with, of course: * * avgload_0' = P*load */ if ( rqd->load_last_update + (1ULL << W) < now ) { rqd->avgload = load << P; rqd->b_avgload = load << P; } else { delta = now - rqd->load_last_update; if ( unlikely(delta < 0) ) { d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", __func__, now, rqd->load_last_update); delta = 0; } /* * Note that, if we were to enforce (or check) some relationship * between P and W, we may save one shift. E.g., if we are sure * that P < W, we could write: * * (delta * (load << P)) >> W * * as: * * (delta * load) >> (W - P) */ rqd->avgload = rqd->avgload + ((delta * (load << P)) >> W) - ((delta * rqd->avgload) >> W); rqd->b_avgload = rqd->b_avgload + ((delta * (load << P)) >> W) - ((delta * rqd->b_avgload) >> W); } rqd->load += change; rqd->load_last_update = now; /* Overflow, capable of making the load look negative, must not occur. */ ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0); if ( unlikely(tb_init_done) ) { struct { uint64_t rq_avgload, b_avgload; uint16_t rq_load; uint8_t rq_id, shift; uint32_t _pad; } d = { .rq_avgload = rqd->avgload, .b_avgload = rqd->b_avgload, .rq_load = rqd->load, .rq_id = rqd->id, .shift = P, }; __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void update_svc_load(const struct scheduler *ops, struct csched2_unit *svc, int change, s_time_t now) { const struct csched2_private *prv = csched2_priv(ops); s_time_t delta, unit_load; unsigned int P, W; if ( change == -1 ) unit_load = 1; else if ( change == 1 ) unit_load = 0; else unit_load = unit_runnable(svc->unit); W = prv->load_window_shift; P = prv->load_precision_shift; now >>= LOADAVG_GRANULARITY_SHIFT; if ( svc->load_last_update + (1ULL << W) < now ) { svc->avgload = unit_load << P; } else { delta = now - svc->load_last_update; if ( unlikely(delta < 0) ) { d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", __func__, now, svc->load_last_update); delta = 0; } svc->avgload = svc->avgload + ((delta * (unit_load << P)) >> W) - ((delta * svc->avgload) >> W); } svc->load_last_update = now; /* Overflow, capable of making the load look negative, must not occur. */ ASSERT(svc->avgload >= 0); if ( unlikely(tb_init_done) ) { struct { uint64_t v_avgload; unsigned unit:16, dom:16; unsigned shift; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.v_avgload = svc->avgload; d.shift = P; __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void update_load(const struct scheduler *ops, struct csched2_runqueue_data *rqd, struct csched2_unit *svc, int change, s_time_t now) { trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL); update_runq_load(ops, rqd, change, now); if ( svc ) update_svc_load(ops, svc, change, now); } static void runq_insert(struct csched2_unit *svc) { struct list_head *iter; unsigned int cpu = sched_unit_master(svc->unit); struct list_head *runq = &c2rqd(cpu)->runq; int pos = 0; ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); ASSERT(!unit_on_runq(svc)); ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit))); ASSERT(&svc->rqd->runq == runq); ASSERT(!is_idle_unit(svc->unit)); ASSERT(!svc->unit->is_running); ASSERT(!(svc->flags & CSFLAG_scheduled)); list_for_each( iter, runq ) { struct csched2_unit * iter_svc = runq_elem(iter); if ( svc->credit > iter_svc->credit ) break; pos++; } list_add_tail(&svc->runq_elem, iter); if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned pos; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.pos = pos; __trace_var(TRC_CSCHED2_RUNQ_POS, 1, sizeof(d), (unsigned char *)&d); } } static inline void runq_remove(struct csched2_unit *svc) { ASSERT(unit_on_runq(svc)); list_del_init(&svc->runq_elem); } void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t); static inline void tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd) { __cpumask_set_cpu(cpu, &rqd->tickled); smt_idle_mask_clear(cpu, &rqd->smt_idle); cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } /* * What we want to know is whether svc, which we assume to be running on some * pcpu, can be interrupted and preempted (which, so far, basically means * whether or not it already run for more than the ratelimit, to which we * apply some tolerance). */ static inline bool is_preemptable(const struct csched2_unit *svc, s_time_t now, s_time_t ratelimit) { if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE ) return true; ASSERT(svc->unit->is_running); return now - svc->unit->state_entry_time > ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE; } /* * Score to preempt the target cpu. Return a negative number if the * credit isn't high enough; if it is, favor a preemption on cpu in * this order: * - cpu is in new's soft-affinity, not in cur's soft-affinity * (2 x CSCHED2_CREDIT_INIT score bonus); * - cpu is in new's soft-affinity and cur's soft-affinity, or * cpu is not in new's soft-affinity, nor in cur's soft-affinity * (1x CSCHED2_CREDIT_INIT score bonus); * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity * (no bonus). * * Within the same class, the highest difference of credit. */ static s_time_t tickle_score(const struct scheduler *ops, s_time_t now, const struct csched2_unit *new, unsigned int cpu) { struct csched2_runqueue_data *rqd = c2rqd(cpu); struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu)); const struct csched2_private *prv = csched2_priv(ops); s_time_t score; /* * We are dealing with cpus that are marked non-idle (i.e., that are not * in rqd->idle). However, some of them may be running their idle unit, * if taking care of tasklets. In that case, we want to leave it alone. */ if ( unlikely(is_idle_unit(cur->unit) || !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) ) return -1; burn_credits(rqd, cur, now); score = new->credit - cur->credit; if ( sched_unit_master(new->unit) != cpu ) score -= CSCHED2_MIGRATE_RESIST; /* * If score is positive, it means new has enough credits (i.e., * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST). * * Let's compute the bonuses for soft-affinities. */ if ( score > 0 ) { if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) ) score += CSCHED2_CREDIT_INIT; if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) ) score += CSCHED2_CREDIT_INIT; } if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; int credit, score; } d; d.dom = cur->unit->domain->domain_id; d.unit = cur->unit->unit_id; d.credit = cur->credit; d.score = score; __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, sizeof(d), (unsigned char *)&d); } return score; } /* * Check what processor it is best to 'wake', for picking up an unit that has * just been put (back) in the runqueue. Logic is as follows: * 1. if there are idle processors in the runq, wake one of them; * 2. if there aren't idle processor, check the one were the unit was * running before to see if we can preempt what's running there now * (and hence doing just one migration); * 3. last stand: check all processors and see if the unit is in right * of preempting any of the other units running on them (this requires * two migrations, and that's indeed why it is left as the last stand). * * Note that when we say 'idle processors' what we really mean is (pretty * much always) both _idle_ and _not_already_tickled_. In fact, if a * processor has been tickled, it will run csched2_schedule() shortly, and * pick up some work, so it would be wrong to consider it idle. */ static void runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now) { int i, ipid = -1; s_time_t max = 0; struct sched_unit *unit = new->unit; unsigned int bs, cpu = sched_unit_master(unit); struct csched2_runqueue_data *rqd = c2rqd(cpu); const cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); cpumask_t mask; ASSERT(new->rqd == rqd); if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned processor; int credit; } d; d.dom = unit->domain->domain_id; d.unit = unit->unit_id; d.processor = cpu; d.credit = new->credit; __trace_var(TRC_CSCHED2_TICKLE_NEW, 1, sizeof(d), (unsigned char *)&d); } /* * Exclusive pinning is when an unit has hard-affinity with only one * cpu, and there is no other unit that has hard-affinity with that * same cpu. This is infrequent, but if it happens, is for achieving * the most possible determinism, and least possible overhead for * the units in question. * * Try to identify the vast majority of these situations, and deal * with them quickly. */ if ( unlikely((new->flags & CSFLAG_pinned) && cpumask_test_cpu(cpu, &rqd->idle) && !cpumask_test_cpu(cpu, &rqd->tickled)) ) { ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); SCHED_STAT_CRANK(tickled_idle_cpu_excl); ipid = cpu; goto tickle; } for_each_affinity_balance_step( bs ) { /* Just skip first step, if we don't have a soft affinity */ if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) continue; affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); /* * First of all, consider idle cpus, checking if we can just * re-use the pcpu where we were running before. * * If there are cores where all the siblings are idle, consider * them first, honoring whatever the spreading-vs-consolidation * SMT policy wants us to do. */ if ( unlikely(sched_smt_power_savings) ) { cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle); cpumask_and(&mask, &mask, online); } else cpumask_and(&mask, &rqd->smt_idle, online); cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); i = cpumask_test_or_cycle(cpu, &mask); if ( i < nr_cpu_ids ) { SCHED_STAT_CRANK(tickled_idle_cpu); ipid = i; goto tickle; } /* * If there are no fully idle cores, check all idlers, after * having filtered out pcpus that have been tickled but haven't * gone through the scheduler yet. */ cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online); cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); i = cpumask_test_or_cycle(cpu, &mask); if ( i < nr_cpu_ids ) { SCHED_STAT_CRANK(tickled_idle_cpu); ipid = i; goto tickle; } } /* * Note that, if we are here, it means we have done the hard-affinity * balancing step of the loop, and hence what we have in cpumask_scratch * is what we put there for last, i.e., new's unit_hard_affinity & online * which is exactly what we need for the next part of the function. */ /* * Otherwise, look for the non-idle (and non-tickled) processors with * the lowest credit, among the ones new is allowed to run on. Again, * the cpu were it was running on would be the best candidate. * * For deciding which cpu to tickle, we use tickle_score(), which will * factor in both new's soft-affinity, and the soft-affinity of the * unit running on each cpu that we consider. */ cpumask_andnot(&mask, &rqd->active, &rqd->idle); cpumask_andnot(&mask, &mask, &rqd->tickled); cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); if ( __cpumask_test_and_clear_cpu(cpu, &mask) ) { s_time_t score = tickle_score(ops, now, new, cpu); if ( score > max ) { max = score; ipid = cpu; /* If this is in new's soft affinity, just take it */ if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) ) { SCHED_STAT_CRANK(tickled_busy_cpu); goto tickle; } } } for_each_cpu(i, &mask) { s_time_t score; /* Already looked at this one above */ ASSERT(i != cpu); score = tickle_score(ops, now, new, i); if ( score > max ) { max = score; ipid = i; } } if ( ipid == -1 ) { SCHED_STAT_CRANK(tickled_no_cpu); return; } ASSERT(!is_idle_unit(curr_on_cpu(ipid))); SCHED_STAT_CRANK(tickled_busy_cpu); tickle: BUG_ON(ipid == -1); if ( unlikely(tb_init_done) ) { struct { unsigned cpu:16, pad:16; } d; d.cpu = ipid; d.pad = 0; __trace_var(TRC_CSCHED2_TICKLE, 1, sizeof(d), (unsigned char *)&d); } tickle_cpu(ipid, rqd); if ( unlikely(new->tickled_cpu != -1) ) SCHED_STAT_CRANK(tickled_cpu_overwritten); new->tickled_cpu = ipid; } /* * Credit-related code */ static void reset_credit(int cpu, s_time_t now, struct csched2_unit *snext) { struct csched2_runqueue_data *rqd = c2rqd(cpu); struct list_head *iter; int reset = CSCHED2_CREDIT_INIT; /* * Under normal circumstances, snext->credit should never be less * than -CSCHED2_MIN_TIMER. However, under some circumstances, an * unit with low credits may be allowed to run long enough that * its credits are actually much lower than that. * (Instances have been observed, for example, where an unit with * 200us of credit was allowed to run for 11ms, giving it -10.8ms * of credit. Thus it was still negative even after the reset.) * * If this is the case for snext, we simply want to keep moving * everyone up until it is in the black again. This means that, * since CSCHED2_CREDIT_MIN is -CSCHED2_CREDIT_INIT, we need to * actually add 2*CSCHED2_CREDIT_INIT. */ ASSERT(snext->credit >= CSCHED2_CREDIT_MIN); if ( unlikely(snext->credit == CSCHED2_CREDIT_MIN) ) reset += CSCHED2_CREDIT_INIT; list_for_each( iter, &rqd->svc ) { unsigned int svc_cpu; struct csched2_unit * svc; int start_credit; svc = list_entry(iter, struct csched2_unit, rqd_elem); svc_cpu = sched_unit_master(svc->unit); ASSERT(!is_idle_unit(svc->unit)); ASSERT(svc->rqd == rqd); /* * If svc is running, it is our responsibility to make sure, here, * that the credit it has spent so far get accounted. */ if ( svc->unit == curr_on_cpu(svc_cpu) ) { burn_credits(rqd, svc, now); /* * And, similarly, in case it has run out of budget, as a * consequence of this round of accounting, we also must inform * its pCPU that it's time to park it, and pick up someone else. */ if ( unlikely(svc->budget <= 0) ) tickle_cpu(svc_cpu, rqd); } start_credit = svc->credit; svc->credit += reset; /* "Clip" credits to max carryover */ if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX ) svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX; svc->start_time = now; if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; int credit_start, credit_end; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.credit_start = start_credit; d.credit_end = svc->credit; __trace_var(TRC_CSCHED2_CREDIT_RESET, 1, sizeof(d), (unsigned char *)&d); } } ASSERT(snext->credit > 0); SCHED_STAT_CRANK(credit_reset); /* No need to resort runqueue, as everyone's order should be the same. */ } void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *svc, s_time_t now) { s_time_t delta; ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit)))); if ( unlikely(is_idle_unit(svc->unit)) ) { ASSERT(svc->credit == CSCHED2_IDLE_CREDIT); return; } delta = now - svc->start_time; if ( unlikely(delta <= 0) ) { if ( unlikely(delta < 0) ) d2printk("WARNING: %s: Time went backwards? now %"PRI_stime " start_time %"PRI_stime"\n", __func__, now, svc->start_time); goto out; } SCHED_STAT_CRANK(burn_credits_t2c); t2c_update(rqd, delta, svc); if ( has_cap(svc) ) svc->budget -= delta; svc->start_time = now; out: if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; int credit, budget; int delta; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.credit = svc->credit; d.budget = has_cap(svc) ? svc->budget : INT_MIN; d.delta = delta; __trace_var(TRC_CSCHED2_CREDIT_BURN, 1, sizeof(d), (unsigned char *)&d); } } /* * Budget-related code. */ static void park_unit(struct csched2_unit *svc) { struct sched_unit *unit = svc->unit; ASSERT(spin_is_locked(&svc->sdom->budget_lock)); /* * It was impossible to find budget for this unit, so it has to be * "parked". This implies it is not runnable, so we mark it as such in * its pause_flags. If the unit is currently scheduled (which means we * are here after being called from within csched_schedule()), flagging * is enough, as we'll choose someone else, and then context_saved() * will take care of updating the load properly. * * If, OTOH, the unit is sitting in the runqueue (which means we are here * after being called from within runq_candidate()), we must go all the * way down to taking it out of there, and updating the load accordingly. * * In both cases, we also add it to the list of parked units of the domain. */ sched_set_pause_flags(unit, _VPF_parked); if ( unit_on_runq(svc) ) { runq_remove(svc); update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW()); } list_add(&svc->parked_elem, &svc->sdom->parked_units); } static bool unit_grab_budget(struct csched2_unit *svc) { struct csched2_dom *sdom = svc->sdom; unsigned int cpu = sched_unit_master(svc->unit); ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); if ( svc->budget > 0 ) return true; /* budget_lock nests inside runqueue lock. */ spin_lock(&sdom->budget_lock); /* * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if * above!). That basically means the unit has overrun a bit --because of * various reasons-- and we want to take that into account. With the +=, * we are actually subtracting the amount of budget the unit has * overconsumed, from the total domain budget. */ sdom->budget += svc->budget; if ( sdom->budget > 0 ) { s_time_t budget; /* Get our quota, if there's at least as much budget */ if ( likely(sdom->budget >= svc->budget_quota) ) budget = svc->budget_quota; else budget = sdom->budget; svc->budget = budget; sdom->budget -= budget; } else { svc->budget = 0; park_unit(svc); } spin_unlock(&sdom->budget_lock); return svc->budget > 0; } static void unit_return_budget(struct csched2_unit *svc, struct list_head *parked) { struct csched2_dom *sdom = svc->sdom; unsigned int cpu = sched_unit_master(svc->unit); ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); ASSERT(list_empty(parked)); /* budget_lock nests inside runqueue lock. */ spin_lock(&sdom->budget_lock); /* * The unit is stopping running (e.g., because it's blocking, or it has * been preempted). If it hasn't consumed all the budget it got when, * starting to run, put that remaining amount back in the domain's budget * pool. */ sdom->budget += svc->budget; svc->budget = 0; /* * Making budget available again to the domain means that parked units * may be unparked and run. They are, if any, in the domain's parked_units * list, so we want to go through that and unpark them (so they can try * to get some budget). * * Touching the list requires the budget_lock, which we hold. Let's * therefore put everyone in that list in another, temporary list, which * then the caller will traverse, unparking the units it finds there. * * In fact, we can't do the actual unparking here, because that requires * taking the runqueue lock of the units being unparked, and we can't * take any runqueue locks while we hold a budget_lock. */ if ( sdom->budget > 0 ) list_splice_init(&sdom->parked_units, parked); spin_unlock(&sdom->budget_lock); } static void unpark_parked_units(const struct scheduler *ops, struct list_head *units) { struct csched2_unit *svc, *tmp; spinlock_t *lock; list_for_each_entry_safe ( svc, tmp, units, parked_elem ) { unsigned long flags; s_time_t now; lock = unit_schedule_lock_irqsave(svc->unit, &flags); sched_clear_pause_flags(svc->unit, _VPF_parked); if ( unlikely(svc->flags & CSFLAG_scheduled) ) { /* * We end here if a budget replenishment arrived between * csched2_schedule() (and, in particular, after a call to * unit_grab_budget() that returned false), and * context_saved(). By setting __CSFLAG_delayed_runq_add, * we tell context_saved() to put the unit back in the * runqueue, from where it will compete with the others * for the newly replenished budget. */ ASSERT( svc->rqd != NULL ); ASSERT( c2rqd(sched_unit_master(svc->unit)) == svc->rqd ); __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); } else if ( unit_runnable(svc->unit) ) { /* * The unit should go back to the runqueue, and compete for * the newly replenished budget, but only if it is actually * runnable (and was therefore offline only because of the * lack of budget). */ now = NOW(); update_load(ops, svc->rqd, svc, 1, now); runq_insert(svc); runq_tickle(ops, svc, now); } list_del_init(&svc->parked_elem); unit_schedule_unlock_irqrestore(lock, flags, svc->unit); } } static inline void do_replenish(struct csched2_dom *sdom) { sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD; sdom->budget += sdom->tot_budget; } static void cf_check replenish_domain_budget(void *data) { struct csched2_dom *sdom = data; unsigned long flags; s_time_t now; LIST_HEAD(parked); spin_lock_irqsave(&sdom->budget_lock, flags); now = NOW(); /* * Let's do the replenishment. Note, though, that a domain may overrun, * which means the budget would have gone below 0 (reasons may be system * overbooking, accounting issues, etc.). It also may happen that we are * handling the replenishment (much) later than we should (reasons may * again be overbooking, or issues with timers). * * Even in cases of overrun or delay, however, we expect that in 99% of * cases, doing just one replenishment will be good enough for being able * to unpark the units that are waiting for some budget. */ do_replenish(sdom); /* * And now, the special cases: * 1) if we are late enough to have skipped (at least) one full period, * what we must do is doing more replenishments. Note that, however, * every time we add tot_budget to the budget, we also move next_repl * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always * respected. */ if ( unlikely(sdom->next_repl <= now) ) { do do_replenish(sdom); while ( sdom->next_repl <= now ); } /* * 2) if we overrun by more than tot_budget, then budget+tot_budget is * still < 0, which means that we can't unpark the units. Let's bail, * and wait for future replenishments. */ if ( unlikely(sdom->budget <= 0) ) { spin_unlock_irqrestore(&sdom->budget_lock, flags); goto out; } /* Since we do more replenishments, make sure we didn't overshot. */ sdom->budget = min(sdom->budget, sdom->tot_budget); /* * As above, let's prepare the temporary list, out of the domain's * parked_units list, now that we hold the budget_lock. Then, drop such * lock, and pass the list to the unparking function. */ list_splice_init(&sdom->parked_units, &parked); spin_unlock_irqrestore(&sdom->budget_lock, flags); unpark_parked_units(sdom->dom->cpupool->sched, &parked); out: set_timer(&sdom->repl_timer, sdom->next_repl); } #ifndef NDEBUG static inline void csched2_unit_check(const struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); struct csched2_dom * const sdom = svc->sdom; BUG_ON( svc->unit != unit ); BUG_ON( sdom != csched2_dom(unit->domain) ); if ( sdom ) { BUG_ON( is_idle_unit(unit) ); BUG_ON( sdom->dom != unit->domain ); } else { BUG_ON( !is_idle_unit(unit) ); } SCHED_STAT_CRANK(unit_check); } #define CSCHED2_UNIT_CHECK(unit) (csched2_unit_check(unit)) #else #define CSCHED2_UNIT_CHECK(unit) #endif static void *cf_check csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd) { struct csched2_unit *svc; /* Allocate per-UNIT info */ svc = xzalloc(struct csched2_unit); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->rqd_elem); INIT_LIST_HEAD(&svc->runq_elem); svc->sdom = dd; svc->unit = unit; svc->flags = 0U; if ( ! is_idle_unit(unit) ) { ASSERT(svc->sdom != NULL); svc->credit = CSCHED2_CREDIT_INIT; svc->weight = svc->sdom->weight; /* Starting load of 50% */ svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1); svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT; } else { ASSERT(svc->sdom == NULL); svc->credit = CSCHED2_IDLE_CREDIT; svc->weight = 0; } svc->tickled_cpu = -1; svc->budget = STIME_MAX; svc->budget_quota = 0; INIT_LIST_HEAD(&svc->parked_elem); SCHED_STAT_CRANK(unit_alloc); return svc; } static void cf_check csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); ASSERT(!is_idle_unit(unit)); SCHED_STAT_CRANK(unit_sleep); if ( curr_on_cpu(sched_unit_master(unit)) == unit ) { tickle_cpu(sched_unit_master(unit), svc->rqd); } else if ( unit_on_runq(svc) ) { ASSERT(svc->rqd == c2rqd(sched_unit_master(unit))); update_load(ops, svc->rqd, svc, -1, NOW()); runq_remove(svc); } else __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); } static void cf_check csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); unsigned int cpu = sched_unit_master(unit); s_time_t now; ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); ASSERT(!is_idle_unit(unit)); if ( unlikely(curr_on_cpu(cpu) == unit) ) { SCHED_STAT_CRANK(unit_wake_running); goto out; } if ( unlikely(unit_on_runq(svc)) ) { SCHED_STAT_CRANK(unit_wake_onrunq); goto out; } if ( likely(unit_runnable(unit)) ) SCHED_STAT_CRANK(unit_wake_runnable); else SCHED_STAT_CRANK(unit_wake_not_runnable); /* If the context hasn't been saved for this unit yet, we can't put it on * another runqueue. Instead, we set a flag so that it will be put on the runqueue * after the context has been saved. */ if ( unlikely(svc->flags & CSFLAG_scheduled) ) { __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); goto out; } /* Add into the new runqueue if necessary */ if ( svc->rqd == NULL ) runq_assign(unit); else ASSERT(c2rqd(sched_unit_master(unit)) == svc->rqd ); now = NOW(); update_load(ops, svc->rqd, svc, 1, now); /* Put the UNIT on the runq */ runq_insert(svc); runq_tickle(ops, svc, now); out: return; } static void cf_check csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); __set_bit(__CSFLAG_unit_yield, &svc->flags); } static void cf_check csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); spinlock_t *lock = unit_schedule_lock_irq(unit); s_time_t now = NOW(); LIST_HEAD(were_parked); ASSERT(is_idle_unit(unit) || svc->rqd == c2rqd(sched_unit_master(unit))); /* This unit is now eligible to be put on the runqueue again */ __clear_bit(__CSFLAG_scheduled, &svc->flags); if ( unlikely(has_cap(svc) && svc->budget > 0) ) unit_return_budget(svc, &were_parked); /* If someone wants it on the runqueue, put it there. */ /* * NB: We can get rid of CSFLAG_scheduled by checking for * vc->is_running and unit_on_runq(svc) here. However, * since we're accessing the flags cacheline anyway, * it seems a bit pointless; especially as we have plenty of * bits free. */ if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) && likely(unit_runnable(unit)) ) { ASSERT(!unit_on_runq(svc)); runq_insert(svc); runq_tickle(ops, svc, now); } else if ( !is_idle_unit(unit) ) update_load(ops, svc->rqd, svc, -1, now); unit_schedule_unlock_irq(lock, unit); unpark_parked_units(ops, &were_parked); } #define MAX_LOAD (STIME_MAX) static struct sched_resource *cf_check csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit) { struct csched2_private *prv = csched2_priv(ops); unsigned int new_cpu, cpu = sched_unit_master(unit); struct csched2_unit *svc = csched2_unit(unit); s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD; bool has_soft; struct csched2_runqueue_data *rqd, *min_rqd = NULL, *min_s_rqd = NULL; ASSERT(!list_empty(&prv->rql)); SCHED_STAT_CRANK(pick_resource); /* Locking: * - Runqueue lock of vc->processor is already locked * - Need to grab prv lock to make sure active runqueues don't * change * - Need to grab locks for other runqueues while checking * avgload * Locking constraint is: * - Lock prv before runqueue locks * - Trylock between runqueue locks (no ordering) * * Since one of the runqueue locks is already held, we can't * just grab the prv lock. Instead, we'll have to trylock, and * do something else reasonable if we fail. */ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); if ( !read_trylock(&prv->lock) ) { /* We may be here because someone requested us to migrate. */ __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); new_cpu = get_fallback_cpu(svc); /* * Tracing of runq and its load won't be accurate, since we could * not get the lock, but at least we will output the chosen pcpu. */ goto out; } cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, cpupool_domain_master_cpumask(unit->domain)); /* * First check to see if we're here because someone else suggested a place * for us to move. */ if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { if ( unlikely(svc->migrate_rqd->id < 0) ) { printk(XENLOG_WARNING "%s: target runqueue disappeared!\n", __func__); } else if ( cpumask_intersects(cpumask_scratch_cpu(cpu), &svc->migrate_rqd->active) ) { /* * If we've been asked to move to migrate_rqd, we should just do * that, which we actually do by returning one cpu from that runq. * There is no need to take care of soft affinity, as that will * happen in runq_tickle(). */ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &svc->migrate_rqd->active); new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias, cpumask_scratch_cpu(cpu)); svc->migrate_rqd->pick_bias = new_cpu; goto out_up; } /* Fall-through to normal cpu pick */ } /* * What we want is: * - if we have soft affinity, the runqueue with the lowest average * load, among the ones that contain cpus in our soft affinity; this * represents the best runq on which we would want to run. * - the runqueue with the lowest average load among the ones that * contains cpus in our hard affinity; this represent the best runq * on which we can run. * * Find both runqueues in one pass. */ has_soft = has_soft_affinity(unit); list_for_each_entry ( rqd, &prv->rql, rql ) { s_time_t rqd_avgload = MAX_LOAD; /* * If none of the cpus of this runqueue is in svc's hard-affinity, * skip the runqueue. * * Note that, in case svc's hard-affinity has changed, this is the * first time when we see such change, so it is indeed possible * that we end up skipping svc's current runqueue. */ if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) ) continue; /* * If checking a different runqueue, grab the lock, read the avg, * and then release the lock. * * If on our own runqueue, don't grab or release the lock; * but subtract our own load from the runqueue load to simulate * impartiality. */ if ( rqd == svc->rqd ) { rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); } else if ( spin_trylock(&rqd->lock) ) { rqd_avgload = rqd->b_avgload; spin_unlock(&rqd->lock); } /* * if svc has a soft-affinity, and some cpus of rqd are part of it, * see if we need to update the "soft-affinity minimum". */ if ( has_soft && rqd_avgload < min_s_avgload ) { cpumask_t mask; cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active); if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) ) { min_s_avgload = rqd_avgload; min_s_rqd = rqd; } } /* In any case, keep the "hard-affinity minimum" updated too. */ if ( rqd_avgload < min_avgload ) { min_avgload = rqd_avgload; min_rqd = rqd; } } if ( has_soft && min_s_rqd ) { /* * We have soft affinity, and we have a candidate runq, so go for it. * * Note that, to obtain the soft-affinity mask, we "just" put what we * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is * ok because: * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have * a non-empty intersection (because has_soft is true); * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask() * already in cpumask_scratch, we do save a lot doing like this. * * It's kind of like open coding affinity_balance_cpumask() but, in * this specific case, calling that would mean a lot of (unnecessary) * cpumask operations. */ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), unit->cpu_soft_affinity); cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &min_s_rqd->active); } else if ( min_rqd ) { /* * Either we don't have soft-affinity, or we do, but we did not find * any suitable runq. But we did find one when considering hard * affinity, so go for it. * * cpumask_scratch already has unit->cpu_hard_affinity & * cpupool_domain_master_cpumask() in it, so it's enough that we filter * with the cpus of the runq. */ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &min_rqd->active); } else { /* * We didn't find anyone at all (most likely because of spinlock * contention). */ new_cpu = get_fallback_cpu(svc); min_rqd = c2rqd(new_cpu); min_avgload = min_rqd->b_avgload; goto out_up; } new_cpu = cpumask_cycle(min_rqd->pick_bias, cpumask_scratch_cpu(cpu)); min_rqd->pick_bias = new_cpu; BUG_ON(new_cpu >= nr_cpu_ids); out_up: read_unlock(&prv->lock); out: if ( unlikely(tb_init_done) ) { struct { uint64_t b_avgload; unsigned unit:16, dom:16; unsigned rq_id:16, new_cpu:16; } d; d.dom = unit->domain->domain_id; d.unit = unit->unit_id; d.rq_id = min_rqd ? min_rqd->id : -1; d.b_avgload = min_avgload; d.new_cpu = new_cpu; __trace_var(TRC_CSCHED2_PICKED_CPU, 1, sizeof(d), (unsigned char *)&d); } return get_sched_res(new_cpu); } /* Working state of the load-balancing algorithm */ typedef struct { /* NB: Modified by consider() */ s_time_t load_delta; struct csched2_unit * best_push_svc, *best_pull_svc; /* NB: Read by consider() */ struct csched2_runqueue_data *lrqd; struct csched2_runqueue_data *orqd; } balance_state_t; static void consider(balance_state_t *st, struct csched2_unit *push_svc, struct csched2_unit *pull_svc) { s_time_t l_load, o_load, delta; l_load = st->lrqd->b_avgload; o_load = st->orqd->b_avgload; if ( push_svc ) { /* What happens to the load on both if we push? */ l_load -= push_svc->avgload; o_load += push_svc->avgload; } if ( pull_svc ) { /* What happens to the load on both if we pull? */ l_load += pull_svc->avgload; o_load -= pull_svc->avgload; } delta = l_load - o_load; if ( delta < 0 ) delta = -delta; if ( delta < st->load_delta ) { st->load_delta = delta; st->best_push_svc=push_svc; st->best_pull_svc=pull_svc; } } static void migrate(const struct scheduler *ops, struct csched2_unit *svc, struct csched2_runqueue_data *trqd, s_time_t now) { struct sched_unit *unit = svc->unit; int cpu = sched_unit_master(unit); if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned rqi:16, trqi:16; } d; d.dom = unit->domain->domain_id; d.unit = unit->unit_id; d.rqi = svc->rqd->id; d.trqi = trqd->id; __trace_var(TRC_CSCHED2_MIGRATE, 1, sizeof(d), (unsigned char *)&d); } if ( svc->flags & CSFLAG_scheduled ) { /* It's running; mark it to migrate. */ svc->migrate_rqd = trqd; sched_set_pause_flags(unit, _VPF_migrating); __set_bit(__CSFLAG_runq_migrate_request, &svc->flags); SCHED_STAT_CRANK(migrate_requested); tickle_cpu(cpu, svc->rqd); } else { int on_runq = 0; /* It's not running; just move it */ if ( unit_on_runq(svc) ) { runq_remove(svc); update_load(ops, svc->rqd, NULL, -1, now); on_runq = 1; } _runq_deassign(svc); cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, cpupool_domain_master_cpumask(unit->domain)); cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), &trqd->active); sched_set_res(unit, get_sched_res(cpumask_cycle(trqd->pick_bias, cpumask_scratch_cpu(cpu)))); trqd->pick_bias = sched_unit_master(unit); ASSERT(sched_unit_master(unit) < nr_cpu_ids); _runq_assign(svc, trqd); if ( on_runq ) { update_load(ops, svc->rqd, NULL, 1, now); runq_insert(svc); runq_tickle(ops, svc, now); SCHED_STAT_CRANK(migrate_on_runq); } else SCHED_STAT_CRANK(migrate_no_runq); } } /* * It makes sense considering migrating svc to rqd, if: * - svc is not already flagged to migrate, * - if svc is allowed to run on at least one of the pcpus of rqd. */ static bool unit_is_migrateable(const struct csched2_unit *svc, const struct csched2_runqueue_data *rqd) { struct sched_unit *unit = svc->unit; int cpu = sched_unit_master(unit); cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, cpupool_domain_master_cpumask(unit->domain)); return !(svc->flags & CSFLAG_runq_migrate_request) && cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active); } static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) { struct csched2_private *prv = csched2_priv(ops); struct list_head *push_iter, *pull_iter; bool inner_load_updated = 0; struct csched2_runqueue_data *rqd, *max_delta_rqd; balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; /* * Basic algorithm: Push, pull, or swap. * - Find the runqueue with the furthest load distance * - Find a pair that makes the difference the least (where one * on either side may be empty). */ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); st.lrqd = c2rqd(cpu); update_runq_load(ops, st.lrqd, 0, now); retry: max_delta_rqd = NULL; if ( !read_trylock(&prv->lock) ) return; st.load_delta = 0; list_for_each_entry ( rqd, &prv->rql, rql ) { s_time_t delta; st.orqd = rqd; if ( st.orqd == st.lrqd || !spin_trylock(&st.orqd->lock) ) continue; update_runq_load(ops, st.orqd, 0, now); delta = st.lrqd->b_avgload - st.orqd->b_avgload; if ( delta < 0 ) delta = -delta; if ( delta > st.load_delta ) { st.load_delta = delta; max_delta_rqd = rqd; } spin_unlock(&st.orqd->lock); } /* Minimize holding the private scheduler lock. */ read_unlock(&prv->lock); if ( !max_delta_rqd ) goto out; { s_time_t load_max; int cpus_max; load_max = st.lrqd->b_avgload; if ( st.orqd->b_avgload > load_max ) load_max = st.orqd->b_avgload; cpus_max = max(st.lrqd->nr_cpus, st.orqd->nr_cpus); if ( unlikely(tb_init_done) ) { struct { unsigned lrq_id:16, orq_id:16; unsigned load_delta; } d; d.lrq_id = st.lrqd->id; d.orq_id = st.orqd->id; d.load_delta = st.load_delta; __trace_var(TRC_CSCHED2_LOAD_CHECK, 1, sizeof(d), (unsigned char *)&d); } /* * If we're under 100% capacaty, only shift if load difference * is > 1. otherwise, shift if under 12.5% */ if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) ) { if ( st.load_delta < (1ULL << (prv->load_precision_shift + opt_underload_balance_tolerance)) ) goto out; } else if ( st.load_delta < (1ULL << (prv->load_precision_shift + opt_overload_balance_tolerance)) ) goto out; } /* Try to grab the other runqueue lock; if it's been taken in the * meantime, try the process over again. This can't deadlock * because if it doesn't get any other rqd locks, it will simply * give up and return. */ st.orqd = max_delta_rqd; if ( !spin_trylock(&st.orqd->lock) ) goto retry; /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ if ( unlikely(st.orqd->id < 0) ) goto out_up; if ( unlikely(tb_init_done) ) { struct { uint64_t lb_avgload, ob_avgload; uint16_t lrq_id, orq_id; uint32_t _pad; } d = { .lb_avgload = st.lrqd->b_avgload, .ob_avgload = st.orqd->b_avgload, .lrq_id = st.lrqd->id, .orq_id = st.orqd->id, }; __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1, sizeof(d), (unsigned char *)&d); } SCHED_STAT_CRANK(acct_load_balance); /* Look for "swap" which gives the best load average * FIXME: O(n^2)! */ /* Reuse load delta (as we're trying to minimize it) */ list_for_each( push_iter, &st.lrqd->svc ) { struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem); update_svc_load(ops, push_svc, 0, now); if ( !unit_is_migrateable(push_svc, st.orqd) ) continue; list_for_each( pull_iter, &st.orqd->svc ) { struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); if ( !inner_load_updated ) update_svc_load(ops, pull_svc, 0, now); if ( !unit_is_migrateable(pull_svc, st.lrqd) ) continue; consider(&st, push_svc, pull_svc); } inner_load_updated = 1; /* Consider push only */ consider(&st, push_svc, NULL); } list_for_each( pull_iter, &st.orqd->svc ) { struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); if ( !unit_is_migrateable(pull_svc, st.lrqd) ) continue; /* Consider pull only */ consider(&st, NULL, pull_svc); } /* OK, now we have some candidates; do the moving */ if ( st.best_push_svc ) migrate(ops, st.best_push_svc, st.orqd, now); if ( st.best_pull_svc ) migrate(ops, st.best_pull_svc, st.lrqd, now); out_up: spin_unlock(&st.orqd->lock); out: return; } static void cf_check csched2_unit_migrate( const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu) { struct csched2_unit * const svc = csched2_unit(unit); struct csched2_runqueue_data *trqd; s_time_t now = NOW(); ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized)); ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity)); trqd = c2rqd(new_cpu); /* * Do the actual movement toward new_cpu, and update vc->processor. * If we are changing runqueue, migrate() takes care of everything. * If we are not changing runqueue, we need to update vc->processor * here. In fact, if, for instance, we are here because the unit's * hard affinity changed, we don't want to risk leaving vc->processor * pointing to a pcpu where we can't run any longer. */ if ( trqd != svc->rqd ) migrate(ops, svc, trqd, now); else sched_set_res(unit, get_sched_res(new_cpu)); } static int cf_check csched2_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched2_dom * const sdom = csched2_dom(d); struct csched2_private *prv = csched2_priv(ops); unsigned long flags; struct sched_unit *unit; int rc = 0; /* * Locking: * - we must take the private lock for accessing the weights of the * units of d, and/or the cap; * - in the putinfo case, we also need the runqueue lock(s), for * updating the max waight of the runqueue(s). * If changing the cap, we also need the budget_lock, for updating * the value of the domain budget pool (and the runqueue lock, * for adjusting the parameters and rescheduling any unit that is * running at the time of the change). */ switch ( op->cmd ) { case XEN_DOMCTL_SCHEDOP_getinfo: read_lock_irqsave(&prv->lock, flags); op->u.credit2.weight = sdom->weight; op->u.credit2.cap = sdom->cap; read_unlock_irqrestore(&prv->lock, flags); break; case XEN_DOMCTL_SCHEDOP_putinfo: write_lock_irqsave(&prv->lock, flags); /* Weight */ if ( op->u.credit2.weight != 0 ) { int old_weight; old_weight = sdom->weight; sdom->weight = op->u.credit2.weight; /* Update weights for units, and max_weight for runqueues on which they reside */ for_each_sched_unit ( d, unit ) { struct csched2_unit *svc = csched2_unit(unit); spinlock_t *lock = unit_schedule_lock(unit); ASSERT(svc->rqd == c2rqd(sched_unit_master(unit))); svc->weight = sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); unit_schedule_unlock(lock, unit); } } /* Cap */ if ( op->u.credit2.cap != 0 ) { struct csched2_unit *svc; spinlock_t *lock; /* Cap is only valid if it's below 100 * nr_of_units */ if ( op->u.credit2.cap > 100 * sdom->nr_units ) { rc = -EINVAL; write_unlock_irqrestore(&prv->lock, flags); break; } spin_lock(&sdom->budget_lock); sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap); sdom->tot_budget /= 100; spin_unlock(&sdom->budget_lock); /* * When trying to get some budget and run, each unit will grab * from the pool 1/N (with N = nr of units of the domain) of * the total budget. Roughly speaking, this means each unit will * have at least one chance to run during every period. */ for_each_sched_unit ( d, unit ) { svc = csched2_unit(unit); lock = unit_schedule_lock(unit); /* * Too small quotas would in theory cause a lot of overhead, * which then won't happen because, in csched2_runtime(), * CSCHED2_MIN_TIMER is what would be used anyway. */ svc->budget_quota = max(sdom->tot_budget / sdom->nr_units, CSCHED2_MIN_TIMER); unit_schedule_unlock(lock, unit); } if ( sdom->cap == 0 ) { /* * We give to the domain the budget to which it is entitled, * and queue its first replenishment event. * * Since cap is currently disabled for this domain, we * know no unit is messing with the domain's budget, and * the replenishment timer is still off. * For these reasons, it is safe to do the following without * taking the budget_lock. */ sdom->budget = sdom->tot_budget; sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD; set_timer(&sdom->repl_timer, sdom->next_repl); /* * Now, let's enable budget accounting for all the units. * For making sure that they will start to honour the domain's * cap, we set their budget to 0. * This way, as soon as they will try to run, they will have * to get some budget. * * For the units that are already running, we trigger the * scheduler on their pCPU. When, as a consequence of this, * csched2_schedule() will run, it will figure out there is * no budget, and the unit will try to get some (and be parked, * if there's none, and we'll switch to someone else). */ for_each_sched_unit ( d, unit ) { svc = csched2_unit(unit); lock = unit_schedule_lock(unit); if ( unit->is_running ) { unsigned int cpu = sched_unit_master(unit); struct csched2_runqueue_data *rqd = c2rqd(cpu); ASSERT(curr_on_cpu(cpu) == unit); /* * We are triggering a reschedule on the unit's * pCPU. That will run burn_credits() and, since * the unit is capped now, it would charge all the * execution time of this last round as budget as * well. That will make the unit budget go negative, * potentially by a large amount, and it's unfair. * * To avoid that, call burn_credit() here, to do the * accounting of this current running instance now, * with budgetting still disabled. This does not * prevent some small amount of budget being charged * to the unit (i.e., the amount of time it runs from * now, to when scheduling happens). The budget will * also go below 0, but a lot less than how it would * if we don't do this. */ burn_credits(rqd, svc, NOW()); __cpumask_set_cpu(cpu, &rqd->tickled); ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle)); cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } svc->budget = 0; unit_schedule_unlock(lock, unit); } } sdom->cap = op->u.credit2.cap; } else if ( sdom->cap != 0 ) { LIST_HEAD(parked); stop_timer(&sdom->repl_timer); /* Disable budget accounting for all the units. */ for_each_sched_unit ( d, unit ) { struct csched2_unit *svc = csched2_unit(unit); spinlock_t *lock = unit_schedule_lock(unit); svc->budget = STIME_MAX; svc->budget_quota = 0; unit_schedule_unlock(lock, unit); } sdom->cap = 0; /* * We are disabling the cap for this domain, which may have * units waiting for a replenishment, so we unpark them all. * Note that, since we have already disabled budget accounting * for all the units of the domain, no currently running unit * will be added to the parked units list any longer. */ spin_lock(&sdom->budget_lock); list_splice_init(&sdom->parked_units, &parked); spin_unlock(&sdom->budget_lock); unpark_parked_units(ops, &parked); } write_unlock_irqrestore(&prv->lock, flags); break; default: rc = -EINVAL; break; } return rc; } static void cf_check csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft) { struct csched2_unit *svc = csched2_unit(unit); if ( !hard ) return; /* Are we becoming exclusively pinned? */ if ( cpumask_weight(hard) == 1 ) __set_bit(__CSFLAG_pinned, &svc->flags); else __clear_bit(__CSFLAG_pinned, &svc->flags); } static int cf_check csched2_sys_cntl( const struct scheduler *ops, struct xen_sysctl_scheduler_op *sc) { struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2; struct csched2_private *prv = csched2_priv(ops); unsigned long flags; switch (sc->cmd ) { case XEN_SYSCTL_SCHEDOP_putinfo: if ( params->ratelimit_us && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN )) return -EINVAL; write_lock_irqsave(&prv->lock, flags); if ( !prv->ratelimit_us && params->ratelimit_us ) printk(XENLOG_INFO "Enabling context switch rate limiting\n"); else if ( prv->ratelimit_us && !params->ratelimit_us ) printk(XENLOG_INFO "Disabling context switch rate limiting\n"); prv->ratelimit_us = params->ratelimit_us; write_unlock_irqrestore(&prv->lock, flags); /* FALLTHRU */ case XEN_SYSCTL_SCHEDOP_getinfo: params->ratelimit_us = prv->ratelimit_us; break; } return 0; } static void *cf_check csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched2_private *prv = csched2_priv(ops); struct csched2_dom *sdom; unsigned long flags; sdom = xzalloc(struct csched2_dom); if ( sdom == NULL ) return ERR_PTR(-ENOMEM); /* Initialize credit, cap and weight */ INIT_LIST_HEAD(&sdom->sdom_elem); sdom->dom = dom; sdom->weight = CSCHED2_DEFAULT_WEIGHT; sdom->cap = 0U; sdom->nr_units = 0; init_timer(&sdom->repl_timer, replenish_domain_budget, sdom, cpumask_any(cpupool_domain_master_cpumask(dom))); spin_lock_init(&sdom->budget_lock); INIT_LIST_HEAD(&sdom->parked_units); write_lock_irqsave(&prv->lock, flags); list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom); write_unlock_irqrestore(&prv->lock, flags); return sdom; } static void cf_check csched2_free_domdata(const struct scheduler *ops, void *data) { struct csched2_dom *sdom = data; struct csched2_private *prv = csched2_priv(ops); if ( sdom ) { unsigned long flags; kill_timer(&sdom->repl_timer); write_lock_irqsave(&prv->lock, flags); list_del_init(&sdom->sdom_elem); write_unlock_irqrestore(&prv->lock, flags); xfree(sdom); } } static void cf_check csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit) { const struct csched2_unit *svc = unit->priv; struct csched2_dom * const sdom = svc->sdom; spinlock_t *lock; ASSERT(!is_idle_unit(unit)); ASSERT(list_empty(&svc->runq_elem)); /* csched2_res_pick() expects the pcpu lock to be held */ lock = unit_schedule_lock_irq(unit); sched_set_res(unit, csched2_res_pick(ops, unit)); spin_unlock_irq(lock); lock = unit_schedule_lock_irq(unit); /* Add unit to runqueue of initial processor */ runq_assign(unit); unit_schedule_unlock_irq(lock, unit); sdom->nr_units++; SCHED_STAT_CRANK(unit_insert); CSCHED2_UNIT_CHECK(unit); } static void cf_check csched2_free_udata(const struct scheduler *ops, void *priv) { struct csched2_unit *svc = priv; xfree(svc); } static void cf_check csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit) { struct csched2_unit * const svc = csched2_unit(unit); spinlock_t *lock; ASSERT(!is_idle_unit(unit)); ASSERT(list_empty(&svc->runq_elem)); SCHED_STAT_CRANK(unit_remove); /* Remove from runqueue */ lock = unit_schedule_lock_irq(unit); runq_deassign(unit); unit_schedule_unlock_irq(lock, unit); svc->sdom->nr_units--; } /* How long should we let this unit run for? */ static s_time_t csched2_runtime(const struct scheduler *ops, int cpu, struct csched2_unit *snext, s_time_t now) { s_time_t time, min_time; int rt_credit; /* Proposed runtime measured in credits */ struct csched2_runqueue_data *rqd = c2rqd(cpu); struct list_head *runq = &rqd->runq; const struct csched2_private *prv = csched2_priv(ops); /* * If we're idle, just stay so. Others (or external events) * will poke us when necessary. */ if ( is_idle_unit(snext->unit) ) return -1; /* General algorithm: * 1) Run until snext's credit will be 0. * 2) But if someone is waiting, run until snext's credit is equal * to his. * 3) But, if we are capped, never run more than our budget. * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or * the ratelimit time. */ /* Calculate mintime */ min_time = CSCHED2_MIN_TIMER; if ( prv->ratelimit_us ) { s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us); if ( snext->unit->is_running ) ratelimit_min = snext->unit->state_entry_time + MICROSECS(prv->ratelimit_us) - now; if ( ratelimit_min > min_time ) min_time = ratelimit_min; } /* 1) Run until snext's credit will be 0. */ rt_credit = snext->credit; /* * 2) If there's someone waiting whose credit is positive, * run until your credit ~= his. */ if ( ! list_empty(runq) ) { struct csched2_unit *swait = runq_elem(runq->next); if ( ! is_idle_unit(swait->unit) && swait->credit > 0 ) { rt_credit = snext->credit - swait->credit; } } /* * The next guy on the runqueue may actually have a higher credit, * if we've tried to avoid migrating him from a different cpu. * Setting time=0 will ensure the minimum timeslice is chosen. * * FIXME: See if we can eliminate this conversion if we know time * will be outside (MIN,MAX). Probably requires pre-calculating * credit values of MIN,MAX per unit, since each unit burns credit * at a different rate. */ if ( rt_credit > 0 ) time = c2t(rqd, rt_credit, snext); else time = 0; /* * 3) But, if capped, never run more than our budget. */ if ( has_cap(snext) ) time = snext->budget < time ? snext->budget : time; /* * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or * the rate_limit time. */ if ( time < min_time ) { time = min_time; SCHED_STAT_CRANK(runtime_min_timer); } else if (time > CSCHED2_MAX_TIMER) { time = CSCHED2_MAX_TIMER; SCHED_STAT_CRANK(runtime_max_timer); } return time; } /* * Find a candidate. */ static struct csched2_unit * runq_candidate(struct csched2_runqueue_data *rqd, struct csched2_unit *scurr, int cpu, s_time_t now) { struct list_head *iter, *temp; const struct sched_resource *sr = get_sched_res(cpu); struct csched2_unit *snext = NULL; struct csched2_private *prv = csched2_priv(sr->scheduler); bool yield = false, soft_aff_preempt = false; if ( unlikely(is_idle_unit(scurr->unit)) ) { snext = scurr; goto check_runq; } yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags); /* * Return the current unit if it has executed for less than ratelimit. * Adjuststment for the selected unit's credit and decision * for how long it will run will be taken in csched2_runtime. * * Note that, if scurr is yielding, we don't let rate limiting kick in. * In fact, it may be the case that scurr is about to spin, and there's * no point forcing it to do so until rate limiting expires. */ if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) && (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) ) { if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned runtime; } d; d.dom = scurr->unit->domain->domain_id; d.unit = scurr->unit->unit_id; d.runtime = now - scurr->unit->state_entry_time; __trace_var(TRC_CSCHED2_RATELIMIT, 1, sizeof(d), (unsigned char *)&d); } return scurr; } /* If scurr has a soft-affinity, let's check whether cpu is part of it */ if ( has_soft_affinity(scurr->unit) ) { affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY, cpumask_scratch); if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) ) { cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain); /* Ok, is any of the pcpus in scurr soft-affinity idle? */ cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle); cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled); soft_aff_preempt = cpumask_intersects(cpumask_scratch, online); } } /* * If scurr is runnable, and this cpu is in its soft-affinity, default to * it. We also default to it, even if cpu is not in its soft-affinity, if * there aren't any idle and not tickled cpu in its soft-affinity. In * fact, we don't want to risk leaving scurr in the runq and this cpu idle * only because scurr is running outside of its soft-affinity. * * On the other hand, if cpu is not in scurr's soft-affinity, and there * looks to be better options, go for them. That happens by defaulting to * idle here, which means scurr will be preempted, put back in runq, and * one of those idle and not tickled cpus from its soft-affinity will be * tickled to pick it up. * * Finally, if scurr does not have a valid soft-affinity, we also let it * continue to run here (in fact, soft_aff_preempt will still be false, * in this case). * * Of course, we also default to idle also if scurr is not runnable. */ if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt ) snext = scurr; else snext = csched2_unit(sched_idle_unit(cpu)); check_runq: list_for_each_safe( iter, temp, &rqd->runq ) { struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem); if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1, sizeof(d), (unsigned char *)&d); } /* * If the unit in the runqueue has more credits than current (or than * idle, if current is not runnable) or if current is yielding, we may * want to pick it up. Otherwise, there's no need to keep scanning the * runqueue any further. */ if ( !yield && svc->credit <= snext->credit ) break; /* Skip non runnable units that we (temporarily) have in the runq */ if ( unlikely(!unit_runnable_state(svc->unit)) ) continue; /* Only consider vcpus that are allowed to run on this processor. */ if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) ) continue; /* * If an unit is meant to be picked up by another processor, and such * processor has not scheduled yet, leave it in the runqueue for him. */ if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu && cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) ) { SCHED_STAT_CRANK(deferred_to_tickled_cpu); continue; } /* * If this is on a different processor, don't pull it unless * its credit is at least CSCHED2_MIGRATE_RESIST higher. */ if ( sched_unit_master(svc->unit) != cpu && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit ) { SCHED_STAT_CRANK(migrate_resisted); continue; } /* * If we are here, we are almost sure we want to pick the unit in * the runqueue. Last thing we need to check is that it either is * not capped or, if it is, it has some budget. * * Note that budget availability must be the very last check that * we do in this loop, due to the side effects that unit_grab_budget() * causes. * * In fact, if there is budget available in the unit's domain's * budget pool, the function will pick some for running this unit. * And we clearly want to do that only if we're otherwise sure that * the unit will actually run, consume it, and return the leftover * (if any) in the usual way. */ if ( has_cap(svc) && !unit_grab_budget(svc) ) continue; /* In any case, if we got this far, break. */ snext = svc; break; } if ( unlikely(tb_init_done) ) { struct { unsigned unit:16, dom:16; unsigned tickled_cpu; int credit; } d; d.dom = snext->unit->domain->domain_id; d.unit = snext->unit->unit_id; d.credit = snext->credit; d.tickled_cpu = snext->tickled_cpu; __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1, sizeof(d), (unsigned char *)&d); } if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) ) SCHED_STAT_CRANK(tickled_cpu_overridden); /* * If snext is from a capped domain, it must have budget (or it * wouldn't have been in the runq). If it is not, it'd be STIME_MAX, * which still is >= 0. */ ASSERT(snext->budget >= 0); return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static void cf_check csched2_schedule( const struct scheduler *ops, struct sched_unit *currunit, s_time_t now, bool tasklet_work_scheduled) { const unsigned int cur_cpu = smp_processor_id(); const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); struct csched2_runqueue_data *rqd; struct csched2_unit * const scurr = csched2_unit(currunit); struct csched2_unit *snext = NULL; bool tickled; bool migrated = false; SCHED_STAT_CRANK(schedule); CSCHED2_UNIT_CHECK(currunit); BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized)); rqd = c2rqd(sched_cpu); BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active)); ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock)); BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd); /* Clear "tickled" bit now that we've been scheduled */ tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled); if ( tickled ) { __cpumask_clear_cpu(sched_cpu, &rqd->tickled); cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); } if ( unlikely(tb_init_done) ) { struct { unsigned cpu:16, rq_id:16; unsigned tasklet:8, idle:8, smt_idle:8, tickled:8; } d; d.cpu = cur_cpu; d.rq_id = c2r(sched_cpu); d.tasklet = tasklet_work_scheduled; d.idle = is_idle_unit(currunit); d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle); d.tickled = tickled; __trace_var(TRC_CSCHED2_SCHEDULE, 1, sizeof(d), (unsigned char *)&d); } /* Update credits (and budget, if necessary). */ burn_credits(rqd, scurr, now); /* * Below 0, means that we are capped and we have overrun our budget. * Let's try to get some more but, if we fail (e.g., because of the * other running units), we will be parked. */ if ( unlikely(scurr->budget <= 0) ) unit_grab_budget(scurr); /* * Select next runnable local UNIT (ie top of local runq). * * If the current unit is runnable, and has higher credit than * the next guy on the queue (or there is noone else), we want to * run him again. * * If there's tasklet work to do, we want to chose the idle unit * for this processor, and mark the current for delayed runqueue * add. * * If the current unit is runnable, and there's another runnable * candidate, we want to mark current for delayed runqueue add, * and remove the next guy from the queue. * * If the current unit is not runnable, we want to chose the idle * unit for this processor. */ if ( tasklet_work_scheduled ) { __clear_bit(__CSFLAG_unit_yield, &scurr->flags); trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL); snext = csched2_unit(sched_idle_unit(sched_cpu)); } else snext = runq_candidate(rqd, scurr, sched_cpu, now); /* If switching from a non-idle runnable unit, put it * back on the runqueue. */ if ( snext != scurr && !is_idle_unit(currunit) && unit_runnable(currunit) ) __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); /* Accounting for non-idle tasks */ if ( !is_idle_unit(snext->unit) ) { int top_credit; /* If switching, remove this from the runqueue and mark it scheduled */ if ( snext != scurr ) { ASSERT(snext->rqd == rqd); ASSERT(!snext->unit->is_running); runq_remove(snext); __set_bit(__CSFLAG_scheduled, &snext->flags); } else update_load(ops, rqd, snext, 0, now); /* Clear the idle mask if necessary */ if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) { __cpumask_clear_cpu(sched_cpu, &rqd->idle); smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); } /* * The reset condition is "has a scheduler epoch come to an end?". * The way this is enforced is checking whether the unit at the top * of the runqueue has negative credits. This means the epochs have * variable length, as in one epoch expores when: * 1) the unit at the top of the runqueue has executed for * around 10 ms (with default parameters); * 2) no other unit with higher credits wants to run. * * Here, where we want to check for reset, we need to make sure the * proper unit is being used. In fact, runq_candidate() may have not * returned the first unit in the runqueue, for various reasons * (e.g., affinity). Only trigger a reset when it does. */ if ( list_empty(&rqd->runq) ) top_credit = snext->credit; else top_credit = max(snext->credit, runq_elem(rqd->runq.next)->credit); if ( top_credit <= CSCHED2_CREDIT_RESET ) { reset_credit(sched_cpu, now, snext); balance_load(ops, sched_cpu, now); } snext->start_time = now; snext->tickled_cpu = -1; /* Safe because lock for old processor is held */ if ( sched_unit_master(snext->unit) != sched_cpu ) { snext->credit += CSCHED2_MIGRATE_COMPENSATION; sched_set_res(snext->unit, get_sched_res(sched_cpu)); SCHED_STAT_CRANK(migrated); migrated = true; } } else { /* * Update the idle mask if necessary. Note that, if we're scheduling * idle in order to carry on some tasklet work, we want to play busy! */ if ( tasklet_work_scheduled ) { if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) { __cpumask_clear_cpu(sched_cpu, &rqd->idle); smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); } } else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) ) { __cpumask_set_cpu(sched_cpu, &rqd->idle); cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); } /* Make sure avgload gets updated periodically even * if there's no activity */ update_load(ops, rqd, NULL, 0, now); } /* * Return task to run next... */ currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now); currunit->next_task = snext->unit; snext->unit->migrated = migrated; CSCHED2_UNIT_CHECK(currunit->next_task); } static void csched2_dump_unit(const struct csched2_private *prv, const struct csched2_unit *svc) { printk("[%i.%i] flags=%x cpu=%i", svc->unit->domain->domain_id, svc->unit->unit_id, svc->flags, sched_unit_master(svc->unit)); printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); if ( has_cap(svc) ) printk(" budget=%"PRI_stime"(%"PRI_stime")", svc->budget, svc->budget_quota); printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload, (svc->avgload * 100) >> prv->load_precision_shift); printk("\n"); } static inline void dump_pcpu(const struct scheduler *ops, int cpu) { const struct csched2_private *prv = csched2_priv(ops); const struct csched2_unit *svc; printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n", cpu, c2r(cpu), CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); /* current UNIT (nothing to say if that's the idle unit) */ svc = csched2_unit(curr_on_cpu(cpu)); if ( svc && !is_idle_unit(svc->unit) ) { printk("\trun: "); csched2_dump_unit(prv, svc); } } static void cf_check csched2_dump(const struct scheduler *ops) { struct list_head *iter_sdom; struct csched2_private *prv = csched2_priv(ops); unsigned long flags; unsigned int j, loop; struct csched2_runqueue_data *rqd; /* * We need the private scheduler lock as we access global * scheduler data and (below) the list of active domains. */ read_lock_irqsave(&prv->lock, flags); printk("Active queues: %d\n" "\tdefault-weight = %d\n", prv->active_queues, CSCHED2_DEFAULT_WEIGHT); list_for_each_entry ( rqd, &prv->rql, rql ) { s_time_t fraction; fraction = (rqd->avgload * 100) >> prv->load_precision_shift; printk("Runqueue %d:\n" "\tncpus = %u\n" "\tcpus = %*pbl\n" "\tmax_weight = %u\n" "\tpick_bias = %u\n" "\tinstload = %d\n" "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n", rqd->id, rqd->nr_cpus, CPUMASK_PR(&rqd->active), rqd->max_weight, rqd->pick_bias, rqd->load, rqd->avgload, fraction); printk("\tidlers: %*pb\n" "\ttickled: %*pb\n" "\tfully idle cores: %*pb\n", CPUMASK_PR(&rqd->idle), CPUMASK_PR(&rqd->tickled), CPUMASK_PR(&rqd->smt_idle)); } printk("Domain info:\n"); loop = 0; list_for_each( iter_sdom, &prv->sdom ) { const struct csched2_dom *sdom; const struct sched_unit *unit; sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem); printk("\tDomain: %d w %d c %u v %d\n", sdom->dom->domain_id, sdom->weight, sdom->cap, sdom->nr_units); for_each_sched_unit ( sdom->dom, unit ) { struct csched2_unit * const svc = csched2_unit(unit); spinlock_t *lock; lock = unit_schedule_lock(unit); printk("\t%3d: ", ++loop); csched2_dump_unit(prv, svc); unit_schedule_unlock(lock, unit); } } list_for_each_entry ( rqd, &prv->rql, rql ) { struct list_head *iter, *runq = &rqd->runq; int loop = 0; /* We need the lock to scan the runqueue. */ spin_lock(&rqd->lock); printk("Runqueue %d:\n", rqd->id); for_each_cpu(j, &rqd->active) dump_pcpu(ops, j); printk("RUNQ:\n"); list_for_each( iter, runq ) { const struct csched2_unit *svc = runq_elem(iter); if ( svc ) { printk("\t%3d: ", loop++); csched2_dump_unit(prv, svc); } } spin_unlock(&rqd->lock); } read_unlock_irqrestore(&prv->lock, flags); } static void *cf_check csched2_alloc_pdata(const struct scheduler *ops, int cpu) { struct csched2_pcpu *spc; struct csched2_runqueue_data *rqd; BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID); spc = xzalloc(struct csched2_pcpu); if ( spc == NULL ) return ERR_PTR(-ENOMEM); rqd = cpu_add_to_runqueue(ops, cpu); if ( IS_ERR(rqd) ) { xfree(spc); return rqd; } spc->rqd = rqd; return spc; } /* * Do what's necessary to add cpu to the rqd (including activating the * runqueue, if this is the first CPU we put in it). */ static void init_cpu_runqueue(struct csched2_private *prv, struct csched2_pcpu *spc, unsigned int cpu, struct csched2_runqueue_data *rqd) { unsigned int rcpu; ASSERT(rqd && !cpumask_test_cpu(cpu, &spc->rqd->active)); printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqd->id); if ( !rqd->nr_cpus ) { printk(XENLOG_INFO " First cpu on runqueue, activating\n"); BUG_ON(!cpumask_empty(&rqd->active)); rqd->max_weight = 1; INIT_LIST_HEAD(&rqd->svc); INIT_LIST_HEAD(&rqd->runq); spin_lock_init(&rqd->lock); prv->active_queues++; } __cpumask_set_cpu(cpu, &spc->sibling_mask); if ( rqd->nr_cpus > 0 ) for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) ) if ( cpumask_test_cpu(rcpu, &rqd->active) ) { __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); __cpumask_set_cpu(rcpu, &spc->sibling_mask); } __cpumask_set_cpu(cpu, &rqd->idle); __cpumask_set_cpu(cpu, &rqd->active); __cpumask_set_cpu(cpu, &prv->initialized); __cpumask_set_cpu(cpu, &rqd->smt_idle); rqd->nr_cpus++; ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); if ( rqd->nr_cpus == 1 ) rqd->pick_bias = cpu; } /* Returns a pointer to the runqueue the cpu is assigned to. */ static struct csched2_runqueue_data * init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc, unsigned int cpu) { struct csched2_runqueue_data *rqd; ASSERT(rw_is_write_locked(&prv->lock)); ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); /* CPU data needs to be allocated, but still uninitialized. */ ASSERT(spc); rqd = spc->rqd; init_cpu_runqueue(prv, spc, cpu, rqd); return rqd; } /* Change the scheduler of cpu to us (Credit2). */ static spinlock_t *cf_check csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, void *pdata, void *vdata) { struct csched2_private *prv = csched2_priv(new_ops); struct csched2_unit *svc = vdata; struct csched2_runqueue_data *rqd; ASSERT(pdata && svc && is_idle_unit(svc->unit)); /* * We own one runqueue lock already (from schedule_cpu_switch()). This * looks like it violates this scheduler's locking rules, but it does * not, as what we own is the lock of another scheduler, that hence has * no particular (ordering) relationship with our private global lock. * And owning exactly that one (the lock of the old scheduler of this * cpu) is what is necessary to prevent races. */ ASSERT(!local_irq_is_enabled()); write_lock(&prv->lock); sched_idle_unit(cpu)->priv = vdata; rqd = init_pdata(prv, pdata, cpu); /* * Now that we know what runqueue we'll go in, double check what's said * above: the lock we already hold is not the one of this runqueue of * this scheduler, and so it's safe to have taken it /before/ our * private global lock. */ ASSERT(get_sched_res(cpu)->schedule_lock != &rqd->lock); write_unlock(&prv->lock); return &rqd->lock; } static void cf_check csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) { unsigned long flags; struct csched2_private *prv = csched2_priv(ops); struct csched2_runqueue_data *rqd; struct csched2_pcpu *spc = pcpu; unsigned int rcpu; write_lock_irqsave(&prv->lock, flags); /* * Scheduler specific data for this pCPU must still be there and and be * valid. In fact, if we are here: * 1. alloc_pdata must have been called for this cpu, and free_pdata * must not have been called on it before us, * 2. init_pdata must have been called on this cpu, and deinit_pdata * (us!) must not have been called on it already. */ ASSERT(spc && spc->rqd); ASSERT(cpumask_test_cpu(cpu, &prv->initialized)); /* Find the old runqueue and remove this cpu from it */ rqd = spc->rqd; /* No need to save IRQs here, they're already disabled */ spin_lock(&rqd->lock); printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqd->id); __cpumask_clear_cpu(cpu, &rqd->idle); __cpumask_clear_cpu(cpu, &rqd->smt_idle); __cpumask_clear_cpu(cpu, &rqd->active); __cpumask_clear_cpu(cpu, &rqd->tickled); for_each_cpu ( rcpu, &rqd->active ) __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); rqd->nr_cpus--; ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); if ( rqd->nr_cpus == 0 ) { printk(XENLOG_INFO " No cpus left on runqueue, disabling\n"); BUG_ON(!cpumask_empty(&rqd->active)); prv->active_queues--; } else if ( rqd->pick_bias == cpu ) rqd->pick_bias = cpumask_first(&rqd->active); spin_unlock(&rqd->lock); __cpumask_clear_cpu(cpu, &prv->initialized); write_unlock_irqrestore(&prv->lock, flags); return; } static void cf_check csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { struct csched2_private *prv = csched2_priv(ops); struct csched2_pcpu *spc = pcpu; struct csched2_runqueue_data *rqd; unsigned long flags; if ( !spc ) return; write_lock_irqsave(&prv->lock, flags); rqd = spc->rqd; ASSERT(rqd && rqd->refcnt); ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); rqd->refcnt--; if ( !rqd->refcnt ) list_del(&rqd->rql); else rqd = NULL; write_unlock_irqrestore(&prv->lock, flags); xfree(rqd); xfree(pcpu); } static int __init cf_check csched2_global_init(void) { if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN ) { printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n", __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN); opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN; } if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT ) { printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n", __func__, opt_load_window_shift); opt_load_window_shift = LOADAVG_WINDOW_SHIFT; } if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER ) { printk("WARNING: %s: opt_cap_period %u too small, resetting\n", __func__, opt_cap_period); opt_cap_period = 10; /* ms */ } return 0; } static int cf_check csched2_init(struct scheduler *ops) { struct csched2_private *prv; printk("Initializing Credit2 scheduler\n"); printk(XENLOG_INFO " load_precision_shift: %d\n" XENLOG_INFO " load_window_shift: %d\n" XENLOG_INFO " underload_balance_tolerance: %d\n" XENLOG_INFO " overload_balance_tolerance: %d\n" XENLOG_INFO " runqueues arrangement: %s\n" XENLOG_INFO " cap enforcement granularity: %dms\n", opt_load_precision_shift, opt_load_window_shift, opt_underload_balance_tolerance, opt_overload_balance_tolerance, opt_runqueue_str[opt_runqueue], opt_cap_period); printk(XENLOG_INFO "load tracking window length %llu ns\n", 1ULL << opt_load_window_shift); /* * Basically no CPU information is available at this point; just * set up basic structures, and a callback when the CPU info is * available. */ prv = xzalloc(struct csched2_private); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; rwlock_init(&prv->lock); INIT_LIST_HEAD(&prv->rql); INIT_LIST_HEAD(&prv->sdom); /* initialize ratelimit */ prv->ratelimit_us = sched_ratelimit_us; prv->load_precision_shift = opt_load_precision_shift; prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT; ASSERT(opt_load_window_shift > 0); return 0; } static void cf_check csched2_deinit(struct scheduler *ops) { struct csched2_private *prv; prv = csched2_priv(ops); ops->sched_data = NULL; xfree(prv); } static const struct scheduler sched_credit2_def = { .name = "SMP Credit Scheduler rev2", .opt_name = "credit2", .sched_id = XEN_SCHEDULER_CREDIT2, .sched_data = NULL, .global_init = csched2_global_init, .insert_unit = csched2_unit_insert, .remove_unit = csched2_unit_remove, .sleep = csched2_unit_sleep, .wake = csched2_unit_wake, .yield = csched2_unit_yield, .adjust = csched2_dom_cntl, .adjust_affinity= csched2_aff_cntl, .adjust_global = csched2_sys_cntl, .pick_resource = csched2_res_pick, .migrate = csched2_unit_migrate, .do_schedule = csched2_schedule, .context_saved = csched2_context_saved, .dump_settings = csched2_dump, .init = csched2_init, .deinit = csched2_deinit, .alloc_udata = csched2_alloc_udata, .free_udata = csched2_free_udata, .alloc_pdata = csched2_alloc_pdata, .deinit_pdata = csched2_deinit_pdata, .free_pdata = csched2_free_pdata, .switch_sched = csched2_switch_sched, .alloc_domdata = csched2_alloc_domdata, .free_domdata = csched2_free_domdata, }; REGISTER_SCHEDULER(sched_credit2_def);