/***************************************************************************** * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen * EDF scheduling is a real-time scheduling algorithm used in embedded field. * * by Sisu Xi, 2013, Washington University in Saint Louis * Meng Xu, 2014-2016, University of Pennsylvania * * Conversion toward event driven model by Tianyang Chen * and Dagaen Golomb, 2016, University of Pennsylvania * * based on the code of credit Scheduler */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* * TODO: * * Migration compensation and resist like credit2 to better use cache; * Lock Holder Problem, using yield? * Self switch problem: UNITs of the same domain may preempt each other; */ /* * Design: * * This scheduler follows the Preemptive Global Earliest Deadline First (EDF) * theory in real-time field. * At any scheduling point, the UNIT with earlier deadline has higher priority. * The scheduler always picks highest priority UNIT to run on a feasible PCPU. * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or * has a lower-priority UNIT running on it.) * * Each UNIT has a dedicated period, budget and a extratime flag * The deadline of an UNIT is at the end of each period; * An UNIT has its budget replenished at the beginning of each period; * While scheduled, an UNIT burns its budget. * The UNIT needs to finish its budget before its deadline in each period; * The UNIT discards its unused budget at the end of each period. * When an UNIT runs out of budget in a period, if its extratime flag is set, * the UNIT increases its priority_level by 1 and refills its budget; otherwise, * it has to wait until next period. * * Each UNIT is implemented as a deferable server. * When an UNIT has a task running on it, its budget is continuously burned; * When an UNIT has no task but with budget left, its budget is preserved. * * Queue scheme: * A global runqueue and a global depletedqueue for each CPU pool. * The runqueue holds all runnable UNITs with budget, * sorted by priority_level and deadline; * The depletedqueue holds all UNITs without budget, unsorted; * * Note: cpumask and cpupool is supported. */ /* * Locking: * A global system lock is used to protect the RunQ and DepletedQ. * The global lock is referenced by sched_res->schedule_lock * from all physical cpus. * * The lock is already grabbed when calling wake/sleep/schedule/ functions * in schedule.c * * The functions involes RunQ and needs to grab locks are: * unit_insert, unit_remove, context_saved, runq_insert */ /* * Default parameters: * Period and budget in default is 10 and 4 ms, respectively */ #define RTDS_DEFAULT_PERIOD (MICROSECS(10000)) #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) /* * Max period: max delta of time type, because period is added to the time * an unit activates, so this must not overflow. * Min period: 10 us, considering the scheduling overhead (when period is * too low, scheduling is invoked too frequently, causing high overhead). */ #define RTDS_MAX_PERIOD (STIME_DELTA_MAX) #define RTDS_MIN_PERIOD (MICROSECS(10)) /* * Min budget: 10 us, considering the scheduling overhead (when budget is * consumed too fast, scheduling is invoked too frequently, causing * high overhead). */ #define RTDS_MIN_BUDGET (MICROSECS(10)) /* * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding * the next deadline, performing addition could be faster if the difference * between cur_deadline and now is small. If the difference is bigger than * 1024 * period, use multiplication. */ #define UPDATE_LIMIT_SHIFT 10 /* * Flags */ /* * RTDS_scheduled: Is this unit either running on, or context-switching off, * a physical cpu? * + Accessed only with global lock held. * + Set when chosen as next in rt_schedule(). * + Cleared after context switch has been saved in rt_context_saved() * + Checked in unit_wake to see if we can add to the Runqueue, or if we should * set RTDS_delayed_runq_add * + Checked to be false in runq_insert. */ #define __RTDS_scheduled 1 #define RTDS_scheduled (1<<__RTDS_scheduled) /* * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ * once it's done being context switching out? * + Set when scheduling out in rt_schedule() if prev is runable * + Set in rt_unit_wake if it finds RTDS_scheduled set * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ * and clears the bit. */ #define __RTDS_delayed_runq_add 2 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) /* * RTDS_depleted: Does this vcp run out of budget? * This flag is * + set in burn_budget() if an unit has zero budget left; * + cleared and checked in the repenishment handler, * for the units that are being replenished. */ #define __RTDS_depleted 3 #define RTDS_depleted (1<<__RTDS_depleted) /* * RTDS_extratime: Can the unit run in the time that is * not part of any real-time reservation, and would therefore * be otherwise left idle? */ #define __RTDS_extratime 4 #define RTDS_extratime (1<<__RTDS_extratime) /* * rt tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1) #define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2) #define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3) #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4) #define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5) #define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6) static void cf_check repl_timer_handler(void *data); /* * System-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by sched_res->schedule_lock from all * physical cpus. It can be grabbed via unit_schedule_lock_irq() */ struct rt_private { spinlock_t lock; /* the global coarse-grained lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable units */ struct list_head depletedq; /* unordered list of depleted units */ struct timer repl_timer; /* replenishment timer */ struct list_head replq; /* ordered list of units that need replenishment */ cpumask_t tickled; /* cpus been tickled */ }; /* * Virtual CPU */ struct rt_unit { struct list_head q_elem; /* on the runq/depletedq list */ struct list_head replq_elem; /* on the replenishment events list */ /* UNIT parameters, in nanoseconds */ s_time_t period; s_time_t budget; /* UNIT current information in nanosecond */ s_time_t cur_budget; /* current budget */ s_time_t last_start; /* last start time */ s_time_t cur_deadline; /* current deadline for EDF */ /* Up-pointers */ struct rt_dom *sdom; struct sched_unit *unit; unsigned priority_level; unsigned flags; /* mark __RTDS_scheduled, etc.. */ }; /* * Domain */ struct rt_dom { struct list_head sdom_elem; /* link list on rt_priv */ struct domain *dom; /* pointer to upper domain */ }; /* * Useful inline functions */ static inline struct rt_private *rt_priv(const struct scheduler *ops) { return ops->sched_data; } static inline struct rt_unit *rt_unit(const struct sched_unit *unit) { return unit->priv; } static inline struct list_head *rt_runq(const struct scheduler *ops) { return &rt_priv(ops)->runq; } static inline struct list_head *rt_depletedq(const struct scheduler *ops) { return &rt_priv(ops)->depletedq; } static inline struct list_head *rt_replq(const struct scheduler *ops) { return &rt_priv(ops)->replq; } static inline bool has_extratime(const struct rt_unit *svc) { return svc->flags & RTDS_extratime; } /* * Helper functions for manipulating the runqueue, the depleted queue, * and the replenishment events queue. */ static int unit_on_q(const struct rt_unit *svc) { return !list_empty(&svc->q_elem); } static struct rt_unit *cf_check q_elem(struct list_head *elem) { return list_entry(elem, struct rt_unit, q_elem); } static struct rt_unit *cf_check replq_elem(struct list_head *elem) { return list_entry(elem, struct rt_unit, replq_elem); } static int unit_on_replq(const struct rt_unit *svc) { return !list_empty(&svc->replq_elem); } /* * If v1 priority >= v2 priority, return value > 0 * Otherwise, return value < 0 */ static s_time_t compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2) { int prio = v2->priority_level - v1->priority_level; if ( prio == 0 ) return v2->cur_deadline - v1->cur_deadline; return prio; } /* * Debug related code, dump unit/cpu information */ static void rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc) { cpumask_t *cpupool_mask, *mask; ASSERT(svc != NULL); /* idle unit */ if( svc->sdom == NULL ) { printk("\n"); return; } /* * We can't just use 'cpumask_scratch' because the dumping can * happen from a pCPU outside of this scheduler's cpupool, and * hence it's not right to use its pCPU's scratch mask. * On the other hand, it is safe to use sched_unit_master(svc->unit)'s * own scratch space, since we hold the runqueue lock. */ mask = cpumask_scratch_cpu(sched_unit_master(svc->unit)); cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain); cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity); printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n" " \t\t priority_level=%d has_extratime=%d\n" " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n", svc->unit->domain->domain_id, svc->unit->unit_id, sched_unit_master(svc->unit), svc->period, svc->budget, svc->cur_budget, svc->cur_deadline, svc->last_start, svc->priority_level, has_extratime(svc), unit_on_q(svc), unit_runnable(svc->unit), svc->flags, CPUMASK_PR(mask)); } static void cf_check rt_dump_pcpu(const struct scheduler *ops, int cpu) { struct rt_private *prv = rt_priv(ops); const struct rt_unit *svc; unsigned long flags; spin_lock_irqsave(&prv->lock, flags); printk("CPU[%02d]\n", cpu); /* current UNIT (nothing to say if that's the idle unit). */ svc = rt_unit(curr_on_cpu(cpu)); if ( svc && !is_idle_unit(svc->unit) ) { rt_dump_unit(ops, svc); } spin_unlock_irqrestore(&prv->lock, flags); } static void cf_check rt_dump(const struct scheduler *ops) { struct list_head *runq, *depletedq, *replq, *iter; struct rt_private *prv = rt_priv(ops); const struct rt_unit *svc; const struct rt_dom *sdom; unsigned long flags; spin_lock_irqsave(&prv->lock, flags); if ( list_empty(&prv->sdom) ) goto out; runq = rt_runq(ops); depletedq = rt_depletedq(ops); replq = rt_replq(ops); printk("Global RunQueue info:\n"); list_for_each ( iter, runq ) { svc = q_elem(iter); rt_dump_unit(ops, svc); } printk("Global DepletedQueue info:\n"); list_for_each ( iter, depletedq ) { svc = q_elem(iter); rt_dump_unit(ops, svc); } printk("Global Replenishment Events info:\n"); list_for_each ( iter, replq ) { svc = replq_elem(iter); rt_dump_unit(ops, svc); } printk("Domain info:\n"); list_for_each ( iter, &prv->sdom ) { const struct sched_unit *unit; sdom = list_entry(iter, struct rt_dom, sdom_elem); printk("\tdomain: %d\n", sdom->dom->domain_id); for_each_sched_unit ( sdom->dom, unit ) { svc = rt_unit(unit); rt_dump_unit(ops, svc); } } out: spin_unlock_irqrestore(&prv->lock, flags); } /* * update deadline and budget when now >= cur_deadline * it needs to be updated to the deadline of the current period */ static void rt_update_deadline(s_time_t now, struct rt_unit *svc) { ASSERT(now >= svc->cur_deadline); ASSERT(svc->period != 0); if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now ) { do svc->cur_deadline += svc->period; while ( svc->cur_deadline <= now ); } else { long count = ((now - svc->cur_deadline) / svc->period) + 1; svc->cur_deadline += count * svc->period; } /* * svc may be scheduled to run immediately after it misses deadline * Then rt_update_deadline is called before rt_schedule, which * should only deduct the time spent in current period from the budget */ svc->last_start = now; svc->cur_budget = svc->budget; svc->priority_level = 0; /* TRACE */ { struct __packed { unsigned unit:16, dom:16; unsigned priority_level; uint64_t cur_deadline, cur_budget; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.priority_level = svc->priority_level; d.cur_deadline = (uint64_t) svc->cur_deadline; d.cur_budget = (uint64_t) svc->cur_budget; trace_var(TRC_RTDS_BUDGET_REPLENISH, 1, sizeof(d), (unsigned char *) &d); } return; } /* * Helpers for removing and inserting an unit in a queue * that is being kept ordered by the units' deadlines (as EDF * mandates). * * For callers' convenience, the unit removing helper returns * true if the unit removed was the one at the front of the * queue; similarly, the inserting helper returns true if the * inserted ended at the front of the queue (i.e., in both * cases, if the unit with the earliest deadline is what we * are dealing with). */ static inline bool deadline_queue_remove(struct list_head *queue, struct list_head *elem) { bool first = false; if ( queue->next != elem ) first = true; list_del_init(elem); return !first; } static inline bool deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *), struct rt_unit *svc, struct list_head *elem, struct list_head *queue) { struct list_head *iter; bool first = true; list_for_each ( iter, queue ) { const struct rt_unit * iter_svc = (*qelem)(iter); if ( compare_unit_priority(svc, iter_svc) > 0 ) break; first = false; } list_add_tail(elem, iter); return first; } #define deadline_runq_insert(...) \ deadline_queue_insert(&q_elem, ##__VA_ARGS__) #define deadline_replq_insert(...) \ deadline_queue_insert(&replq_elem, ##__VA_ARGS__) static inline void q_remove(struct rt_unit *svc) { ASSERT( unit_on_q(svc) ); list_del_init(&svc->q_elem); } static inline void replq_remove(const struct scheduler *ops, struct rt_unit *svc) { struct rt_private *prv = rt_priv(ops); struct list_head *replq = rt_replq(ops); ASSERT( unit_on_replq(svc) ); if ( deadline_queue_remove(replq, &svc->replq_elem) ) { /* * The replenishment timer needs to be set to fire when a * replenishment for the unit at the front of the replenishment * queue is due. If it is such unit that we just removed, we may * need to reprogram the timer. */ if ( !list_empty(replq) ) { const struct rt_unit *svc_next = replq_elem(replq->next); set_timer(&prv->repl_timer, svc_next->cur_deadline); } else stop_timer(&prv->repl_timer); } } /* * Insert svc with budget in RunQ according to EDF: * units with smaller deadlines go first. * Insert svc without budget in DepletedQ unsorted; */ static void runq_insert(const struct scheduler *ops, struct rt_unit *svc) { struct rt_private *prv = rt_priv(ops); struct list_head *runq = rt_runq(ops); ASSERT( spin_is_locked(&prv->lock) ); ASSERT( !unit_on_q(svc) ); ASSERT( unit_on_replq(svc) ); /* add svc to runq if svc still has budget or its extratime is set */ if ( svc->cur_budget > 0 || has_extratime(svc) ) deadline_runq_insert(svc, &svc->q_elem, runq); else list_add(&svc->q_elem, &prv->depletedq); } static void replq_insert(const struct scheduler *ops, struct rt_unit *svc) { struct list_head *replq = rt_replq(ops); struct rt_private *prv = rt_priv(ops); ASSERT( !unit_on_replq(svc) ); /* * The timer may be re-programmed if svc is inserted * at the front of the event list. */ if ( deadline_replq_insert(svc, &svc->replq_elem, replq) ) set_timer(&prv->repl_timer, svc->cur_deadline); } /* * Removes and re-inserts an event to the replenishment queue. * The aim is to update its position inside the queue, as its * deadline (and hence its replenishment time) could have * changed. */ static void replq_reinsert(const struct scheduler *ops, struct rt_unit *svc) { struct list_head *replq = rt_replq(ops); const struct rt_unit *rearm_svc = svc; bool rearm = false; ASSERT( unit_on_replq(svc) ); /* * If svc was at the front of the replenishment queue, we certainly * need to re-program the timer, and we want to use the deadline of * the unit which is now at the front of the queue (which may still * be svc or not). * * We may also need to re-program, if svc has been put at the front * of the replenishment queue when being re-inserted. */ if ( deadline_queue_remove(replq, &svc->replq_elem) ) { deadline_replq_insert(svc, &svc->replq_elem, replq); rearm_svc = replq_elem(replq->next); rearm = true; } else rearm = deadline_replq_insert(svc, &svc->replq_elem, replq); if ( rearm ) set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline); } /* * Pick a valid resource for the unit vc * Valid resource of an unit is intesection of unit's affinity * and available resources */ static struct sched_resource *cf_check rt_res_pick_locked(const struct sched_unit *unit, unsigned int locked_cpu) { cpumask_t *cpus = cpumask_scratch_cpu(locked_cpu); const cpumask_t *online; int cpu; online = cpupool_domain_master_cpumask(unit->domain); cpumask_and(cpus, online, unit->cpu_hard_affinity); cpu = cpumask_test_cpu(sched_unit_master(unit), cpus) ? sched_unit_master(unit) : cpumask_cycle(sched_unit_master(unit), cpus); ASSERT( !cpumask_empty(cpus) && cpumask_test_cpu(cpu, cpus) ); return get_sched_res(cpu); } /* * Pick a valid resource for the unit vc * Valid resource of an unit is intesection of unit's affinity * and available resources */ static struct sched_resource *cf_check rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit) { struct sched_resource *res; res = rt_res_pick_locked(unit, unit->res->master_cpu); return res; } /* * Init/Free related code */ static int cf_check rt_init(struct scheduler *ops) { int rc = -ENOMEM; struct rt_private *prv = xzalloc(struct rt_private); printk("Initializing RTDS scheduler\n" "WARNING: This is experimental software in development.\n" "Use at your own risk.\n"); if ( prv == NULL ) goto err; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->sdom); INIT_LIST_HEAD(&prv->runq); INIT_LIST_HEAD(&prv->depletedq); INIT_LIST_HEAD(&prv->replq); ops->sched_data = prv; rc = 0; err: if ( rc ) xfree(prv); return rc; } static void cf_check rt_deinit(struct scheduler *ops) { struct rt_private *prv = rt_priv(ops); ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid || prv->repl_timer.status == TIMER_STATUS_killed); ops->sched_data = NULL; xfree(prv); } /* Change the scheduler of cpu to us (RTDS). */ static spinlock_t *cf_check rt_switch_sched(struct scheduler *new_ops, unsigned int cpu, void *pdata, void *vdata) { struct rt_private *prv = rt_priv(new_ops); struct rt_unit *svc = vdata; ASSERT(!pdata && svc && is_idle_unit(svc->unit)); /* * We are holding the runqueue lock already (it's been taken in * schedule_cpu_switch()). It's actually the runqueue lock of * another scheduler, but that is how things need to be, for * preventing races. */ ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock); /* * If we are the absolute first cpu being switched toward this * scheduler (in which case we'll see TIMER_STATUS_invalid), or the * first one that is added back to the cpupool that had all its cpus * removed (in which case we'll see TIMER_STATUS_killed), it's our * job to (re)initialize the timer. */ if ( prv->repl_timer.status == TIMER_STATUS_invalid || prv->repl_timer.status == TIMER_STATUS_killed ) { init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu); dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu); } sched_idle_unit(cpu)->priv = vdata; return &prv->lock; } static void move_repl_timer(struct rt_private *prv, unsigned int old_cpu) { cpumask_t *online = get_sched_res(old_cpu)->cpupool->res_valid; unsigned int new_cpu = cpumask_cycle(old_cpu, online); /* * Make sure the timer run on one of the cpus that are still available * to this scheduler. If there aren't any left, it means it's the time * to just kill it. */ if ( new_cpu >= nr_cpu_ids ) { kill_timer(&prv->repl_timer); dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", old_cpu); } else { migrate_timer(&prv->repl_timer, new_cpu); } } static void cf_check rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) { unsigned long flags; struct rt_private *prv = rt_priv(ops); spin_lock_irqsave(&prv->lock, flags); if ( prv->repl_timer.cpu == cpu ) move_repl_timer(prv, cpu); spin_unlock_irqrestore(&prv->lock, flags); } static void cf_check rt_move_timers(const struct scheduler *ops, struct sched_resource *sr) { unsigned long flags; struct rt_private *prv = rt_priv(ops); unsigned int old_cpu; spin_lock_irqsave(&prv->lock, flags); old_cpu = prv->repl_timer.cpu; if ( prv->repl_timer.status != TIMER_STATUS_invalid && prv->repl_timer.status != TIMER_STATUS_killed && !cpumask_test_cpu(old_cpu, sr->cpupool->res_valid) ) move_repl_timer(prv, old_cpu); spin_unlock_irqrestore(&prv->lock, flags); } static void *cf_check rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) { unsigned long flags; struct rt_dom *sdom; struct rt_private * prv = rt_priv(ops); sdom = xzalloc(struct rt_dom); if ( sdom == NULL ) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&sdom->sdom_elem); sdom->dom = dom; /* spinlock here to insert the dom */ spin_lock_irqsave(&prv->lock, flags); list_add_tail(&sdom->sdom_elem, &(prv->sdom)); spin_unlock_irqrestore(&prv->lock, flags); return sdom; } static void cf_check rt_free_domdata(const struct scheduler *ops, void *data) { struct rt_dom *sdom = data; struct rt_private *prv = rt_priv(ops); if ( sdom ) { unsigned long flags; spin_lock_irqsave(&prv->lock, flags); list_del_init(&sdom->sdom_elem); spin_unlock_irqrestore(&prv->lock, flags); xfree(sdom); } } static void * cf_check rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd) { struct rt_unit *svc; /* Allocate per-UNIT info */ svc = xzalloc(struct rt_unit); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->q_elem); INIT_LIST_HEAD(&svc->replq_elem); svc->flags = 0U; svc->sdom = dd; svc->unit = unit; svc->last_start = 0; __set_bit(__RTDS_extratime, &svc->flags); svc->priority_level = 0; svc->period = RTDS_DEFAULT_PERIOD; if ( !is_idle_unit(unit) ) svc->budget = RTDS_DEFAULT_BUDGET; SCHED_STAT_CRANK(unit_alloc); return svc; } static void cf_check rt_free_udata(const struct scheduler *ops, void *priv) { struct rt_unit *svc = priv; xfree(svc); } /* * It is called in sched_move_domain() and sched_init_vcpu * in schedule.c. * When move a domain to a new cpupool. * It inserts units of moving domain to the scheduler's RunQ in * dest. cpupool. */ static void cf_check rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit) { struct rt_unit *svc = rt_unit(unit); s_time_t now; spinlock_t *lock; unsigned int cpu = smp_processor_id(); BUG_ON( is_idle_unit(unit) ); /* This is safe because unit isn't yet being scheduled */ lock = pcpu_schedule_lock_irq(cpu); sched_set_res(unit, rt_res_pick_locked(unit, cpu)); pcpu_schedule_unlock_irq(lock, cpu); lock = unit_schedule_lock_irq(unit); now = NOW(); if ( now >= svc->cur_deadline ) rt_update_deadline(now, svc); if ( !unit_on_q(svc) && unit_runnable(unit) ) { replq_insert(ops, svc); if ( !unit->is_running ) runq_insert(ops, svc); } unit_schedule_unlock_irq(lock, unit); SCHED_STAT_CRANK(unit_insert); } /* * Remove rt_unit svc from the old scheduler in source cpupool. */ static void cf_check rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit) { struct rt_unit * const svc = rt_unit(unit); struct rt_dom * const sdom = svc->sdom; spinlock_t *lock; SCHED_STAT_CRANK(unit_remove); BUG_ON( sdom == NULL ); lock = unit_schedule_lock_irq(unit); if ( unit_on_q(svc) ) q_remove(svc); if ( unit_on_replq(svc) ) replq_remove(ops,svc); unit_schedule_unlock_irq(lock, unit); } /* * Burn budget in nanosecond granularity */ static void burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now) { s_time_t delta; /* don't burn budget for idle UNIT */ if ( is_idle_unit(svc->unit) ) return; /* burn at nanoseconds level */ delta = now - svc->last_start; /* * delta < 0 only happens in nested virtualization; * TODO: how should we handle delta < 0 in a better way? */ if ( delta < 0 ) { printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n", __func__, delta); svc->last_start = now; return; } svc->cur_budget -= delta; svc->last_start = now; if ( svc->cur_budget <= 0 ) { if ( has_extratime(svc) ) { svc->priority_level++; svc->cur_budget = svc->budget; } else { svc->cur_budget = 0; __set_bit(__RTDS_depleted, &svc->flags); } } /* TRACE */ { struct __packed { uint16_t unit, dom; uint64_t cur_budget; uint32_t delta; uint32_t priority_level; uint32_t has_extratime; } d = { .unit = svc->unit->unit_id, .dom = svc->unit->domain->domain_id, .cur_budget = svc->cur_budget, .delta = delta, /* TODO: truncation? */ .priority_level = svc->priority_level, .has_extratime = !!(svc->flags & RTDS_extratime), }; trace_var(TRC_RTDS_BUDGET_BURN, 1, sizeof(d), (unsigned char *) &d); } } /* * RunQ is sorted. Pick first one within cpumask. If no one, return NULL * lock is grabbed before calling this function */ static struct rt_unit * runq_pick(const struct scheduler *ops, const cpumask_t *mask, unsigned int cpu) { struct list_head *runq = rt_runq(ops); struct list_head *iter; struct rt_unit *svc = NULL; struct rt_unit *iter_svc = NULL; cpumask_t *cpu_common = cpumask_scratch_cpu(cpu); const cpumask_t *online; list_for_each ( iter, runq ) { iter_svc = q_elem(iter); /* mask cpu_hard_affinity & cpupool & mask */ online = cpupool_domain_master_cpumask(iter_svc->unit->domain); cpumask_and(cpu_common, online, iter_svc->unit->cpu_hard_affinity); cpumask_and(cpu_common, mask, cpu_common); if ( cpumask_empty(cpu_common) ) continue; ASSERT( iter_svc->cur_budget > 0 ); svc = iter_svc; break; } /* TRACE */ { if( svc != NULL ) { struct __packed { unsigned unit:16, dom:16; uint64_t cur_deadline, cur_budget; } d; d.dom = svc->unit->domain->domain_id; d.unit = svc->unit->unit_id; d.cur_deadline = (uint64_t) svc->cur_deadline; d.cur_budget = (uint64_t) svc->cur_budget; trace_var(TRC_RTDS_RUNQ_PICK, 1, sizeof(d), (unsigned char *) &d); } } return svc; } /* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ static void cf_check rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, s_time_t now, bool tasklet_work_scheduled) { const unsigned int cur_cpu = smp_processor_id(); const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); struct rt_private *prv = rt_priv(ops); struct rt_unit *const scurr = rt_unit(currunit); struct rt_unit *snext = NULL; bool migrated = false; /* TRACE */ { struct __packed { unsigned cpu:16, tasklet:8, tickled:4, idle:4; } d; d.cpu = cur_cpu; d.tasklet = tasklet_work_scheduled; d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled); d.idle = is_idle_unit(currunit); trace_var(TRC_RTDS_SCHEDULE, 1, sizeof(d), (unsigned char *)&d); } /* clear ticked bit now that we've been scheduled */ cpumask_clear_cpu(sched_cpu, &prv->tickled); /* burn_budget would return for IDLE UNIT */ burn_budget(ops, scurr, now); if ( tasklet_work_scheduled ) { trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL); snext = rt_unit(sched_idle_unit(sched_cpu)); } else { while ( true ) { snext = runq_pick(ops, cpumask_of(sched_cpu), cur_cpu); if ( snext == NULL ) { snext = rt_unit(sched_idle_unit(sched_cpu)); break; } if ( unit_runnable_state(snext->unit) ) break; q_remove(snext); replq_remove(ops, snext); } /* if scurr has higher priority and budget, still pick scurr */ if ( !is_idle_unit(currunit) && unit_runnable_state(currunit) && scurr->cur_budget > 0 && ( is_idle_unit(snext->unit) || compare_unit_priority(scurr, snext) > 0 ) ) snext = scurr; } if ( snext != scurr && !is_idle_unit(currunit) && unit_runnable(currunit) ) __set_bit(__RTDS_delayed_runq_add, &scurr->flags); snext->last_start = now; currunit->next_time = -1; /* if an idle unit is picked */ if ( !is_idle_unit(snext->unit) ) { if ( snext != scurr ) { q_remove(snext); __set_bit(__RTDS_scheduled, &snext->flags); } if ( sched_unit_master(snext->unit) != sched_cpu ) { sched_set_res(snext->unit, get_sched_res(sched_cpu)); migrated = true; } /* Invoke the scheduler next time. */ currunit->next_time = snext->cur_budget; } currunit->next_task = snext->unit; snext->unit->migrated = migrated; } /* * Remove UNIT from RunQ * The lock is already grabbed in schedule.c, no need to lock here */ static void cf_check rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) { struct rt_unit * const svc = rt_unit(unit); BUG_ON( is_idle_unit(unit) ); SCHED_STAT_CRANK(unit_sleep); if ( curr_on_cpu(sched_unit_master(unit)) == unit ) cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); else if ( unit_on_q(svc) ) { q_remove(svc); replq_remove(ops, svc); } else if ( svc->flags & RTDS_delayed_runq_add ) __clear_bit(__RTDS_delayed_runq_add, &svc->flags); } /* * Pick a cpu where to run an unit, * possibly kicking out the unit running there * Called by wake() and context_saved() * We have a running candidate here, the kick logic is: * Among all the cpus that are within the cpu affinity * 1) if there are any idle CPUs, kick one. For cache benefit, we check new->cpu as first * 2) now all pcpus are busy; * among all the running units, pick lowest priority one * if snext has higher priority, kick it. * * TODO: * 1) what if these two units belongs to the same domain? * replace an unit belonging to the same domain introduces more overhead * * lock is grabbed before calling this function */ static void runq_tickle(const struct scheduler *ops, const struct rt_unit *new) { struct rt_private *prv = rt_priv(ops); const struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */ const struct rt_unit *iter_svc; const struct sched_unit *iter_unit; int cpu = 0, cpu_to_tickle = 0; cpumask_t *not_tickled = cpumask_scratch_cpu(smp_processor_id()); const cpumask_t *online; if ( new == NULL || is_idle_unit(new->unit) ) return; online = cpupool_domain_master_cpumask(new->unit->domain); cpumask_and(not_tickled, online, new->unit->cpu_hard_affinity); cpumask_andnot(not_tickled, not_tickled, &prv->tickled); /* * 1) If there are any idle CPUs, kick one. * For cache benefit,we first search new->cpu. * The same loop also find the one with lowest priority. */ cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), not_tickled); while ( cpu!= nr_cpu_ids ) { iter_unit = curr_on_cpu(cpu); if ( is_idle_unit(iter_unit) ) { SCHED_STAT_CRANK(tickled_idle_cpu); cpu_to_tickle = cpu; goto out; } iter_svc = rt_unit(iter_unit); if ( latest_deadline_unit == NULL || compare_unit_priority(iter_svc, latest_deadline_unit) < 0 ) latest_deadline_unit = iter_svc; cpumask_clear_cpu(cpu, not_tickled); cpu = cpumask_cycle(cpu, not_tickled); } /* 2) candicate has higher priority, kick out lowest priority unit */ if ( latest_deadline_unit != NULL && compare_unit_priority(latest_deadline_unit, new) < 0 ) { SCHED_STAT_CRANK(tickled_busy_cpu); cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit); goto out; } /* didn't tickle any cpu */ SCHED_STAT_CRANK(tickled_no_cpu); return; out: /* TRACE */ { struct { unsigned cpu:16, pad:16; } d; d.cpu = cpu_to_tickle; d.pad = 0; trace_var(TRC_RTDS_TICKLE, 1, sizeof(d), (unsigned char *)&d); } cpumask_set_cpu(cpu_to_tickle, &prv->tickled); cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); return; } /* * Should always wake up runnable unit, put it back to RunQ. * Check priority to raise interrupt * The lock is already grabbed in schedule.c, no need to lock here * TODO: what if these two units belongs to the same domain? */ static void cf_check rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit) { struct rt_unit * const svc = rt_unit(unit); s_time_t now; bool missed; BUG_ON( is_idle_unit(unit) ); if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) { SCHED_STAT_CRANK(unit_wake_running); return; } /* on RunQ/DepletedQ, just update info is ok */ if ( unlikely(unit_on_q(svc)) ) { SCHED_STAT_CRANK(unit_wake_onrunq); return; } if ( likely(unit_runnable(unit)) ) SCHED_STAT_CRANK(unit_wake_runnable); else SCHED_STAT_CRANK(unit_wake_not_runnable); /* * If a deadline passed while svc was asleep/blocked, we need new * scheduling parameters (a new deadline and full budget). */ now = NOW(); missed = ( now >= svc->cur_deadline ); if ( missed ) rt_update_deadline(now, svc); /* * If context hasn't been saved for this unit yet, we can't put it on * the run-queue/depleted-queue. Instead, we set the appropriate flag, * the unit will be put back on queue after the context has been saved * (in rt_context_save()). */ if ( unlikely(svc->flags & RTDS_scheduled) ) { __set_bit(__RTDS_delayed_runq_add, &svc->flags); /* * The unit is waking up already, and we didn't even had the time to * remove its next replenishment event from the replenishment queue * when it blocked! No big deal. If we did not miss the deadline in * the meantime, let's just leave it there. If we did, let's remove it * and queue a new one (to occur at our new deadline). */ if ( missed ) replq_reinsert(ops, svc); return; } /* Replenishment event got cancelled when we blocked. Add it back. */ replq_insert(ops, svc); /* insert svc to runq/depletedq because svc is not in queue now */ runq_insert(ops, svc); runq_tickle(ops, svc); } /* * scurr has finished context switch, insert it back to the RunQ, * and then pick the highest priority unit from runq to run */ static void cf_check rt_context_saved(const struct scheduler *ops, struct sched_unit *unit) { struct rt_unit *svc = rt_unit(unit); spinlock_t *lock = unit_schedule_lock_irq(unit); __clear_bit(__RTDS_scheduled, &svc->flags); /* not insert idle unit to runq */ if ( is_idle_unit(unit) ) goto out; if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && likely(unit_runnable(unit)) ) { runq_insert(ops, svc); runq_tickle(ops, svc); } else replq_remove(ops, svc); out: unit_schedule_unlock_irq(lock, unit); } /* * set/get each unit info of each domain */ static int cf_check rt_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct rt_private *prv = rt_priv(ops); struct rt_unit *svc; const struct sched_unit *unit; unsigned long flags; int rc = 0; struct xen_domctl_schedparam_vcpu local_sched; s_time_t period, budget; uint32_t index = 0; switch ( op->cmd ) { case XEN_DOMCTL_SCHEDOP_getinfo: /* Return the default parameters. */ op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1); op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1); break; case XEN_DOMCTL_SCHEDOP_putinfo: if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 ) { rc = -EINVAL; break; } spin_lock_irqsave(&prv->lock, flags); for_each_sched_unit ( d, unit ) { svc = rt_unit(unit); svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */ svc->budget = MICROSECS(op->u.rtds.budget); } spin_unlock_irqrestore(&prv->lock, flags); break; case XEN_DOMCTL_SCHEDOP_getvcpuinfo: case XEN_DOMCTL_SCHEDOP_putvcpuinfo: while ( index < op->u.v.nr_vcpus ) { if ( copy_from_guest_offset(&local_sched, op->u.v.vcpus, index, 1) ) { rc = -EFAULT; break; } if ( local_sched.vcpuid >= d->max_vcpus || d->vcpu[local_sched.vcpuid] == NULL ) { rc = -EINVAL; break; } if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo ) { spin_lock_irqsave(&prv->lock, flags); svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); local_sched.u.rtds.budget = svc->budget / MICROSECS(1); local_sched.u.rtds.period = svc->period / MICROSECS(1); if ( has_extratime(svc) ) local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra; else local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra; spin_unlock_irqrestore(&prv->lock, flags); if ( copy_to_guest_offset(op->u.v.vcpus, index, &local_sched, 1) ) { rc = -EFAULT; break; } } else { period = MICROSECS(local_sched.u.rtds.period); budget = MICROSECS(local_sched.u.rtds.budget); if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET || budget > period || period < RTDS_MIN_PERIOD ) { rc = -EINVAL; break; } spin_lock_irqsave(&prv->lock, flags); svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); svc->period = period; svc->budget = budget; if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra ) __set_bit(__RTDS_extratime, &svc->flags); else __clear_bit(__RTDS_extratime, &svc->flags); spin_unlock_irqrestore(&prv->lock, flags); } /* Process a most 64 vCPUs without checking for preemptions. */ if ( (++index > 63) && hypercall_preempt_check() ) break; } if ( !rc ) /* notify upper caller how many units have been processed. */ op->u.v.nr_vcpus = index; break; } return rc; } /* * The replenishment timer handler picks units * from the replq and does the actual replenishment. */ static void cf_check repl_timer_handler(void *data) { s_time_t now; const struct scheduler *ops = data; struct rt_private *prv = rt_priv(ops); struct list_head *replq = rt_replq(ops); struct list_head *runq = rt_runq(ops); struct list_head *iter, *tmp; struct rt_unit *svc; LIST_HEAD(tmp_replq); spin_lock_irq(&prv->lock); now = NOW(); /* * Do the replenishment and move replenished units * to the temporary list to tickle. * If svc is on run queue, we need to put it at * the correct place since its deadline changes. */ list_for_each_safe ( iter, tmp, replq ) { svc = replq_elem(iter); if ( now < svc->cur_deadline ) break; list_del(&svc->replq_elem); rt_update_deadline(now, svc); list_add(&svc->replq_elem, &tmp_replq); if ( unit_on_q(svc) ) { q_remove(svc); runq_insert(ops, svc); } } /* * Iterate through the list of updated units. * If an updated unit is running, tickle the head of the * runqueue if it has a higher priority. * If an updated unit was depleted and on the runqueue, tickle it. * Finally, reinsert the units back to replenishement events list. */ list_for_each_safe ( iter, tmp, &tmp_replq ) { svc = replq_elem(iter); if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit && !list_empty(runq) ) { struct rt_unit *next_on_runq = q_elem(runq->next); if ( compare_unit_priority(svc, next_on_runq) < 0 ) runq_tickle(ops, next_on_runq); } else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) && unit_on_q(svc) ) runq_tickle(ops, svc); list_del(&svc->replq_elem); deadline_replq_insert(svc, &svc->replq_elem, replq); } /* * If there are units left in the replenishment event list, * set the next replenishment to happen at the deadline of * the one in the front. */ if ( !list_empty(replq) ) set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline); spin_unlock_irq(&prv->lock); } static const struct scheduler sched_rtds_def = { .name = "SMP RTDS Scheduler", .opt_name = "rtds", .sched_id = XEN_SCHEDULER_RTDS, .sched_data = NULL, .dump_cpu_state = rt_dump_pcpu, .dump_settings = rt_dump, .init = rt_init, .deinit = rt_deinit, .switch_sched = rt_switch_sched, .deinit_pdata = rt_deinit_pdata, .alloc_domdata = rt_alloc_domdata, .free_domdata = rt_free_domdata, .alloc_udata = rt_alloc_udata, .free_udata = rt_free_udata, .insert_unit = rt_unit_insert, .remove_unit = rt_unit_remove, .adjust = rt_dom_cntl, .pick_resource = rt_res_pick, .do_schedule = rt_schedule, .sleep = rt_unit_sleep, .wake = rt_unit_wake, .context_saved = rt_context_saved, .move_timers = rt_move_timers, }; REGISTER_SCHEDULER(sched_rtds_def);