From c0a80c0c27e5e65b180a25e6c4c2f7ef9e386cd3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 9 Jan 2015 13:06:33 +0100 Subject: ftrace: allow architectures to specify ftrace compile options If the kernel is compiled with function tracer support the -pg compile option is passed to gcc to generate extra code into the prologue of each function. This patch replaces the "open-coded" -pg compile flag with a CC_FLAGS_FTRACE makefile variable which architectures can override if a different option should be used for code generation. Acked-by: Steven Rostedt Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- kernel/sched/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c..46be87024875 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg +CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) -- cgit v1.2.1 From 545a2bf742fb41f17d03486dd8a8c74ad511dec2 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 12 Feb 2015 15:01:24 -0800 Subject: kernel/sched/clock.c: add another clock for use with the soft lockup watchdog When the hypervisor pauses a virtualised kernel the kernel will observe a jump in timebase, this can cause spurious messages from the softlockup detector. Whilst these messages are harmless, they are accompanied with a stack trace which causes undue concern and more problematically the stack trace in the guest has nothing to do with the observed problem and can only be misleading. Futhermore, on POWER8 this is completely avoidable with the introduction of the Virtual Time Base (VTB) register. This patch (of 2): This permits the use of arch specific clocks for which virtualised kernels can use their notion of 'running' time, not the elpased wall time which will include host execution time. Signed-off-by: Cyril Bur Cc: Michael Ellerman Cc: Andrew Jones Acked-by: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: chai wen Cc: Fabian Frederick Cc: Aaron Tomlin Cc: Ben Zhang Cc: Martin Schwidefsky Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/clock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c27e4f8f4879..c0a205101c23 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -420,3 +420,16 @@ u64 local_clock(void) EXPORT_SYMBOL_GPL(cpu_clock); EXPORT_SYMBOL_GPL(local_clock); + +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +u64 __weak running_clock(void) +{ + return local_clock(); +} -- cgit v1.2.1 From 3810631332465d967ba5e27ea2c7dff2c9afac6c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Feb 2015 23:33:15 +0100 Subject: PM / sleep: Re-implement suspend-to-idle handling In preparation for adding support for quiescing timers in the final stage of suspend-to-idle transitions, rework the freeze_enter() function making the system wait on a wakeup event, the freeze_wake() function terminating the suspend-to-idle loop and the mechanism by which deep idle states are entered during suspend-to-idle. First of all, introduce a simple state machine for suspend-to-idle and make the code in question use it. Second, prevent freeze_enter() from losing wakeup events due to race conditions and ensure that the number of online CPUs won't change while it is being executed. In addition to that, make it force all of the CPUs re-enter the idle loop in case they are in idle states already (so they can enter deeper idle states if possible). Next, drop cpuidle_use_deepest_state() and replace use_deepest_state checks in cpuidle_select() and cpuidle_reflect() with a single suspend-to-idle state check in cpuidle_idle_call(). Finally, introduce cpuidle_enter_freeze() that will simply find the deepest idle state available to the given CPU and enter it using cpuidle_enter(). Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/sched/idle.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index aaf1c1d5cf5d..94b2d7b88a27 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -104,6 +105,21 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + /* + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. + */ + if (idle_should_freeze()) { + cpuidle_enter_freeze(); + local_irq_enable(); + goto exit_idle; + } + /* * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. -- cgit v1.2.1 From 333470ee46b6851477dc9392eb71ba8ffa4f3387 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Feb 2015 14:37:28 -0800 Subject: sched: use %*pb[l] to print bitmaps including cpumasks and nodemasks printk and friends can now format bitmaps using '%*pb[l]'. cpumask and nodemask also provide cpumask_pr_args() and nodemask_pr_args() respectively which can be used to generate the two printf arguments necessary to format the specified cpu/nodemask. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 10 ++++------ kernel/sched/stats.c | 11 ++--------- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f37fe7f77a4..13049aac05a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5462,9 +5462,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; - char str[256]; - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5477,7 +5475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -5522,9 +5521,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %*pbl", + cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index a476bea17fbc..87e2c9f0c33e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -15,11 +15,6 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; if (v == (void *)1) { seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); @@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); + seq_printf(seq, "domain%d %*pb", dcount++, + cpumask_pr_args(sched_domain_span(sd))); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", @@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v) rcu_read_unlock(); #endif } - kfree(mask_str); return 0; } -- cgit v1.2.1 From 01e04f466e12e883907937eb04a9010533363f55 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Feb 2015 00:39:21 +0100 Subject: idle / sleep: Avoid excessive disabling and enabling interrupts Disabling interrupts at the end of cpuidle_enter_freeze() is not useful, because its caller, cpuidle_idle_call(), re-enables them right away after invoking it. To avoid that unnecessary back and forth dance with interrupts, make cpuidle_enter_freeze() enable interrupts after calling enter_freeze_proper() and drop the local_irq_disable() at its end, so that all of the code paths in it end up with interrupts enabled. Then, cpuidle_idle_call() will not need to re-enable interrupts after calling cpuidle_enter_freeze() any more, because the latter will return with interrupts enabled, in analogy with cpuidle_enter(). Reported-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/sched/idle.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 94b2d7b88a27..f59198bda1bf 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -116,7 +116,6 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { cpuidle_enter_freeze(); - local_irq_enable(); goto exit_idle; } -- cgit v1.2.1 From dfcacc154fb38fdb2c243c3dbbdc1f26a64cedc8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:25:37 +0100 Subject: cpuidle: Clean up fallback handling in cpuidle_idle_call() Move the fallback code path in cpuidle_idle_call() to the end of the function to avoid jumping to a label in an if () branch. Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f59198bda1bf..84b93b68482a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -124,20 +124,8 @@ static void cpuidle_idle_call(void) * Fall back to the default arch idle method on errors. */ next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; - } - + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -195,6 +183,19 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } /* -- cgit v1.2.1 From ef2b22ac540c018bd574d1846ab95b9bfcf38702 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:26:55 +0100 Subject: cpuidle / sleep: Use broadcast timer for states that stop local timer Commit 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) overlooked the fact that entering some sufficiently deep idle states by CPUs may cause their local timers to stop and in those cases it is necessary to switch over to a broadcast timer prior to entering the idle state. If the cpuidle driver in use does not provide the new ->enter_freeze callback for any of the idle states, that problem affects suspend-to-idle too, but it is not taken into account after the changes made by commit 381063133246. Fix that by changing the definition of cpuidle_enter_freeze() and re-arranging of the code in cpuidle_idle_call(), so the former does not call cpuidle_enter() any more and the fallback case is handled by cpuidle_idle_call() directly. Fixes: 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) Reported-and-tested-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/sched/idle.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 84b93b68482a..80014a178342 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -82,6 +82,7 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -105,6 +106,9 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* * Suspend-to-idle ("freeze") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only @@ -115,15 +119,22 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ if (idle_should_freeze()) { - cpuidle_enter_freeze(); - goto exit_idle; - } + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { + local_irq_enable(); + goto exit_idle; + } - /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. - */ - next_state = cpuidle_select(drv, dev); + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); + } + /* Fall back to the default arch idle method on errors. */ if (next_state < 0) goto use_default; @@ -170,7 +181,8 @@ static void cpuidle_idle_call(void) /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); -- cgit v1.2.1 From 746db9443ea57fd9c059f62c4bfbf41cf224fe13 Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Wed, 18 Feb 2015 16:23:56 -0800 Subject: sched: Fix RLIMIT_RTTIME when PI-boosting to RT When non-realtime tasks get priority-inheritance boosted to a realtime scheduling class, RLIMIT_RTTIME starts to apply to them. However, the counter used for checking this (the same one used for SCHED_RR timeslices) was not getting reset. This meant that tasks running with a non-realtime scheduling class which are repeatedly boosted to a realtime one, but never block while they are running realtime, eventually hit the timeout without ever running for a time over the limit. This patch resets the realtime timeslice counter when un-PI-boosting from an RT to a non-RT scheduling class. I have some test code with two threads and a shared PTHREAD_PRIO_INHERIT mutex which induces priority boosting and spins while boosted that gets killed by a SIGXCPU on non-fixed kernels but doesn't with this patch applied. It happens much faster with a CONFIG_PREEMPT_RT kernel, and does happen eventually with PREEMPT_VOLUNTARY kernels. Signed-off-by: Brian Silverman Signed-off-by: Peter Zijlstra (Intel) Cc: austin@peloton-tech.com Cc: Link: http://lkml.kernel.org/r/1424305436-6716-1-git-send-email-brian@peloton-tech.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..62671f53202a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3034,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } -- cgit v1.2.1