From e6745a4da964f5f49b21cbcce5125456984ddc6b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 19 Dec 2022 13:31:07 -0500 Subject: tracing: Add a way to filter function addresses to function names There's been several times where an event records a function address in its field and I needed to filter on that address for a specific function name. It required looking up the function in kallsyms, finding its size, and doing a compare of "field >= function_start && field < function_end". But this would change from boot to boot and is unreliable in scripts. Also, it is useful to have this at boot up, where the addresses will not be known. For example, on the boot command line: trace_trigger="initcall_finish.traceoff if func.function == acpi_init" To implement this, add a ".function" prefix, that will check that the field is of size long, and the only operations allowed (so far) are "==" and "!=". Link: https://lkml.kernel.org/r/20221219183213.916833763@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Zheng Yejian Reviewed-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/events.rst | 12 +++++ kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events_filter.c | 93 +++++++++++++++++++++++++++++++++++++- 3 files changed, 105 insertions(+), 2 deletions(-) diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index c47f381d0c00..d0fd5c7220b7 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -207,6 +207,18 @@ field name:: As the kernel will have to know how to retrieve the memory that the pointer is at from user space. +You can convert any long type to a function address and search by function name:: + + call_site.function == security_prepare_creds + +The above will filter when the field "call_site" falls on the address within +"security_prepare_creds". That is, it will compare the value of "call_site" and +the filter will return true if it is greater than or equal to the start of +the function "security_prepare_creds" and less than the end of that function. + +The ".function" postfix can only be attached to values of size long, and can only +be compared with "==" or "!=". + 5.2 Setting filters ------------------- diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 33e0b4f8ebe6..b07931ad97de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2822,7 +2822,7 @@ static __init int setup_trace_triggers(char *str) if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); - bootup_triggers[i].trigger = strsep(&trigger, "."); + bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e095c3b3a50d..1dad64267878 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -64,6 +64,7 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; @@ -71,6 +72,7 @@ enum filter_pred_fn { struct filter_pred { enum filter_pred_fn fn_num; u64 val; + u64 val2; struct regex regex; unsigned short *ops; struct ftrace_event_field *field; @@ -103,6 +105,7 @@ struct filter_pred { C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FUNCTION, "Function not found"), \ C(ERRNO, "Error"), \ C(NO_FILTER, "No filter found") @@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } +/* Filter predicate for functions. */ +static int filter_pred_function(struct filter_pred *pred, void *event) +{ + unsigned long *addr = (unsigned long *)(event + pred->offset); + unsigned long start = (unsigned long)pred->val; + unsigned long end = (unsigned long)pred->val2; + int ret = *addr >= start && *addr < end; + + return pred->op == OP_EQ ? ret : !ret; +} + /* * regex_match_foo - Basic regex callbacks * @@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_FUNCTION: + return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: return test_pred_visited_fn(pred, event); default: @@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data, struct trace_event_call *call = data; struct ftrace_event_field *field; struct filter_pred *pred = NULL; + unsigned long offset; + unsigned long size; + unsigned long ip; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + char *name; + bool function = false; bool ustring = false; char q; u64 val; @@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data, i += len; } + /* See if the field is a kernel function name */ + if ((len = str_has_prefix(str + i, ".function"))) { + function = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data, pred->offset = field->offset; pred->op = op; - if (ftrace_event_is_function(call)) { + if (function) { + /* The field must be the same size as long */ + if (field->size != sizeof(long)) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Function only works with '==' or '!=' and an unquoted string */ + switch (op) { + case OP_NE: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; + } + + if (isdigit(str[i])) { + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + strncpy(num_buf, str + s, len); + num_buf[len] = 0; + + ret = kstrtoul(num_buf, 0, &ip); + if (ret) { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } + } else { + s = i; + for (; str[i] && !isspace(str[i]); i++) + ; + + len = i - s; + name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!name) + goto err_mem; + ip = kallsyms_lookup_name(name); + kfree(name); + if (!ip) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + } + + /* Now find the function start and end address */ + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + + pred->fn_num = FILTER_PRED_FN_FUNCTION; + pred->val = ip - offset; + pred->val2 = pred->val + size; + + } else if (ftrace_event_is_function(call)) { /* * Perf does things different with function events. * It only allows an "ip" field, and expects a string. -- cgit v1.2.1 From 7f09d639b8c4959736ff112d0fb42780c37ff6db Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 19 Dec 2022 13:31:08 -0500 Subject: tracing/selftests: Add test for event filtering on function name With the new filter logic of passing in the name of a function to match an instruction pointer (or the address of the function), add a test to make sure that it is functional. This is also the first test to test plain filtering. The filtering has been tested via the trigger logic, which uses the same code, but there was nothing to test just the event filter, so this test is the first to add such a case. Link: https://lkml.kernel.org/r/20221219183214.075559302@goodmis.org Cc: Andrew Morton Cc: Tom Zanussi Cc: Zheng Yejian Cc: linux-kselftest@vger.kernel.org Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Ross Zwisler Acked-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/filter/event-filter-function.tc | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc new file mode 100644 index 000000000000..e2ff3bf4df80 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -0,0 +1,58 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event filter function - test event filtering on functions +# requires: set_event events/kmem/kmem_cache_free/filter +# flags: instance + +fail() { #msg + echo $1 + exit_fail +} + +echo "Test event filter function name" +echo 0 > tracing_on +echo 0 > events/enable +echo > trace +echo 'call_site.function == exit_mmap' > events/kmem/kmem_cache_free/filter +echo 1 > events/kmem/kmem_cache_free/enable +echo 1 > tracing_on +ls > /dev/null +echo 0 > events/kmem/kmem_cache_free/enable + +hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l` +misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l` + +if [ $hitcnt -eq 0 ]; then + exit_fail +fi + +if [ $misscnt -gt 0 ]; then + exit_fail +fi + +address=`grep ' exit_mmap$' /proc/kallsyms | cut -d' ' -f1` + +echo "Test event filter function address" +echo 0 > tracing_on +echo 0 > events/enable +echo > trace +echo "call_site.function == 0x$address" > events/kmem/kmem_cache_free/filter +echo 1 > events/kmem/kmem_cache_free/enable +echo 1 > tracing_on +sleep 1 +echo 0 > events/kmem/kmem_cache_free/enable + +hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l` +misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l` + +if [ $hitcnt -eq 0 ]; then + exit_fail +fi + +if [ $misscnt -gt 0 ]; then + exit_fail +fi + +reset_events_filter + +exit 0 -- cgit v1.2.1 From b56c68f705cad0cff61fbe132c66ced2c737c65c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 3 Jan 2023 12:49:12 +0000 Subject: ftrace: Add sample with custom ops When reworking core ftrace code or architectural ftrace code, it's often necessary to test/analyse/benchmark a number of ftrace_ops configurations. This patch adds a module which can be used to explore some of those configurations. I'm using this to benchmark various options for changing the way trampolines and handling of ftrace_ops work on arm64, and ensuring other architectures aren't adversely affected. For example, in a QEMU+KVM VM running on a 2GHz Xeon E5-2660 workstation, loading the module in various configurations produces: | # insmod ftrace-ops.ko | ftrace_ops: registering: | relevant ops: 1 | tracee: tracee_relevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | irrelevant ops: 0 | tracee: tracee_irrelevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | saving registers: NO | assist recursion: NO | assist RCU: NO | ftrace_ops: Attempted 100000 calls to tracee_relevant [ftrace_ops] in 1681558ns (16ns / call) | # insmod ftrace-ops.ko nr_ops_irrelevant=5 | ftrace_ops: registering: | relevant ops: 1 | tracee: tracee_relevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | irrelevant ops: 5 | tracee: tracee_irrelevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | saving registers: NO | assist recursion: NO | assist RCU: NO | ftrace_ops: Attempted 100000 calls to tracee_relevant [ftrace_ops] in 1693042ns (16ns / call) | # insmod ftrace-ops.ko nr_ops_relevant=2 | ftrace_ops: registering: | relevant ops: 2 | tracee: tracee_relevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | irrelevant ops: 0 | tracee: tracee_irrelevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | saving registers: NO | assist recursion: NO | assist RCU: NO | ftrace_ops: Attempted 100000 calls to tracee_relevant [ftrace_ops] in 11965582ns (119ns / call) | # insmod ftrace-ops.ko save_regs=true | ftrace_ops: registering: | relevant ops: 1 | tracee: tracee_relevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | irrelevant ops: 0 | tracee: tracee_irrelevant [ftrace_ops] | tracer: ops_func_nop [ftrace_ops] | saving registers: YES | assist recursion: NO | assist RCU: NO | ftrace_ops: Attempted 100000 calls to tracee_relevant [ftrace_ops] in 4459624ns (44ns / call) Link: https://lkml.kernel.org/r/20230103124912.2948963-4-mark.rutland@arm.com Cc: Florent Revest Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- samples/Kconfig | 7 ++ samples/Makefile | 1 + samples/ftrace/Makefile | 1 + samples/ftrace/ftrace-ops.c | 252 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 261 insertions(+) create mode 100644 samples/ftrace/ftrace-ops.c diff --git a/samples/Kconfig b/samples/Kconfig index 0d81c00289ee..daf14c35f071 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -46,6 +46,13 @@ config SAMPLE_FTRACE_DIRECT_MULTI that hooks to wake_up_process and schedule, and prints the function addresses. +config SAMPLE_FTRACE_OPS + tristate "Build custom ftrace ops example" + depends on FUNCTION_TRACER + help + This builds an ftrace ops example that hooks two functions and + measures the time taken to invoke one function a number of times. + config SAMPLE_TRACE_ARRAY tristate "Build sample module for kernel access to Ftrace instancess" depends on EVENT_TRACING && m diff --git a/samples/Makefile b/samples/Makefile index 9832ef3f8fcb..7cb632ef88ee 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index faf8cdb79c5f..589baf2ec4e3 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace-ops.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c new file mode 100644 index 000000000000..24deb51c7261 --- /dev/null +++ b/samples/ftrace/ftrace-ops.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include + +/* + * Arbitrary large value chosen to be sufficiently large to minimize noise but + * sufficiently small to complete quickly. + */ +unsigned int nr_function_calls = 100000; +module_param(nr_function_calls, uint, 0); +MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee"); + +/* + * The number of ops associated with a call site affects whether a tracer can + * be called directly or whether it's necessary to go via the list func, which + * can be significantly more expensive. + */ +unsigned int nr_ops_relevant = 1; +module_param(nr_ops_relevant, uint, 0); +MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the relevant tracee"); + +/* + * On architectures where all call sites share the same trampoline, having + * tracers enabled for distinct functions can force the use of the list func + * and incur overhead for all call sites. + */ +unsigned int nr_ops_irrelevant = 0; +module_param(nr_ops_irrelevant, uint, 0); +MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the irrelevant tracee"); + +/* + * On architectures with DYNAMIC_FTRACE_WITH_REGS, saving the full pt_regs can + * be more expensive than only saving the minimal necessary regs. + */ +bool save_regs = false; +module_param(save_regs, bool, 0); +MODULE_PARM_DESC(save_regs, "Register ops with FTRACE_OPS_FL_SAVE_REGS (save all registers in the trampoline)"); + +bool assist_recursion = false; +module_param(assist_recursion, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RECURSION"); + +bool assist_rcu = false; +module_param(assist_rcu, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU"); + +/* + * By default, a trivial tracer is used which immediately returns to mimimize + * overhead. Sometimes a consistency check using a more expensive tracer is + * desireable. + */ +bool check_count = false; +module_param(check_count, bool, 0); +MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number of times\n"); + +/* + * Usually it's not interesting to leave the ops registered after the test + * runs, but sometimes it can be useful to leave them registered so that they + * can be inspected through the tracefs 'enabled_functions' file. + */ +bool persist = false; +module_param(persist, bool, 0); +MODULE_PARM_DESC(persist, "Successfully load module and leave ftrace ops registered after test completes\n"); + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_relevant(void) +{ + barrier(); +} + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_irrelevant(void) +{ + barrier(); +} + +struct sample_ops { + struct ftrace_ops ops; + unsigned int count; +}; + +static void ops_func_nop(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* do nothing */ +} + +static void ops_func_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct sample_ops *self; + + self = container_of(op, struct sample_ops, ops); + self->count++; +} + +struct sample_ops *ops_relevant; +struct sample_ops *ops_irrelevant; + +static struct sample_ops *ops_alloc_init(void *tracee, ftrace_func_t func, + unsigned long flags, int nr) +{ + struct sample_ops *ops; + + ops = kcalloc(nr, sizeof(*ops), GFP_KERNEL); + if (WARN_ON_ONCE(!ops)) + return NULL; + + for (unsigned int i = 0; i < nr; i++) { + ops[i].ops.func = func; + ops[i].ops.flags = flags; + WARN_ON_ONCE(ftrace_set_filter_ip(&ops[i].ops, (unsigned long)tracee, 0, 0)); + WARN_ON_ONCE(register_ftrace_function(&ops[i].ops)); + } + + return ops; +} + +static void ops_destroy(struct sample_ops *ops, int nr) +{ + if (!ops) + return; + + for (unsigned int i = 0; i < nr; i++) { + WARN_ON_ONCE(unregister_ftrace_function(&ops[i].ops)); + ftrace_free_filter(&ops[i].ops); + } + + kfree(ops); +} + +static void ops_check(struct sample_ops *ops, int nr, + unsigned int expected_count) +{ + if (!ops || !check_count) + return; + + for (unsigned int i = 0; i < nr; i++) { + if (ops->count == expected_count) + continue; + pr_warn("Counter called %u times (expected %u)\n", + ops->count, expected_count); + } +} + +ftrace_func_t tracer_relevant = ops_func_nop; +ftrace_func_t tracer_irrelevant = ops_func_nop; + +static int __init ftrace_ops_sample_init(void) +{ + unsigned long flags = 0; + ktime_t start, end; + u64 period; + + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && save_regs) { + pr_info("this kernel does not support saving registers\n"); + save_regs = false; + } else if (save_regs) { + flags |= FTRACE_OPS_FL_SAVE_REGS; + } + + if (assist_recursion) + flags |= FTRACE_OPS_FL_RECURSION; + + if (assist_rcu) + flags |= FTRACE_OPS_FL_RCU; + + if (check_count) { + tracer_relevant = ops_func_count; + tracer_irrelevant = ops_func_count; + } + + pr_info("registering:\n" + " relevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " irrelevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " saving registers: %s\n" + " assist recursion: %s\n" + " assist RCU: %s\n", + nr_ops_relevant, tracee_relevant, tracer_relevant, + nr_ops_irrelevant, tracee_irrelevant, tracer_irrelevant, + save_regs ? "YES" : "NO", + assist_recursion ? "YES" : "NO", + assist_rcu ? "YES" : "NO"); + + ops_relevant = ops_alloc_init(tracee_relevant, tracer_relevant, + flags, nr_ops_relevant); + ops_irrelevant = ops_alloc_init(tracee_irrelevant, tracer_irrelevant, + flags, nr_ops_irrelevant); + + start = ktime_get(); + for (unsigned int i = 0; i < nr_function_calls; i++) + tracee_relevant(); + end = ktime_get(); + + ops_check(ops_relevant, nr_ops_relevant, nr_function_calls); + ops_check(ops_irrelevant, nr_ops_irrelevant, 0); + + period = ktime_to_ns(ktime_sub(end, start)); + + pr_info("Attempted %u calls to %ps in %lluns (%lluns / call)\n", + nr_function_calls, tracee_relevant, + period, period / nr_function_calls); + + if (persist) + return 0; + + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); + + /* + * The benchmark completed sucessfully, but there's no reason to keep + * the module around. Return an error do the user doesn't have to + * manually unload the module. + */ + return -EINVAL; +} +module_init(ftrace_ops_sample_init); + +static void __exit ftrace_ops_sample_exit(void) +{ + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); +} +module_exit(ftrace_ops_sample_exit); + +MODULE_AUTHOR("Mark Rutland"); +MODULE_DESCRIPTION("Example of using custom ftrace_ops"); +MODULE_LICENSE("GPL"); -- cgit v1.2.1 From 3e4272b9954094907f16861199728f14002fcaf6 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 13 Jan 2023 20:55:01 +0800 Subject: tracing: Add NULL checks for buffer in ring_buffer_free_read_page() In a previous commit 7433632c9ff6, buffer, buffer->buffers and buffer->buffers[cpu] in ring_buffer_wake_waiters() can be NULL, and thus the related checks are added. However, in the same call stack, these variables are also used in ring_buffer_free_read_page(): tracing_buffers_release() ring_buffer_wake_waiters(iter->array_buffer->buffer) cpu_buffer = buffer->buffers[cpu] -> Add checks by previous commit ring_buffer_free_read_page(iter->array_buffer->buffer) cpu_buffer = buffer->buffers[cpu] -> No check Thus, to avod possible null-pointer derefernces, the related checks should be added. These results are reported by a static tool designed by myself. Link: https://lkml.kernel.org/r/20230113125501.760324-1-baijiaju1990@gmail.com Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c366a0a9ddba..45d4a23d6044 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5626,11 +5626,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data; struct page *page = virt_to_page(bpage); unsigned long flags; + if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* If the page is still in use someplace else, we can't reuse it */ if (page_ref_count(page) > 1) goto out; -- cgit v1.2.1 From 19ff80496450f58a148effa2551350f3b89e4990 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:26 -0500 Subject: tracing: Simplify calculating entry size using struct_size() When tracing a dynamic string field for a synthetic event, the offset calculation for where to write the next event can use struct_size() to find what the current size of the structure is. This simplifies the code and makes it less error prone. Link: https://lkml.kernel.org/r/20230117152235.698632147@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 67592eed0be8..9f79cd689b79 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -416,8 +416,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, if (is_dynamic) { u32 data_offset; - data_offset = offsetof(typeof(*entry), fields); - data_offset += event->n_u64 * sizeof(u64); + data_offset = struct_size(entry, fields, event->n_u64); data_offset += data_size; len = kern_fetch_store_strlen((unsigned long)str_val); -- cgit v1.2.1 From 288709c9f3b0236000754824bcabc9a9ffaa3738 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:27 -0500 Subject: tracing: Allow stacktraces to be saved as histogram variables Allow to save stacktraces into a histogram variable. This will be used by synthetic events to allow a stacktrace from one event to be passed and displayed by another event. The special keyword "stacktrace" is to be used to trigger a stack trace for the event that the histogram trigger is attached to. echo 'hist:keys=pid:st=stacktrace" > events/sched/sched_waking/trigger Currently nothing can get access to the "$st" variable above that contains the stack trace, but that will soon change. Link: https://lkml.kernel.org/r/20230117152235.856323729@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 52 ++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5edbf6b1da3f..f8f67e17898a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1360,6 +1360,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_STACKTRACE) + field_name = "stacktrace"; else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -1980,6 +1982,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_STACKTRACE) { hist_field->fn_num = HIST_FIELD_FN_NOP; + hist_field->size = HIST_STACKTRACE_SIZE; + hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -2351,6 +2357,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "stacktrace") == 0) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; else if (strcmp(field_name, "hitcount") == 0) @@ -3119,13 +3127,24 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; - if (val->flags & HIST_FIELD_FL_STRING) { + if (val->flags & (HIST_FIELD_FL_STRING | + HIST_FIELD_FL_STACKTRACE)) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; unsigned int size; - size = min(val->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); + if (val->flags & HIST_FIELD_FL_STRING) { + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + int e; + + e = stack_trace_save((void *)str, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)str)[e] = 0; + } var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -3824,7 +3843,8 @@ static void save_field_var(struct hist_trigger_data *hist_data, { hist_data->field_vars[hist_data->n_field_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + /* Stack traces are saved in the string storage too */ + if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_field_var_str++; } @@ -4103,7 +4123,8 @@ static int action_create(struct hist_trigger_data *hist_data, } hist_data->save_vars[hist_data->n_save_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + if (field_var->val->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_save_var_str++; kfree(param); } @@ -4351,7 +4372,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) update_var_execname(hist_data->fields[val_idx]); - if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + if (!ret && hist_data->fields[val_idx]->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; return ret; @@ -5092,7 +5114,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; - if (hist_field->flags & HIST_FIELD_FL_STRING) { + if (hist_field->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; unsigned int size; @@ -5105,9 +5128,18 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - size = min(hist_field->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); - + if (hist_field->flags & HIST_FIELD_FL_STRING) { + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + int e; + + e = stack_trace_save((void *)str, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)str)[e] = 0; + } hist_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, hist_val); -- cgit v1.2.1 From 00cf3d672a9dd409418647e9f98784c339c3ff63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:28 -0500 Subject: tracing: Allow synthetic events to pass around stacktraces Allow a stacktrace from one event to be displayed by the end event of a synthetic event. This is very useful when looking for the longest latency of a sleep or something blocked on I/O. # cd /sys/kernel/tracing/ # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 1||prev_state == 2' > events/sched/sched_switch/trigger # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger The above creates a "block_lat" synthetic event that take the stacktrace of when a task schedules out in either the interruptible or uninterruptible states, and on a new per process max $delta (the time it was scheduled out), will print the process id and the stacktrace. # echo 1 > events/synthetic/block_lat/enable # cat trace # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | kworker/u16:0-767 [006] d..4. 560.645045: block_lat: pid=767 delta=66 stack=STACK: => __schedule => schedule => pipe_read => vfs_read => ksys_read => do_syscall_64 => 0x966000aa -0 [003] d..4. 561.132117: block_lat: pid=0 delta=413787 stack=STACK: => __schedule => schedule => schedule_hrtimeout_range_clock => do_sys_poll => __x64_sys_poll => do_syscall_64 => 0x966000aa <...>-153 [006] d..4. 562.068407: block_lat: pid=153 delta=54 stack=STACK: => __schedule => schedule => io_schedule => rq_qos_wait => wbt_wait => __rq_qos_throttle => blk_mq_submit_bio => submit_bio_noacct_nocheck => ext4_bio_write_page => mpage_submit_page => mpage_process_page_bufs => mpage_prepare_extent_to_map => ext4_do_writepages => ext4_writepages => do_writepages => __writeback_single_inode Link: https://lkml.kernel.org/r/20230117152236.010941267@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 4 ++ kernel/trace/trace_events_hist.c | 7 ++-- kernel/trace/trace_events_synth.c | 80 ++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_synth.h | 1 + 4 files changed, 87 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb6d6b97a9f..d16929dd0f08 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,10 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f8f67e17898a..2c0837ff2299 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -480,10 +480,6 @@ DEFINE_HIST_FIELD_FN(u8); #define for_each_hist_key_field(i, hist_data) \ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) -#define HIST_STACKTRACE_DEPTH 16 -#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) -#define HIST_STACKTRACE_SKIP 5 - #define HITCOUNT_IDX 0 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) @@ -3869,6 +3865,9 @@ static int check_synth_field(struct synth_event *event, && field->is_dynamic) return 0; + if (strstr(hist_field->type, "long[") && field->is_stack) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || (!field->is_string && field->is_signed != hist_field->is_signed)) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9f79cd689b79..adb630633f31 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -173,6 +173,14 @@ static int synth_field_is_string(char *type) return false; } +static int synth_field_is_stack(char *type) +{ + if (strstr(type, "long[") != NULL) + return true; + + return false; +} + static int synth_field_string_size(char *type) { char buf[4], *end, *start; @@ -248,6 +256,8 @@ static int synth_field_size(char *type) size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); + else if (synth_field_is_stack(type)) + size = 0; return size; } @@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type) fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%.*s"; + else if (synth_field_is_stack(type)) + fmt = "%s"; return fmt; } @@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } + } else if (se->fields[i]->is_stack) { + u32 offset, data_offset, len; + unsigned long *p, *end; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + len = offset >> 16; + + p = (void *)entry + data_offset; + end = (void *)p + len - (sizeof(long) - 1); + + trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); + + for (; *p && p < end; p++) + trace_seq_printf(s, "=> %pS\n", (void *)*p); + n_u64++; + } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -446,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry, return len; } +static unsigned int trace_stack(struct synth_trace_event *entry, + struct synth_event *event, + long *stack, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len; + u32 data_offset; + void *data_loc; + + data_offset = struct_size(entry, fields, event->n_u64); + data_offset += data_size; + + for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) { + if (!stack[len]) + break; + } + + /* Include the zero'd element if it fits */ + if (len < HIST_STACKTRACE_DEPTH) + len++; + + len *= sizeof(long); + + /* Find the dynamic section to copy the stack into. */ + data_loc = (void *)entry + data_offset; + memcpy(data_loc, stack, len); + + /* Fill in the field that holds the offset/len combo */ + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) @@ -498,6 +564,12 @@ static notrace void trace_event_raw_event_synth(void *__data, event->fields[i]->is_dynamic, data_size, &n_u64); data_size += len; /* only dynamic string increments */ + } if (event->fields[i]->is_stack) { + long *stack = (long *)(long)var_ref_vals[val_idx]; + + len = trace_stack(entry, event, stack, + data_size, &n_u64); + data_size += len; } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -560,6 +632,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event, event->fields[i]->is_dynamic) pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(%s)", event->fields[i]->name); + else if (event->fields[i]->is_stack) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_stacktrace(%s)", event->fields[i]->name); else pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", event->fields[i]->name); @@ -696,7 +771,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, ret = -EINVAL; goto free; } else if (size == 0) { - if (synth_field_is_string(field->type)) { + if (synth_field_is_string(field->type) || + synth_field_is_stack(field->type)) { char *type; len = sizeof("__data_loc ") + strlen(field->type) + 1; @@ -727,6 +803,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, if (synth_field_is_string(field->type)) field->is_string = true; + else if (synth_field_is_stack(field->type)) + field->is_stack = true; field->is_signed = synth_field_signed(field->type); out: diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index b29595fe3ac5..43f6fb6078db 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -18,6 +18,7 @@ struct synth_field { bool is_signed; bool is_string; bool is_dynamic; + bool is_stack; }; struct synth_event { -- cgit v1.2.1 From cc5fc8bfc961eeb99b7e8dffbeff7a3f6995d314 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:29 -0500 Subject: tracing/histogram: Add stacktrace type Now that stacktraces can be part of synthetic events, allow a key to be typed as a stacktrace. # cd /sys/kernel/tracing # echo 's:block_lat u64 delta; unsigned long stack[];' >> dynamic_events # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,st2=$st:onmatch(sched.sched_switch).trace(block_lat,$delta,$st2)' >> events/sched/sched_switch/trigger # echo 'hist:keys=delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger # cat events/synthetic/block_lat/hist # event histogram # # trigger info: hist:keys=delta.buckets=100,stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] # { delta: ~ 0-99, stacktrace: event_hist_trigger+0x464/0x480 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x193/0x250 trace_event_raw_event_sched_switch+0xfc/0x150 __traceiter_sched_switch+0x41/0x60 __schedule+0x448/0x7b0 schedule_idle+0x26/0x40 cpu_startup_entry+0x19/0x20 start_secondary+0xed/0xf0 secondary_startup_64_no_verify+0xe0/0xeb } hitcount: 6 { delta: ~ 0-99, stacktrace: event_hist_trigger+0x464/0x480 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x193/0x250 trace_event_raw_event_sched_switch+0xfc/0x150 __traceiter_sched_switch+0x41/0x60 __schedule+0x448/0x7b0 schedule_idle+0x26/0x40 cpu_startup_entry+0x19/0x20 __pfx_kernel_init+0x0/0x10 arch_call_rest_init+0xa/0x24 start_kernel+0x964/0x98d secondary_startup_64_no_verify+0xe0/0xeb } hitcount: 3 { delta: ~ 0-99, stacktrace: event_hist_trigger+0x464/0x480 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x193/0x250 trace_event_raw_event_sched_switch+0xfc/0x150 __traceiter_sched_switch+0x41/0x60 __schedule+0x448/0x7b0 schedule+0x5a/0xb0 worker_thread+0xaf/0x380 kthread+0xe9/0x110 ret_from_fork+0x2c/0x50 } hitcount: 1 { delta: ~ 100-199, stacktrace: event_hist_trigger+0x464/0x480 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x193/0x250 trace_event_raw_event_sched_switch+0xfc/0x150 __traceiter_sched_switch+0x41/0x60 __schedule+0x448/0x7b0 schedule_idle+0x26/0x40 cpu_startup_entry+0x19/0x20 start_secondary+0xed/0xf0 secondary_startup_64_no_verify+0xe0/0xeb } hitcount: 15 [..] { delta: ~ 8500-8599, stacktrace: event_hist_trigger+0x464/0x480 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x193/0x250 trace_event_raw_event_sched_switch+0xfc/0x150 __traceiter_sched_switch+0x41/0x60 __schedule+0x448/0x7b0 schedule_idle+0x26/0x40 cpu_startup_entry+0x19/0x20 start_secondary+0xed/0xf0 secondary_startup_64_no_verify+0xe0/0xeb } hitcount: 1 Totals: Hits: 89 Entries: 11 Dropped: 0 Link: https://lkml.kernel.org/r/20230117152236.167046397@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 2c0837ff2299..7f3e6ca6f0fa 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1716,6 +1716,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "percent"; else if (hist_field->flags & HIST_FIELD_FL_GRAPH) flags_str = "graph"; + else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + flags_str = "stacktrace"; return flags_str; } @@ -2314,6 +2316,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_EXECNAME; else if (strcmp(modifier, "syscall") == 0) *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "stacktrace") == 0) + *flags |= HIST_FIELD_FL_STACKTRACE; else if (strcmp(modifier, "log2") == 0) *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) -- cgit v1.2.1 From 88238513bb26713ef6d4418bdf1af062fe608bcb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:30 -0500 Subject: tracing/histogram: Document variable stacktrace Add a little documentation (and a useful example) of how a stacktrace can be used within a histogram variable and synthetic event. Link: https://lkml.kernel.org/r/20230117152236.320181354@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 129 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index f95459aa984f..5c391328b9bb 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -81,6 +81,7 @@ Documentation written by Tom Zanussi .usecs display a common_timestamp in microseconds .percent display a number of percentage value .graph display a bar-graph of a value + .stacktrace display as a stacktrace (must by a long[] type) ============= ================================================= Note that in general the semantics of a given field aren't @@ -1786,6 +1787,8 @@ or assigned to a variable and referenced in a subsequent expression:: # echo 'hist:keys=next_pid:us_per_sec=1000000 ...' >> event/trigger # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/$us_per_sec ...' >> event/trigger +Variables can even hold stacktraces, which are useful with synthetic events. + 2.2.2 Synthetic Events ---------------------- @@ -1940,6 +1943,132 @@ the ".buckets" modifier and specify a size (in this case groups of 10). Entries: 16 Dropped: 0 +To save stacktraces, create a synthetic event with a field of type "unsigned long[]" +or even just "long[]". For example, to see how long a task is blocked in an +uninterruptible state: + + # cd /sys/kernel/tracing + # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events + # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger + # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger + # echo 1 > events/synthetic/block_lat/enable + # cat trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 2/2 #P:8 + # + # _-----=> irqs-off/BH-disabled + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / _-=> migrate-disable + # |||| / delay + # TASK-PID CPU# ||||| TIMESTAMP FUNCTION + # | | | ||||| | | + -0 [005] d..4. 521.164922: block_lat: pid=0 delta=8322 stack=STACK: + => __schedule+0x448/0x7b0 + => schedule+0x5a/0xb0 + => io_schedule+0x42/0x70 + => bit_wait_io+0xd/0x60 + => __wait_on_bit+0x4b/0x140 + => out_of_line_wait_on_bit+0x91/0xb0 + => jbd2_journal_commit_transaction+0x1679/0x1a70 + => kjournald2+0xa9/0x280 + => kthread+0xe9/0x110 + => ret_from_fork+0x2c/0x50 + + <...>-2 [004] d..4. 525.184257: block_lat: pid=2 delta=76 stack=STACK: + => __schedule+0x448/0x7b0 + => schedule+0x5a/0xb0 + => schedule_timeout+0x11a/0x150 + => wait_for_completion_killable+0x144/0x1f0 + => __kthread_create_on_node+0xe7/0x1e0 + => kthread_create_on_node+0x51/0x70 + => create_worker+0xcc/0x1a0 + => worker_thread+0x2ad/0x380 + => kthread+0xe9/0x110 + => ret_from_fork+0x2c/0x50 + +A synthetic event that has a stacktrace field may use it as a key in histogram: + + # echo 'hist:delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger + # cat events/synthetic/block_lat/hist + + # event histogram + # + # trigger info: hist:keys=delta.buckets=100,stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] + # + + { delta: ~ 0-99, stacktrace: + event_hist_trigger+0x464/0x480 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x193/0x250 + trace_event_raw_event_sched_switch+0xfc/0x150 + __traceiter_sched_switch+0x41/0x60 + __schedule+0x448/0x7b0 + schedule_idle+0x26/0x40 + cpu_startup_entry+0x19/0x20 + start_secondary+0xed/0xf0 + secondary_startup_64_no_verify+0xe0/0xeb + } hitcount: 6 + { delta: ~ 0-99, stacktrace: + event_hist_trigger+0x464/0x480 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x193/0x250 + trace_event_raw_event_sched_switch+0xfc/0x150 + __traceiter_sched_switch+0x41/0x60 + __schedule+0x448/0x7b0 + schedule_idle+0x26/0x40 + cpu_startup_entry+0x19/0x20 + __pfx_kernel_init+0x0/0x10 + arch_call_rest_init+0xa/0x24 + start_kernel+0x964/0x98d + secondary_startup_64_no_verify+0xe0/0xeb + } hitcount: 3 + { delta: ~ 0-99, stacktrace: + event_hist_trigger+0x464/0x480 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x193/0x250 + trace_event_raw_event_sched_switch+0xfc/0x150 + __traceiter_sched_switch+0x41/0x60 + __schedule+0x448/0x7b0 + schedule+0x5a/0xb0 + worker_thread+0xaf/0x380 + kthread+0xe9/0x110 + ret_from_fork+0x2c/0x50 + } hitcount: 1 + { delta: ~ 100-199, stacktrace: + event_hist_trigger+0x464/0x480 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x193/0x250 + trace_event_raw_event_sched_switch+0xfc/0x150 + __traceiter_sched_switch+0x41/0x60 + __schedule+0x448/0x7b0 + schedule_idle+0x26/0x40 + cpu_startup_entry+0x19/0x20 + start_secondary+0xed/0xf0 + secondary_startup_64_no_verify+0xe0/0xeb + } hitcount: 15 + [..] + { delta: ~ 8500-8599, stacktrace: + event_hist_trigger+0x464/0x480 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x193/0x250 + trace_event_raw_event_sched_switch+0xfc/0x150 + __traceiter_sched_switch+0x41/0x60 + __schedule+0x448/0x7b0 + schedule_idle+0x26/0x40 + cpu_startup_entry+0x19/0x20 + start_secondary+0xed/0xf0 + secondary_startup_64_no_verify+0xe0/0xeb + } hitcount: 1 + + Totals: + Hits: 89 + Entries: 11 + Dropped: 0 + 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- -- cgit v1.2.1 From b81a3a100cca1bd63d897efa339d047986615c9f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 17 Jan 2023 10:21:31 -0500 Subject: tracing/histogram: Add simple tests for stacktrace usage of synthetic events Update the selftests to include a test of passing a stacktrace between the events of a synthetic event. Link: https://lkml.kernel.org/r/20230117152236.475439286@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Tom Zanussi Cc: Ross Zwisler Cc: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- .../inter-event/trigger-synthetic-event-stack.tc | 24 ++++++++++++++++++++++ .../inter-event/trigger-synthetic-event-syntax.tc | 6 ++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ed5f1baa8c..b90eecd27dfc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5757,7 +5757,7 @@ static const char readme_msg[] = #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" - "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" + "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif #endif ; diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc new file mode 100644 index 000000000000..755dbe94ccf4 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test inter-event histogram trigger trace action with dynamic string param +# requires: set_event synthetic_events events/sched/sched_process_exec/hist "long[]' >> synthetic_events":README + +fail() { #msg + echo $1 + exit_fail +} + +echo "Test create synthetic event with stack" + + +echo 's:wake_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events +echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger +echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(wake_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger +echo 1 > events/synthetic/wake_lat/enable +sleep 1 + +if ! grep -q "=>.*sched" trace; then + fail "Failed to create synthetic event with stack" +fi + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc index 2968cdc7df30..366f1f3ad906 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc @@ -70,6 +70,12 @@ grep "myevent[[:space:]]unsigned long var" synthetic_events echo "myevent char var[10]" > synthetic_events grep "myevent[[:space:]]char\[10\] var" synthetic_events +if grep -q 'long\[\]' README; then + # test stacktrace type + echo "myevent unsigned long[] var" > synthetic_events + grep "myevent[[:space:]]unsigned long\[\] var" synthetic_events +fi + do_reset exit 0 -- cgit v1.2.1 From 92a22cea4c847b518af646a809cd662d55f9d8ac Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 24 Jan 2023 15:22:39 -0500 Subject: perf/tracing: Use stage6 of tracing to not duplicate macros The perf events are created by the same macro magic as tracefs trace events are. But to hook into perf, it has its own code. It duplicates many of the same macros as the tracefs macros and this is an issue because it misses bug fixes as well as any new enhancements that come with the other trace macros. As the trace macros have been put into their own staging files, have perf take advantage of this and use the tracefs stage 6 macros that the "fast assign" portion of the trace event macro uses. Link: https://lkml.kernel.org/r/20230124202515.716458410@goodmis.org Link: https://lore.kernel.org/lkml/1671181385-5719-1-git-send-email-quic_linyyuan@quicinc.com/ Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra (Intel) Reported-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 46 +--------------------------- include/trace/stages/stage6_event_callback.h | 3 ++ 2 files changed, 4 insertions(+), 45 deletions(-) diff --git a/include/trace/perf.h b/include/trace/perf.h index 8f3bf1e17707..2c11181c82e0 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -4,51 +4,7 @@ #ifdef CONFIG_PERF_EVENTS -#undef __entry -#define __entry entry - -#undef __get_dynamic_array -#define __get_dynamic_array(field) \ - ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) - -#undef __get_dynamic_array_len -#define __get_dynamic_array_len(field) \ - ((__entry->__data_loc_##field >> 16) & 0xffff) - -#undef __get_str -#define __get_str(field) ((char *)__get_dynamic_array(field)) - -#undef __get_bitmask -#define __get_bitmask(field) (char *)__get_dynamic_array(field) - -#undef __get_cpumask -#define __get_cpumask(field) (char *)__get_dynamic_array(field) - -#undef __get_sockaddr -#define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) - -#undef __get_rel_dynamic_array -#define __get_rel_dynamic_array(field) \ - ((void *)__entry + \ - offsetof(typeof(*__entry), __rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ - (__entry->__rel_loc_##field & 0xffff)) - -#undef __get_rel_dynamic_array_len -#define __get_rel_dynamic_array_len(field) \ - ((__entry->__rel_loc_##field >> 16) & 0xffff) - -#undef __get_rel_str -#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) - -#undef __get_rel_bitmask -#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_cpumask -#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_sockaddr -#define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) +#include "stages/stage6_event_callback.h" #undef __perf_count #define __perf_count(c) (__count = (c)) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 49c32394b53f..919b1a4da980 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -2,6 +2,9 @@ /* Stage 6 definitions for creating trace events */ +/* Reuse some of the stage 3 macros */ +#include "stage3_trace_output.h" + #undef __entry #define __entry entry -- cgit v1.2.1 From dc513fd5321d4c15ac1f820c224b2e5220b6e14f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 24 Jan 2023 15:22:40 -0500 Subject: bpf/tracing: Use stage6 of tracing to not duplicate macros The bpf events are created by the same macro magic as tracefs trace events are. But to hook into bpf, it has its own code. It duplicates many of the same macros as the tracefs macros and this is an issue because it misses bug fixes as well as any new enhancements that come with the other trace macros. As the trace macros have been put into their own staging files, have bpf take advantage of this and use the tracefs stage 6 macros that the "fast ssign" portion of the trace event macro uses. Link: https://lkml.kernel.org/r/20230124202515.873075730@goodmis.org Link: https://lore.kernel.org/lkml/1671181385-5719-1-git-send-email-quic_linyyuan@quicinc.com/ Cc: bpf@vger.kernel.org Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Reported-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- include/trace/bpf_probe.h | 45 +-------------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 155c495b89ea..1f7fc1fc590c 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -4,50 +4,7 @@ #ifdef CONFIG_BPF_EVENTS -#undef __entry -#define __entry entry - -#undef __get_dynamic_array -#define __get_dynamic_array(field) \ - ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) - -#undef __get_dynamic_array_len -#define __get_dynamic_array_len(field) \ - ((__entry->__data_loc_##field >> 16) & 0xffff) - -#undef __get_str -#define __get_str(field) ((char *)__get_dynamic_array(field)) - -#undef __get_bitmask -#define __get_bitmask(field) (char *)__get_dynamic_array(field) - -#undef __get_cpumask -#define __get_cpumask(field) (char *)__get_dynamic_array(field) - -#undef __get_sockaddr -#define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) - -#undef __get_rel_dynamic_array -#define __get_rel_dynamic_array(field) \ - ((void *)(&__entry->__rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ - (__entry->__rel_loc_##field & 0xffff)) - -#undef __get_rel_dynamic_array_len -#define __get_rel_dynamic_array_len(field) \ - ((__entry->__rel_loc_##field >> 16) & 0xffff) - -#undef __get_rel_str -#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) - -#undef __get_rel_bitmask -#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_cpumask -#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_sockaddr -#define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) +#include "stages/stage6_event_callback.h" #undef __perf_count #define __perf_count(c) (c) -- cgit v1.2.1 From b18c58af29e465d21d4cd9e8c5008ae0e0147384 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Jan 2023 15:46:49 -0800 Subject: tracing/osnoise: No need for schedule_hrtimeout range No slack time is being passed, just use schedule_hrtimeout(). Link: https://lore.kernel.org/linux-trace-kernel/20230123234649.17968-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 210e1f168392..04f0fdae19a1 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1539,7 +1539,7 @@ static void osnoise_sleep(void) wake_time = ktime_add_us(ktime_get(), interval); __set_current_state(TASK_INTERRUPTIBLE); - while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) { if (kthread_should_stop()) break; } -- cgit v1.2.1 From a2ff84a5d1e65c7d1178f24ecf39fc55283fbd14 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sun, 29 Jan 2023 10:14:02 +0700 Subject: tracing/histogram: Wrap remaining shell snippets in code blocks Most shell command snippets (echo/cat) and their output are already in literal code blocks. However a few still isn't wrapped, in which the htmldocs output is ugly. Wrap the remaining unwrapped snippets, while also fix recent kernel test robot warnings. Link: https://lore.kernel.org/linux-trace-kernel/20230129031402.47420-1-bagasdotme@gmail.com Cc: Masami Hiramatsu Cc: Jonathan Corbet Link: https://lore.kernel.org/linux-doc/202301290253.LU5yIxcJ-lkp@intel.com/ Fixes: 88238513bb2671 ("tracing/histogram: Document variable stacktrace") Reported-by: kernel test robot Signed-off-by: Bagas Sanjaya Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 93 ++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 5c391328b9bb..8e95295e39b6 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1864,7 +1864,7 @@ A histogram can now be defined for the new synthetic event:: The above shows the latency "lat" in a power of 2 grouping. Like any other event, once a histogram is enabled for the event, the -output can be displayed by reading the event's 'hist' file. +output can be displayed by reading the event's 'hist' file:: # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist @@ -1911,7 +1911,7 @@ output can be displayed by reading the event's 'hist' file. The latency values can also be grouped linearly by a given size with -the ".buckets" modifier and specify a size (in this case groups of 10). +the ".buckets" modifier and specify a size (in this case groups of 10):: # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger @@ -1945,7 +1945,7 @@ the ".buckets" modifier and specify a size (in this case groups of 10). To save stacktraces, create a synthetic event with a field of type "unsigned long[]" or even just "long[]". For example, to see how long a task is blocked in an -uninterruptible state: +uninterruptible state:: # cd /sys/kernel/tracing # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events @@ -1990,7 +1990,8 @@ uninterruptible state: => kthread+0xe9/0x110 => ret_from_fork+0x2c/0x50 -A synthetic event that has a stacktrace field may use it as a key in histogram: +A synthetic event that has a stacktrace field may use it as a key in +histogram:: # echo 'hist:delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger # cat events/synthetic/block_lat/hist @@ -2183,11 +2184,11 @@ The following commonly-used handler.action pairs are available: wakeup_new_test($testpid) if comm=="cyclictest"' >> \ /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger - Or, equivalently, using the 'trace' keyword syntax: + Or, equivalently, using the 'trace' keyword syntax:: - # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ - trace(wakeup_new_test,$testpid) if comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger + # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ + trace(wakeup_new_test,$testpid) if comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger Creating and displaying a histogram based on those events is now just a matter of using the fields and new synthetic event in the @@ -2320,48 +2321,48 @@ The following commonly-used handler.action pairs are available: resulting latency, stored in wakeup_lat, exceeds the current maximum latency, a snapshot is taken. As part of the setup, all the scheduler events are also enabled, which are the events that - will show up in the snapshot when it is taken at some point: + will show up in the snapshot when it is taken at some point:: - # echo 1 > /sys/kernel/debug/tracing/events/sched/enable + # echo 1 > /sys/kernel/debug/tracing/events/sched/enable - # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ - if comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_waking/trigger + # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ + if comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ - onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio, \ - prev_comm):onmax($wakeup_lat).snapshot() \ - if next_comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio, \ + prev_comm):onmax($wakeup_lat).snapshot() \ + if next_comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger When the histogram is displayed, for each bucket the max value and the saved values corresponding to the max are displayed following the rest of the fields. If a snapshot was taken, there is also a message indicating that, - along with the value and event that triggered the global maximum: + along with the value and event that triggered the global maximum:: - # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist - { next_pid: 2101 } hitcount: 200 - max: 52 next_prio: 120 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 + # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist + { next_pid: 2101 } hitcount: 200 + max: 52 next_prio: 120 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 - { next_pid: 2103 } hitcount: 1326 - max: 572 next_prio: 19 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 + { next_pid: 2103 } hitcount: 1326 + max: 572 next_prio: 19 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 - { next_pid: 2102 } hitcount: 1982 \ - max: 74 next_prio: 19 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 + { next_pid: 2102 } hitcount: 1982 \ + max: 74 next_prio: 19 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 - Snapshot taken (see tracing/snapshot). Details: - triggering value { onmax($wakeup_lat) }: 572 \ - triggered by event with key: { next_pid: 2103 } + Snapshot taken (see tracing/snapshot). Details: + triggering value { onmax($wakeup_lat) }: 572 \ + triggered by event with key: { next_pid: 2103 } - Totals: - Hits: 3508 - Entries: 3 - Dropped: 0 + Totals: + Hits: 3508 + Entries: 3 + Dropped: 0 In the above case, the event that triggered the global maximum has the key with next_pid == 2103. If you look at the bucket that has @@ -2439,15 +2440,15 @@ The following commonly-used handler.action pairs are available: $cwnd variable. If the value has changed, a snapshot is taken. As part of the setup, all the scheduler and tcp events are also enabled, which are the events that will show up in the snapshot - when it is taken at some point: + when it is taken at some point:: - # echo 1 > /sys/kernel/debug/tracing/events/sched/enable - # echo 1 > /sys/kernel/debug/tracing/events/tcp/enable + # echo 1 > /sys/kernel/debug/tracing/events/sched/enable + # echo 1 > /sys/kernel/debug/tracing/events/tcp/enable - # echo 'hist:keys=dport:cwnd=snd_cwnd: \ - onchange($cwnd).save(snd_wnd,srtt,rcv_wnd): \ - onchange($cwnd).snapshot()' >> \ - /sys/kernel/debug/tracing/events/tcp/tcp_probe/trigger + # echo 'hist:keys=dport:cwnd=snd_cwnd: \ + onchange($cwnd).save(snd_wnd,srtt,rcv_wnd): \ + onchange($cwnd).snapshot()' >> \ + /sys/kernel/debug/tracing/events/tcp/tcp_probe/trigger When the histogram is displayed, for each bucket the tracked value and the saved values corresponding to that value are displayed @@ -2470,10 +2471,10 @@ The following commonly-used handler.action pairs are available: { dport: 443 } hitcount: 211 changed: 10 snd_wnd: 26960 srtt: 17379 rcv_wnd: 28800 - Snapshot taken (see tracing/snapshot). Details:: + Snapshot taken (see tracing/snapshot). Details: - triggering value { onchange($cwnd) }: 10 - triggered by event with key: { dport: 80 } + triggering value { onchange($cwnd) }: 10 + triggered by event with key: { dport: 80 } Totals: Hits: 414 -- cgit v1.2.1 From a9c4bdd505630469f93f5efedfc7a9ca254996c8 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 30 Jan 2023 15:54:09 +0800 Subject: tracing: Acquire buffer from temparary trace sequence there is one dwc3 trace event declare as below, DECLARE_EVENT_CLASS(dwc3_log_event, TP_PROTO(u32 event, struct dwc3 *dwc), TP_ARGS(event, dwc), TP_STRUCT__entry( __field(u32, event) __field(u32, ep0state) __dynamic_array(char, str, DWC3_MSG_MAX) ), TP_fast_assign( __entry->event = event; __entry->ep0state = dwc->ep0state; ), TP_printk("event (%08x): %s", __entry->event, dwc3_decode_event(__get_str(str), DWC3_MSG_MAX, __entry->event, __entry->ep0state)) ); the problem is when trace function called, it will allocate up to DWC3_MSG_MAX bytes from trace event buffer, but never fill the buffer during fast assignment, it only fill the buffer when output function are called, so this means if output function are not called, the buffer will never used. add __get_buf(len) which acquiree buffer from iter->tmp_seq when trace output function called, it allow user write string to acquired buffer. the mentioned dwc3 trace event will changed as below, DECLARE_EVENT_CLASS(dwc3_log_event, TP_PROTO(u32 event, struct dwc3 *dwc), TP_ARGS(event, dwc), TP_STRUCT__entry( __field(u32, event) __field(u32, ep0state) ), TP_fast_assign( __entry->event = event; __entry->ep0state = dwc->ep0state; ), TP_printk("event (%08x): %s", __entry->event, dwc3_decode_event(__get_buf(DWC3_MSG_MAX), DWC3_MSG_MAX, __entry->event, __entry->ep0state)) );. Link: https://lore.kernel.org/linux-trace-kernel/1675065249-23368-1-git-send-email-quic_linyyuan@quicinc.com Cc: Masami Hiramatsu Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 5 +++++ include/trace/stages/stage3_trace_output.h | 3 +++ include/trace/stages/stage7_class_define.h | 1 + kernel/trace/trace_seq.c | 23 +++++++++++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 0c4c7587d6c3..6be92bf559fe 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -95,6 +95,7 @@ extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); +char *trace_seq_acquire(struct trace_seq *s, unsigned int len); #else /* CONFIG_TRACING */ static inline __printf(2, 3) @@ -139,6 +140,10 @@ static inline int trace_seq_path(struct trace_seq *s, const struct path *path) { return 0; } +static inline char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + return NULL; +} #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_SEQ_H */ diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index 66374df61ed3..c1fb1355d309 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -139,3 +139,6 @@ u64 ____val = (u64)(value); \ (u32) do_div(____val, NSEC_PER_SEC); \ }) + +#undef __get_buf +#define __get_buf(len) trace_seq_acquire(p, (len)) diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index 8795429f388b..bcb960d16fc0 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -23,6 +23,7 @@ #undef __get_rel_sockaddr #undef __print_array #undef __print_hex_dump +#undef __get_buf /* * The below is not executed in the kernel. It is only what is diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 9c90b3a7dce2..e5e299260d0c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, return 1; } EXPORT_SYMBOL(trace_seq_hex_dump); + +/* + * trace_seq_acquire - acquire seq buffer with size len + * @s: trace sequence descriptor + * @len: size of buffer to be acquired + * + * acquire buffer with size of @len from trace_seq for output usage, + * user can fill string into that buffer. + * + * Returns start address of acquired buffer. + * + * it allow multiple usage in one trace output function call. + */ +char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + char *ret = trace_seq_buffer_ptr(s); + + if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len)) + seq_buf_commit(&s->seq, len); + + return ret; +} +EXPORT_SYMBOL(trace_seq_acquire); -- cgit v1.2.1 From 01678fbce3dd2e3042077cf71b68157e733614a0 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Mon, 30 Jan 2023 16:59:54 +0800 Subject: samples: ftrace: Include the nospec-branch.h only for x86 When other architectures without the nospec functionality write their direct-call functions of samples/ftrace/*.c, the including of asm/nospec-branch.h must be taken care to fix the no header file found error in building process. This commit (ee3e2469b346 "x86/ftrace: Make it call depth tracking aware") file-globally includes asm/nospec-branch.h providing CALL_DEPTH_ACCOUNT for only x86 direct-call functions. It seems better to move the including to `#ifdef CONFIG_X86_64`. Link: https://lore.kernel.org/linux-trace-kernel/20230130085954.647845-1-suagrfillet@gmail.com Signed-off-by: Song Shuai Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/ftrace-direct-modify.c | 2 +- samples/ftrace/ftrace-direct-multi-modify.c | 2 +- samples/ftrace/ftrace-direct-multi.c | 2 +- samples/ftrace/ftrace-direct-too.c | 2 +- samples/ftrace/ftrace-direct.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index de5a0f67f320..d93abbcb1f4c 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -3,7 +3,6 @@ #include #include #include -#include extern void my_direct_func1(void); extern void my_direct_func2(void); @@ -26,6 +25,7 @@ static unsigned long my_ip = (unsigned long)schedule; #ifdef CONFIG_X86_64 #include +#include asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index a825dbd2c9cf..b58c594efb51 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -3,7 +3,6 @@ #include #include #include -#include extern void my_direct_func1(unsigned long ip); extern void my_direct_func2(unsigned long ip); @@ -24,6 +23,7 @@ extern void my_tramp2(void *); #ifdef CONFIG_X86_64 #include +#include asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index d955a2650605..c27cf130c319 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -5,7 +5,6 @@ #include #include #include -#include extern void my_direct_func(unsigned long ip); @@ -19,6 +18,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include +#include asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index e13fb59a2b47..8139dce2a31c 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -4,7 +4,6 @@ #include /* for handle_mm_fault() */ #include #include -#include extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -21,6 +20,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include +#include asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index 1f769d0db20f..1d3d307ca33d 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -4,7 +4,6 @@ #include /* for wake_up_process() */ #include #include -#include extern void my_direct_func(struct task_struct *p); @@ -18,6 +17,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include +#include asm ( " .pushsection .text, \"ax\", @progbits\n" -- cgit v1.2.1 From f94fe7048a352ff8914232a18e2e2f18f8a5ac81 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 30 Jan 2023 14:02:37 +0100 Subject: ftrace: sample: avoid open-coded 64-bit division Calculating the average period requires a 64-bit division that leads to a link failure on 32-bit architectures: x86_64-linux-ld: samples/ftrace/ftrace-ops.o: in function `ftrace_ops_sample_init': ftrace-ops.c:(.init.text+0x23b): undefined reference to `__udivdi3' Use the div_u64() helper to do this instead. Since this is an init function that is not called frequently, the runtime overhead is going to be acceptable. Link: https://lore.kernel.org/linux-trace-kernel/20230130130246.247537-1-arnd@kernel.org Cc: Masami Hiramatsu Fixes: b56c68f705ca ("ftrace: Add sample with custom ops") Signed-off-by: Arnd Bergmann Acked-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/ftrace-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c index 24deb51c7261..0c8da87ff5c3 100644 --- a/samples/ftrace/ftrace-ops.c +++ b/samples/ftrace/ftrace-ops.c @@ -223,7 +223,7 @@ static int __init ftrace_ops_sample_init(void) pr_info("Attempted %u calls to %ps in %lluns (%lluns / call)\n", nr_function_calls, tracee_relevant, - period, period / nr_function_calls); + period, div_u64(period, nr_function_calls)); if (persist) return 0; -- cgit v1.2.1 From aef70ebd624cf2959c531b652365da83f2b19352 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 30 Jan 2023 11:37:08 -0800 Subject: samples: ftrace: Make some global variables static smatch reports this representative issue samples/ftrace/ftrace-ops.c:15:14: warning: symbol 'nr_function_calls' was not declared. Should it be static? The nr_functions_calls and several other global variables are only used in ftrace-ops.c, so they should be static. Remove the instances of initializing static int to 0. Link: https://lore.kernel.org/linux-trace-kernel/20230130193708.1378108-1-trix@redhat.com Signed-off-by: Tom Rix Acked-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/ftrace-ops.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c index 0c8da87ff5c3..68d6685c80bd 100644 --- a/samples/ftrace/ftrace-ops.c +++ b/samples/ftrace/ftrace-ops.c @@ -12,7 +12,7 @@ * Arbitrary large value chosen to be sufficiently large to minimize noise but * sufficiently small to complete quickly. */ -unsigned int nr_function_calls = 100000; +static unsigned int nr_function_calls = 100000; module_param(nr_function_calls, uint, 0); MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee"); @@ -21,7 +21,7 @@ MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee" * be called directly or whether it's necessary to go via the list func, which * can be significantly more expensive. */ -unsigned int nr_ops_relevant = 1; +static unsigned int nr_ops_relevant = 1; module_param(nr_ops_relevant, uint, 0); MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the relevant tracee"); @@ -30,7 +30,7 @@ MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the rel * tracers enabled for distinct functions can force the use of the list func * and incur overhead for all call sites. */ -unsigned int nr_ops_irrelevant = 0; +static unsigned int nr_ops_irrelevant; module_param(nr_ops_irrelevant, uint, 0); MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the irrelevant tracee"); @@ -38,15 +38,15 @@ MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the i * On architectures with DYNAMIC_FTRACE_WITH_REGS, saving the full pt_regs can * be more expensive than only saving the minimal necessary regs. */ -bool save_regs = false; +static bool save_regs; module_param(save_regs, bool, 0); MODULE_PARM_DESC(save_regs, "Register ops with FTRACE_OPS_FL_SAVE_REGS (save all registers in the trampoline)"); -bool assist_recursion = false; +static bool assist_recursion; module_param(assist_recursion, bool, 0); MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RECURSION"); -bool assist_rcu = false; +static bool assist_rcu; module_param(assist_rcu, bool, 0); MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU"); @@ -55,7 +55,7 @@ MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU"); * overhead. Sometimes a consistency check using a more expensive tracer is * desireable. */ -bool check_count = false; +static bool check_count; module_param(check_count, bool, 0); MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number of times\n"); @@ -64,7 +64,7 @@ MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number * runs, but sometimes it can be useful to leave them registered so that they * can be inspected through the tracefs 'enabled_functions' file. */ -bool persist = false; +static bool persist; module_param(persist, bool, 0); MODULE_PARM_DESC(persist, "Successfully load module and leave ftrace ops registered after test completes\n"); @@ -114,8 +114,8 @@ static void ops_func_count(unsigned long ip, unsigned long parent_ip, self->count++; } -struct sample_ops *ops_relevant; -struct sample_ops *ops_irrelevant; +static struct sample_ops *ops_relevant; +static struct sample_ops *ops_irrelevant; static struct sample_ops *ops_alloc_init(void *tracee, ftrace_func_t func, unsigned long flags, int nr) @@ -163,8 +163,8 @@ static void ops_check(struct sample_ops *ops, int nr, } } -ftrace_func_t tracer_relevant = ops_func_nop; -ftrace_func_t tracer_irrelevant = ops_func_nop; +static ftrace_func_t tracer_relevant = ops_func_nop; +static ftrace_func_t tracer_irrelevant = ops_func_nop; static int __init ftrace_ops_sample_init(void) { -- cgit v1.2.1 From 9971c3f944489ff7aacb9d25e0cde841a5f6018a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Jan 2023 09:52:37 -0500 Subject: tracing: Fix trace_event_raw_event_synth() if else statement The test to check if the field is a stack is to be done if it is not a string. But the code had: } if (event->fields[i]->is_stack) { and not } else if (event->fields[i]->is_stack) { which would cause it to always be tested. Worse yet, this also included an "else" statement that was only to be called if the field was not a string and a stack, but this code allows it to be called if it was a string (and not a stack). Also fixed some whitespace issues. Link: https://lore.kernel.org/all/202301302110.mEtNwkBD-lkp@intel.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230131095237.63e3ca8d@gandalf.local.home Cc: Tom Zanussi Fixes: 00cf3d672a9d ("tracing: Allow synthetic events to pass around stacktraces") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- kernel/trace/trace_events_synth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index adb630633f31..306c89e0ce55 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -564,8 +564,8 @@ static notrace void trace_event_raw_event_synth(void *__data, event->fields[i]->is_dynamic, data_size, &n_u64); data_size += len; /* only dynamic string increments */ - } if (event->fields[i]->is_stack) { - long *stack = (long *)(long)var_ref_vals[val_idx]; + } else if (event->fields[i]->is_stack) { + long *stack = (long *)(long)var_ref_vals[val_idx]; len = trace_stack(entry, event, stack, data_size, &n_u64); -- cgit v1.2.1 From cb1f98c5e5742273bca39410d504430793ab701a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 7 Feb 2023 12:28:50 -0500 Subject: tracing: Add creation of instances at boot command line Add kernel command line to add tracing instances. This only creates instances at boot but still does not enable any events to them. Later changes will extend this command line to add enabling of events, filters, and triggers. As well as possibly redirecting trace_printk()! Link: https://lkml.kernel.org/r/20230207173026.186210158@goodmis.org Cc: Randy Dunlap Cc: Masami Hiramatsu Cc: Andrew Morton Reviewed-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- Documentation/admin-guide/kernel-parameters.txt | 6 +++ kernel/trace/trace.c | 50 +++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6cfa6e3996cf..9545da5ed849 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6272,6 +6272,12 @@ comma-separated list of trace events to enable. See also Documentation/trace/events.rst + trace_instance=[instance-info] + [FTRACE] Create a ring buffer instance early in boot up. + This will be listed in: + + /sys/kernel/tracing/instances + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b90eecd27dfc..863716fcca49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@ #include #include +#include /* COMMAND_LINE_SIZE */ + #include "trace.h" #include "trace_output.h" @@ -186,6 +188,9 @@ static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); @@ -239,6 +244,23 @@ static int __init boot_snapshot(char *str) __setup("ftrace_boot_snapshot", boot_snapshot); +static int __init boot_instance(char *str) +{ + char *slot = boot_instance_info + boot_instance_index; + int left = sizeof(boot_instance_info) - boot_instance_index; + int ret; + + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_instance_index += ret; + + return 1; +} +__setup("trace_instance=", boot_instance); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) @@ -10144,6 +10166,31 @@ out: return ret; } +__init static void enable_instances(void) +{ + struct trace_array *tr; + char *curr_str; + char *str; + char *tok; + + /* A tab is always appended */ + boot_instance_info[boot_instance_index - 1] = '\0'; + str = boot_instance_info; + + while ((curr_str = strsep(&str, "\t"))) { + + tok = strsep(&curr_str, ","); + + tr = trace_array_get_by_name(tok); + if (!tr) { + pr_warn("Failed to create instance buffer %s\n", curr_str); + continue; + } + /* Allow user space to delete it */ + trace_array_put(tr); + } +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; @@ -10302,6 +10349,9 @@ void __init early_trace_init(void) void __init trace_init(void) { trace_event_init(); + + if (boot_instance_index) + enable_instances(); } __init static void clear_boot_tracer(void) -- cgit v1.2.1 From c4846480831ea526bcab0ca0bf6364bfa6ed4227 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 7 Feb 2023 12:28:51 -0500 Subject: tracing: Add enabling of events to boot instances Add the format of: trace_instance=foo,sched:sched_switch,irq_handler_entry,initcall That will create the "foo" instance and enable the sched_switch event (here were the "sched" system is explicitly specified), the irq_handler_entry event, and all events under the system initcall. Link: https://lkml.kernel.org/r/20230207173026.386114535@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Reviewed-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- Documentation/admin-guide/kernel-parameters.txt | 14 ++++++++++++++ kernel/trace/trace.c | 4 ++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 9 ++++----- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9545da5ed849..40304dc3d319 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6278,6 +6278,20 @@ /sys/kernel/tracing/instances + Events can be enabled at the time the instance is created + via: + + trace_instance=,:,: + + Note, the ":" portion is optional if the event is + unique. + + trace_instance=foo,sched:sched_switch,irq_handler_entry,initcall + + will enable the "sched_switch" event (note, the "sched:" is optional, and + the same thing would happen if it was left off). The irq_handler_entry + event, and all events under the "initcall" system. + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 863716fcca49..5b16af9e7d30 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10188,6 +10188,10 @@ __init static void enable_instances(void) } /* Allow user space to delete it */ trace_array_put(tr); + + while ((tok = strsep(&curr_str, ","))) { + early_enable_events(tr, tok, true); + } } } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d16929dd0f08..f10bf804dd2b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1334,6 +1334,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); +void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b07931ad97de..cf3fd74fa675 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3750,10 +3750,9 @@ static __init int event_trace_memsetup(void) return 0; } -static __init void -early_enable_events(struct trace_array *tr, bool disable_first) +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { - char *buf = bootup_event_buf; char *token; int ret; @@ -3806,7 +3805,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - early_enable_events(tr, false); + early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); @@ -3834,7 +3833,7 @@ static __init int event_trace_enable_again(void) if (!tr) return -ENODEV; - early_enable_events(tr, true); + early_enable_events(tr, bootup_event_buf, true); return 0; } -- cgit v1.2.1 From d503b8f7474fe7ac616518f7fc49773cbab49f36 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 7 Feb 2023 12:28:52 -0500 Subject: tracing: Add trace_array_puts() to write into instance Add a generic trace_array_puts() that can be used to "trace_puts()" into an allocated trace_array instance. This is just another variant of trace_array_printk(). Link: https://lkml.kernel.org/r/20230207173026.584717290@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Reviewed-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- include/linux/trace.h | 12 ++++++++++++ kernel/trace/trace.c | 27 +++++++++++++++++---------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index 80ffda871749..2a70a447184c 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -33,6 +33,18 @@ struct trace_array; int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +/** + * trace_array_puts - write a constant string into the trace buffer. + * @tr: The trace array to write to + * @str: The constant string to write + */ +#define trace_array_puts(tr, str) \ + ({ \ + str ? __trace_array_puts(tr, _THIS_IP_, str, strlen(str)) : -1; \ + }) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size); + void trace_printk_init_buffers(void); __printf(3, 4) int trace_array_printk(struct trace_array *tr, unsigned long ip, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b16af9e7d30..9918bf7441ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1023,13 +1023,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_unlock_commit(buffer); } -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; @@ -1037,7 +1032,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) unsigned int trace_ctx; int alloc; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1046,7 +1041,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); @@ -1068,11 +1063,23 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; } +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + return __trace_array_puts(&global_trace, ip, str, size); +} EXPORT_SYMBOL_GPL(__trace_puts); /** -- cgit v1.2.1 From 9c1c251d670bc107b9ce600a2c20adb08b4849e9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 7 Feb 2023 12:28:53 -0500 Subject: tracing: Allow boot instances to have snapshot buffers Add to ftrace_boot_snapshot, "=" name, where the instance will get a snapshot buffer, and will take a snapshot at the end of boot (which will save the boot traces). Link: https://lkml.kernel.org/r/20230207173026.792774721@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Reviewed-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- Documentation/admin-guide/kernel-parameters.txt | 9 +++ kernel/trace/trace.c | 79 ++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 40304dc3d319..7fa1ee4ba910 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1532,6 +1532,15 @@ boot up that is likely to be overridden by user space start up functionality. + Optionally, the snapshot can also be defined for a tracing + instance that was created by the trace_instance= command + line parameter. + + trace_instance=foo,sched_switch ftrace_boot_snapshot=foo + + The above will cause the "foo" tracing instance to trigger + a snapshot at the end of boot up. + ftrace_dump_on_oops[=orig_cpu] [FTRACE] will dump the trace buffers on oops. If no parameter is passed, ftrace will dump diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9918bf7441ed..97c88711f270 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -191,6 +191,9 @@ static bool snapshot_at_boot; static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); @@ -227,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); @@ -9254,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) } tr->allocated_snapshot = allocate_snapshot; - /* - * Only the top level trace array gets its snapshot allocated - * from the kernel command line. - */ allocate_snapshot = false; #endif @@ -10173,6 +10185,47 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will alse clear + * this flag. + */ + allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + __init static void enable_instances(void) { struct trace_array *tr; @@ -10188,6 +10241,9 @@ __init static void enable_instances(void) tok = strsep(&curr_str, ","); + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(tok); + tr = trace_array_get_by_name(tok); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); @@ -10335,10 +10391,19 @@ out: void __init ftrace_boot_snapshot(void) { + struct trace_array *tr; + if (snapshot_at_boot) { tracing_snapshot(); internal_trace_puts("** Boot snapshot taken **\n"); } + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == &global_trace) + continue; + trace_array_puts(tr, "** Boot snapshot taken **\n"); + tracing_snapshot_instance(tr); + } } void __init early_trace_init(void) -- cgit v1.2.1 From fc1a9dc101292403babe0c5c2f99f4748580ed98 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 10 Feb 2023 15:33:03 -0600 Subject: tracing/histogram: Don't use strlen to find length of stacktrace variables Because stacktraces are saved in dynamic strings, trace_event_raw_event_synth() uses strlen to determine the length of the stack. Stacktraces may contain 0-bytes, though, in the saved addresses, so the length found and passed to reserve() will be too small. Fix this by using the first unsigned long in the stack variables to store the actual number of elements in the stack and have trace_event_raw_event_synth() use that to determine the length of the stack. Link: https://lkml.kernel.org/r/1ed6906cd9d6477ef2bd8e63c61de20a9ffe64d7.1676063532.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 12 ++++++++---- kernel/trace/trace_events_synth.c | 7 ++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7f3e6ca6f0fa..f21e42ddba69 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3137,13 +3137,15 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, size = min(val->size, STR_VAR_LEN_MAX); strscpy(str, val_str, size); } else { + char *stack_start = str + sizeof(unsigned long); int e; - e = stack_trace_save((void *)str, + e = stack_trace_save((void *)stack_start, HIST_STACKTRACE_DEPTH, HIST_STACKTRACE_SKIP); if (e < HIST_STACKTRACE_DEPTH - 1) - ((unsigned long *)str)[e] = 0; + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; } var_val = (u64)(uintptr_t)str; } @@ -5135,13 +5137,15 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, size = min(hist_field->size, STR_VAR_LEN_MAX); strscpy(str, val_str, size); } else { + char *stack_start = str + sizeof(unsigned long); int e; - e = stack_trace_save((void *)str, + e = stack_trace_save((void *)stack_start, HIST_STACKTRACE_DEPTH, HIST_STACKTRACE_SKIP); if (e < HIST_STACKTRACE_DEPTH - 1) - ((unsigned long *)str)[e] = 0; + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; } hist_val = (u64)(uintptr_t)str; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 306c89e0ce55..70bddb25d9c0 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -538,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = kern_fetch_store_strlen((unsigned long)str_val); + if (event->dynamic_fields[i]->is_stack) { + len = *((unsigned long *)str_val); + len *= sizeof(unsigned long); + } else { + len = kern_fetch_store_strlen((unsigned long)str_val); + } fields_size += len; } -- cgit v1.2.1 From 8261ef2eb35fce689a82d346b25e945e16bcb9d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 15 Feb 2023 20:25:38 -0500 Subject: tracing: Add BUILD_BUG() to make sure stacktrace fits in strings The max string length for a histogram variable is 256 bytes. The max depth of a stacktrace is 16. With 8byte words, that's 16 * 8 = 128. Which can easily fit in the string variable. The histogram stacktrace is being stored in the string value (with the given max length), with the assumption it will fit. To make sure that this is always the case (in the case that the stack trace depth increases), add a BUILD_BUG_ON() to test this. Link: https://lore.kernel.org/linux-trace-kernel/20230214002418.0103b9e765d3e5c374d2aa7d@kernel.org/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f21e42ddba69..6cef1def1da5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3119,6 +3119,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, unsigned int i, j, var_idx; u64 var_val; + /* Make sure stacktrace can fit in the string variable length */ + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { struct field_var *field_var = field_vars[i]; struct hist_field *var = field_var->var; -- cgit v1.2.1 From 2bacfd9f7e5ac18ae40ecd7b29c63580d0e5b329 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 10 Feb 2023 15:33:04 -0600 Subject: tracing/histogram: Fix a few problems with stacktrace variable printing Currently, there are a few problems when printing hist triggers and trace output when using stacktrace variables. This fixes the problems seen below: # echo 'hist:keys=delta.buckets=100,stack.stacktrace:sort=delta' > /sys/kernel/debug/tracing/events/synthetic/block_lat/trigger # cat /sys/kernel/debug/tracing/events/synthetic/block_lat/trigger hist:keys=delta.buckets=100,stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # cat /sys/kernel/debug/tracing/events/sched/sched_switch/trigger hist:keys=next_pid:vals=hitcount:ts=common_timestamp.usecs,st=stacktrace.stacktrace:sort=hitcount:size=2048:clock=global if prev_state == 2 [active] and also in the trace output (should be stack.stacktrace): { delta: ~ 100-199, stacktrace __schedule+0xa19/0x1520 Link: https://lkml.kernel.org/r/60bebd4e546728e012a7a2bcbf58716d48ba6edb.1676063532.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6cef1def1da5..c4f1fe985f6f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1356,9 +1356,12 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; - else if (field->flags & HIST_FIELD_FL_STACKTRACE) - field_name = "stacktrace"; - else if (field->flags & HIST_FIELD_FL_HITCOUNT) + else if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + field_name = field->field->name; + else + field_name = "stacktrace"; + } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; if (field_name == NULL) @@ -5339,7 +5342,10 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %-30s[%3llu]", field_name, syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - seq_puts(m, "stacktrace:\n"); + if (key_field->field) + seq_printf(m, "%s.stacktrace", key_field->field->name); + else + seq_puts(m, "stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5884,7 +5890,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags) { if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && - !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + !(hist_field->flags & HIST_FIELD_FL_EXPR) && + !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) { const char *flags = get_hist_field_flags(hist_field); if (flags) @@ -5917,9 +5924,12 @@ static int event_hist_trigger_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ","); - if (field->flags & HIST_FIELD_FL_STACKTRACE) - seq_puts(m, "stacktrace"); - else + if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + seq_printf(m, "%s.stacktrace", field->field->name); + else + seq_puts(m, "stacktrace"); + } else hist_field_print(m, field); } -- cgit v1.2.1 From f5914b301a17575a4cbcb85a0169a3148b958064 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 10 Feb 2023 15:33:05 -0600 Subject: tracing/histogram: Fix stacktrace key The current code will always use the current stacktrace as a key even if a stacktrace contained in a specific event field was specified. For example, we expect to use the 'unsigned long[] stack' field in the below event in the histogram: # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > /sys/kernel/debug/tracing/dynamic_events # echo 'hist:keys=delta.buckets=100,stack.stacktrace:sort=delta' > /sys/kernel/debug/tracing/events/synthetic/block_lat/trigger But in fact, when we type out the trigger, we see that it's using the plain old global 'stacktrace' as the key, which is just the stacktrace when the event was hit and not the stacktrace contained in the event, which is what we want: # cat /sys/kernel/debug/tracing/events/synthetic/block_lat/trigger hist:keys=delta.buckets=100,stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] And in fact, there's no code to actually retrieve it from the event, so we need to add HIST_FIELD_FN_STACK and hist_field_stack() to get it and hook it into the trigger code. For now, since the stack is just using dynamic strings, this could just use the dynamic string function, but it seems cleaner to have a dedicated function an be able to tweak independently as necessary. Link: https://lkml.kernel.org/r/11aa614c82976adbfa4ea763dbe885b5fb01d59c.1676063532.git.zanussi@kernel.org Signed-off-by: Tom Zanussi [ Fixed 32bit build warning reported by kernel test robot ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c4f1fe985f6f..89877a18f933 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -135,6 +135,7 @@ enum hist_field_fn { HIST_FIELD_FN_DIV_NOT_POWER2, HIST_FIELD_FN_DIV_MULT_SHIFT, HIST_FIELD_FN_EXECNAME, + HIST_FIELD_FN_STACK, }; /* @@ -1982,7 +1983,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn_num = HIST_FIELD_FN_NOP; + if (field) + hist_field->fn_num = HIST_FIELD_FN_STACK; + else + hist_field->fn_num = HIST_FIELD_FN_NOP; hist_field->size = HIST_STACKTRACE_SIZE; hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); if (!hist_field->type) @@ -4274,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_field_stack(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + static u64 hist_fn_call(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -4337,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_EXECNAME: return hist_field_execname(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STACK: + return hist_field_stack(hist_field, elt, buffer, rbe, event); default: return 0; } @@ -5238,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data, if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { memset(entries, 0, HIST_STACKTRACE_SIZE); - stack_trace_save(entries, HIST_STACKTRACE_DEPTH, - HIST_STACKTRACE_SKIP); + if (key_field->field) { + unsigned long *stack, n_entries; + + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); + stack = (unsigned long *)(long)field_contents; + n_entries = *stack; + memcpy(entries, ++stack, n_entries * sizeof(unsigned long)); + } else { + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + } key = entries; } else { field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); -- cgit v1.2.1 From d8f0ae3ebed416728077fe94698983c30d409241 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 10 Feb 2023 15:33:06 -0600 Subject: tracing/histogram: Fix stacktrace histogram Documententation Fix a small problem with the histogram specification in the Documentation, and change the example to show output using a stacktrace field rather than the global stacktrace. Link: https://lkml.kernel.org/r/f75f807dd4998249e513515f703a2ff7407605f4.1676063532.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 156 ++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 66 deletions(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 8e95295e39b6..c5cfc6e7baea 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1993,82 +1993,106 @@ uninterruptible state:: A synthetic event that has a stacktrace field may use it as a key in histogram:: - # echo 'hist:delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger + # echo 'hist:keys=delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger # cat events/synthetic/block_lat/hist # event histogram # - # trigger info: hist:keys=delta.buckets=100,stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] + # trigger info: hist:keys=delta.buckets=100,stack.stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] # - - { delta: ~ 0-99, stacktrace: - event_hist_trigger+0x464/0x480 - event_triggers_call+0x52/0xe0 - trace_event_buffer_commit+0x193/0x250 - trace_event_raw_event_sched_switch+0xfc/0x150 - __traceiter_sched_switch+0x41/0x60 - __schedule+0x448/0x7b0 - schedule_idle+0x26/0x40 - cpu_startup_entry+0x19/0x20 - start_secondary+0xed/0xf0 - secondary_startup_64_no_verify+0xe0/0xeb - } hitcount: 6 - { delta: ~ 0-99, stacktrace: - event_hist_trigger+0x464/0x480 - event_triggers_call+0x52/0xe0 - trace_event_buffer_commit+0x193/0x250 - trace_event_raw_event_sched_switch+0xfc/0x150 - __traceiter_sched_switch+0x41/0x60 - __schedule+0x448/0x7b0 - schedule_idle+0x26/0x40 - cpu_startup_entry+0x19/0x20 - __pfx_kernel_init+0x0/0x10 - arch_call_rest_init+0xa/0x24 - start_kernel+0x964/0x98d - secondary_startup_64_no_verify+0xe0/0xeb + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + io_schedule+0x46/0x80 + bit_wait_io+0x11/0x80 + __wait_on_bit+0x4e/0x120 + out_of_line_wait_on_bit+0x8d/0xb0 + __wait_on_buffer+0x33/0x40 + jbd2_journal_commit_transaction+0x155a/0x19b0 + kjournald2+0xab/0x270 + kthread+0xfa/0x130 + ret_from_fork+0x29/0x50 + } hitcount: 1 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + io_schedule+0x46/0x80 + rq_qos_wait+0xd0/0x170 + wbt_wait+0x9e/0xf0 + __rq_qos_throttle+0x25/0x40 + blk_mq_submit_bio+0x2c3/0x5b0 + __submit_bio+0xff/0x190 + submit_bio_noacct_nocheck+0x25b/0x2b0 + submit_bio_noacct+0x20b/0x600 + submit_bio+0x28/0x90 + ext4_bio_write_page+0x1e0/0x8c0 + mpage_submit_page+0x60/0x80 + mpage_process_page_bufs+0x16c/0x180 + mpage_prepare_extent_to_map+0x23f/0x530 + } hitcount: 1 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_hrtimeout_range_clock+0x97/0x110 + schedule_hrtimeout_range+0x13/0x20 + usleep_range_state+0x65/0x90 + __intel_wait_for_register+0x1c1/0x230 [i915] + intel_psr_wait_for_idle_locked+0x171/0x2a0 [i915] + intel_pipe_update_start+0x169/0x360 [i915] + intel_update_crtc+0x112/0x490 [i915] + skl_commit_modeset_enables+0x199/0x600 [i915] + intel_atomic_commit_tail+0x7c4/0x1080 [i915] + intel_atomic_commit_work+0x12/0x20 [i915] + process_one_work+0x21c/0x3f0 + worker_thread+0x50/0x3e0 + kthread+0xfa/0x130 } hitcount: 3 - { delta: ~ 0-99, stacktrace: - event_hist_trigger+0x464/0x480 - event_triggers_call+0x52/0xe0 - trace_event_buffer_commit+0x193/0x250 - trace_event_raw_event_sched_switch+0xfc/0x150 - __traceiter_sched_switch+0x41/0x60 - __schedule+0x448/0x7b0 - schedule+0x5a/0xb0 - worker_thread+0xaf/0x380 - kthread+0xe9/0x110 - ret_from_fork+0x2c/0x50 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_timeout+0x11e/0x160 + __wait_for_common+0x8f/0x190 + wait_for_completion+0x24/0x30 + __flush_work.isra.0+0x1cc/0x360 + flush_work+0xe/0x20 + drm_mode_rmfb+0x18b/0x1d0 [drm] + drm_mode_rmfb_ioctl+0x10/0x20 [drm] + drm_ioctl_kernel+0xb8/0x150 [drm] + drm_ioctl+0x243/0x560 [drm] + __x64_sys_ioctl+0x92/0xd0 + do_syscall_64+0x59/0x90 + entry_SYSCALL_64_after_hwframe+0x72/0xdc } hitcount: 1 - { delta: ~ 100-199, stacktrace: - event_hist_trigger+0x464/0x480 - event_triggers_call+0x52/0xe0 - trace_event_buffer_commit+0x193/0x250 - trace_event_raw_event_sched_switch+0xfc/0x150 - __traceiter_sched_switch+0x41/0x60 - __schedule+0x448/0x7b0 - schedule_idle+0x26/0x40 - cpu_startup_entry+0x19/0x20 - start_secondary+0xed/0xf0 - secondary_startup_64_no_verify+0xe0/0xeb - } hitcount: 15 - [..] - { delta: ~ 8500-8599, stacktrace: - event_hist_trigger+0x464/0x480 - event_triggers_call+0x52/0xe0 - trace_event_buffer_commit+0x193/0x250 - trace_event_raw_event_sched_switch+0xfc/0x150 - __traceiter_sched_switch+0x41/0x60 - __schedule+0x448/0x7b0 - schedule_idle+0x26/0x40 - cpu_startup_entry+0x19/0x20 - start_secondary+0xed/0xf0 - secondary_startup_64_no_verify+0xe0/0xeb + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_timeout+0x87/0x160 + __wait_for_common+0x8f/0x190 + wait_for_completion_timeout+0x1d/0x30 + drm_atomic_helper_wait_for_flip_done+0x57/0x90 [drm_kms_helper] + intel_atomic_commit_tail+0x8ce/0x1080 [i915] + intel_atomic_commit_work+0x12/0x20 [i915] + process_one_work+0x21c/0x3f0 + worker_thread+0x50/0x3e0 + kthread+0xfa/0x130 + ret_from_fork+0x29/0x50 + } hitcount: 1 + { delta: ~ 100-199, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_hrtimeout_range_clock+0x97/0x110 + schedule_hrtimeout_range+0x13/0x20 + usleep_range_state+0x65/0x90 + pci_set_low_power_state+0x17f/0x1f0 + pci_set_power_state+0x49/0x250 + pci_finish_runtime_suspend+0x4a/0x90 + pci_pm_runtime_suspend+0xcb/0x1b0 + __rpm_callback+0x48/0x120 + rpm_callback+0x67/0x70 + rpm_suspend+0x167/0x780 + rpm_idle+0x25a/0x380 + pm_runtime_work+0x93/0xc0 + process_one_work+0x21c/0x3f0 } hitcount: 1 Totals: - Hits: 89 - Entries: 11 - Dropped: 0 + Hits: 10 + Entries: 7 + Dropped: 0 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- -- cgit v1.2.1 From 2455f0e124d317dd08d337a7550a78a224d4ba41 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 15 Feb 2023 15:33:45 -0700 Subject: tracing: Always use canonical ftrace path The canonical location for the tracefs filesystem is at /sys/kernel/tracing. But, from Documentation/trace/ftrace.rst: Before 4.1, all ftrace tracing control files were within the debugfs file system, which is typically located at /sys/kernel/debug/tracing. For backward compatibility, when mounting the debugfs file system, the tracefs file system will be automatically mounted at: /sys/kernel/debug/tracing Many comments and Kconfig help messages in the tracing code still refer to this older debugfs path, so let's update them to avoid confusion. Link: https://lore.kernel.org/linux-trace-kernel/20230215223350.2658616-2-zwisler@google.com Acked-by: Masami Hiramatsu (Google) Reviewed-by: Mukesh Ojha Signed-off-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- include/linux/kernel.h | 2 +- include/linux/tracepoint.h | 4 ++-- kernel/trace/Kconfig | 20 ++++++++++---------- kernel/trace/kprobe_event_gen_test.c | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/synth_event_gen_test.c | 2 +- kernel/trace/trace.c | 2 +- samples/user_events/example.c | 4 ++-- scripts/tracing/draw_functrace.py | 6 +++--- tools/lib/api/fs/tracing_path.c | 4 ++-- tools/tracing/latency/latency-collector.c | 2 +- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fe6efb24d151..40bce7495af8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -297,7 +297,7 @@ bool mac_pton(const char *s, u8 *mac); * * Use tracing_on/tracing_off when you want to quickly turn on or off * tracing. It simply enables or disables the recording of the trace events. - * This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on + * This also corresponds to the user space /sys/kernel/tracing/tracing_on * file, which gives a means for the kernel and userspace to interact. * Place a tracing_off() in the kernel where you want tracing to end. * From user space, examine the trace, and then echo 1 > tracing_on diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4b33b95eb8be..fa1004fcf810 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -471,7 +471,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * This is how the trace record is structured and will * * be saved into the ring buffer. These are the fields * * that will be exposed to user-space in - * * /sys/kernel/debug/tracing/events/<*>/format. + * * /sys/kernel/tracing/events/<*>/format. * * * * The declared 'local variable' is called '__entry' * * @@ -531,7 +531,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * tracepoint callback (this is used by programmatic plugins and * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in - * /sys/kernel/debug/tracing/events/. + * /sys/kernel/tracing/events/. * * A set of (un)registration functions can be passed to the variant * TRACE_EVENT_FN to perform any (un)registration work. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d7043043f59c..5f5e64f9e715 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -239,7 +239,7 @@ config DYNAMIC_FTRACE enabled, and the functions not enabled will not affect performance of the system. - See the files in /sys/kernel/debug/tracing: + See the files in /sys/kernel/tracing: available_filter_functions set_ftrace_filter set_ftrace_notrace @@ -299,7 +299,7 @@ config STACK_TRACER select KALLSYMS help This special tracer records the maximum stack footprint of the - kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + kernel and displays it in /sys/kernel/tracing/stack_trace. This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and @@ -339,7 +339,7 @@ config IRQSOFF_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be @@ -363,7 +363,7 @@ config PREEMPT_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be @@ -515,7 +515,7 @@ config TRACER_SNAPSHOT Allow tracing users to take snapshot of the current buffer using the ftrace interface, e.g.: - echo 1 > /sys/kernel/debug/tracing/snapshot + echo 1 > /sys/kernel/tracing/snapshot cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP @@ -527,7 +527,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP full swap (all buffers). If this is set, then the following is allowed: - echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot After which, only the tracing buffer for CPU 2 was swapped with the main tracing buffer, and the other CPU buffers remain the same. @@ -574,7 +574,7 @@ config PROFILE_ANNOTATED_BRANCHES This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: - /sys/kernel/debug/tracing/trace_stat/branch_annotated + /sys/kernel/tracing/trace_stat/branch_annotated Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. @@ -587,7 +587,7 @@ config PROFILE_ALL_BRANCHES taken in the kernel is recorded whether it hit or miss. The results will be displayed in: - /sys/kernel/debug/tracing/trace_stat/branch_all + /sys/kernel/tracing/trace_stat/branch_all This option also enables the likely/unlikely profiler. @@ -638,8 +638,8 @@ config BLK_DEV_IO_TRACE Tracing also is possible using the ftrace interface, e.g.: echo 1 > /sys/block/sda/sda1/trace/enable - echo blk > /sys/kernel/debug/tracing/current_tracer - cat /sys/kernel/debug/tracing/trace_pipe + echo blk > /sys/kernel/tracing/current_tracer + cat /sys/kernel/tracing/trace_pipe If unsure, say N. diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index c736487fc0e4..4850fdfe27f1 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -21,7 +21,7 @@ * Then: * * # insmod kernel/trace/kprobe_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see many instances of the "gen_kprobe_test" and * "gen_kretprobe_test" events in the trace buffer. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 45d4a23d6044..071184324d18 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2886,7 +2886,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" + " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8d77526892f4..8dfe85499d4a 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -22,7 +22,7 @@ * Then: * * # insmod kernel/trace/synth_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see several events in the trace buffer - * "create_synth_test", "empty_synth_test", and several instances of diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 97c88711f270..fbb602a8b64b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1187,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr) * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. diff --git a/samples/user_events/example.c b/samples/user_events/example.c index d06dc24156ec..18e34c9d708e 100644 --- a/samples/user_events/example.c +++ b/samples/user_events/example.c @@ -23,8 +23,8 @@ #endif /* Assumes debugfs is mounted */ -const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; -const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; +const char *data_file = "/sys/kernel/tracing/user_events_data"; +const char *status_file = "/sys/kernel/tracing/user_events_status"; static int event_status(long **status) { diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py index 438516bdfb3c..42fa87300941 100755 --- a/scripts/tracing/draw_functrace.py +++ b/scripts/tracing/draw_functrace.py @@ -12,9 +12,9 @@ calls. Only the functions's names and the call time are provided. Usage: Be sure that you have CONFIG_FUNCTION_TRACER - # mount -t debugfs nodev /sys/kernel/debug - # echo function > /sys/kernel/debug/tracing/current_tracer - $ cat /sys/kernel/debug/tracing/trace_pipe > ~/raw_trace_func + # mount -t tracefs nodev /sys/kernel/tracing + # echo function > /sys/kernel/tracing/current_tracer + $ cat /sys/kernel/tracing/trace_pipe > ~/raw_trace_func Wait some times but not too much, the script is a bit slow. Break the pipe (Ctrl + Z) $ scripts/tracing/draw_functrace.py < ~/raw_trace_func > draw_functrace diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c index b8e457c841ab..7ba3e81274e8 100644 --- a/tools/lib/api/fs/tracing_path.c +++ b/tools/lib/api/fs/tracing_path.c @@ -14,8 +14,8 @@ #include "tracing_path.h" static char tracing_mnt[PATH_MAX] = "/sys/kernel/debug"; -static char tracing_path[PATH_MAX] = "/sys/kernel/debug/tracing"; -static char tracing_events_path[PATH_MAX] = "/sys/kernel/debug/tracing/events"; +static char tracing_path[PATH_MAX] = "/sys/kernel/tracing"; +static char tracing_events_path[PATH_MAX] = "/sys/kernel/tracing/events"; static void __tracing_path_set(const char *tracing, const char *mountpoint) { diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 59a7f2346eab..0fd9c747d396 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1584,7 +1584,7 @@ static void *do_printloop(void *arg) /* * Toss a coin to decide if we want to sleep before printing * out the backtrace. The reason for this is that opening - * /sys/kernel/debug/tracing/trace will cause a blackout of + * /sys/kernel/tracing/trace will cause a blackout of * hundreds of ms, where no latencies will be noted by the * latency tracer. Thus by randomly sleeping we try to avoid * missing traces systematically due to this. With this option -- cgit v1.2.1 From e7bb66f79a7b19a47b3eff745ea9f7ba1ae76032 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Wed, 2 Nov 2022 16:02:36 +0000 Subject: tracepoint: Allow livepatch module add trace event In the case of keeping the system running, the preferred method for tracing the kernel is dynamic tracing (kprobe), but the drawback of this method is that events are lost, especially when tracing packages in the network stack. Livepatching provides a potential solution, which is to reimplement the function you want to replace and insert a static tracepoint. In such a way, custom stable static tracepoints can be expanded without rebooting the system. Link: https://lkml.kernel.org/r/20221102160236.11696-1-iecedge@gmail.com Signed-off-by: Jianlin Lv Signed-off-by: Steven Rostedt (Google) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f23144af5743..8d1507dd0724 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,8 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE) | - (1 << TAINT_TEST)); + (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | + (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); -- cgit v1.2.1 From 7568a21e52f60930ba8ae7897c2521bdab3ef5a4 Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Wed, 23 Nov 2022 14:51:24 +0800 Subject: tracing: Remove unnecessary NULL assignment Remove unnecessary NULL assignment int create_new_subsystem(). Link: https://lkml.kernel.org/r/20221123065124.3982439-1-bobo.shaobowang@huawei.com Signed-off-by: Wang ShaoBo Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cf3fd74fa675..b52c4d79eb78 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2261,8 +2261,6 @@ create_new_subsystem(const char *name) if (!system->name) goto out_free; - system->filter = NULL; - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; -- cgit v1.2.1