summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSven Hassler <sven_hassler@mentor.com>2015-12-10 13:22:27 +0100
committerLutz Helwing <lutz_helwing@mentor.com>2015-12-16 17:20:39 +0100
commitba46036d828fcd5f79ab7609025007b3dba46aee (patch)
tree0dd678966a0a3d2a4b5da8129f1c495ba99fba7e
parent77400754611a8e7982a7ee3c4816ee5f070c0dca (diff)
downloadDLT-daemon-ba46036d828fcd5f79ab7609025007b3dba46aee.tar.gz
Added documentation, bugfixes, improvements
- Added dlt-kpi.txt as asciidoc document. - Fixed a bug where extremely high CPU- and IO-Wait times were calculated. - CPU- and IO-Wait time are now being divided by CPU count.
-rw-r--r--doc/CMakeLists.txt1
-rw-r--r--doc/dlt-kpi.txt145
-rw-r--r--src/kpi/dlt-kpi-common.c48
-rw-r--r--src/kpi/dlt-kpi-common.h1
-rw-r--r--src/kpi/dlt-kpi-process.c26
5 files changed, 213 insertions, 8 deletions
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index b8f0ae8..87fb339 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_DOC)
COMMAND asciidoc -a TOC1 -o ${CMAKE_BINARY_DIR}/doc/manuals/dlt_book.html ${CMAKE_SOURCE_DIR}/doc/dlt_book.txt
COMMAND asciidoc -a TOC1 -o ${CMAKE_BINARY_DIR}/doc/manuals/dlt_extended_network_trace.html ${CMAKE_SOURCE_DIR}/doc/dlt_extended_network_trace.txt
COMMAND asciidoc -a TOC1 -o ${CMAKE_BINARY_DIR}/doc/manuals/dlt_howto_debug.html ${CMAKE_SOURCE_DIR}/doc/dlt_howto_debug.txt
+ COMMAND asciidoc -a TOC1 -o ${CMAKE_BINARY_DIR}/doc/manuals/dlt-kpi.html ${CMAKE_SOURCE_DIR}/doc/dlt-kpi.txt
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/doc
)
diff --git a/doc/dlt-kpi.txt b/doc/dlt-kpi.txt
new file mode 100644
index 0000000..384ba7b
--- /dev/null
+++ b/doc/dlt-kpi.txt
@@ -0,0 +1,145 @@
+////
+# SPDX license identifier: MPL-2.0
+#
+# Copyright (C) 2011-2015, BMW AG
+#
+# This file is part of GENIVI Project DLT - Diagnostic Log and Trace.
+#
+# This Source Code Form is subject to the terms of the
+# Mozilla Public License (MPL), v. 2.0.
+# If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# For further information see http://www.genivi.org/.
+////
+
+DLT KPI
+=======
+Sven Hassler <Sven_Hassler@mentor.com>
+0.0.1, 14th December 2015: Initial version
+
+image::images/genivi_chrome_1_transparent.png[width=128]
+
+Overview
+--------
+_DLT KPI_ is a tool to provide log messages about **K**ey **P**erformance **I**ndicators to the DLT Daemon. The log message format is designed to be both readable by humans and to be parsed by DLT Viewer plugins.
+The information source for the dlt-kpi tool is the /proc file system.
+
+
+Message format
+--------------
+_DLT KPI_ logs its messages as human readable ASCII messages, divided in multiple arguments. The tool will log messages in user defined intervals, which can be set in the configuration file dlt-kpi.conf.
+
+Identifiers and their datasets:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The logged messages always start with a three character long identifier as first argument. After this identifier, they can contain multiple datasets separated in the remaining arguments. The datasets contain information separated by semicolons. The order and meaning of those information chunks is defined below. +
+The following will explain the meaning to each three-character-identifier and each information chunk of the datasets associated with this identifier. The example messages all contain only one dataset - in real use, many messages will contain multiple datasets (one per argument).
+
+[NOTE]
+Arguments are delimited by spaces when shown in ASCII, but dlt-viewer plugins can easily access each argument separately by certain methods, which makes arguments useful for parsing.
+
+* *NEW*
++
+This identifies a message that contains datasets describing newly created processes.
++
+The datasets in these messages have the following form:
++
+----
+[PID];[Parent PID];[Commandline]
+----
++
+====
+Example message:
+----
+NEW 21226;1;/usr/libexec/nm-dispatcher
+----
+====
+
+* *STP*
++
+This identifies a message that contains datasets describing processes that have ended since the last interval.
++
+The datasets in these messages have the following form:
++
+----
+[PID]
+----
++
+====
+Example message:
+----
+STP 20541
+----
+====
+
+* *ACT*
++
+This identifies a message that contains datasets describing active processes. These are processes that have consumed CPU time since the last interval.
++
+The datasets in these messages have the following form:
++
+----
+[PID];[CPU time in milliseconds per second];[RSS bytes];[CTX-switches since last interval];[I/O bytes];[I/O wait time in milliseconds per second]
+----
++
+====
+Example message:
+----
+ACT 20503;10;389;3;1886649;0
+----
+====
++
+[NOTE]
+The _CPU time_ value is the active time of a process in milliseconds, divided by the number of CPU cores. So this value should never get greater than 1000ms, which would mean 100% CPU usage.
+
+* *CHK*
++
+This identifies a message that is logged for each process in a certain interval. These messages can be used to get a list of currently existing processes and to keep a plugin, that tracks running processes, up to date if messages were lost or if the commandlines have changed.
++
+The datasets in these messages have the following form:
++
+----
+[PID];[Commandline]
+----
++
+====
+Example message:
+----
+CHK 660;/sbin/audispd
+----
+====
+
+* *IRQ*
++
+This identifies a message that contains datasets describing the numbers of interrupts that occurred on each CPU.
++
+The datasets in these messages have the following form:
++
+----
+[IRQ name];cpu[CPU number];[Number of total interrupts];
+----
++
+====
+Example message:
+----
+IRQ 0;cpu0:133;cpu1:0; 1;cpu0:76827;cpu1:0;
+----
+====
+
+Synchronization messages:
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Because the messages can get too long for logging and segmented network messages don't allow for individually set arguments, the datasets can be splitted into multiple messages of the same type (i.e. they have the same identifier). This can make it difficult for an observer (human or machine) to keep track of currently valid information. For example, one can't be sure if a process is part of the list of currently active processes or not, or if this message was part of an older interval that simply arrived too late. So, to correctly associate these messages to each other, each group of potentially "segmented" messages is surrounded by two synchronization messages which start with the same identifier, followed by the codes _BEG_ (for the opening sync message) or _END_ (for the closing sync message). Synchronization messages do not contain datasets.
+
+Example (Messages have been shortened for simplicity):
+
+----
+ACT BEG
+ACT 21768;10;417;3;672075;0 19284;20;15857;303654;22932174;0 1826;20;39781;4404293;154392870;0
+ACT 1635;10;10696;8412557;375710810;0 990;10;22027;1176631;0;0
+ACT END
+----
+
+Only processes that are part of this group are active at this moment. _ACT_ messages that came before this message-group are invalid now. +
+It can also happen that, between a _BEG_ and an _END_ sync message, there are messages of other types. So, plugins should not expect these message groups to always be a "solid block", but react on each message individually and dynamically, and store the logged information until the closing _END_ message arrives.
diff --git a/src/kpi/dlt-kpi-common.c b/src/kpi/dlt-kpi-common.c
index c0c7767..26c4e28 100644
--- a/src/kpi/dlt-kpi-common.c
+++ b/src/kpi/dlt-kpi-common.c
@@ -26,6 +26,8 @@
#include "dlt-kpi-common.h"
+static int dlt_kpi_cpu_count = -1;
+
DltReturnValue dlt_kpi_read_file_compact(char *filename, char **target)
{
char buffer[BUFFER_SIZE];
@@ -66,3 +68,49 @@ DltReturnValue dlt_kpi_read_file(char* filename, char* buffer, uint maxLength)
return DLT_RETURN_OK;
}
+
+int dlt_kpi_read_cpu_count()
+{
+ char buffer[BUFFER_SIZE];
+ int ret = dlt_kpi_read_file("/proc/cpuinfo", buffer, sizeof(buffer));
+ if(ret != 0)
+ {
+ fprintf(stderr, "dlt_kpi_get_cpu_count(): Could not read /proc/cpuinfo\n");
+ return -1;
+ }
+
+ char* delim = "[] \t\n";
+ char* tok = strtok(buffer, delim);
+ if(tok == NULL)
+ {
+ fprintf(stderr, "dlt_kpi_get_cpu_count(): Could not extract token\n");
+ return -1;
+ }
+
+ int num = 0;
+ do
+ {
+ if(strcmp(tok, "processor") == 0)
+ num++;
+
+ tok = strtok(NULL, delim);
+ }
+ while(tok != NULL);
+
+ return num;
+}
+
+int dlt_kpi_get_cpu_count()
+{
+ if(dlt_kpi_cpu_count <= 0)
+ {
+ dlt_kpi_cpu_count = dlt_kpi_read_cpu_count();
+ if(dlt_kpi_cpu_count <= 0)
+ {
+ fprintf(stderr, "Could not get CPU count\n");
+ dlt_kpi_cpu_count = -1;
+ }
+ }
+
+ return dlt_kpi_cpu_count;
+}
diff --git a/src/kpi/dlt-kpi-common.h b/src/kpi/dlt-kpi-common.h
index 4e220a8..4777d6f 100644
--- a/src/kpi/dlt-kpi-common.h
+++ b/src/kpi/dlt-kpi-common.h
@@ -36,5 +36,6 @@
DltReturnValue dlt_kpi_read_file(char* filename, char* buffer, uint maxLength);
DltReturnValue dlt_kpi_read_file_compact(char *filename, char **target);
+int dlt_kpi_get_cpu_count();
#endif /* SRC_KPI_DLT_KPI_COMMON_H_ */
diff --git a/src/kpi/dlt-kpi-process.c b/src/kpi/dlt-kpi-process.c
index d126551..168adf7 100644
--- a/src/kpi/dlt-kpi-process.c
+++ b/src/kpi/dlt-kpi-process.c
@@ -43,9 +43,11 @@ DltReturnValue dlt_kpi_process_update_io_wait(DltKpiProcess *process, unsigned l
unsigned long int total_io_wait = dlt_kpi_read_process_stat_to_ulong(process->pid, 42);
+ int cpu_count = dlt_kpi_get_cpu_count();
+
process->io_wait = (total_io_wait - process->last_io_wait) * 1000 / sysconf(_SC_CLK_TCK); // busy milliseconds since last update
- if(time_dif_ms > 0)
- process->io_wait = process->io_wait * 1000 / time_dif_ms; // busy milliseconds per second
+ if(time_dif_ms > 0 && cpu_count > 0)
+ process->io_wait = process->io_wait * 1000 / time_dif_ms / cpu_count; // busy milliseconds per second per CPU
process->last_io_wait = total_io_wait;
@@ -65,9 +67,16 @@ DltReturnValue dlt_kpi_process_update_cpu_time(DltKpiProcess *process, unsigned
unsigned long total_cpu_time = utime + stime;
- process->cpu_time = (total_cpu_time - process->last_cpu_time) * 1000 / sysconf(_SC_CLK_TCK); // busy milliseconds since last update
- if(time_dif_ms > 0)
- process->cpu_time = process->cpu_time * 1000 / time_dif_ms; // busy milliseconds per second
+ if(process->last_cpu_time > 0 && process->last_cpu_time <= total_cpu_time)
+ {
+ int cpu_count = dlt_kpi_get_cpu_count();
+
+ process->cpu_time = (total_cpu_time - process->last_cpu_time) * 1000 / sysconf(_SC_CLK_TCK); // busy milliseconds since last update
+ if(time_dif_ms > 0 && cpu_count > 0)
+ process->cpu_time = process->cpu_time * 1000 / time_dif_ms / cpu_count; // busy milliseconds per second per CPU
+ }
+ else
+ process->cpu_time = 0;
process->last_cpu_time = total_cpu_time;
@@ -319,13 +328,14 @@ unsigned long int dlt_kpi_read_process_stat_to_ulong(pid_t pid, unsigned int ind
}
char *buffer = NULL;
- DltReturnValue tmp = dlt_kpi_read_process_file_to_str(pid, &buffer, "stat");
- if(tmp < DLT_RETURN_OK)
+ if(dlt_kpi_read_process_file_to_str(pid, &buffer, "stat") < DLT_RETURN_OK)
{
+ // fprintf(stderr, "dlt_kpi_read_process_stat_to_ulong(): Error while reading process stat file. Pid: %d. Requested index: %u\n", pid, index); // can happen if process closed shortly before
+
if(buffer != NULL)
free(buffer);
- return tmp;
+ return 0;
}
char *tok = strtok(buffer, " \t\n");