summaryrefslogtreecommitdiff
path: root/lib/xray
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2018-11-05 05:43:22 +0000
committerDean Michael Berris <dberris@google.com>2018-11-05 05:43:22 +0000
commit2279b0f516b147ecafcef3751e0c789c08e889e4 (patch)
tree6be3da93baf2ef1390c972ead8dfd6bd544a08d8 /lib/xray
parent3cbd30261ab7ce78cf97b8844ee79b943e67411f (diff)
downloadcompiler-rt-2279b0f516b147ecafcef3751e0c789c08e889e4.tar.gz
[XRay] Update TSC math to handle wraparound
Summary: Prior to this change, we can run into situations where the TSC we're getting when exiting a function is less than the TSC we got when entering it. This would sometimes cause the counter for cumulative call times overflow, which was erroneously also being stored as a signed 64-bit integer. This change addresses both these issues while adding provisions for tracking CPU migrations. We do this because moving from one CPU to another doesn't guarantee that the timestamp counter for some architectures aren't guaranteed to be synchronised. For the moment, we leave the provisions there until we can update the data format to include the counting of CPU migrations we can catch. We update the necessary tests as well, ensuring that our expectations for the cycle accounting to be met in case of counter wraparound. Reviewers: mboerger Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D54088 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@346116 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/xray')
-rw-r--r--lib/xray/tests/unit/function_call_trie_test.cc138
-rw-r--r--lib/xray/tests/unit/profile_collector_test.cc16
-rw-r--r--lib/xray/xray_function_call_trie.h55
-rw-r--r--lib/xray/xray_profiling.cc4
4 files changed, 132 insertions, 81 deletions
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
index 049ecfb07..9b0f21090 100644
--- a/lib/xray/tests/unit/function_call_trie_test.cc
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -10,9 +10,9 @@
// This file is a part of XRay, a function call tracing system.
//
//===----------------------------------------------------------------------===//
-#include "gtest/gtest.h"
-
#include "xray_function_call_trie.h"
+#include "gtest/gtest.h"
+#include <cstdint>
namespace __xray {
@@ -29,26 +29,54 @@ TEST(FunctionCallTrieTest, EnterAndExitFunction) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 1);
- Trie.exitFunction(1, 2);
+ uint64_t TSC = 1;
+ uint16_t CPU = 0;
+ Trie.enterFunction(1, TSC++, CPU++);
+ Trie.exitFunction(1, TSC++, CPU++);
+ const auto &R = Trie.getRoots();
- // We need a way to pull the data out. At this point, until we get a data
- // collection service implemented, we're going to export the data as a list of
- // roots, and manually walk through the structure ourselves.
+ ASSERT_EQ(R.size(), 1u);
+ ASSERT_EQ(R.front()->FId, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
+ ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, HandleTSCOverflow) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.enterFunction(1, std::numeric_limits<uint64_t>::max(), 0);
+ Trie.exitFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_EQ(R.size(), 1u);
ASSERT_EQ(R.front()->FId, 1);
- ASSERT_EQ(R.front()->CallCount, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
}
+TEST(FunctionCallTrieTest, MaximalCumulativeTime) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ Trie.enterFunction(1, 1, 0);
+ Trie.exitFunction(1, 0, 0);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_EQ(R.size(), 1u);
+ ASSERT_EQ(R.front()->FId, 1);
+ ASSERT_EQ(R.front()->CallCount, 1u);
+ ASSERT_EQ(R.front()->CumulativeLocalTime,
+ std::numeric_limits<uint64_t>::max() - 1);
+}
+
TEST(FunctionCallTrieTest, MissingFunctionEntry) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.exitFunction(1, 1);
+ Trie.exitFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_TRUE(R.empty());
@@ -58,9 +86,9 @@ TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(2, 1);
- Trie.enterFunction(3, 3);
- Trie.exitFunction(1, 5);
+ Trie.enterFunction(2, 1, 0);
+ Trie.enterFunction(3, 3, 0);
+ Trie.exitFunction(1, 5, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
@@ -71,7 +99,7 @@ TEST(FunctionCallTrieTest, MissingFunctionExit) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 1);
+ Trie.enterFunction(1, 1, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
@@ -84,12 +112,12 @@ TEST(FunctionCallTrieTest, MultipleRoots) {
FunctionCallTrie Trie(A);
// Enter and exit FId = 1.
- Trie.enterFunction(1, 1);
- Trie.exitFunction(1, 2);
+ Trie.enterFunction(1, 1, 0);
+ Trie.exitFunction(1, 2, 0);
// Enter and exit FId = 2.
- Trie.enterFunction(2, 3);
- Trie.exitFunction(2, 4);
+ Trie.enterFunction(2, 3, 0);
+ Trie.exitFunction(2, 4, 0);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
@@ -126,11 +154,11 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 0);
- Trie.enterFunction(2, 100);
- Trie.enterFunction(3, 200);
- Trie.exitFunction(3, 300);
- Trie.exitFunction(1, 400);
+ Trie.enterFunction(1, 0, 0);
+ Trie.enterFunction(2, 100, 0);
+ Trie.enterFunction(3, 200, 0);
+ Trie.exitFunction(3, 300, 0);
+ Trie.exitFunction(1, 400, 0);
// What we should see at this point is all the functions in the trie in a
// specific order (1 -> 2 -> 3) with the appropriate count(s) and local
@@ -153,12 +181,12 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
// Now that we've established the preconditions, we check for specific aspects
// of the nodes.
- EXPECT_EQ(F3.CallCount, 1);
- EXPECT_EQ(F2.CallCount, 1);
- EXPECT_EQ(F1.CallCount, 1);
- EXPECT_EQ(F3.CumulativeLocalTime, 100);
- EXPECT_EQ(F2.CumulativeLocalTime, 300);
- EXPECT_EQ(F1.CumulativeLocalTime, 100);
+ EXPECT_EQ(F3.CallCount, 1u);
+ EXPECT_EQ(F2.CallCount, 1u);
+ EXPECT_EQ(F1.CallCount, 1u);
+ EXPECT_EQ(F3.CumulativeLocalTime, 100u);
+ EXPECT_EQ(F2.CumulativeLocalTime, 300u);
+ EXPECT_EQ(F1.CumulativeLocalTime, 100u);
}
TEST(FunctionCallTrieTest, DeepCallStack) {
@@ -168,8 +196,8 @@ TEST(FunctionCallTrieTest, DeepCallStack) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
for (int i = 0; i < 32; ++i)
- Trie.enterFunction(i + 1, i);
- Trie.exitFunction(1, 33);
+ Trie.enterFunction(i + 1, i, 0);
+ Trie.exitFunction(1, 33, 0);
// Here, validate that we have a 32-level deep function call path from the
// root (1) down to the leaf (33).
@@ -178,7 +206,7 @@ TEST(FunctionCallTrieTest, DeepCallStack) {
auto F = R[0];
for (int i = 0; i < 32; ++i) {
EXPECT_EQ(F->FId, i + 1);
- EXPECT_EQ(F->CallCount, 1);
+ EXPECT_EQ(F->CallCount, 1u);
if (F->Callees.empty() && i != 31)
FAIL() << "Empty callees for FId " << F->FId;
if (i != 31)
@@ -193,12 +221,12 @@ TEST(FunctionCallTrieTest, DeepCopy) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
- Trie.enterFunction(1, 0);
- Trie.enterFunction(2, 1);
- Trie.exitFunction(2, 2);
- Trie.enterFunction(3, 3);
- Trie.exitFunction(3, 4);
- Trie.exitFunction(1, 5);
+ Trie.enterFunction(1, 0, 0);
+ Trie.enterFunction(2, 1, 0);
+ Trie.exitFunction(2, 2, 0);
+ Trie.enterFunction(3, 3, 0);
+ Trie.exitFunction(3, 4, 0);
+ Trie.exitFunction(1, 5, 0);
// We want to make a deep copy and compare notes.
auto B = FunctionCallTrie::InitAllocators();
@@ -236,20 +264,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
FunctionCallTrie T1(A);
// 1 -> 2 -> 3
- T0.enterFunction(1, 0);
- T0.enterFunction(2, 1);
- T0.enterFunction(3, 2);
- T0.exitFunction(3, 3);
- T0.exitFunction(2, 4);
- T0.exitFunction(1, 5);
+ T0.enterFunction(1, 0, 0);
+ T0.enterFunction(2, 1, 0);
+ T0.enterFunction(3, 2, 0);
+ T0.exitFunction(3, 3, 0);
+ T0.exitFunction(2, 4, 0);
+ T0.exitFunction(1, 5, 0);
// 1 -> 2 -> 3
- T1.enterFunction(1, 0);
- T1.enterFunction(2, 1);
- T1.enterFunction(3, 2);
- T1.exitFunction(3, 3);
- T1.exitFunction(2, 4);
- T1.exitFunction(1, 5);
+ T1.enterFunction(1, 0, 0);
+ T1.enterFunction(2, 1, 0);
+ T1.enterFunction(3, 2, 0);
+ T1.exitFunction(3, 3, 0);
+ T1.exitFunction(2, 4, 0);
+ T1.exitFunction(1, 5, 0);
// We use a different allocator here to make sure that we're able to transfer
// data into a FunctionCallTrie which uses a different allocator. This
@@ -264,20 +292,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
ASSERT_EQ(Merged.getRoots().size(), 1u);
const auto &R0 = *Merged.getRoots()[0];
EXPECT_EQ(R0.FId, 1);
- EXPECT_EQ(R0.CallCount, 2);
- EXPECT_EQ(R0.CumulativeLocalTime, 10);
+ EXPECT_EQ(R0.CallCount, 2u);
+ EXPECT_EQ(R0.CumulativeLocalTime, 10u);
EXPECT_EQ(R0.Callees.size(), 1u);
const auto &F1 = *R0.Callees[0].NodePtr;
EXPECT_EQ(F1.FId, 2);
- EXPECT_EQ(F1.CallCount, 2);
- EXPECT_EQ(F1.CumulativeLocalTime, 6);
+ EXPECT_EQ(F1.CallCount, 2u);
+ EXPECT_EQ(F1.CumulativeLocalTime, 6u);
EXPECT_EQ(F1.Callees.size(), 1u);
const auto &F2 = *F1.Callees[0].NodePtr;
EXPECT_EQ(F2.FId, 3);
- EXPECT_EQ(F2.CallCount, 2);
- EXPECT_EQ(F2.CumulativeLocalTime, 2);
+ EXPECT_EQ(F2.CallCount, 2u);
+ EXPECT_EQ(F2.CumulativeLocalTime, 2u);
EXPECT_EQ(F2.Callees.size(), 0u);
}
diff --git a/lib/xray/tests/unit/profile_collector_test.cc b/lib/xray/tests/unit/profile_collector_test.cc
index 67049af2c..f06b7027e 100644
--- a/lib/xray/tests/unit/profile_collector_test.cc
+++ b/lib/xray/tests/unit/profile_collector_test.cc
@@ -121,10 +121,10 @@ TEST(profileCollectorServiceTest, PostSerializeCollect) {
FunctionCallTrie T(Allocators);
// Then, we populate the trie with some data.
- T.enterFunction(1, 1);
- T.enterFunction(2, 2);
- T.exitFunction(2, 3);
- T.exitFunction(1, 4);
+ T.enterFunction(1, 1, 0);
+ T.enterFunction(2, 2, 0);
+ T.exitFunction(2, 3, 0);
+ T.exitFunction(1, 4, 0);
// Then we post the data to the global profile collector service.
profileCollectorService::post(T, 1);
@@ -177,10 +177,10 @@ void threadProcessing() {
thread_local auto Allocators = FunctionCallTrie::InitAllocators();
FunctionCallTrie T(Allocators);
- T.enterFunction(1, 1);
- T.enterFunction(2, 2);
- T.exitFunction(2, 3);
- T.exitFunction(1, 4);
+ T.enterFunction(1, 1, 0);
+ T.enterFunction(2, 2, 0);
+ T.exitFunction(2, 3, 0);
+ T.exitFunction(1, 4, 0);
profileCollectorService::post(T, GetTid());
}
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index f4c2fc335..4a6a94ca9 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -18,6 +18,7 @@
#include "xray_defs.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
+#include <limits>
#include <memory> // For placement new.
#include <utility>
@@ -113,13 +114,13 @@ public:
struct Node {
Node *Parent;
NodeIdPairArray Callees;
- int64_t CallCount;
- int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+ uint64_t CallCount;
+ uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
int32_t FId;
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+ Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT,
int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P),
Callees(A),
CallCount(CC),
@@ -133,11 +134,14 @@ private:
struct ShadowStackEntry {
uint64_t EntryTSC;
Node *NodePtr;
+ uint16_t EntryCPU;
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T},
- NodePtr{N} {}
+ ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT
+ : EntryTSC{T},
+ NodePtr{N},
+ EntryCPU{C} {}
};
using NodeArray = Array<Node>;
@@ -262,17 +266,18 @@ public:
ShadowStack(*A.ShadowStackAllocator),
NodeIdPairAllocator(A.NodeIdPairAllocator) {}
- void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ void enterFunction(const int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
DCHECK_NE(FId, 0);
// This function primarily deals with ensuring that the ShadowStack is
// consistent and ready for when an exit event is encountered.
if (UNLIKELY(ShadowStack.empty())) {
auto NewRoot =
- Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+ Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
Roots.Append(NewRoot);
- ShadowStack.AppendEmplace(TSC, NewRoot);
+ ShadowStack.AppendEmplace(TSC, NewRoot, CPU);
return;
}
@@ -286,23 +291,24 @@ public:
[FId](const NodeIdPair &NR) { return NR.FId == FId; });
if (Callee != nullptr) {
CHECK_NE(Callee->NodePtr, nullptr);
- ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+ ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU);
return;
}
// This means we've never seen this stack before, create a new node here.
auto NewNode =
- Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId);
if (UNLIKELY(NewNode == nullptr))
return;
DCHECK_NE(NewNode, nullptr);
TopNode->Callees.AppendEmplace(NewNode, FId);
- ShadowStack.AppendEmplace(TSC, NewNode);
+ ShadowStack.AppendEmplace(TSC, NewNode, CPU);
DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
return;
}
- void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ void exitFunction(int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
// When we exit a function, we look up the ShadowStack to see whether we've
// entered this function before. We do as little processing here as we can,
// since most of the hard work would have already been done at function
@@ -312,7 +318,24 @@ public:
const auto &Top = ShadowStack.back();
auto TopNode = Top.NodePtr;
DCHECK_NE(TopNode, nullptr);
- auto LocalTime = TSC - Top.EntryTSC;
+
+ // We may encounter overflow on the TSC we're provided, which may end up
+ // being less than the TSC when we first entered the function.
+ //
+ // To get the accurate measurement of cycles, we need to check whether
+ // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+ // between the entry TSC and the max for the TSC counter (max of uint64_t)
+ // then add the value of TSC. We can prove that the maximum delta we will
+ // get is at most the 64-bit unsigned value, since the difference between
+ // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+ // - 1) + 1.
+ //
+ // NOTE: This assumes that TSCs are synchronised across CPUs.
+ // TODO: Count the number of times we've seen CPU migrations.
+ uint64_t LocalTime =
+ Top.EntryTSC > TSC
+ ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+ : TSC - Top.EntryTSC;
TopNode->CallCount++;
TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
CumulativeTreeTime += LocalTime;
@@ -410,8 +433,8 @@ public:
auto R = O.Roots.find_element(
[&](const Node *Node) { return Node->FId == Root->FId; });
if (R == nullptr) {
- TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
- 0, Root->FId);
+ TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u,
+ 0u, Root->FId);
if (UNLIKELY(TargetRoot == nullptr))
return;
@@ -436,7 +459,7 @@ public:
});
if (TargetCallee == nullptr) {
auto NewTargetNode = O.Nodes.AppendEmplace(
- NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+ NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId);
if (UNLIKELY(NewTargetNode == nullptr))
return;
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 50dbd7f12..4b9222a13 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -197,11 +197,11 @@ void profilingHandleArg0(int32_t FuncId,
switch (Entry) {
case XRayEntryType::ENTRY:
case XRayEntryType::LOG_ARGS_ENTRY:
- TLD.FCT->enterFunction(FuncId, TSC);
+ TLD.FCT->enterFunction(FuncId, TSC, CPU);
break;
case XRayEntryType::EXIT:
case XRayEntryType::TAIL:
- TLD.FCT->exitFunction(FuncId, TSC);
+ TLD.FCT->exitFunction(FuncId, TSC, CPU);
break;
default:
// FIXME: Handle bugs.