From dd1051d9cd25f713c3b050c157bfe1ddf44ffb84 Mon Sep 17 00:00:00 2001 From: Max Moroz Date: Tue, 11 Jun 2019 14:30:18 +0000 Subject: Add FuzzedDataProvider helper class / single header library. Summary: This class is useful for writing fuzz target that have multiple inputs. Current CL imports the existing `FuzzedDataProvider` from Chromium without any modifications. Feel free to review it thoroughly, if you're interested, but I'd prefer changing the class in a follow up CL. The CL also introduces an exhaustive test for the library, as the behavior of `FuzzedDataProvider` must not change over time. In follow up CLs I'm planning on changing some implementation details (I can share a doc with some comments to be addressed). After that, we will document how `FuzzedDataProvider` should be used. I have tested this on Linux, Windows and Mac platforms. Reviewers: morehouse, metzman, kcc Reviewed By: morehouse Subscribers: metzman, thakis, rnk, mgorny, ormris, delcypher, #sanitizers, llvm-commits Tags: #llvm, #sanitizers Differential Revision: https://reviews.llvm.org/D62733 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@363071 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/fuzzer/utils/FuzzedDataProvider.h | 205 ++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 lib/fuzzer/utils/FuzzedDataProvider.h (limited to 'lib/fuzzer/utils/FuzzedDataProvider.h') diff --git a/lib/fuzzer/utils/FuzzedDataProvider.h b/lib/fuzzer/utils/FuzzedDataProvider.h new file mode 100644 index 000000000..252f1f669 --- /dev/null +++ b/lib/fuzzer/utils/FuzzedDataProvider.h @@ -0,0 +1,205 @@ +//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A single header library providing an utility class to break up an array of +// bytes (supposedly provided by a fuzzing engine) for multiple consumers. +// Whenever run on the same input, provides the same output, as long as its +// methods are called in the same order, with the same arguments. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ +#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +class FuzzedDataProvider { + public: + typedef uint8_t data_type; + + // |data| is an array of length |size| that the FuzzedDataProvider wraps to + // provide more granular access. |data| must outlive the FuzzedDataProvider. + FuzzedDataProvider(const uint8_t* data, size_t size) + : data_ptr_(data), remaining_bytes_(size) {} + ~FuzzedDataProvider() = default; + + // Returns a std::vector containing |num_bytes| of input data. If fewer than + // |num_bytes| of data remain, returns a shorter std::vector containing all + // of the data that's left. + template + std::vector ConsumeBytes(size_t num_bytes) { + static_assert(sizeof(T) == sizeof(data_type), "Incompatible data type."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natual choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(num_bytes); + std::memcpy(result.data(), data_ptr_, num_bytes); + Advance(num_bytes); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; + } + + // Prefer using |ConsumeBytes| unless you actually need a std::string object. + // Returns a std::string containing |num_bytes| of input data. If fewer than + // |num_bytes| of data remain, returns a shorter std::string containing all + // of the data that's left. + std::string ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(data_type), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), num_bytes); + Advance(num_bytes); + return result; + } + + // Returns a number in the range [min, max] by consuming bytes from the input + // data. The value might not be uniformly distributed in the given range. If + // there's no input data left, always returns |min|. |min| must be less than + // or equal to |max|. + template + T ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; + } + + // Avoid division by 0, in the case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); + } + + // Returns a std::string of length from 0 to |max_length|. When it runs out of + // input data, returns what remains of the input. Designed to be more stable + // with respect to a fuzzer inserting characters than just picking a random + // length and then consuming that many bytes with |ConsumeBytes|. + std::string ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = static_cast(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = static_cast(data_ptr_[0]); + Advance(1); + if (next != '\\') + return result; + } + result += next; + } + + result.shrink_to_fit(); + return result; + } + + // Returns a std::vector containing all remaining bytes of the input data. + template + std::vector ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); + } + + // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string + // object. + // Returns a std::vector containing all remaining bytes of the input data. + std::string ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); + } + + // Returns a number in the range [Type's min, Type's max]. The value might + // not be uniformly distributed in the given range. If there's no input data + // left, always returns |min|. + template + T ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); + } + + // Reads one byte and returns a bool, or false when no data remains. + bool ConsumeBool() { return 1 & ConsumeIntegral(); } + + // Returns a value from |array|, consuming as many bytes as needed to do so. + // |array| must be a fixed-size array. + template + T PickValueInArray(T (&array)[size]) { + return array[ConsumeIntegralInRange(0, size - 1)]; + } + + // Return an enum value. The enum must start at 0 and be contiguous. It must + // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: + // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; + template + T ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast(ConsumeIntegralInRange( + 0, static_cast(T::kMaxValue))); + } + + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } + + private: + FuzzedDataProvider(const FuzzedDataProvider&) = delete; + FuzzedDataProvider& operator=(const FuzzedDataProvider&) = delete; + + void Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; + } + + const data_type* data_ptr_; + size_t remaining_bytes_; +}; + +#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ -- cgit v1.2.1