1 files changed, 358 insertions, 345 deletions
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index dab6abedc..472017dff 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -15,56 +15,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "scudo_allocator.h"
+#include "scudo_crc32.h"
+#include "scudo_flags.h"
+#include "scudo_tsd.h"
 #include "scudo_utils.h"
 
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_quarantine.h"
 
-#include <limits.h>
-#include <pthread.h>
-
-#include <cstring>
+#include <errno.h>
+#include <string.h>
 
 namespace __scudo {
 
-#if SANITIZER_CAN_USE_ALLOCATOR64
-const uptr AllocatorSpace = ~0ULL;
-const uptr AllocatorSize = 0x40000000000ULL;
-typedef DefaultSizeClassMap SizeClassMap;
-struct AP {
-  static const uptr kSpaceBeg = AllocatorSpace;
-  static const uptr kSpaceSize = AllocatorSize;
-  static const uptr kMetadataSize = 0;
-  typedef __scudo::SizeClassMap SizeClassMap;
-  typedef NoOpMapUnmapCallback MapUnmapCallback;
-  static const uptr kFlags =
-      SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
-};
-typedef SizeClassAllocator64<AP> PrimaryAllocator;
-#else
-// Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
-// security improvements brought to the 64-bit one. This makes the 32-bit
-// version of Scudo slightly less toughened.
-static const uptr RegionSizeLog = 20;
-static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
-# if SANITIZER_WORDSIZE == 32
-typedef FlatByteMap<NumRegions> ByteMap;
-# elif SANITIZER_WORDSIZE == 64
-typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
-# endif  // SANITIZER_WORDSIZE
-typedef DefaultSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
-    RegionSizeLog, ByteMap> PrimaryAllocator;
-#endif  // SANITIZER_CAN_USE_ALLOCATOR64
-
-typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
-typedef ScudoLargeMmapAllocator SecondaryAllocator;
-typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
-  ScudoAllocator;
-
-static ScudoAllocator &getAllocator();
-
-static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
 static uptr Cookie;
 
@@ -72,23 +36,32 @@ static uptr Cookie;
 // at compilation or at runtime.
 static atomic_uint8_t HashAlgorithm = { CRC32Software };
 
-SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);
-
-INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) {
-  // If SSE4.2 is defined here, it was enabled everywhere, as opposed to only
-  // for scudo_crc32.cpp. This means that other SSE instructions were likely
-  // emitted at other places, and as a result there is no reason to not use
-  // the hardware version of the CRC32.
+INLINE u32 computeCRC32(u32 Crc, uptr Value, uptr *Array, uptr ArraySize) {
+  // If the hardware CRC32 feature is defined here, it was enabled everywhere,
+  // as opposed to only for scudo_crc32.cpp. This means that other hardware
+  // specific instructions were likely emitted at other places, and as a
+  // result there is no reason to not use it here.
 #if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
-  return computeHardwareCRC32(Crc, Data);
+  Crc = CRC32_INTRINSIC(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = CRC32_INTRINSIC(Crc, Array[i]);
+  return Crc;
 #else
-  if (computeHardwareCRC32 && HashType == CRC32Hardware)
-    return computeHardwareCRC32(Crc, Data);
-  else
-    return computeSoftwareCRC32(Crc, Data);
-#endif  // defined(__SSE4_2__)
+  if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware) {
+    Crc = computeHardwareCRC32(Crc, Value);
+    for (uptr i = 0; i < ArraySize; i++)
+      Crc = computeHardwareCRC32(Crc, Array[i]);
+    return Crc;
+  }
+  Crc = computeSoftwareCRC32(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = computeSoftwareCRC32(Crc, Array[i]);
+  return Crc;
+#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
 }
 
+static ScudoBackendAllocator &getBackendAllocator();
+
 struct ScudoChunk : UnpackedHeader {
   // We can't use the offset member of the chunk itself, as we would double
   // fetch it without any warranty that it wouldn't have been tampered. To
@@ -101,9 +74,11 @@ struct ScudoChunk : UnpackedHeader {
   // Returns the usable size for a chunk, meaning the amount of bytes from the
   // beginning of the user data to the end of the backend allocated chunk.
   uptr getUsableSize(UnpackedHeader *Header) {
-    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    uptr Size =
+        getBackendAllocator().getActuallyAllocatedSize(getAllocBeg(Header),
+                                                       Header->FromPrimary);
     if (Size == 0)
-      return Size;
+      return 0;
     return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
   }
 
@@ -113,14 +88,14 @@ struct ScudoChunk : UnpackedHeader {
     ZeroChecksumHeader.Checksum = 0;
     uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
     memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
-    u8 HashType = atomic_load_relaxed(&HashAlgorithm);
-    u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HashType);
-    for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++)
-      Crc = computeCRC32(Crc, HeaderHolder[i], HashType);
+    u32 Crc = computeCRC32(static_cast<u32>(Cookie),
+                           reinterpret_cast<uptr>(this), HeaderHolder,
+                           ARRAY_SIZE(HeaderHolder));
     return static_cast<u16>(Crc);
   }
 
-  // Checks the validity of a chunk by verifying its checksum.
+  // Checks the validity of a chunk by verifying its checksum. It doesn't
+  // incur termination in the event of an invalid chunk.
   bool isValid() {
     UnpackedHeader NewUnpackedHeader;
     const AtomicPackedHeader *AtomicHeader =
@@ -130,13 +105,27 @@ struct ScudoChunk : UnpackedHeader {
     return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
   }
 
+  // Nulls out a chunk header. When returning the chunk to the backend, there
+  // is no need to store a valid ChunkAvailable header, as this would be
+  // computationally expensive. Zeroing out serves the same purpose by making
+  // the header invalid. In the extremely rare event where 0 would be a valid
+  // checksum for the chunk, the state of the chunk is ChunkAvailable anyway.
+  COMPILER_CHECK(ChunkAvailable == 0);
+  void eraseHeader() {
+    PackedHeader NullPackedHeader = 0;
+    AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<AtomicPackedHeader *>(this);
+    atomic_store_relaxed(AtomicHeader, NullPackedHeader);
+  }
+
   // Loads and unpacks the header, verifying the checksum in the process.
   void loadHeader(UnpackedHeader *NewUnpackedHeader) const {
     const AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<const AtomicPackedHeader *>(this);
     PackedHeader NewPackedHeader = atomic_load_relaxed(AtomicHeader);
     *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader)) {
+    if (UNLIKELY(NewUnpackedHeader->Checksum !=
+        computeChecksum(NewUnpackedHeader))) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
@@ -160,155 +149,134 @@ struct ScudoChunk : UnpackedHeader {
     PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
     AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<AtomicPackedHeader *>(this);
-    if (!atomic_compare_exchange_strong(AtomicHeader,
-                                        &OldPackedHeader,
-                                        NewPackedHeader,
-                                        memory_order_relaxed)) {
+    if (UNLIKELY(!atomic_compare_exchange_strong(AtomicHeader,
+                                                 &OldPackedHeader,
+                                                 NewPackedHeader,
+                                                 memory_order_relaxed))) {
       dieWithMessage("ERROR: race on chunk header at address %p\n", this);
     }
   }
 };
 
-static bool ScudoInitIsRunning = false;
+ScudoChunk *getScudoChunk(uptr UserBeg) {
+  return reinterpret_cast<ScudoChunk *>(UserBeg - AlignedChunkHeaderSize);
+}
 
-static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
-static pthread_key_t PThreadKey;
-
-static thread_local bool ThreadInited = false;
-static thread_local bool ThreadTornDown = false;
-static thread_local AllocatorCache Cache;
-
-static void teardownThread(void *p) {
-  uptr v = reinterpret_cast<uptr>(p);
-  // The glibc POSIX thread-local-storage deallocation routine calls user
-  // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS.
-  // We want to be called last since other destructors might call free and the
-  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
-  // quarantine and swallowing the cache.
-  if (v < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    pthread_setspecific(PThreadKey, reinterpret_cast<void *>(v + 1));
-    return;
-  }
-  drainQuarantine();
-  getAllocator().DestroyCache(&Cache);
-  ThreadTornDown = true;
+struct AllocatorOptions {
+  u32 QuarantineSizeKb;
+  u32 ThreadLocalQuarantineSizeKb;
+  u32 QuarantineChunksUpToSize;
+  bool MayReturnNull;
+  s32 ReleaseToOSIntervalMs;
+  bool DeallocationTypeMismatch;
+  bool DeleteSizeMismatch;
+  bool ZeroContents;
+
+  void setFrom(const Flags *f, const CommonFlags *cf);
+};
+
+void AllocatorOptions::setFrom(const Flags *f, const CommonFlags *cf) {
+  MayReturnNull = cf->allocator_may_return_null;
+  ReleaseToOSIntervalMs = cf->allocator_release_to_os_interval_ms;
+  QuarantineSizeKb = f->QuarantineSizeKb;
+  ThreadLocalQuarantineSizeKb = f->ThreadLocalQuarantineSizeKb;
+  QuarantineChunksUpToSize = f->QuarantineChunksUpToSize;
+  DeallocationTypeMismatch = f->DeallocationTypeMismatch;
+  DeleteSizeMismatch = f->DeleteSizeMismatch;
+  ZeroContents = f->ZeroContents;
 }
 
-static void initInternal() {
+static void initScudoInternal(const AllocatorOptions &Options);
+
+static bool ScudoInitIsRunning = false;
+
+void initScudo() {
   SanitizerToolName = "Scudo";
   CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
   ScudoInitIsRunning = true;
 
-  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
-  if (testCPUFeature(CRC32CPUFeature)) {
+  // Check if hardware CRC32 is supported in the binary and by the platform, if
+  // so, opt for the CRC32 hardware version of the checksum.
+  if (computeHardwareCRC32 && testCPUFeature(CRC32CPUFeature))
     atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
-  }
 
   initFlags();
 
   AllocatorOptions Options;
   Options.setFrom(getFlags(), common_flags());
-  initAllocator(Options);
+  initScudoInternal(Options);
 
-  MaybeStartBackgroudThread();
+  // TODO(kostyak): determine if MaybeStartBackgroudThread could be of some use.
 
   ScudoInitIsRunning = false;
 }
 
-static void initGlobal() {
-  pthread_key_create(&PThreadKey, teardownThread);
-  initInternal();
-}
-
-static void NOINLINE initThread() {
-  pthread_once(&GlobalInited, initGlobal);
-  pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
-  getAllocator().InitCache(&Cache);
-  ThreadInited = true;
-}
-
 struct QuarantineCallback {
   explicit QuarantineCallback(AllocatorCache *Cache)
     : Cache_(Cache) {}
 
-  // Chunk recycling function, returns a quarantined chunk to the backend.
+  // Chunk recycling function, returns a quarantined chunk to the backend,
+  // first making sure it hasn't been tampered with.
   void Recycle(ScudoChunk *Chunk) {
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
-    if (Header.State != ChunkQuarantine) {
+    if (UNLIKELY(Header.State != ChunkQuarantine)) {
       dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
                      Chunk);
     }
+    Chunk->eraseHeader();
     void *Ptr = Chunk->getAllocBeg(&Header);
-    getAllocator().Deallocate(Cache_, Ptr);
+    if (Header.FromPrimary)
+      getBackendAllocator().deallocatePrimary(Cache_, Ptr);
+    else
+      getBackendAllocator().deallocateSecondary(Ptr);
   }
 
-  /// Internal quarantine allocation and deallocation functions.
+  // Internal quarantine allocation and deallocation functions. We first check
+  // that the batches are indeed serviced by the Primary.
+  // TODO(kostyak): figure out the best way to protect the batches.
+  COMPILER_CHECK(sizeof(QuarantineBatch) < SizeClassMap::kMaxSize);
   void *Allocate(uptr Size) {
-    // The internal quarantine memory cannot be protected by us. But the only
-    // structures allocated are QuarantineBatch, that are 8KB for x64. So we
-    // will use mmap for those, and given that Deallocate doesn't pass a size
-    // in, we enforce the size of the allocation to be sizeof(QuarantineBatch).
-    // TODO(kostyak): switching to mmap impacts greatly performances, we have
-    //                to find another solution
-    // CHECK_EQ(Size, sizeof(QuarantineBatch));
-    // return MmapOrDie(Size, "QuarantineBatch");
-    return getAllocator().Allocate(Cache_, Size, 1, false);
+    return getBackendAllocator().allocatePrimary(Cache_, Size);
   }
 
   void Deallocate(void *Ptr) {
-    // UnmapOrDie(Ptr, sizeof(QuarantineBatch));
-    getAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().deallocatePrimary(Cache_, Ptr);
   }
 
   AllocatorCache *Cache_;
 };
 
 typedef Quarantine<QuarantineCallback, ScudoChunk> ScudoQuarantine;
-typedef ScudoQuarantine::Cache QuarantineCache;
-static thread_local QuarantineCache ThreadQuarantineCache;
-
-void AllocatorOptions::setFrom(const Flags *f, const CommonFlags *cf) {
-  MayReturnNull = cf->allocator_may_return_null;
-  ReleaseToOSIntervalMs = cf->allocator_release_to_os_interval_ms;
-  QuarantineSizeMb = f->QuarantineSizeMb;
-  ThreadLocalQuarantineSizeKb = f->ThreadLocalQuarantineSizeKb;
-  DeallocationTypeMismatch = f->DeallocationTypeMismatch;
-  DeleteSizeMismatch = f->DeleteSizeMismatch;
-  ZeroContents = f->ZeroContents;
-}
+typedef ScudoQuarantine::Cache ScudoQuarantineCache;
+COMPILER_CHECK(sizeof(ScudoQuarantineCache) <=
+               sizeof(ScudoTSD::QuarantineCachePlaceHolder));
 
-void AllocatorOptions::copyTo(Flags *f, CommonFlags *cf) const {
-  cf->allocator_may_return_null = MayReturnNull;
-  cf->allocator_release_to_os_interval_ms = ReleaseToOSIntervalMs;
-  f->QuarantineSizeMb = QuarantineSizeMb;
-  f->ThreadLocalQuarantineSizeKb = ThreadLocalQuarantineSizeKb;
-  f->DeallocationTypeMismatch = DeallocationTypeMismatch;
-  f->DeleteSizeMismatch = DeleteSizeMismatch;
-  f->ZeroContents = ZeroContents;
+ScudoQuarantineCache *getQuarantineCache(ScudoTSD *TSD) {
+  return reinterpret_cast<ScudoQuarantineCache *>(
+      TSD->QuarantineCachePlaceHolder);
 }
 
-struct Allocator {
+struct ScudoAllocator {
   static const uptr MaxAllowedMallocSize =
       FIRST_32_SECOND_64(2UL << 30, 1ULL << 40);
 
-  ScudoAllocator BackendAllocator;
+  typedef ReturnNullOrDieOnFailure FailureHandler;
+
+  ScudoBackendAllocator BackendAllocator;
   ScudoQuarantine AllocatorQuarantine;
 
-  // The fallback caches are used when the thread local caches have been
-  // 'detroyed' on thread tear-down. They are protected by a Mutex as they can
-  // be accessed by different threads.
-  StaticSpinMutex FallbackMutex;
-  AllocatorCache FallbackAllocatorCache;
-  QuarantineCache FallbackQuarantineCache;
+  StaticSpinMutex GlobalPrngMutex;
+  ScudoPrng GlobalPrng;
+
+  u32 QuarantineChunksUpToSize;
 
   bool DeallocationTypeMismatch;
   bool ZeroContents;
   bool DeleteSizeMismatch;
 
-  explicit Allocator(LinkerInitialized)
-    : AllocatorQuarantine(LINKER_INITIALIZED),
-      FallbackQuarantineCache(LINKER_INITIALIZED) {}
+  explicit ScudoAllocator(LinkerInitialized)
+    : AllocatorQuarantine(LINKER_INITIALIZED) {}
 
   void init(const AllocatorOptions &Options) {
     // Verify that the header offset field can hold the maximum offset. In the
@@ -320,23 +288,23 @@ struct Allocator {
     // result, the maximum offset will be at most the maximum alignment for the
     // last size class minus the header size, in multiples of MinAlignment.
     UnpackedHeader Header = {};
-    uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
-        SizeClassMap::kMaxSize - MinAlignment);
-    uptr MaxOffset = (MaxPrimaryAlignment - AlignedChunkHeaderSize) >>
-        MinAlignmentLog;
+    uptr MaxPrimaryAlignment =
+        1 << MostSignificantSetBitIndex(SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaxOffset =
+        (MaxPrimaryAlignment - AlignedChunkHeaderSize) >> MinAlignmentLog;
     Header.Offset = MaxOffset;
     if (Header.Offset != MaxOffset) {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                      "header\n");
     }
-    // Verify that we can fit the maximum amount of unused bytes in the header.
-    // Given that the Secondary fits the allocation to a page, the worst case
-    // scenario happens in the Primary. It will depend on the second to last
-    // and last class sizes, as well as the dynamic base for the Primary. The
-    // following is an over-approximation that works for our needs.
-    uptr MaxUnusedBytes = SizeClassMap::kMaxSize - 1 - AlignedChunkHeaderSize;
-    Header.UnusedBytes = MaxUnusedBytes;
-    if (Header.UnusedBytes != MaxUnusedBytes) {
+    // Verify that we can fit the maximum size or amount of unused bytes in the
+    // header. Given that the Secondary fits the allocation to a page, the worst
+    // case scenario happens in the Primary. It will depend on the second to
+    // last and last class sizes, as well as the dynamic base for the Primary.
+    // The following is an over-approximation that works for our needs.
+    uptr MaxSizeOrUnusedBytes = SizeClassMap::kMaxSize - 1;
+    Header.SizeOrUnusedBytes = MaxSizeOrUnusedBytes;
+    if (Header.SizeOrUnusedBytes != MaxSizeOrUnusedBytes) {
       dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
                      "the header\n");
     }
@@ -344,184 +312,221 @@ struct Allocator {
     DeallocationTypeMismatch = Options.DeallocationTypeMismatch;
     DeleteSizeMismatch = Options.DeleteSizeMismatch;
     ZeroContents = Options.ZeroContents;
-    BackendAllocator.Init(Options.MayReturnNull, Options.ReleaseToOSIntervalMs);
+    SetAllocatorMayReturnNull(Options.MayReturnNull);
+    BackendAllocator.init(Options.ReleaseToOSIntervalMs);
     AllocatorQuarantine.Init(
-        static_cast<uptr>(Options.QuarantineSizeMb) << 20,
+        static_cast<uptr>(Options.QuarantineSizeKb) << 10,
         static_cast<uptr>(Options.ThreadLocalQuarantineSizeKb) << 10);
-    BackendAllocator.InitCache(&FallbackAllocatorCache);
-    Cookie = Prng.Next();
+    QuarantineChunksUpToSize = Options.QuarantineChunksUpToSize;
+    GlobalPrng.init();
+    Cookie = GlobalPrng.getU64();
   }
 
-  // Helper function that checks for a valid Scudo chunk.
+  // Helper function that checks for a valid Scudo chunk. nullptr isn't.
   bool isValidPointer(const void *UserPtr) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    initThreadMaybe();
+    if (UNLIKELY(!UserPtr))
       return false;
-    }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
-    return Chunk->isValid();
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(UserBeg, MinAlignment))
+      return false;
+    return getScudoChunk(UserBeg)->isValid();
   }
 
   // Allocates a chunk.
-  void *allocate(uptr Size, uptr Alignment, AllocType Type) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!IsPowerOfTwo(Alignment)) {
-      dieWithMessage("ERROR: alignment is not a power of 2\n");
-    }
-    if (Alignment > MaxAlignment)
-      return BackendAllocator.ReturnNullOrDieOnBadRequest();
-    if (Alignment < MinAlignment)
+  void *allocate(uptr Size, uptr Alignment, AllocType Type,
+                 bool ForceZeroContents = false) {
+    initThreadMaybe();
+    if (UNLIKELY(Alignment > MaxAlignment))
+      return FailureHandler::OnBadRequest();
+    if (UNLIKELY(Alignment < MinAlignment))
       Alignment = MinAlignment;
-    if (Size == 0)
+    if (UNLIKELY(Size >= MaxAllowedMallocSize))
+      return FailureHandler::OnBadRequest();
+    if (UNLIKELY(Size == 0))
       Size = 1;
-    if (Size >= MaxAllowedMallocSize)
-      return BackendAllocator.ReturnNullOrDieOnBadRequest();
 
     uptr NeededSize = RoundUpTo(Size, MinAlignment) + AlignedChunkHeaderSize;
-    if (Alignment > MinAlignment)
-      NeededSize += Alignment;
-    if (NeededSize >= MaxAllowedMallocSize)
-      return BackendAllocator.ReturnNullOrDieOnBadRequest();
-
-    // Primary backed and Secondary backed allocations have a different
-    // treatment. We deal with alignment requirements of Primary serviced
-    // allocations here, but the Secondary will take care of its own alignment
-    // needs, which means we also have to work around some limitations of the
-    // combined allocator to accommodate the situation.
-    bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
+    uptr AlignedSize = (Alignment > MinAlignment) ?
+        NeededSize + (Alignment - AlignedChunkHeaderSize) : NeededSize;
+    if (UNLIKELY(AlignedSize >= MaxAllowedMallocSize))
+      return FailureHandler::OnBadRequest();
+
+    // Primary and Secondary backed allocations have a different treatment. We
+    // deal with alignment requirements of Primary serviced allocations here,
+    // but the Secondary will take care of its own alignment needs.
+    bool FromPrimary = PrimaryAllocator::CanAllocate(AlignedSize, MinAlignment);
 
     void *Ptr;
-    if (LIKELY(!ThreadTornDown)) {
-      Ptr = BackendAllocator.Allocate(&Cache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
+    u8 Salt;
+    uptr AllocSize;
+    if (FromPrimary) {
+      AllocSize = AlignedSize;
+      ScudoTSD *TSD = getTSDAndLock();
+      Salt = TSD->Prng.getU8();
+      Ptr = BackendAllocator.allocatePrimary(&TSD->Cache, AllocSize);
+      TSD->unlock();
     } else {
-      SpinMutexLock l(&FallbackMutex);
-      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
-    }
-    if (!Ptr)
-      return BackendAllocator.ReturnNullOrDieOnOOM();
-
-    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
-    // If the allocation was serviced by the secondary, the returned pointer
-    // accounts for ChunkHeaderSize to pass the alignment check of the combined
-    // allocator. Adjust it here.
-    if (!FromPrimary) {
-      AllocBeg -= AlignedChunkHeaderSize;
-      if (Alignment > MinAlignment)
-        NeededSize -= Alignment;
+      {
+        SpinMutexLock l(&GlobalPrngMutex);
+        Salt = GlobalPrng.getU8();
+      }
+      AllocSize = NeededSize;
+      Ptr = BackendAllocator.allocateSecondary(AllocSize, Alignment);
     }
+    if (UNLIKELY(!Ptr))
+      return FailureHandler::OnOOM();
 
-    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
-        reinterpret_cast<void *>(AllocBeg));
     // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && FromPrimary)
-       memset(Ptr, 0, ActuallyAllocatedSize);
-
-    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
-    if (!IsAligned(ChunkBeg, Alignment))
-      ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
-    CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    if ((ForceZeroContents || ZeroContents) && FromPrimary)
+      memset(Ptr, 0, BackendAllocator.getActuallyAllocatedSize(
+          Ptr, /*FromPrimary=*/true));
+
     UnpackedHeader Header = {};
+    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
+    uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
+    if (UNLIKELY(!IsAligned(UserBeg, Alignment))) {
+      // Since the Secondary takes care of alignment, a non-aligned pointer
+      // means it is from the Primary. It is also the only case where the offset
+      // field of the header would be non-zero.
+      CHECK(FromPrimary);
+      UserBeg = RoundUpTo(UserBeg, Alignment);
+      uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
+      Header.Offset = Offset >> MinAlignmentLog;
+    }
+    CHECK_LE(UserBeg + Size, AllocBeg + AllocSize);
     Header.State = ChunkAllocated;
-    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
-    Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
-    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
-        AlignedChunkHeaderSize - Size;
-    Header.Salt = static_cast<u8>(Prng.Next());
-    Chunk->storeHeader(&Header);
-    void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
-    // TODO(kostyak): hooks sound like a terrible idea security wise but might
-    //                be needed for things to work properly?
+    if (FromPrimary) {
+      Header.FromPrimary = 1;
+      Header.SizeOrUnusedBytes = Size;
+    } else {
+      // The secondary fits the allocations to a page, so the amount of unused
+      // bytes is the difference between the end of the user allocation and the
+      // next page boundary.
+      uptr PageSize = GetPageSizeCached();
+      uptr TrailingBytes = (UserBeg + Size) & (PageSize - 1);
+      if (TrailingBytes)
+        Header.SizeOrUnusedBytes = PageSize - TrailingBytes;
+    }
+    Header.Salt = Salt;
+    getScudoChunk(UserBeg)->storeHeader(&Header);
+    void *UserPtr = reinterpret_cast<void *>(UserBeg);
     // if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(UserPtr, Size);
     return UserPtr;
   }
 
+  // Place a chunk in the quarantine or directly deallocate it in the event of
+  // a zero-sized quarantine, or if the size of the chunk is greater than the
+  // quarantine chunk size threshold.
+  void quarantineOrDeallocateChunk(ScudoChunk *Chunk, UnpackedHeader *Header,
+                                   uptr Size) {
+    const bool BypassQuarantine = (AllocatorQuarantine.GetCacheSize() == 0) ||
+        (Size > QuarantineChunksUpToSize);
+    if (BypassQuarantine) {
+      Chunk->eraseHeader();
+      void *Ptr = Chunk->getAllocBeg(Header);
+      if (Header->FromPrimary) {
+        ScudoTSD *TSD = getTSDAndLock();
+        getBackendAllocator().deallocatePrimary(&TSD->Cache, Ptr);
+        TSD->unlock();
+      } else {
+        getBackendAllocator().deallocateSecondary(Ptr);
+      }
+    } else {
+      // If a small memory amount was allocated with a larger alignment, we want
+      // to take that into account. Otherwise the Quarantine would be filled
+      // with tiny chunks, taking a lot of VA memory. This is an approximation
+      // of the usable size, that allows us to not call
+      // GetActuallyAllocatedSize.
+      uptr EstimatedSize = Size + (Header->Offset << MinAlignmentLog);
+      UnpackedHeader NewHeader = *Header;
+      NewHeader.State = ChunkQuarantine;
+      Chunk->compareExchangeHeader(&NewHeader, Header);
+      ScudoTSD *TSD = getTSDAndLock();
+      AllocatorQuarantine.Put(getQuarantineCache(TSD),
+                              QuarantineCallback(&TSD->Cache),
+                              Chunk, EstimatedSize);
+      TSD->unlock();
+    }
+  }
+
   // Deallocates a Chunk, which means adding it to the delayed free list (or
   // Quarantine).
   void deallocate(void *UserPtr, uptr DeleteSize, AllocType Type) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    // TODO(kostyak): see hook comment above
+    // For a deallocation, we only ensure minimal initialization, meaning thread
+    // local data will be left uninitialized for now (when using ELF TLS). The
+    // fallback cache will be used instead. This is a workaround for a situation
+    // where the only heap operation performed in a thread would be a free past
+    // the TLS destructors, ending up in initialized thread specific data never
+    // being destroyed properly. Any other heap operation will do a full init.
+    initThreadMaybe(/*MinimalInit=*/true);
     // if (&__sanitizer_free_hook) __sanitizer_free_hook(UserPtr);
-    if (!UserPtr)
+    if (UNLIKELY(!UserPtr))
       return;
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
       dieWithMessage("ERROR: attempted to deallocate a chunk not properly "
                      "aligned at address %p\n", UserPtr);
     }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
-    UnpackedHeader OldHeader;
-    Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
+    UnpackedHeader Header;
+    Chunk->loadHeader(&Header);
+    if (UNLIKELY(Header.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when deallocating address "
                      "%p\n", UserPtr);
     }
-    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
-    UnpackedHeader NewHeader = OldHeader;
-    NewHeader.State = ChunkQuarantine;
-    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
     if (DeallocationTypeMismatch) {
       // The deallocation type has to match the allocation one.
-      if (NewHeader.AllocType != Type) {
+      if (Header.AllocType != Type) {
         // With the exception of memalign'd Chunks, that can be still be free'd.
-        if (NewHeader.AllocType != FromMemalign || Type != FromMalloc) {
-          dieWithMessage("ERROR: allocation type mismatch on address %p\n",
-                         Chunk);
+        if (Header.AllocType != FromMemalign || Type != FromMalloc) {
+          dieWithMessage("ERROR: allocation type mismatch when deallocating "
+                         "address %p\n", UserPtr);
         }
       }
     }
-    uptr Size = UsableSize - OldHeader.UnusedBytes;
+    uptr Size = Header.FromPrimary ? Header.SizeOrUnusedBytes :
+        Chunk->getUsableSize(&Header) - Header.SizeOrUnusedBytes;
     if (DeleteSizeMismatch) {
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
-                       Chunk);
+                       UserPtr);
       }
     }
-
-    if (LIKELY(!ThreadTornDown)) {
-      AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, UsableSize);
-    } else {
-      SpinMutexLock l(&FallbackMutex);
-      AllocatorQuarantine.Put(&FallbackQuarantineCache,
-                              QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, UsableSize);
-    }
+    quarantineOrDeallocateChunk(Chunk, &Header, Size);
   }
 
   // Reallocates a chunk. We can save on a new allocation if the new requested
   // size still fits in the chunk.
   void *reallocate(void *OldPtr, uptr NewSize) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    initThreadMaybe();
+    uptr UserBeg = reinterpret_cast<uptr>(OldPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
+      dieWithMessage("ERROR: attempted to reallocate a chunk not properly "
+                     "aligned at address %p\n", OldPtr);
+    }
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    if (UNLIKELY(OldHeader.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when reallocating address "
                      "%p\n", OldPtr);
     }
-    uptr Size = Chunk->getUsableSize(&OldHeader);
-    if (OldHeader.AllocType != FromMalloc) {
-      dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
-                     Chunk);
+    if (DeallocationTypeMismatch) {
+      if (UNLIKELY(OldHeader.AllocType != FromMalloc)) {
+        dieWithMessage("ERROR: allocation type mismatch when reallocating "
+                       "address %p\n", OldPtr);
+      }
     }
-    UnpackedHeader NewHeader = OldHeader;
-    // The new size still fits in the current chunk.
-    if (NewSize <= Size) {
-      NewHeader.UnusedBytes = Size - NewSize;
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
+    // The new size still fits in the current chunk, and the size difference
+    // is reasonable.
+    if (NewSize <= UsableSize &&
+        (UsableSize - NewSize) < (SizeClassMap::kMaxSize / 2)) {
+      UnpackedHeader NewHeader = OldHeader;
+      NewHeader.SizeOrUnusedBytes =
+                OldHeader.FromPrimary ? NewSize : UsableSize - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -529,36 +534,25 @@ struct Allocator {
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = Size - OldHeader.UnusedBytes;
-      memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
-      NewHeader.State = ChunkQuarantine;
-      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
-      if (LIKELY(!ThreadTornDown)) {
-        AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, Size);
-      } else {
-        SpinMutexLock l(&FallbackMutex);
-        AllocatorQuarantine.Put(&FallbackQuarantineCache,
-                                QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, Size);
-      }
+      uptr OldSize = OldHeader.FromPrimary ? OldHeader.SizeOrUnusedBytes :
+          UsableSize - OldHeader.SizeOrUnusedBytes;
+      memcpy(NewPtr, OldPtr, Min(NewSize, UsableSize));
+      quarantineOrDeallocateChunk(Chunk, &OldHeader, OldSize);
     }
     return NewPtr;
   }
 
   // Helper function that returns the actual usable size of a chunk.
   uptr getUsableSize(const void *Ptr) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!Ptr)
+    initThreadMaybe();
+    if (UNLIKELY(!Ptr))
       return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    uptr UserBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header.State != ChunkAllocated) {
+    if (UNLIKELY(Header.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
                      Ptr);
     }
@@ -566,48 +560,54 @@ struct Allocator {
   }
 
   void *calloc(uptr NMemB, uptr Size) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    uptr Total = NMemB * Size;
-    if (Size != 0 && Total / Size != NMemB) // Overflow check
-      return BackendAllocator.ReturnNullOrDieOnBadRequest();
-    void *Ptr = allocate(Total, MinAlignment, FromMalloc);
-    // If ZeroContents, the content of the chunk has already been zero'd out.
-    if (!ZeroContents && Ptr && BackendAllocator.FromPrimary(Ptr))
-      memset(Ptr, 0, getUsableSize(Ptr));
-    return Ptr;
+    initThreadMaybe();
+    if (UNLIKELY(CheckForCallocOverflow(NMemB, Size)))
+      return FailureHandler::OnBadRequest();
+    return allocate(NMemB * Size, MinAlignment, FromMalloc, true);
   }
 
-  void drainQuarantine() {
-    AllocatorQuarantine.Drain(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache));
+  void commitBack(ScudoTSD *TSD) {
+    AllocatorQuarantine.Drain(getQuarantineCache(TSD),
+                              QuarantineCallback(&TSD->Cache));
+    BackendAllocator.destroyCache(&TSD->Cache);
   }
 
   uptr getStats(AllocatorStat StatType) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
+    initThreadMaybe();
     uptr stats[AllocatorStatCount];
-    BackendAllocator.GetStats(stats);
+    BackendAllocator.getStats(stats);
     return stats[StatType];
   }
+
+  void *handleBadRequest() {
+    initThreadMaybe();
+    return FailureHandler::OnBadRequest();
+  }
 };
 
-static Allocator Instance(LINKER_INITIALIZED);
+static ScudoAllocator Instance(LINKER_INITIALIZED);
 
-static ScudoAllocator &getAllocator() {
+static ScudoBackendAllocator &getBackendAllocator() {
   return Instance.BackendAllocator;
 }
 
-void initAllocator(const AllocatorOptions &Options) {
+static void initScudoInternal(const AllocatorOptions &Options) {
   Instance.init(Options);
 }
 
-void drainQuarantine() {
-  Instance.drainQuarantine();
+void ScudoTSD::init(bool Shared) {
+  UnlockRequired = Shared;
+  getBackendAllocator().initCache(&Cache);
+  Prng.init();
+  memset(QuarantineCachePlaceHolder, 0, sizeof(QuarantineCachePlaceHolder));
+}
+
+void ScudoTSD::commitBack() {
+  Instance.commitBack(this);
 }
 
 void *scudoMalloc(uptr Size, AllocType Type) {
-  return Instance.allocate(Size, MinAlignment, Type);
+  return SetErrnoOnNull(Instance.allocate(Size, MinAlignment, Type));
 }
 
 void scudoFree(void *Ptr, AllocType Type) {
@@ -620,47 +620,60 @@ void scudoSizedFree(void *Ptr, uptr Size, AllocType Type) {
 
 void *scudoRealloc(void *Ptr, uptr Size) {
   if (!Ptr)
-    return Instance.allocate(Size, MinAlignment, FromMalloc);
+    return SetErrnoOnNull(Instance.allocate(Size, MinAlignment, FromMalloc));
   if (Size == 0) {
     Instance.deallocate(Ptr, 0, FromMalloc);
     return nullptr;
   }
-  return Instance.reallocate(Ptr, Size);
+  return SetErrnoOnNull(Instance.reallocate(Ptr, Size));
 }
 
 void *scudoCalloc(uptr NMemB, uptr Size) {
-  return Instance.calloc(NMemB, Size);
+  return SetErrnoOnNull(Instance.calloc(NMemB, Size));
 }
 
 void *scudoValloc(uptr Size) {
-  return Instance.allocate(Size, GetPageSizeCached(), FromMemalign);
-}
-
-void *scudoMemalign(uptr Alignment, uptr Size) {
-  return Instance.allocate(Size, Alignment, FromMemalign);
+  return SetErrnoOnNull(
+      Instance.allocate(Size, GetPageSizeCached(), FromMemalign));
 }
 
 void *scudoPvalloc(uptr Size) {
   uptr PageSize = GetPageSizeCached();
-  Size = RoundUpTo(Size, PageSize);
-  if (Size == 0) {
-    // pvalloc(0) should allocate one page.
-    Size = PageSize;
+  if (UNLIKELY(CheckForPvallocOverflow(Size, PageSize))) {
+    errno = ENOMEM;
+    return Instance.handleBadRequest();
   }
-  return Instance.allocate(Size, PageSize, FromMemalign);
+  // pvalloc(0) should allocate one page.
+  Size = Size ? RoundUpTo(Size, PageSize) : PageSize;
+  return SetErrnoOnNull(Instance.allocate(Size, PageSize, FromMemalign));
+}
+
+void *scudoMemalign(uptr Alignment, uptr Size) {
+  if (UNLIKELY(!IsPowerOfTwo(Alignment))) {
+    errno = EINVAL;
+    return Instance.handleBadRequest();
+  }
+  return SetErrnoOnNull(Instance.allocate(Size, Alignment, FromMemalign));
 }
 
 int scudoPosixMemalign(void **MemPtr, uptr Alignment, uptr Size) {
-  *MemPtr = Instance.allocate(Size, Alignment, FromMemalign);
+  if (UNLIKELY(!CheckPosixMemalignAlignment(Alignment))) {
+    Instance.handleBadRequest();
+    return EINVAL;
+  }
+  void *Ptr = Instance.allocate(Size, Alignment, FromMemalign);
+  if (UNLIKELY(!Ptr))
+    return ENOMEM;
+  *MemPtr = Ptr;
   return 0;
 }
 
 void *scudoAlignedAlloc(uptr Alignment, uptr Size) {
-  // size must be a multiple of the alignment. To avoid a division, we first
-  // make sure that alignment is a power of 2.
-  CHECK(IsPowerOfTwo(Alignment));
-  CHECK_EQ((Size & (Alignment - 1)), 0);
-  return Instance.allocate(Size, Alignment, FromMalloc);
+  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(Alignment, Size))) {
+    errno = EINVAL;
+    return Instance.handleBadRequest();
+  }
+  return SetErrnoOnNull(Instance.allocate(Size, Alignment, FromMalloc));
 }
 
 uptr scudoMallocUsableSize(void *Ptr) {