From 73d1e17b3afc7d5e56184f90bf819dc64956448a Mon Sep 17 00:00:00 2001 From: Mathieu Chartier Date: Fri, 11 Apr 2014 17:53:48 -0700 Subject: Enable reading page map without lock in RosAlloc::BulkFree Enabling this flag greatly reduces how much time was spent in the GC. It was not done previously since it was regressing MemAllocTest. With these RosAlloc changes, the benchmark score no longer regresses after we enable the flag. Changed Run::AllocSlot to only have one mode of allocation. The new mode is finding the first free bit in the bitmap. This was previously the slow path but is now the fast path. Some optimizations which enabled this include always having the alloc bitmap bits which correspond to invalid slots be set to 1. This prevents us from needing a bound check since we will never end up allocating there. Changed revoking thread local buffer to point to an invalid run. The invalid run is just a run which always has all the allocation bits set to 1. When a thread attempts to do a thread local allocation from here it will always fail and go slow path. This eliminates the need for a null check for revoked runs. Changed zeroing of memory to happen during free, AllocPages should always return zeroed memory. Added prefetching which happens when we allocate a run. Some refactoring to reduce duplicated code. Ergonomics changes: Changed kStickyGcThroughputAdjustment to 1.0, this helps reduce GC time. Measurements (3 samples per benchmark): Before: MemAllocTest scores: 3463, 3445, 3431 EvaluateAndApplyChanges score | total GC time Iter 1: 3485, 23.602436s Iter 2: 3434, 22.499882s Iter 3: 3483, 23.253274s After: MemAllocTest scores: 3495, 3417, 3409 EvaluateAndApplyChanges score | total GC time: Iter 1: 3375, 17.463462s Iter 2: 3358, 16.185188s Iter 3: 3367, 15.822312s Bug: 8788501 Bug: 11790317 Bug: 9986565 Change-Id: Ifd273a054824028dabed27c07c081dde1816f93c --- runtime/base/mutex.cc | 22 +- runtime/gc/accounting/space_bitmap.cc | 14 +- runtime/gc/accounting/space_bitmap.h | 3 + runtime/gc/allocator/rosalloc-inl.h | 2 +- runtime/gc/allocator/rosalloc.cc | 651 ++++++++++++++++++---------------- runtime/gc/allocator/rosalloc.h | 68 +++- runtime/gc/heap.cc | 41 ++- runtime/gc/heap.h | 10 +- runtime/gc/space/rosalloc_space.cc | 2 +- runtime/gc/space/space_test.h | 17 +- runtime/thread.cc | 3 +- 11 files changed, 451 insertions(+), 382 deletions(-) diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc index fdd0249..2bc17bf 100644 --- a/runtime/base/mutex.cc +++ b/runtime/base/mutex.cc @@ -206,16 +206,16 @@ void BaseMutex::DumpContention(std::ostream& os) const { os << "never contended"; } else { os << "contended " << contention_count - << " times, average wait of contender " << PrettyDuration(wait_time / contention_count); + << " total wait of contender " << PrettyDuration(wait_time) + << " average " << PrettyDuration(wait_time / contention_count); SafeMap most_common_blocker; SafeMap most_common_blocked; - typedef SafeMap::const_iterator It; for (size_t i = 0; i < kContentionLogSize; ++i) { uint64_t blocked_tid = log[i].blocked_tid; uint64_t owner_tid = log[i].owner_tid; uint32_t count = log[i].count; if (count > 0) { - It it = most_common_blocked.find(blocked_tid); + auto it = most_common_blocked.find(blocked_tid); if (it != most_common_blocked.end()) { most_common_blocked.Overwrite(blocked_tid, it->second + count); } else { @@ -231,10 +231,10 @@ void BaseMutex::DumpContention(std::ostream& os) const { } uint64_t max_tid = 0; size_t max_tid_count = 0; - for (It it = most_common_blocked.begin(); it != most_common_blocked.end(); ++it) { - if (it->second > max_tid_count) { - max_tid = it->first; - max_tid_count = it->second; + for (const auto& pair : most_common_blocked) { + if (pair.second > max_tid_count) { + max_tid = pair.first; + max_tid_count = pair.second; } } if (max_tid != 0) { @@ -242,10 +242,10 @@ void BaseMutex::DumpContention(std::ostream& os) const { } max_tid = 0; max_tid_count = 0; - for (It it = most_common_blocker.begin(); it != most_common_blocker.end(); ++it) { - if (it->second > max_tid_count) { - max_tid = it->first; - max_tid_count = it->second; + for (const auto& pair : most_common_blocker) { + if (pair.second > max_tid_count) { + max_tid = pair.first; + max_tid_count = pair.second; } } if (max_tid != 0) { diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc index 31a1537..66f9a3a 100644 --- a/runtime/gc/accounting/space_bitmap.cc +++ b/runtime/gc/accounting/space_bitmap.cc @@ -21,13 +21,17 @@ namespace gc { namespace accounting { template +size_t SpaceBitmap::ComputeBitmapSize(uint64_t capacity) { + const uint64_t kBytesCoveredPerWord = kAlignment * kBitsPerWord; + return (RoundUp(capacity, kBytesCoveredPerWord) / kBytesCoveredPerWord) * kWordSize; +} + +template SpaceBitmap* SpaceBitmap::CreateFromMemMap( const std::string& name, MemMap* mem_map, byte* heap_begin, size_t heap_capacity) { CHECK(mem_map != nullptr); uword* bitmap_begin = reinterpret_cast(mem_map->Begin()); - const uint64_t kBytesCoveredPerWord = kAlignment * kBitsPerWord; - size_t bitmap_size = (RoundUp(static_cast(heap_capacity), kBytesCoveredPerWord) / - kBytesCoveredPerWord) * kWordSize; + const size_t bitmap_size = ComputeBitmapSize(heap_capacity); return new SpaceBitmap(name, mem_map, bitmap_begin, bitmap_size, heap_begin); } @@ -45,9 +49,7 @@ template SpaceBitmap* SpaceBitmap::Create( const std::string& name, byte* heap_begin, size_t heap_capacity) { // Round up since heap_capacity is not necessarily a multiple of kAlignment * kBitsPerWord. - const uint64_t kBytesCoveredPerWord = kAlignment * kBitsPerWord; - size_t bitmap_size = (RoundUp(static_cast(heap_capacity), kBytesCoveredPerWord) / - kBytesCoveredPerWord) * kWordSize; + const size_t bitmap_size = ComputeBitmapSize(heap_capacity); std::string error_msg; UniquePtr mem_map(MemMap::MapAnonymous(name.c_str(), nullptr, bitmap_size, PROT_READ | PROT_WRITE, false, &error_msg)); diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h index df3fd37..5c7cce2 100644 --- a/runtime/gc/accounting/space_bitmap.h +++ b/runtime/gc/accounting/space_bitmap.h @@ -200,6 +200,9 @@ class SpaceBitmap { SpaceBitmap(const std::string& name, MemMap* mem_map, uword* bitmap_begin, size_t bitmap_size, const void* heap_begin); + // Helper function for computing bitmap size based on a 64 bit capacity. + static size_t ComputeBitmapSize(uint64_t capacity); + template bool Modify(const mirror::Object* obj); diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h index f395314..ac0f67b 100644 --- a/runtime/gc/allocator/rosalloc-inl.h +++ b/runtime/gc/allocator/rosalloc-inl.h @@ -29,7 +29,7 @@ inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* by } void* m = AllocFromRun(self, size, bytes_allocated); // Check if the returned memory is really all zero. - if (kCheckZeroMemory && m != NULL) { + if (kCheckZeroMemory && m != nullptr) { byte* bytes = reinterpret_cast(m); for (size_t i = 0; i < size; ++i) { DCHECK_EQ(bytes[i], 0); diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc index 0f2d6a9..821aa2d 100644 --- a/runtime/gc/allocator/rosalloc.cc +++ b/runtime/gc/allocator/rosalloc.cc @@ -32,6 +32,10 @@ namespace allocator { extern "C" void* art_heap_rosalloc_morecore(RosAlloc* rosalloc, intptr_t increment); +static constexpr bool kUsePrefetchDuringAllocRun = true; +static constexpr bool kPrefetchNewRunDataByZeroing = false; +static constexpr size_t kPrefetchStride = 64; + size_t RosAlloc::bracketSizes[kNumOfSizeBrackets]; size_t RosAlloc::numOfPages[kNumOfSizeBrackets]; size_t RosAlloc::numOfSlots[kNumOfSizeBrackets]; @@ -39,6 +43,9 @@ size_t RosAlloc::headerSizes[kNumOfSizeBrackets]; size_t RosAlloc::bulkFreeBitMapOffsets[kNumOfSizeBrackets]; size_t RosAlloc::threadLocalFreeBitMapOffsets[kNumOfSizeBrackets]; bool RosAlloc::initialized_ = false; +size_t RosAlloc::dedicated_full_run_storage_[kPageSize / sizeof(size_t)] = { 0 }; +RosAlloc::Run* RosAlloc::dedicated_full_run_ = + reinterpret_cast(dedicated_full_run_storage_); RosAlloc::RosAlloc(void* base, size_t capacity, size_t max_capacity, PageReleaseMode page_release_mode, size_t page_release_size_threshold) @@ -62,8 +69,9 @@ RosAlloc::RosAlloc(void* base, size_t capacity, size_t max_capacity, << ", max_capacity=" << std::dec << max_capacity_; memset(current_runs_, 0, sizeof(current_runs_)); for (size_t i = 0; i < kNumOfSizeBrackets; i++) { - size_bracket_locks_[i] = new Mutex("an rosalloc size bracket lock", - kRosAllocBracketLock); + size_bracket_lock_names[i] = + StringPrintf("an rosalloc size bracket %d lock", static_cast(i)); + size_bracket_locks_[i] = new Mutex(size_bracket_lock_names[i].c_str(), kRosAllocBracketLock); } DCHECK_EQ(footprint_, capacity_); size_t num_of_pages = footprint_ / kPageSize; @@ -71,7 +79,7 @@ RosAlloc::RosAlloc(void* base, size_t capacity, size_t max_capacity, std::string error_msg; page_map_mem_map_.reset(MemMap::MapAnonymous("rosalloc page map", NULL, RoundUp(max_num_of_pages, kPageSize), PROT_READ | PROT_WRITE, false, &error_msg)); - CHECK(page_map_mem_map_.get() != NULL) << "Couldn't allocate the page map : " << error_msg; + CHECK(page_map_mem_map_.get() != nullptr) << "Couldn't allocate the page map : " << error_msg; page_map_ = page_map_mem_map_->Begin(); page_map_size_ = num_of_pages; max_page_map_size_ = max_num_of_pages; @@ -103,7 +111,7 @@ void* RosAlloc::AllocPages(Thread* self, size_t num_pages, byte page_map_type) { lock_.AssertHeld(self); DCHECK(page_map_type == kPageMapRun || page_map_type == kPageMapLargeObject); FreePageRun* res = NULL; - size_t req_byte_size = num_pages * kPageSize; + const size_t req_byte_size = num_pages * kPageSize; // Find the lowest address free page run that's large enough. for (auto it = free_page_runs_.begin(); it != free_page_runs_.end(); ) { FreePageRun* fpr = *it; @@ -260,8 +268,7 @@ void* RosAlloc::AllocPages(Thread* self, size_t num_pages, byte page_map_type) { break; } if (kIsDebugBuild) { - // Clear the first page which isn't madvised away in the debug - // build for the magic number. + // Clear the first page since it is not madvised due to the magic number. memset(res, 0, kPageSize); } if (kTraceRosAlloc) { @@ -279,7 +286,7 @@ void* RosAlloc::AllocPages(Thread* self, size_t num_pages, byte page_map_type) { return nullptr; } -size_t RosAlloc::FreePages(Thread* self, void* ptr) { +size_t RosAlloc::FreePages(Thread* self, void* ptr, bool already_zero) { lock_.AssertHeld(self); size_t pm_idx = ToPageMapIndex(ptr); DCHECK_LT(pm_idx, page_map_size_); @@ -310,10 +317,21 @@ size_t RosAlloc::FreePages(Thread* self, void* ptr) { num_pages++; idx++; } + const size_t byte_size = num_pages * kPageSize; + if (already_zero) { + if (kCheckZeroMemory) { + const uword* word_ptr = reinterpret_cast(ptr); + for (size_t i = 0; i < byte_size / sizeof(uword); ++i) { + CHECK_EQ(word_ptr[i], 0U) << "words don't match at index " << i; + } + } + } else if (!DoesReleaseAllPages()) { + memset(ptr, 0, byte_size); + } if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::FreePages() : 0x" << std::hex << reinterpret_cast(ptr) - << "-0x" << (reinterpret_cast(ptr) + num_pages * kPageSize) + << "-0x" << (reinterpret_cast(ptr) + byte_size) << "(" << std::dec << (num_pages * kPageSize) << ")"; } @@ -322,8 +340,8 @@ size_t RosAlloc::FreePages(Thread* self, void* ptr) { if (kIsDebugBuild) { fpr->magic_num_ = kMagicNumFree; } - fpr->SetByteSize(this, num_pages * kPageSize); - DCHECK_EQ(fpr->ByteSize(this) % kPageSize, static_cast(0)); + fpr->SetByteSize(this, byte_size); + DCHECK(IsAligned(fpr->ByteSize(this))); DCHECK(free_page_runs_.find(fpr) == free_page_runs_.end()); if (!free_page_runs_.empty()) { @@ -349,6 +367,10 @@ size_t RosAlloc::FreePages(Thread* self, void* ptr) { if (kTraceRosAlloc) { LOG(INFO) << "Success"; } + // Clear magic num since this is no longer the start of a free page run. + if (kIsDebugBuild) { + h->magic_num_ = 0; + } free_page_runs_.erase(it++); if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::FreePages() : (coalesce) Erased run 0x" << std::hex @@ -395,6 +417,10 @@ size_t RosAlloc::FreePages(Thread* self, void* ptr) { } l->SetByteSize(this, l->ByteSize(this) + fpr->ByteSize(this)); DCHECK_EQ(l->ByteSize(this) % kPageSize, static_cast(0)); + // Clear magic num since this is no longer the start of a free page run. + if (kIsDebugBuild) { + fpr->magic_num_ = 0; + } fpr = l; } else { // Not adjacent. Stop. @@ -422,7 +448,7 @@ size_t RosAlloc::FreePages(Thread* self, void* ptr) { LOG(INFO) << "RosAlloc::FreePages() : Inserted run 0x" << std::hex << reinterpret_cast(fpr) << " into free_page_runs_"; } - return num_pages; + return byte_size; } void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) { @@ -439,23 +465,19 @@ void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_alloca } return nullptr; } - if (bytes_allocated != NULL) { - *bytes_allocated = num_pages * kPageSize; - } + const size_t total_bytes = num_pages * kPageSize; + *bytes_allocated = total_bytes; if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::AllocLargeObject() : 0x" << std::hex << reinterpret_cast(r) << "-0x" << (reinterpret_cast(r) + num_pages * kPageSize) << "(" << std::dec << (num_pages * kPageSize) << ")"; } - if (!DoesReleaseAllPages()) { - // If it does not release all pages, pages may not be zeroed out. - memset(r, 0, size); - } // Check if the returned memory is really all zero. if (kCheckZeroMemory) { - byte* bytes = reinterpret_cast(r); - for (size_t i = 0; i < size; ++i) { - DCHECK_EQ(bytes[i], 0); + CHECK_EQ(total_bytes % sizeof(uword), 0U); + const uword* words = reinterpret_cast(r); + for (size_t i = 0; i < total_bytes / sizeof(uword); ++i) { + CHECK_EQ(words[i], 0U); } } return r; @@ -479,7 +501,7 @@ size_t RosAlloc::FreeInternal(Thread* self, void* ptr) { LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx]; return 0; case kPageMapLargeObject: - return FreePages(self, ptr) * kPageSize; + return FreePages(self, ptr, false); case kPageMapLargeObjectPart: LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx]; return 0; @@ -503,9 +525,7 @@ size_t RosAlloc::FreeInternal(Thread* self, void* ptr) { } } DCHECK(run != nullptr); - const size_t size = IndexToBracketSize(run->size_bracket_idx_); - FreeFromRun(self, ptr, run); - return size; + return FreeFromRun(self, ptr, run); } size_t RosAlloc::Free(Thread* self, void* ptr) { @@ -513,42 +533,57 @@ size_t RosAlloc::Free(Thread* self, void* ptr) { return FreeInternal(self, ptr); } -RosAlloc::Run* RosAlloc::RefillRun(Thread* self, size_t idx) { - Run* new_run; - size_t num_pages = numOfPages[idx]; - // Get the lowest address non-full run from the binary tree. - Run* temp = NULL; - std::set* bt = &non_full_runs_[idx]; - std::set::iterator found = bt->lower_bound(temp); - if (found != bt->end()) { - // If there's one, use it as the current run. - Run* non_full_run = *found; - DCHECK(non_full_run != NULL); - new_run = non_full_run; - DCHECK_EQ(new_run->is_thread_local_, 0); - bt->erase(found); - DCHECK_EQ(non_full_run->is_thread_local_, 0); - } else { - // If there's none, allocate a new run and use it as the - // current run. - { - MutexLock mu(self, lock_); - new_run = reinterpret_cast(AllocPages(self, num_pages, kPageMapRun)); - } - if (new_run == NULL) { - return NULL; - } +RosAlloc::Run* RosAlloc::AllocRun(Thread* self, size_t idx) { + RosAlloc::Run* new_run = nullptr; + { + MutexLock mu(self, lock_); + new_run = reinterpret_cast(AllocPages(self, numOfPages[idx], kPageMapRun)); + } + if (LIKELY(new_run != nullptr)) { if (kIsDebugBuild) { new_run->magic_num_ = kMagicNum; } new_run->size_bracket_idx_ = idx; - new_run->top_slot_idx_ = 0; - new_run->ClearBitMaps(); - new_run->to_be_bulk_freed_ = false; + new_run->SetAllocBitMapBitsForInvalidSlots(); + DCHECK(!new_run->IsThreadLocal()); + DCHECK_EQ(new_run->first_search_vec_idx_, 0U); + DCHECK(!new_run->to_be_bulk_freed_); + if (kUsePrefetchDuringAllocRun && idx <= kMaxThreadLocalSizeBracketIdx) { + // Take ownership of the cache lines if we are likely to be thread local run. + if (kPrefetchNewRunDataByZeroing) { + // Zeroing the data is sometimes faster than prefetching but it increases memory usage + // since we end up dirtying zero pages which may have been madvised. + new_run->ZeroData(); + } else { + const size_t num_of_slots = numOfSlots[idx]; + const size_t bracket_size = bracketSizes[idx]; + const size_t num_of_bytes = num_of_slots * bracket_size; + byte* begin = reinterpret_cast(new_run) + headerSizes[idx]; + for (size_t i = 0; i < num_of_bytes; i += kPrefetchStride) { + __builtin_prefetch(begin + i); + } + } + } } return new_run; } +RosAlloc::Run* RosAlloc::RefillRun(Thread* self, size_t idx) { + // Get the lowest address non-full run from the binary tree. + std::set* const bt = &non_full_runs_[idx]; + if (!bt->empty()) { + // If there's one, use it as the current run. + auto it = bt->begin(); + Run* non_full_run = *it; + DCHECK(non_full_run != nullptr); + DCHECK(!non_full_run->IsThreadLocal()); + bt->erase(it); + return non_full_run; + } + // If there's none, allocate a new run and use it as the current run. + return AllocRun(self, idx); +} + void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) { DCHECK_LE(size, kLargeSizeThreshold); size_t bracket_size; @@ -564,66 +599,62 @@ void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) { // Use a thread-local run. Run* thread_local_run = reinterpret_cast(self->GetRosAllocRun(idx)); - if (UNLIKELY(thread_local_run == NULL)) { - MutexLock mu(self, *size_bracket_locks_[idx]); - thread_local_run = RefillRun(self, idx); - if (UNLIKELY(thread_local_run == NULL)) { - return NULL; - } - DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); - DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end()); - thread_local_run->is_thread_local_ = 1; - self->SetRosAllocRun(idx, thread_local_run); - DCHECK(!thread_local_run->IsFull()); - } - - DCHECK(thread_local_run != NULL); - DCHECK_NE(thread_local_run->is_thread_local_, 0); + // Allow invalid since this will always fail the allocation. + DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); + DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end()); + DCHECK(thread_local_run != nullptr); + DCHECK(thread_local_run->IsThreadLocal() || thread_local_run == dedicated_full_run_); slot_addr = thread_local_run->AllocSlot(); - - if (UNLIKELY(slot_addr == NULL)) { + // The allocation must fail if the run is invalid. + DCHECK(thread_local_run != dedicated_full_run_ || slot_addr == nullptr) + << "allocated from an invalid run"; + if (UNLIKELY(slot_addr == nullptr)) { // The run got full. Try to free slots. DCHECK(thread_local_run->IsFull()); MutexLock mu(self, *size_bracket_locks_[idx]); bool is_all_free_after_merge; + // This is safe to do for the dedicated_full_run_ since the bitmaps are empty. if (thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&is_all_free_after_merge)) { + DCHECK_NE(thread_local_run, dedicated_full_run_); // Some slot got freed. Keep it. DCHECK(!thread_local_run->IsFull()); DCHECK_EQ(is_all_free_after_merge, thread_local_run->IsAllFree()); if (is_all_free_after_merge) { - // Reinstate the bump index mode if it's all free. - DCHECK_EQ(thread_local_run->top_slot_idx_, numOfSlots[idx]); - thread_local_run->top_slot_idx_ = 0; + // Check that the bitmap idx is back at 0 if it's all free. + DCHECK_EQ(thread_local_run->first_search_vec_idx_, 0U); } } else { // No slots got freed. Try to refill the thread-local run. DCHECK(thread_local_run->IsFull()); - self->SetRosAllocRun(idx, nullptr); - thread_local_run->is_thread_local_ = 0; - if (kIsDebugBuild) { - full_runs_[idx].insert(thread_local_run); - if (kTraceRosAlloc) { - LOG(INFO) << "RosAlloc::AllocFromRun() : Inserted run 0x" << std::hex - << reinterpret_cast(thread_local_run) - << " into full_runs_[" << std::dec << idx << "]"; + if (thread_local_run != dedicated_full_run_) { + self->SetRosAllocRun(idx, dedicated_full_run_); + thread_local_run->SetIsThreadLocal(false); + if (kIsDebugBuild) { + full_runs_[idx].insert(thread_local_run); + if (kTraceRosAlloc) { + LOG(INFO) << "RosAlloc::AllocFromRun() : Inserted run 0x" << std::hex + << reinterpret_cast(thread_local_run) + << " into full_runs_[" << std::dec << idx << "]"; + } } + DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); + DCHECK(full_runs_[idx].find(thread_local_run) != full_runs_[idx].end()); } - DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); - DCHECK(full_runs_[idx].find(thread_local_run) != full_runs_[idx].end()); + thread_local_run = RefillRun(self, idx); if (UNLIKELY(thread_local_run == NULL)) { return NULL; } DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end()); - thread_local_run->is_thread_local_ = 1; + thread_local_run->SetIsThreadLocal(true); self->SetRosAllocRun(idx, thread_local_run); DCHECK(!thread_local_run->IsFull()); } DCHECK(thread_local_run != NULL); DCHECK(!thread_local_run->IsFull()); - DCHECK_NE(thread_local_run->is_thread_local_, 0); + DCHECK(thread_local_run->IsThreadLocal()); slot_addr = thread_local_run->AllocSlot(); // Must succeed now with a new run. DCHECK(slot_addr != NULL); @@ -644,7 +675,7 @@ void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) } DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end()); - current_run->is_thread_local_ = 0; + current_run->SetIsThreadLocal(false); current_runs_[idx] = current_run; DCHECK(!current_run->IsFull()); } @@ -671,7 +702,7 @@ void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) DCHECK(current_run != NULL); DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end()); - current_run->is_thread_local_ = 0; + current_run->SetIsThreadLocal(false); current_runs_[idx] = current_run; DCHECK(!current_run->IsFull()); slot_addr = current_run->AllocSlot(); @@ -684,27 +715,27 @@ void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) << "(" << std::dec << (bracket_size) << ")"; } } - if (LIKELY(bytes_allocated != NULL)) { - *bytes_allocated = bracket_size; - } - memset(slot_addr, 0, size); + DCHECK(bytes_allocated != nullptr); + *bytes_allocated = bracket_size; + // Caller verifies that it is all 0. return slot_addr; } -void RosAlloc::FreeFromRun(Thread* self, void* ptr, Run* run) { +size_t RosAlloc::FreeFromRun(Thread* self, void* ptr, Run* run) { DCHECK_EQ(run->magic_num_, kMagicNum); DCHECK_LT(run, ptr); DCHECK_LT(ptr, run->End()); - size_t idx = run->size_bracket_idx_; - MutexLock mu(self, *size_bracket_locks_[idx]); + const size_t idx = run->size_bracket_idx_; + const size_t bracket_size = bracketSizes[idx]; bool run_was_full = false; + MutexLock mu(self, *size_bracket_locks_[idx]); if (kIsDebugBuild) { run_was_full = run->IsFull(); } if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::FreeFromRun() : 0x" << std::hex << reinterpret_cast(ptr); } - if (LIKELY(run->is_thread_local_ != 0)) { + if (LIKELY(run->IsThreadLocal())) { // It's a thread-local run. Just mark the thread-local free bit map and return. DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx); DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end()); @@ -715,7 +746,7 @@ void RosAlloc::FreeFromRun(Thread* self, void* ptr, Run* run) { << reinterpret_cast(run); } // A thread local run will be kept as a thread local even if it's become all free. - return; + return bracket_size; } // Free the slot in the run. run->FreeSlot(ptr); @@ -735,9 +766,10 @@ void RosAlloc::FreeFromRun(Thread* self, void* ptr, Run* run) { } DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end()); + run->ZeroHeader(); { MutexLock mu(self, lock_); - FreePages(self, run); + FreePages(self, run, true); } } else { // It is not completely free. If it wasn't the current run or @@ -767,6 +799,7 @@ void RosAlloc::FreeFromRun(Thread* self, void* ptr, Run* run) { } } } + return bracket_size; } std::string RosAlloc::Run::BitMapToStr(uint32_t* bit_map_base, size_t num_vec) { @@ -792,7 +825,7 @@ std::string RosAlloc::Run::Dump() { << " size_bracket_idx=" << idx << " is_thread_local=" << static_cast(is_thread_local_) << " to_be_bulk_freed=" << static_cast(to_be_bulk_freed_) - << " top_slot_idx=" << top_slot_idx_ + << " first_search_vec_idx=" << first_search_vec_idx_ << " alloc_bit_map=" << BitMapToStr(alloc_bit_map_, num_vec) << " bulk_free_bit_map=" << BitMapToStr(BulkFreeBitMap(), num_vec) << " thread_local_bit_map=" << BitMapToStr(ThreadLocalFreeBitMap(), num_vec) @@ -800,64 +833,52 @@ std::string RosAlloc::Run::Dump() { return stream.str(); } -void* RosAlloc::Run::AllocSlot() { - size_t idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - DCHECK_LE(top_slot_idx_, num_slots); - if (LIKELY(top_slot_idx_ < num_slots)) { - // If it's in bump index mode, grab the top slot and increment the top index. - size_t slot_idx = top_slot_idx_; - byte* slot_addr = reinterpret_cast(this) + headerSizes[idx] + slot_idx * bracketSizes[idx]; - if (kTraceRosAlloc) { - LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex << reinterpret_cast(slot_addr) - << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx; - } - top_slot_idx_++; - size_t vec_idx = slot_idx / 32; - size_t vec_off = slot_idx % 32; - uint32_t* vec = &alloc_bit_map_[vec_idx]; - DCHECK_EQ((*vec & (1 << vec_off)), static_cast(0)); - *vec |= 1 << vec_off; - DCHECK_NE((*vec & (1 << vec_off)), static_cast(0)); - return slot_addr; - } - // Not in bump index mode. Search the alloc bit map for an empty slot. - size_t num_vec = RoundUp(num_slots, 32) / 32; - size_t slot_idx = 0; - bool found_slot = false; - for (size_t v = 0; v < num_vec; v++) { - uint32_t *vecp = &alloc_bit_map_[v]; - uint32_t ffz1 = __builtin_ffs(~*vecp); - uint32_t ffz; - // TODO: Use LIKELY or UNLIKELY here? - if (LIKELY(ffz1 > 0 && (ffz = ffz1 - 1) + v * 32 < num_slots)) { +inline void* RosAlloc::Run::AllocSlot() { + const size_t idx = size_bracket_idx_; + while (true) { + if (kIsDebugBuild) { + // Make sure that no slots leaked, the bitmap should be full for all previous vectors. + for (size_t i = 0; i < first_search_vec_idx_; ++i) { + CHECK_EQ(~alloc_bit_map_[i], 0U); + } + } + uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_]; + uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr); + if (LIKELY(ffz1 != 0)) { + const uint32_t ffz = ffz1 - 1; + const uint32_t slot_idx = ffz + first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte; + const uint32_t mask = 1U << ffz; + DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range"; // Found an empty slot. Set the bit. - DCHECK_EQ((*vecp & (1 << ffz)), static_cast(0)); - *vecp |= (1 << ffz); - DCHECK_NE((*vecp & (1 << ffz)), static_cast(0)); - slot_idx = ffz + v * 32; - found_slot = true; - break; + DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U); + *alloc_bitmap_ptr |= mask; + DCHECK_NE(*alloc_bitmap_ptr & mask, 0U); + byte* slot_addr = reinterpret_cast(this) + headerSizes[idx] + slot_idx * bracketSizes[idx]; + if (kTraceRosAlloc) { + LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex << reinterpret_cast(slot_addr) + << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx; + } + return slot_addr; } - } - if (LIKELY(found_slot)) { - byte* slot_addr = reinterpret_cast(this) + headerSizes[idx] + slot_idx * bracketSizes[idx]; - if (kTraceRosAlloc) { - LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex << reinterpret_cast(slot_addr) - << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx; + const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32; + if (first_search_vec_idx_ + 1 >= num_words) { + DCHECK(IsFull()); + // Already at the last word, return null. + return nullptr; } - return slot_addr; + // Increase the index to the next word and try again. + ++first_search_vec_idx_; } - return NULL; } -inline void RosAlloc::Run::FreeSlot(void* ptr) { - DCHECK_EQ(is_thread_local_, 0); - byte idx = size_bracket_idx_; - size_t offset_from_slot_base = reinterpret_cast(ptr) +void RosAlloc::Run::FreeSlot(void* ptr) { + DCHECK(!IsThreadLocal()); + const byte idx = size_bracket_idx_; + const size_t bracket_size = bracketSizes[idx]; + const size_t offset_from_slot_base = reinterpret_cast(ptr) - (reinterpret_cast(this) + headerSizes[idx]); - DCHECK_EQ(offset_from_slot_base % bracketSizes[idx], static_cast(0)); - size_t slot_idx = offset_from_slot_base / bracketSizes[idx]; + DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast(0)); + size_t slot_idx = offset_from_slot_base / bracket_size; DCHECK_LT(slot_idx, numOfSlots[idx]); size_t vec_idx = slot_idx / 32; if (kIsDebugBuild) { @@ -866,9 +887,14 @@ inline void RosAlloc::Run::FreeSlot(void* ptr) { } size_t vec_off = slot_idx % 32; uint32_t* vec = &alloc_bit_map_[vec_idx]; - DCHECK_NE((*vec & (1 << vec_off)), static_cast(0)); - *vec &= ~(1 << vec_off); - DCHECK_EQ((*vec & (1 << vec_off)), static_cast(0)); + first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast(vec_idx)); + const uint32_t mask = 1U << vec_off; + DCHECK_NE(*vec & mask, 0U); + *vec &= ~mask; + DCHECK_EQ(*vec & mask, 0U); + // Zero out the memory. + // TODO: Investigate alternate memset since ptr is guaranteed to be aligned to 16. + memset(ptr, 0, bracket_size); if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::Run::FreeSlot() : 0x" << std::hex << reinterpret_cast(ptr) << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx; @@ -876,11 +902,11 @@ inline void RosAlloc::Run::FreeSlot(void* ptr) { } inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out) { - DCHECK_NE(is_thread_local_, 0); + DCHECK(IsThreadLocal()); // Free slots in the alloc bit map based on the thread local free bit map. - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; + const size_t idx = size_bracket_idx_; + const size_t num_of_slots = numOfSlots[idx]; + const size_t num_vec = RoundUp(num_of_slots, 32) / 32; bool changed = false; uint32_t* vecp = &alloc_bit_map_[0]; uint32_t* tl_free_vecp = &ThreadLocalFreeBitMap()[0]; @@ -890,6 +916,7 @@ inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_ uint32_t vec_before = *vecp; uint32_t vec_after; if (tl_free_vec != 0) { + first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast(v)); vec_after = vec_before & ~tl_free_vec; *vecp = vec_after; changed = true; @@ -898,7 +925,13 @@ inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_ vec_after = vec_before; } if (vec_after != 0) { - is_all_free_after = false; + if (v == num_vec - 1) { + // Only not all free if a bit other than the mask bits are set. + is_all_free_after = + is_all_free_after && GetBitmapLastVectorMask(num_of_slots, num_vec) == vec_after; + } else { + is_all_free_after = false; + } } DCHECK_EQ(*tl_free_vecp, static_cast(0)); } @@ -909,16 +942,15 @@ inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_ } inline void RosAlloc::Run::MergeBulkFreeBitMapIntoAllocBitMap() { - DCHECK_EQ(is_thread_local_, 0); + DCHECK(!IsThreadLocal()); // Free slots in the alloc bit map based on the bulk free bit map. - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; + const size_t num_vec = NumberOfBitmapVectors(); uint32_t* vecp = &alloc_bit_map_[0]; uint32_t* free_vecp = &BulkFreeBitMap()[0]; for (size_t v = 0; v < num_vec; v++, vecp++, free_vecp++) { uint32_t free_vec = *free_vecp; if (free_vec != 0) { + first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast(v)); *vecp &= ~free_vec; *free_vecp = 0; // clear the bulk free bit map. } @@ -927,11 +959,9 @@ inline void RosAlloc::Run::MergeBulkFreeBitMapIntoAllocBitMap() { } inline void RosAlloc::Run::UnionBulkFreeBitMapToThreadLocalFreeBitMap() { - DCHECK_NE(is_thread_local_, 0); + DCHECK(IsThreadLocal()); // Union the thread local bit map with the bulk free bit map. - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; + size_t num_vec = NumberOfBitmapVectors(); uint32_t* to_vecp = &ThreadLocalFreeBitMap()[0]; uint32_t* from_vecp = &BulkFreeBitMap()[0]; for (size_t v = 0; v < num_vec; v++, to_vecp++, from_vecp++) { @@ -945,66 +975,71 @@ inline void RosAlloc::Run::UnionBulkFreeBitMapToThreadLocalFreeBitMap() { } inline void RosAlloc::Run::MarkThreadLocalFreeBitMap(void* ptr) { - DCHECK_NE(is_thread_local_, 0); + DCHECK(IsThreadLocal()); MarkFreeBitMapShared(ptr, ThreadLocalFreeBitMap(), "MarkThreadLocalFreeBitMap"); } -inline void RosAlloc::Run::MarkBulkFreeBitMap(void* ptr) { - MarkFreeBitMapShared(ptr, BulkFreeBitMap(), "MarkFreeBitMap"); +inline size_t RosAlloc::Run::MarkBulkFreeBitMap(void* ptr) { + return MarkFreeBitMapShared(ptr, BulkFreeBitMap(), "MarkFreeBitMap"); } -inline void RosAlloc::Run::MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, - const char* caller_name) { - byte idx = size_bracket_idx_; - size_t offset_from_slot_base = reinterpret_cast(ptr) +inline size_t RosAlloc::Run::MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, + const char* caller_name) { + const byte idx = size_bracket_idx_; + const size_t offset_from_slot_base = reinterpret_cast(ptr) - (reinterpret_cast(this) + headerSizes[idx]); - DCHECK_EQ(offset_from_slot_base % bracketSizes[idx], static_cast(0)); - size_t slot_idx = offset_from_slot_base / bracketSizes[idx]; + const size_t bracket_size = bracketSizes[idx]; + memset(ptr, 0, bracket_size); + DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast(0)); + size_t slot_idx = offset_from_slot_base / bracket_size; DCHECK_LT(slot_idx, numOfSlots[idx]); size_t vec_idx = slot_idx / 32; if (kIsDebugBuild) { - size_t num_vec = RoundUp(numOfSlots[idx], 32) / 32; + size_t num_vec = NumberOfBitmapVectors(); DCHECK_LT(vec_idx, num_vec); } size_t vec_off = slot_idx % 32; uint32_t* vec = &free_bit_map_base[vec_idx]; - DCHECK_EQ((*vec & (1 << vec_off)), static_cast(0)); - *vec |= 1 << vec_off; - DCHECK_NE((*vec & (1 << vec_off)), static_cast(0)); + const uint32_t mask = 1U << vec_off; + DCHECK_EQ(*vec & mask, 0U); + *vec |= mask; + DCHECK_NE(*vec & mask, 0U); if (kTraceRosAlloc) { LOG(INFO) << "RosAlloc::Run::" << caller_name << "() : 0x" << std::hex << reinterpret_cast(ptr) << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx; } + return bracket_size; +} + +inline uint32_t RosAlloc::Run::GetBitmapLastVectorMask(size_t num_slots, size_t num_vec) { + const size_t kBitsPerVec = 32; + DCHECK_GE(num_slots * kBitsPerVec, num_vec); + size_t remain = num_vec * kBitsPerVec - num_slots; + DCHECK_NE(remain, kBitsPerVec); + return ((1U << remain) - 1) << (kBitsPerVec - remain); } inline bool RosAlloc::Run::IsAllFree() { - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; - for (size_t v = 0; v < num_vec; v++) { + const byte idx = size_bracket_idx_; + const size_t num_slots = numOfSlots[idx]; + const size_t num_vec = NumberOfBitmapVectors(); + DCHECK_NE(num_vec, 0U); + // Check the last vector after the loop since it uses a special case for the masked bits. + for (size_t v = 0; v < num_vec - 1; v++) { uint32_t vec = alloc_bit_map_[v]; if (vec != 0) { return false; } } - return true; + // Make sure the last word is equal to the mask, all other bits must be 0. + return alloc_bit_map_[num_vec - 1] == GetBitmapLastVectorMask(num_slots, num_vec); } inline bool RosAlloc::Run::IsFull() { - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; - size_t slots = 0; - for (size_t v = 0; v < num_vec; v++, slots += 32) { - DCHECK_GE(num_slots, slots); - uint32_t vec = alloc_bit_map_[v]; - uint32_t mask = (num_slots - slots >= 32) ? static_cast(-1) - : (1 << (num_slots - slots)) - 1; - if ((num_slots - slots) >= 32) { - DCHECK_EQ(mask, static_cast(-1)); - } - if (vec != mask) { + const size_t num_vec = NumberOfBitmapVectors(); + for (size_t v = 0; v < num_vec; ++v) { + if (~alloc_bit_map_[v] != 0) { return false; } } @@ -1012,9 +1047,7 @@ inline bool RosAlloc::Run::IsFull() { } inline bool RosAlloc::Run::IsBulkFreeBitmapClean() { - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; + const size_t num_vec = NumberOfBitmapVectors(); for (size_t v = 0; v < num_vec; v++) { uint32_t vec = BulkFreeBitMap()[v]; if (vec != 0) { @@ -1025,9 +1058,7 @@ inline bool RosAlloc::Run::IsBulkFreeBitmapClean() { } inline bool RosAlloc::Run::IsThreadLocalFreeBitmapClean() { - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; + const size_t num_vec = NumberOfBitmapVectors(); for (size_t v = 0; v < num_vec; v++) { uint32_t vec = ThreadLocalFreeBitMap()[v]; if (vec != 0) { @@ -1037,11 +1068,31 @@ inline bool RosAlloc::Run::IsThreadLocalFreeBitmapClean() { return true; } -inline void RosAlloc::Run::ClearBitMaps() { - byte idx = size_bracket_idx_; - size_t num_slots = numOfSlots[idx]; - size_t num_vec = RoundUp(num_slots, 32) / 32; - memset(alloc_bit_map_, 0, sizeof(uint32_t) * num_vec * 3); +inline void RosAlloc::Run::SetAllocBitMapBitsForInvalidSlots() { + const size_t idx = size_bracket_idx_; + const size_t num_slots = numOfSlots[idx]; + const size_t num_vec = RoundUp(num_slots, 32) / 32; + DCHECK_NE(num_vec, 0U); + // Make sure to set the bits at the end of the bitmap so that we don't allocate there since they + // don't represent valid slots. + alloc_bit_map_[num_vec - 1] |= GetBitmapLastVectorMask(num_slots, num_vec); +} + +inline void RosAlloc::Run::ZeroHeader() { + const byte idx = size_bracket_idx_; + memset(this, 0, headerSizes[idx]); +} + +inline void RosAlloc::Run::ZeroData() { + const byte idx = size_bracket_idx_; + byte* slot_begin = reinterpret_cast(this) + headerSizes[idx]; + memset(slot_begin, 0, numOfSlots[idx] * bracketSizes[idx]); +} + +inline void RosAlloc::Run::FillAllocBitMap() { + size_t num_vec = NumberOfBitmapVectors(); + memset(alloc_bit_map_, 0xFF, sizeof(uint32_t) * num_vec); + first_search_vec_idx_ = num_vec - 1; // No free bits in any of the bitmap words. } void RosAlloc::Run::InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg), @@ -1073,7 +1124,7 @@ void RosAlloc::Run::InspectAllSlots(void (*handler)(void* start, void* end, size // lock for better performance, assuming that the existence of an // allocated chunk/pointer being freed in BulkFree() guarantees that // the page map entry won't change. Disabled for now. -static constexpr bool kReadPageMapEntryWithoutLockInBulkFree = false; +static constexpr bool kReadPageMapEntryWithoutLockInBulkFree = true; size_t RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) { size_t freed_bytes = 0; @@ -1096,11 +1147,10 @@ size_t RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) { #endif for (size_t i = 0; i < num_ptrs; i++) { void* ptr = ptrs[i]; - ptrs[i] = NULL; DCHECK_LE(base_, ptr); DCHECK_LT(ptr, base_ + footprint_); size_t pm_idx = RoundDownToPageMapIndex(ptr); - Run* run = NULL; + Run* run = nullptr; if (kReadPageMapEntryWithoutLockInBulkFree) { // Read the page map entries without locking the lock. byte page_map_entry = page_map_[pm_idx]; @@ -1111,106 +1161,74 @@ size_t RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) { } if (LIKELY(page_map_entry == kPageMapRun)) { run = reinterpret_cast(base_ + pm_idx * kPageSize); - DCHECK_EQ(run->magic_num_, kMagicNum); } else if (LIKELY(page_map_entry == kPageMapRunPart)) { size_t pi = pm_idx; - DCHECK(page_map_[pi] == kPageMapRun || page_map_[pi] == kPageMapRunPart); // Find the beginning of the run. - while (page_map_[pi] != kPageMapRun) { - pi--; + do { + --pi; DCHECK_LT(pi, capacity_ / kPageSize); - } - DCHECK_EQ(page_map_[pi], kPageMapRun); + } while (page_map_[pi] != kPageMapRun); run = reinterpret_cast(base_ + pi * kPageSize); - DCHECK_EQ(run->magic_num_, kMagicNum); } else if (page_map_entry == kPageMapLargeObject) { MutexLock mu(self, lock_); - freed_bytes += FreePages(self, ptr) * kPageSize; + freed_bytes += FreePages(self, ptr, false); continue; } else { LOG(FATAL) << "Unreachable - page map type: " << page_map_entry; } - DCHECK(run != nullptr); - // Set the bit in the bulk free bit map. - run->MarkBulkFreeBitMap(ptr); - freed_bytes += IndexToBracketSize(run->size_bracket_idx_); -#ifdef HAVE_ANDROID_OS - if (!run->to_be_bulk_freed_) { - run->to_be_bulk_freed_ = true; - runs.push_back(run); - } -#else - runs.insert(run); -#endif } else { // Read the page map entries with a lock. - bool free_from_run = false; - { - MutexLock mu(self, lock_); - DCHECK_LT(pm_idx, page_map_size_); - byte page_map_entry = page_map_[pm_idx]; - if (kTraceRosAlloc) { - LOG(INFO) << "RosAlloc::BulkFree() : " << std::hex << ptr << ", pm_idx=" - << std::dec << pm_idx - << ", page_map_entry=" << static_cast(page_map_entry); - } - if (LIKELY(page_map_entry == kPageMapRun)) { - free_from_run = true; - run = reinterpret_cast(base_ + pm_idx * kPageSize); - DCHECK_EQ(run->magic_num_, kMagicNum); - } else if (LIKELY(page_map_entry == kPageMapRunPart)) { - free_from_run = true; - size_t pi = pm_idx; - DCHECK(page_map_[pi] == kPageMapRun || page_map_[pi] == kPageMapRunPart); - // Find the beginning of the run. - while (page_map_[pi] != kPageMapRun) { - pi--; - DCHECK_LT(pi, capacity_ / kPageSize); - } - DCHECK_EQ(page_map_[pi], kPageMapRun); - run = reinterpret_cast(base_ + pi * kPageSize); - DCHECK_EQ(run->magic_num_, kMagicNum); - } else if (page_map_entry == kPageMapLargeObject) { - freed_bytes += FreePages(self, ptr) * kPageSize; - } else { - LOG(FATAL) << "Unreachable - page map type: " << page_map_entry; - } + MutexLock mu(self, lock_); + DCHECK_LT(pm_idx, page_map_size_); + byte page_map_entry = page_map_[pm_idx]; + if (kTraceRosAlloc) { + LOG(INFO) << "RosAlloc::BulkFree() : " << std::hex << ptr << ", pm_idx=" + << std::dec << pm_idx + << ", page_map_entry=" << static_cast(page_map_entry); + } + if (LIKELY(page_map_entry == kPageMapRun)) { + run = reinterpret_cast(base_ + pm_idx * kPageSize); + } else if (LIKELY(page_map_entry == kPageMapRunPart)) { + size_t pi = pm_idx; + // Find the beginning of the run. + do { + --pi; + DCHECK_LT(pi, capacity_ / kPageSize); + } while (page_map_[pi] != kPageMapRun); + run = reinterpret_cast(base_ + pi * kPageSize); + } else if (page_map_entry == kPageMapLargeObject) { + freed_bytes += FreePages(self, ptr, false); + continue; + } else { + LOG(FATAL) << "Unreachable - page map type: " << page_map_entry; } - if (LIKELY(free_from_run)) { - DCHECK(run != NULL); - // Set the bit in the bulk free bit map. - run->MarkBulkFreeBitMap(ptr); - freed_bytes += IndexToBracketSize(run->size_bracket_idx_); + } + DCHECK(run != nullptr); + DCHECK_EQ(run->magic_num_, kMagicNum); + // Set the bit in the bulk free bit map. + freed_bytes += run->MarkBulkFreeBitMap(ptr); #ifdef HAVE_ANDROID_OS - if (!run->to_be_bulk_freed_) { - run->to_be_bulk_freed_ = true; - runs.push_back(run); - } + if (!run->to_be_bulk_freed_) { + run->to_be_bulk_freed_ = true; + runs.push_back(run); + } #else - runs.insert(run); + runs.insert(run); #endif - } - } } // Now, iterate over the affected runs and update the alloc bit map // based on the bulk free bit map (for non-thread-local runs) and // union the bulk free bit map into the thread-local free bit map // (for thread-local runs.) -#ifdef HAVE_ANDROID_OS - typedef std::vector::iterator It; -#else - typedef hash_set::iterator It; -#endif - for (It it = runs.begin(); it != runs.end(); ++it) { - Run* run = *it; + for (Run* run : runs) { #ifdef HAVE_ANDROID_OS DCHECK(run->to_be_bulk_freed_); run->to_be_bulk_freed_ = false; #endif size_t idx = run->size_bracket_idx_; MutexLock mu(self, *size_bracket_locks_[idx]); - if (run->is_thread_local_ != 0) { + if (run->IsThreadLocal()) { DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx); DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end()); @@ -1219,7 +1237,7 @@ size_t RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) { LOG(INFO) << "RosAlloc::BulkFree() : Freed slot(s) in a thread local run 0x" << std::hex << reinterpret_cast(run); } - DCHECK_NE(run->is_thread_local_, 0); + DCHECK(run->IsThreadLocal()); // A thread local run will be kept as a thread local even if // it's become all free. } else { @@ -1269,8 +1287,9 @@ size_t RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) { DCHECK(non_full_runs->find(run) == non_full_runs->end()); } if (!run_was_current) { + run->ZeroHeader(); MutexLock mu(self, lock_); - FreePages(self, run); + FreePages(self, run, true); } } else { // It is not completely free. If it wasn't the current run or @@ -1381,7 +1400,7 @@ std::string RosAlloc::DumpPageMap() { stream << "[" << i << "]=Run (start)" << " idx=" << idx << " numOfPages=" << numOfPages[idx] - << " thread_local=" << static_cast(run->is_thread_local_) + << " is_thread_local=" << run->is_thread_local_ << " is_all_free=" << (run->IsAllFree() ? 1 : 0) << std::endl; break; @@ -1556,6 +1575,8 @@ void RosAlloc::InspectAll(void (*handler)(void* start, void* end, size_t used_by // The start of a run. Run* run = reinterpret_cast(base_ + i * kPageSize); DCHECK_EQ(run->magic_num_, kMagicNum); + // The dedicated full run doesn't contain any real allocations, don't visit the slots in + // there. run->InspectAllSlots(handler, arg); size_t num_pages = numOfPages[run->size_bracket_idx_]; if (kIsDebugBuild) { @@ -1605,14 +1626,16 @@ void RosAlloc::RevokeThreadLocalRuns(Thread* thread) { for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) { MutexLock mu(self, *size_bracket_locks_[idx]); Run* thread_local_run = reinterpret_cast(thread->GetRosAllocRun(idx)); - if (thread_local_run != NULL) { + CHECK(thread_local_run != nullptr); + // Invalid means already revoked. + DCHECK(thread_local_run->IsThreadLocal()); + if (thread_local_run != dedicated_full_run_) { + thread->SetRosAllocRun(idx, dedicated_full_run_); DCHECK_EQ(thread_local_run->magic_num_, kMagicNum); - DCHECK_NE(thread_local_run->is_thread_local_, 0); - thread->SetRosAllocRun(idx, nullptr); // Note the thread local run may not be full here. bool dont_care; thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care); - thread_local_run->is_thread_local_ = 0; + thread_local_run->SetIsThreadLocal(false); thread_local_run->MergeBulkFreeBitMapIntoAllocBitMap(); DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end()); DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end()); @@ -1628,7 +1651,8 @@ void RosAlloc::RevokeThreadLocalRuns(Thread* thread) { } } else if (thread_local_run->IsAllFree()) { MutexLock mu(self, lock_); - FreePages(self, thread_local_run); + thread_local_run->ZeroHeader(); + FreePages(self, thread_local_run, true); } else { non_full_runs_[idx].insert(thread_local_run); DCHECK(non_full_runs_[idx].find(thread_local_run) != non_full_runs_[idx].end()); @@ -1648,9 +1672,8 @@ void RosAlloc::RevokeAllThreadLocalRuns() { MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_); MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_); std::list thread_list = Runtime::Current()->GetThreadList()->GetList(); - for (auto it = thread_list.begin(); it != thread_list.end(); ++it) { - Thread* t = *it; - RevokeThreadLocalRuns(t); + for (Thread* thread : thread_list) { + RevokeThreadLocalRuns(thread); } } @@ -1662,7 +1685,7 @@ void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) { for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) { MutexLock mu(self, *size_bracket_locks_[idx]); Run* thread_local_run = reinterpret_cast(thread->GetRosAllocRun(idx)); - DCHECK(thread_local_run == nullptr); + DCHECK(thread_local_run == nullptr || thread_local_run == dedicated_full_run_); } } } @@ -1770,6 +1793,15 @@ void RosAlloc::Initialize() { << ", threadLocalFreeBitMapOffsets[" << i << "]=" << threadLocalFreeBitMapOffsets[i];; } } + // Fill the alloc bitmap so nobody can successfully allocate from it. + if (kIsDebugBuild) { + dedicated_full_run_->magic_num_ = kMagicNum; + } + // It doesn't matter which size bracket we use since the main goal is to have the allocation + // fail 100% of the time you attempt to allocate into the dedicated full run. + dedicated_full_run_->size_bracket_idx_ = 0; + dedicated_full_run_->FillAllocBitMap(); + dedicated_full_run_->SetIsThreadLocal(true); } void RosAlloc::BytesAllocatedCallback(void* start, void* end, size_t used_bytes, void* arg) { @@ -1867,6 +1899,7 @@ void RosAlloc::Verify() { << " and the run size : page index range " << i << " to " << (i + num_pages) << std::endl << DumpPageMap(); } + // Don't verify the dedicated_full_run_ since it doesn't have any real allocations. runs.push_back(run); i += num_pages; CHECK_LE(i, pm_end) << "Page map index " << i << " out of range < " << pm_end @@ -1891,34 +1924,25 @@ void RosAlloc::Verify() { void RosAlloc::Run::Verify(Thread* self, RosAlloc* rosalloc) { DCHECK_EQ(magic_num_, kMagicNum) << "Bad magic number : " << Dump(); - size_t idx = size_bracket_idx_; + const size_t idx = size_bracket_idx_; CHECK_LT(idx, kNumOfSizeBrackets) << "Out of range size bracket index : " << Dump(); byte* slot_base = reinterpret_cast(this) + headerSizes[idx]; - size_t num_slots = numOfSlots[idx]; + const size_t num_slots = numOfSlots[idx]; + const size_t num_vec = RoundUp(num_slots, 32) / 32; + CHECK_GT(num_vec, 0U); size_t bracket_size = IndexToBracketSize(idx); CHECK_EQ(slot_base + num_slots * bracket_size, reinterpret_cast(this) + numOfPages[idx] * kPageSize) << "Mismatch in the end address of the run " << Dump(); // Check that the bulk free bitmap is clean. It's only used during BulkFree(). CHECK(IsBulkFreeBitmapClean()) << "The bulk free bit map isn't clean " << Dump(); - // Check the bump index mode, if it's on. - if (top_slot_idx_ < num_slots) { - // If the bump index mode is on (top_slot_idx_ < num_slots), then - // all of the slots after the top index must be free. - for (size_t i = top_slot_idx_; i < num_slots; ++i) { - size_t vec_idx = i / 32; - size_t vec_off = i % 32; - uint32_t vec = alloc_bit_map_[vec_idx]; - CHECK_EQ((vec & (1 << vec_off)), static_cast(0)) - << "A slot >= top_slot_idx_ isn't free " << Dump(); - } - } else { - CHECK_EQ(top_slot_idx_, num_slots) - << "If the bump index mode is off, the top index == the number of slots " - << Dump(); - } + uint32_t last_word_mask = GetBitmapLastVectorMask(num_slots, num_vec); + // Make sure all the bits at the end of the run are set so that we don't allocate there. + CHECK_EQ(alloc_bit_map_[num_vec - 1] & last_word_mask, last_word_mask); + // Ensure that the first bitmap index is valid. + CHECK_LT(first_search_vec_idx_, num_vec); // Check the thread local runs, the current runs, and the run sets. - if (is_thread_local_) { + if (IsThreadLocal()) { // If it's a thread local run, then it must be pointed to by an owner thread. bool owner_found = false; std::list thread_list = Runtime::Current()->GetThreadList()->GetList(); @@ -1980,7 +2004,6 @@ void RosAlloc::Run::Verify(Thread* self, RosAlloc* rosalloc) { } } // Check each slot. - size_t num_vec = RoundUp(num_slots, 32) / 32; size_t slots = 0; for (size_t v = 0; v < num_vec; v++, slots += 32) { DCHECK_GE(num_slots, slots) << "Out of bounds"; @@ -1991,7 +2014,7 @@ void RosAlloc::Run::Verify(Thread* self, RosAlloc* rosalloc) { bool is_allocated = ((vec >> i) & 0x1) != 0; // If a thread local run, slots may be marked freed in the // thread local free bitmap. - bool is_thread_local_freed = is_thread_local_ && ((thread_local_free_vec >> i) & 0x1) != 0; + bool is_thread_local_freed = IsThreadLocal() && ((thread_local_free_vec >> i) & 0x1) != 0; if (is_allocated && !is_thread_local_freed) { byte* slot_addr = slot_base + (slots + i) * bracket_size; mirror::Object* obj = reinterpret_cast(slot_addr); diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h index 0c508b7..714230b 100644 --- a/runtime/gc/allocator/rosalloc.h +++ b/runtime/gc/allocator/rosalloc.h @@ -155,7 +155,7 @@ class RosAlloc { // +-------------------+ // | to_be_bulk_freed | // +-------------------+ - // | top_slot_idx | + // | top_bitmap_idx | // +-------------------+ // | | // | alloc bit map | @@ -186,12 +186,12 @@ class RosAlloc { // class Run { public: - byte magic_num_; // The magic number used for debugging. - byte size_bracket_idx_; // The index of the size bracket of this run. - byte is_thread_local_; // True if this run is used as a thread-local run. - byte to_be_bulk_freed_; // Used within BulkFree() to flag a run that's involved with a bulk free. - uint32_t top_slot_idx_; // The top slot index when this run is in bump index mode. - uint32_t alloc_bit_map_[0]; // The bit map that allocates if each slot is in use. + byte magic_num_; // The magic number used for debugging. + byte size_bracket_idx_; // The index of the size bracket of this run. + byte is_thread_local_; // True if this run is used as a thread-local run. + byte to_be_bulk_freed_; // Used within BulkFree() to flag a run that's involved with a bulk free. + uint32_t first_search_vec_idx_; // The index of the first bitmap vector which may contain an available slot. + uint32_t alloc_bit_map_[0]; // The bit map that allocates if each slot is in use. // bulk_free_bit_map_[] : The bit map that is used for GC to // temporarily mark the slots to free without using a lock. After @@ -225,6 +225,16 @@ class RosAlloc { void* End() { return reinterpret_cast(this) + kPageSize * numOfPages[size_bracket_idx_]; } + // Returns the number of bitmap words per run. + size_t NumberOfBitmapVectors() const { + return RoundUp(numOfSlots[size_bracket_idx_], 32) / 32; + } + void SetIsThreadLocal(bool is_thread_local) { + is_thread_local_ = is_thread_local ? 1 : 0; + } + bool IsThreadLocal() const { + return is_thread_local_ != 0; + } // Frees slots in the allocation bit map with regard to the // thread-local free bit map. Used when a thread-local run becomes // full. @@ -243,10 +253,13 @@ class RosAlloc { void* AllocSlot(); // Frees a slot in a run. This is used in a non-bulk free. void FreeSlot(void* ptr); - // Marks the slots to free in the bulk free bit map. - void MarkBulkFreeBitMap(void* ptr); + // Marks the slots to free in the bulk free bit map. Returns the bracket size. + size_t MarkBulkFreeBitMap(void* ptr); // Marks the slots to free in the thread-local free bit map. void MarkThreadLocalFreeBitMap(void* ptr); + // Last word mask, all of the bits in the last word which aren't valid slots are set to + // optimize allocation path. + static size_t GetBitmapLastVectorMask(size_t num_slots, size_t num_vec); // Returns true if all the slots in the run are not in use. bool IsAllFree(); // Returns true if all the slots in the run are in use. @@ -255,8 +268,14 @@ class RosAlloc { bool IsBulkFreeBitmapClean(); // Returns true if the thread local free bit map is clean. bool IsThreadLocalFreeBitmapClean(); - // Clear all the bit maps. - void ClearBitMaps(); + // Set the alloc_bit_map_ bits for slots that are past the end of the run. + void SetAllocBitMapBitsForInvalidSlots(); + // Zero the run's data. + void ZeroData(); + // Zero the run's header. + void ZeroHeader(); + // Fill the alloc bitmap with 1s. + void FillAllocBitMap(); // Iterate over all the slots and apply the given function. void InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg), void* arg); // Dump the run metadata for debugging. @@ -267,8 +286,9 @@ class RosAlloc { EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_); private: - // The common part of MarkFreeBitMap() and MarkThreadLocalFreeBitMap(). - void MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, const char* caller_name); + // The common part of MarkFreeBitMap() and MarkThreadLocalFreeBitMap(). Returns the bracket + // size. + size_t MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, const char* caller_name); // Turns the bit map into a string for debugging. static std::string BitMapToStr(uint32_t* bit_map_base, size_t num_vec); }; @@ -376,7 +396,7 @@ class RosAlloc { return byte_offset / kPageSize; } // Returns the page map index from an address with rounding. - size_t RoundDownToPageMapIndex(void* addr) { + size_t RoundDownToPageMapIndex(void* addr) const { DCHECK(base_ <= addr && addr < reinterpret_cast(base_) + capacity_); return (reinterpret_cast(addr) - reinterpret_cast(base_)) / kPageSize; } @@ -446,12 +466,19 @@ class RosAlloc { hash_set full_runs_[kNumOfSizeBrackets]; // The set of free pages. std::set free_page_runs_ GUARDED_BY(lock_); + // The dedicated full run, it is always full and shared by all threads when revoking happens. + // This is an optimization since enables us to avoid a null check for revoked runs. + static Run* dedicated_full_run_; + // Using size_t to ensure that it is at least word aligned. + static size_t dedicated_full_run_storage_[]; // The current runs where the allocations are first attempted for // the size brackes that do not use thread-local // runs. current_runs_[i] is guarded by size_bracket_locks_[i]. Run* current_runs_[kNumOfSizeBrackets]; // The mutexes, one per size bracket. Mutex* size_bracket_locks_[kNumOfSizeBrackets]; + // Bracket lock names (since locks only have char* names). + std::string size_bracket_lock_names[kNumOfSizeBrackets]; // The types of page map entries. enum { kPageMapEmpty = 0, // Not allocated. @@ -493,15 +520,19 @@ class RosAlloc { // Page-granularity alloc/free void* AllocPages(Thread* self, size_t num_pages, byte page_map_type) EXCLUSIVE_LOCKS_REQUIRED(lock_); - // Returns how many pages were freed. - size_t FreePages(Thread* self, void* ptr) EXCLUSIVE_LOCKS_REQUIRED(lock_); + // Returns how many bytes were freed. + size_t FreePages(Thread* self, void* ptr, bool already_zero) EXCLUSIVE_LOCKS_REQUIRED(lock_); // Allocate/free a run slot. void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) LOCKS_EXCLUDED(lock_); - void FreeFromRun(Thread* self, void* ptr, Run* run) + // Returns the bracket size. + size_t FreeFromRun(Thread* self, void* ptr, Run* run) LOCKS_EXCLUDED(lock_); + // Used to allocate a new thread local run for a size bracket. + Run* AllocRun(Thread* self, size_t idx) LOCKS_EXCLUDED(lock_); + // Used to acquire a new/reused run for a size bracket. Used when a // thread-local or current run gets full. Run* RefillRun(Thread* self, size_t idx) LOCKS_EXCLUDED(lock_); @@ -558,6 +589,9 @@ class RosAlloc { void AssertAllThreadLocalRunsAreRevoked() LOCKS_EXCLUDED(Locks::thread_list_lock_); // Dumps the page map for debugging. std::string DumpPageMap() EXCLUSIVE_LOCKS_REQUIRED(lock_); + static Run* GetDedicatedFullRun() { + return dedicated_full_run_; + } // Callbacks for InspectAll that will count the number of bytes // allocated and objects allocated, respectively. diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index 517c748..5d72bc1 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc @@ -78,9 +78,9 @@ static constexpr size_t kGcAlotInterval = KB; static constexpr size_t kMinConcurrentRemainingBytes = 128 * KB; static constexpr size_t kMaxConcurrentRemainingBytes = 512 * KB; // Sticky GC throughput adjustment, divided by 4. Increasing this causes sticky GC to occur more -// relative to partial/full GC. This is desirable since sticky GCs interfere less with mutator +// relative to partial/full GC. This may be desirable since sticky GCs interfere less with mutator // threads (lower pauses, use less memory bandwidth). -static constexpr double kStickyGcThroughputAdjustment = 1.25; +static constexpr double kStickyGcThroughputAdjustment = 1.0; // Whether or not we use the free list large object space. static constexpr bool kUseFreeListSpaceForLOS = false; // Whtehr or not we compact the zygote in PreZygoteFork. @@ -595,6 +595,11 @@ void Heap::AddSpace(space::Space* space, bool set_as_default) { if (continuous_space->IsDlMallocSpace()) { dlmalloc_space_ = continuous_space->AsDlMallocSpace(); } else if (continuous_space->IsRosAllocSpace()) { + // Revoke before if we already have a rosalloc_space_ so that we don't end up with non full + // runs from the previous one during the revoke after. + if (rosalloc_space_ != nullptr) { + rosalloc_space_->RevokeAllThreadLocalBuffers(); + } rosalloc_space_ = continuous_space->AsRosAllocSpace(); } } @@ -615,7 +620,7 @@ void Heap::AddSpace(space::Space* space, bool set_as_default) { } } -void Heap::RemoveSpace(space::Space* space) { +void Heap::RemoveSpace(space::Space* space, bool unset_as_default) { DCHECK(space != nullptr); WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_); if (space->IsContinuousSpace()) { @@ -632,17 +637,19 @@ void Heap::RemoveSpace(space::Space* space) { auto it = std::find(continuous_spaces_.begin(), continuous_spaces_.end(), continuous_space); DCHECK(it != continuous_spaces_.end()); continuous_spaces_.erase(it); - if (continuous_space == dlmalloc_space_) { - dlmalloc_space_ = nullptr; - } else if (continuous_space == rosalloc_space_) { - rosalloc_space_ = nullptr; - } - if (continuous_space == main_space_) { - main_space_ = nullptr; - } else if (continuous_space == bump_pointer_space_) { - bump_pointer_space_ = nullptr; - } else if (continuous_space == temp_space_) { - temp_space_ = nullptr; + if (unset_as_default) { + if (continuous_space == dlmalloc_space_) { + dlmalloc_space_ = nullptr; + } else if (continuous_space == rosalloc_space_) { + rosalloc_space_ = nullptr; + } + if (continuous_space == main_space_) { + main_space_ = nullptr; + } else if (continuous_space == bump_pointer_space_) { + bump_pointer_space_ = nullptr; + } else if (continuous_space == temp_space_) { + temp_space_ = nullptr; + } } } else { DCHECK(space->IsDiscontinuousSpace()); @@ -725,6 +732,7 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) { os << "Total mutator paused time: " << PrettyDuration(total_paused_time) << "\n"; os << "Total time waiting for GC to complete: " << PrettyDuration(total_wait_time_) << "\n"; os << "Approximate GC data structures memory overhead: " << gc_memory_overhead_; + BaseMutex::DumpAll(os); } Heap::~Heap() { @@ -1457,6 +1465,10 @@ void Heap::TransitionCollector(CollectorType collector_type) { // pointer space last transition it will be protected. bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE); Compact(bump_pointer_space_, main_space_); + // Remove the main space so that we don't try to trim it, this doens't work for debug + // builds since RosAlloc attempts to read the magic number from a protected page. + // TODO: Clean this up by getting rid of the remove_as_default parameter. + RemoveSpace(main_space_, false); } break; } @@ -1465,6 +1477,7 @@ void Heap::TransitionCollector(CollectorType collector_type) { case kCollectorTypeCMS: { if (IsMovingGc(collector_type_)) { // Compact to the main space from the bump pointer space, don't need to swap semispaces. + AddSpace(main_space_, false); main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE); Compact(main_space_, bump_pointer_space_); } diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h index ceba8b6..c37bb05 100644 --- a/runtime/gc/heap.h +++ b/runtime/gc/heap.h @@ -289,6 +289,12 @@ class Heap { void RegisterGCAllocation(size_t bytes); void RegisterGCDeAllocation(size_t bytes); + // Public due to usage by tests. + void AddSpace(space::Space* space, bool set_as_default = true) + LOCKS_EXCLUDED(Locks::heap_bitmap_lock_); + void RemoveSpace(space::Space* space, bool unset_as_default = true) + LOCKS_EXCLUDED(Locks::heap_bitmap_lock_); + // Set target ideal heap utilization ratio, implements // dalvik.system.VMRuntime.setTargetHeapUtilization. void SetTargetHeapUtilization(float target); @@ -684,10 +690,6 @@ class Heap { size_t GetPercentFree(); - void AddSpace(space::Space* space, bool set_as_default = true) - LOCKS_EXCLUDED(Locks::heap_bitmap_lock_); - void RemoveSpace(space::Space* space) LOCKS_EXCLUDED(Locks::heap_bitmap_lock_); - static void VerificationCallback(mirror::Object* obj, void* arg) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc index a5a6da0..f5c0e94 100644 --- a/runtime/gc/space/rosalloc_space.cc +++ b/runtime/gc/space/rosalloc_space.cc @@ -32,7 +32,7 @@ namespace art { namespace gc { namespace space { -static constexpr bool kPrefetchDuringRosAllocFreeList = true; +static constexpr bool kPrefetchDuringRosAllocFreeList = false; static constexpr size_t kPrefetchLookAhead = 8; // Use this only for verification, it is not safe to use since the class of the object may have // been freed. diff --git a/runtime/gc/space/space_test.h b/runtime/gc/space/space_test.h index 9896a48..28200df 100644 --- a/runtime/gc/space/space_test.h +++ b/runtime/gc/space/space_test.h @@ -39,10 +39,8 @@ class SpaceTest : public CommonRuntimeTest { } void AddSpace(ContinuousSpace* space) { - // For RosAlloc, revoke the thread local runs before moving onto a - // new alloc space. - Runtime::Current()->GetHeap()->RevokeAllThreadLocalBuffers(); - Runtime::Current()->GetHeap()->AddSpace(space); + // By passing true, AddSpace() does the revoke. + Runtime::Current()->GetHeap()->AddSpace(space, true); } mirror::Class* GetByteArrayClass(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { @@ -349,11 +347,8 @@ void SpaceTest::AllocAndFreeListTestBody(CreateSpaceFn create_space) { EXPECT_EQ(usable_size, computed_usable_size); } - // Release memory and check pointers are nullptr. + // Release memory. space->FreeList(self, arraysize(lots_of_objects), lots_of_objects); - for (size_t i = 0; i < arraysize(lots_of_objects); i++) { - EXPECT_TRUE(lots_of_objects[i] == nullptr); - } // Succeeds, fits by adjusting the max allowed footprint. for (size_t i = 0; i < arraysize(lots_of_objects); i++) { @@ -367,12 +362,8 @@ void SpaceTest::AllocAndFreeListTestBody(CreateSpaceFn create_space) { EXPECT_EQ(usable_size, computed_usable_size); } - // Release memory and check pointers are nullptr - // TODO: This isn't compaction safe, fix. + // Release memory. space->FreeList(self, arraysize(lots_of_objects), lots_of_objects); - for (size_t i = 0; i < arraysize(lots_of_objects); i++) { - EXPECT_TRUE(lots_of_objects[i] == nullptr); - } } void SpaceTest::SizeFootPrintGrowthLimitAndTrimBody(MallocSpace* space, intptr_t object_size, diff --git a/runtime/thread.cc b/runtime/thread.cc index 095404f..771680b 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -1018,7 +1018,8 @@ Thread::Thread(bool daemon) : tls32_(daemon), wait_monitor_(nullptr), interrupte tls32_.state_and_flags.as_struct.flags = 0; tls32_.state_and_flags.as_struct.state = kNative; memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes)); - memset(tlsPtr_.rosalloc_runs, 0, sizeof(tlsPtr_.rosalloc_runs)); + std::fill(tlsPtr_.rosalloc_runs, tlsPtr_.rosalloc_runs + kRosAllocNumOfSizeBrackets, + gc::allocator::RosAlloc::GetDedicatedFullRun()); for (uint32_t i = 0; i < kMaxCheckpoints; ++i) { tlsPtr_.checkpoint_functions[i] = nullptr; } -- cgit v1.1