diff options
author | Hiroshi Yamauchi <yamauchi@google.com> | 2014-02-11 17:02:22 -0800 |
---|---|---|
committer | Hiroshi Yamauchi <yamauchi@google.com> | 2014-02-13 13:43:43 -0800 |
commit | f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd (patch) | |
tree | 9e1b211414c1dcb34f19a7b509e72aee2153a3cf /runtime | |
parent | 68bb649b128cd8760732524bd7ba58b49780d9d3 (diff) | |
download | art-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.zip art-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.tar.gz art-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.tar.bz2 |
Thread-local allocation stack.
With this change, Ritz MemAllocTest gets ~14% faster on N4.
Bug: 9986565
Change-Id: I2fb7d6f7c5daa63dd4fc73ba739e6ae4ed820617
Diffstat (limited to 'runtime')
-rw-r--r-- | runtime/gc/accounting/atomic_stack.h | 35 | ||||
-rw-r--r-- | runtime/gc/allocator/rosalloc.cc | 3 | ||||
-rw-r--r-- | runtime/gc/collector/mark_sweep.cc | 18 | ||||
-rw-r--r-- | runtime/gc/collector/semi_space.cc | 5 | ||||
-rw-r--r-- | runtime/gc/heap-inl.h | 35 | ||||
-rw-r--r-- | runtime/gc/heap.cc | 48 | ||||
-rw-r--r-- | runtime/gc/heap.h | 11 | ||||
-rw-r--r-- | runtime/thread-inl.h | 36 | ||||
-rw-r--r-- | runtime/thread.cc | 4 | ||||
-rw-r--r-- | runtime/thread.h | 13 |
10 files changed, 186 insertions, 22 deletions
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h index ea8f89c..d6f3228 100644 --- a/runtime/gc/accounting/atomic_stack.h +++ b/runtime/gc/accounting/atomic_stack.h @@ -73,6 +73,41 @@ class AtomicStack { return true; } + // Atomically bump the back index by the given number of + // slots. Returns false if we overflowed the stack. + bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) { + if (kIsDebugBuild) { + debug_is_sorted_ = false; + } + int32_t index; + int32_t new_index; + do { + index = back_index_; + new_index = index + num_slots; + if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) { + // Stack overflow. + return false; + } + } while (!back_index_.CompareAndSwap(index, new_index)); + *start_address = &begin_[index]; + *end_address = &begin_[new_index]; + if (kIsDebugBuild) { + // Sanity check that the memory is zero. + for (int32_t i = index; i < new_index; ++i) { + DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i << " index=" << index << " new_index=" << new_index; + } + } + return true; + } + + void AssertAllZero() { + if (kIsDebugBuild) { + for (size_t i = 0; i < capacity_; ++i) { + DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i; + } + } + } + void PushBack(const T& value) { if (kIsDebugBuild) { debug_is_sorted_ = false; diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc index 65d4c44..d02b851 100644 --- a/runtime/gc/allocator/rosalloc.cc +++ b/runtime/gc/allocator/rosalloc.cc @@ -1560,7 +1560,8 @@ void RosAlloc::RevokeThreadLocalRuns(Thread* thread) { void RosAlloc::RevokeAllThreadLocalRuns() { // This is called when a mutator thread won't allocate such as at // the Zygote creation time or during the GC pause. - MutexLock mu(Thread::Current(), *Locks::thread_list_lock_); + MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_); + MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_); std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList(); for (auto it = thread_list.begin(); it != thread_list.end(); ++it) { Thread* t = *it; diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc index de9f59e..dbbc115 100644 --- a/runtime/gc/collector/mark_sweep.cc +++ b/runtime/gc/collector/mark_sweep.cc @@ -206,6 +206,10 @@ bool MarkSweep::HandleDirtyObjectsPhase() { // This second sweep makes sure that we don't have any objects in the live stack which point to // freed objects. These cause problems since their references may be previously freed objects. SweepArray(GetHeap()->allocation_stack_.get(), false); + // Since SweepArray() above resets the (active) allocation + // stack. Need to revoke the thread-local allocation stacks that + // point into it. + GetHeap()->RevokeAllThreadLocalAllocationStacks(self); } timings_.StartSplit("PreSweepingGcVerification"); @@ -241,12 +245,15 @@ void MarkSweep::MarkingPhase() { // Need to do this before the checkpoint since we don't want any threads to add references to // the live stack during the recursive mark. timings_.NewSplit("SwapStacks"); - heap_->SwapStacks(); + heap_->SwapStacks(self); WriterMutexLock mu(self, *Locks::heap_bitmap_lock_); if (Locks::mutator_lock_->IsExclusiveHeld(self)) { // If we exclusively hold the mutator lock, all threads must be suspended. MarkRoots(); + if (kUseThreadLocalAllocationStack) { + heap_->RevokeAllThreadLocalAllocationStacks(self); + } } else { MarkThreadRoots(self); // At this point the live stack should no longer have any mutators which push into it. @@ -995,6 +1002,9 @@ class CheckpointMarkThreadRoots : public Closure { << thread->GetState() << " thread " << thread << " self " << self; thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_); ATRACE_END(); + if (kUseThreadLocalAllocationStack) { + thread->RevokeThreadLocalAllocationStack(); + } mark_sweep_->GetBarrier().Pass(self); } @@ -1062,6 +1072,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma Object** out = objects; for (size_t i = 0; i < count; ++i) { Object* obj = objects[i]; + if (kUseThreadLocalAllocationStack && obj == nullptr) { + continue; + } if (space->HasAddress(obj)) { // This object is in the space, remove it from the array and add it to the sweep buffer // if needed. @@ -1100,6 +1113,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma for (size_t i = 0; i < count; ++i) { Object* obj = objects[i]; // Handle large objects. + if (kUseThreadLocalAllocationStack && obj == nullptr) { + continue; + } if (!large_mark_objects->Test(obj)) { ++freed_large_objects; freed_large_object_bytes += large_object_space->Free(self, obj); diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc index ac33cc7..b1122b9 100644 --- a/runtime/gc/collector/semi_space.cc +++ b/runtime/gc/collector/semi_space.cc @@ -210,7 +210,10 @@ void SemiSpace::MarkingPhase() { // Need to do this before the checkpoint since we don't want any threads to add references to // the live stack during the recursive mark. timings_.NewSplit("SwapStacks"); - heap_->SwapStacks(); + if (kUseThreadLocalAllocationStack) { + heap_->RevokeAllThreadLocalAllocationStacks(self_); + } + heap_->SwapStacks(self_); WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_); MarkRoots(); // Mark roots of immune spaces. diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h index 5e1136b..9c91b0e 100644 --- a/runtime/gc/heap-inl.h +++ b/runtime/gc/heap-inl.h @@ -82,11 +82,7 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas DCHECK(!Runtime::Current()->HasStatsEnabled()); } if (AllocatorHasAllocationStack(allocator)) { - // This is safe to do since the GC will never free objects which are neither in the allocation - // stack or the live bitmap. - while (!allocation_stack_->AtomicPushBack(obj)) { - CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false); - } + PushOnAllocationStack(self, obj); } if (kInstrumented) { if (Dbg::IsAllocTrackingEnabled()) { @@ -111,6 +107,35 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas return obj; } +// The size of a thread-local allocation stack in the number of references. +static constexpr size_t kThreadLocalAllocationStackSize = 128; + +inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) { + if (kUseThreadLocalAllocationStack) { + bool success = self->PushOnThreadLocalAllocationStack(obj); + if (UNLIKELY(!success)) { + // Slow path. Allocate a new thread-local allocation stack. + mirror::Object** start_address; + mirror::Object** end_address; + while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize, + &start_address, &end_address)) { + CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false); + } + self->SetThreadLocalAllocationStack(start_address, end_address); + // Retry on the new thread-local allocation stack. + success = self->PushOnThreadLocalAllocationStack(obj); + // Must succeed. + CHECK(success); + } + } else { + // This is safe to do since the GC will never free objects which are neither in the allocation + // stack or the live bitmap. + while (!allocation_stack_->AtomicPushBack(obj)) { + CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false); + } + } +} + template <bool kInstrumented, typename PreFenceVisitor> inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass, size_t byte_count, diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index 5c174f8..f1126ef 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc @@ -383,7 +383,8 @@ void Heap::VisitObjects(ObjectCallback callback, void* arg) { mirror::Object* obj = *it; if (obj != nullptr && obj->GetClass() != nullptr) { // Avoid the race condition caused by the object not yet being written into the allocation - // stack or the class not yet being written in the object. + // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack, + // there can be nulls on the allocation stack. callback(obj, arg); } } @@ -1533,13 +1534,14 @@ void Heap::MarkAllocStack(accounting::SpaceBitmap* bitmap1, mirror::Object** limit = stack->End(); for (mirror::Object** it = stack->Begin(); it != limit; ++it) { const mirror::Object* obj = *it; - DCHECK(obj != nullptr); - if (bitmap1->HasAddress(obj)) { - bitmap1->Set(obj); - } else if (bitmap2->HasAddress(obj)) { - bitmap2->Set(obj); - } else { - large_objects->Set(obj); + if (!kUseThreadLocalAllocationStack || obj != nullptr) { + if (bitmap1->HasAddress(obj)) { + bitmap1->Set(obj); + } else if (bitmap2->HasAddress(obj)) { + bitmap2->Set(obj); + } else { + large_objects->Set(obj); + } } } } @@ -2004,7 +2006,9 @@ bool Heap::VerifyMissingCardMarks() { // We can verify objects in the live stack since none of these should reference dead objects. for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) { - visitor(*it); + if (!kUseThreadLocalAllocationStack || *it != nullptr) { + visitor(*it); + } } if (visitor.Failed()) { @@ -2014,10 +2018,30 @@ bool Heap::VerifyMissingCardMarks() { return true; } -void Heap::SwapStacks() { +void Heap::SwapStacks(Thread* self) { + if (kUseThreadLocalAllocationStack) { + live_stack_->AssertAllZero(); + } allocation_stack_.swap(live_stack_); } +void Heap::RevokeAllThreadLocalAllocationStacks(Thread* self) { + if (!Runtime::Current()->IsStarted()) { + // There's no thread list if the runtime hasn't started (eg + // dex2oat or a test). Just revoke for self. + self->RevokeThreadLocalAllocationStack(); + return; + } + // This must be called only during the pause. + CHECK(Locks::mutator_lock_->IsExclusiveHeld(self)); + MutexLock mu(self, *Locks::runtime_shutdown_lock_); + MutexLock mu2(self, *Locks::thread_list_lock_); + std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList(); + for (Thread* t : thread_list) { + t->RevokeThreadLocalAllocationStack(); + } +} + accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) { auto it = mod_union_tables_.find(space); if (it == mod_union_tables_.end()) { @@ -2072,12 +2096,12 @@ void Heap::PreGcVerification(collector::GarbageCollector* gc) { thread_list->SuspendAll(); { ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_); - SwapStacks(); + SwapStacks(self); // Sort the live stack so that we can quickly binary search it later. if (!VerifyMissingCardMarks()) { LOG(FATAL) << "Pre " << gc->GetName() << " missing card mark verification failed"; } - SwapStacks(); + SwapStacks(self); } thread_list->ResumeAll(); } diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h index e416c0e..80a5a1a 100644 --- a/runtime/gc/heap.h +++ b/runtime/gc/heap.h @@ -111,6 +111,9 @@ static constexpr HeapVerificationMode kDesiredHeapVerification = kNoHeapVerifica // If true, use rosalloc/RosAllocSpace instead of dlmalloc/DlMallocSpace static constexpr bool kUseRosAlloc = true; +// If true, use thread-local allocation stack. +static constexpr bool kUseThreadLocalAllocationStack = true; + // The process state passed in from the activity manager, used to determine when to do trimming // and compaction. enum ProcessState { @@ -665,11 +668,17 @@ class Heap { SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_); // Swap the allocation stack with the live stack. - void SwapStacks(); + void SwapStacks(Thread* self); + + // Revoke all the thread-local allocation stacks. + void RevokeAllThreadLocalAllocationStacks(Thread* self); // Clear cards and update the mod union table. void ProcessCards(TimingLogger& timings); + // Push an object onto the allocation stack. + void PushOnAllocationStack(Thread* self, mirror::Object* obj); + // All-known continuous spaces, where objects lie within fixed bounds. std::vector<space::ContinuousSpace*> continuous_spaces_; diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h index 9420e7b..c0bf377 100644 --- a/runtime/thread-inl.h +++ b/runtime/thread-inl.h @@ -170,6 +170,42 @@ inline mirror::Object* Thread::AllocTlab(size_t bytes) { return ret; } +inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) { + DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_); + if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) { + // There's room. + DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*), + reinterpret_cast<byte*>(thread_local_alloc_stack_end_)); + DCHECK(*thread_local_alloc_stack_top_ == nullptr); + *thread_local_alloc_stack_top_ = obj; + ++thread_local_alloc_stack_top_; + return true; + } + return false; +} + +inline void Thread::SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end) { + DCHECK(Thread::Current() == this) << "Should be called by self"; + DCHECK(start != nullptr); + DCHECK(end != nullptr); + DCHECK_ALIGNED(start, sizeof(mirror::Object*)); + DCHECK_ALIGNED(end, sizeof(mirror::Object*)); + DCHECK_LT(start, end); + thread_local_alloc_stack_end_ = end; + thread_local_alloc_stack_top_ = start; +} + +inline void Thread::RevokeThreadLocalAllocationStack() { + if (kIsDebugBuild) { + // Note: self is not necessarily equal to this thread since thread may be suspended. + Thread* self = Thread::Current(); + DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc) + << GetState() << " thread " << this << " self " << self; + } + thread_local_alloc_stack_end_ = nullptr; + thread_local_alloc_stack_top_ = nullptr; +} + } // namespace art #endif // ART_RUNTIME_THREAD_INL_H_ diff --git a/runtime/thread.cc b/runtime/thread.cc index 9797a48..3382811 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -963,7 +963,9 @@ Thread::Thread(bool daemon) thread_local_start_(nullptr), thread_local_pos_(nullptr), thread_local_end_(nullptr), - thread_local_objects_(0) { + thread_local_objects_(0), + thread_local_alloc_stack_top_(nullptr), + thread_local_alloc_stack_end_(nullptr) { CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread); state_and_flags_.as_struct.flags = 0; state_and_flags_.as_struct.state = kNative; diff --git a/runtime/thread.h b/runtime/thread.h index a3a77bb..6c072ba 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -829,6 +829,19 @@ class PACKED(4) Thread { static const size_t kRosAllocNumOfSizeBrackets = 34; void* rosalloc_runs_[kRosAllocNumOfSizeBrackets]; + // Thread-local allocation stack data/routines. + mirror::Object** thread_local_alloc_stack_top_; + mirror::Object** thread_local_alloc_stack_end_; + + // Push an object onto the allocation stack. + bool PushOnThreadLocalAllocationStack(mirror::Object* obj); + + // Set the thread local allocation pointers to the given pointers. + void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end); + + // Resets the thread local allocation pointers. + void RevokeThreadLocalAllocationStack(); + private: friend class Dbg; // For SetStateUnsafe. friend class Monitor; |