summaryrefslogtreecommitdiffstats
path: root/runtime
diff options
context:
space:
mode:
authorHiroshi Yamauchi <yamauchi@google.com>2014-02-11 17:02:22 -0800
committerHiroshi Yamauchi <yamauchi@google.com>2014-02-13 13:43:43 -0800
commitf5b0e20b5b31f5f5465784adcf2a204dcd69c7fd (patch)
tree9e1b211414c1dcb34f19a7b509e72aee2153a3cf /runtime
parent68bb649b128cd8760732524bd7ba58b49780d9d3 (diff)
downloadart-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.zip
art-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.tar.gz
art-f5b0e20b5b31f5f5465784adcf2a204dcd69c7fd.tar.bz2
Thread-local allocation stack.
With this change, Ritz MemAllocTest gets ~14% faster on N4. Bug: 9986565 Change-Id: I2fb7d6f7c5daa63dd4fc73ba739e6ae4ed820617
Diffstat (limited to 'runtime')
-rw-r--r--runtime/gc/accounting/atomic_stack.h35
-rw-r--r--runtime/gc/allocator/rosalloc.cc3
-rw-r--r--runtime/gc/collector/mark_sweep.cc18
-rw-r--r--runtime/gc/collector/semi_space.cc5
-rw-r--r--runtime/gc/heap-inl.h35
-rw-r--r--runtime/gc/heap.cc48
-rw-r--r--runtime/gc/heap.h11
-rw-r--r--runtime/thread-inl.h36
-rw-r--r--runtime/thread.cc4
-rw-r--r--runtime/thread.h13
10 files changed, 186 insertions, 22 deletions
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ea8f89c..d6f3228 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -73,6 +73,41 @@ class AtomicStack {
return true;
}
+ // Atomically bump the back index by the given number of
+ // slots. Returns false if we overflowed the stack.
+ bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) {
+ if (kIsDebugBuild) {
+ debug_is_sorted_ = false;
+ }
+ int32_t index;
+ int32_t new_index;
+ do {
+ index = back_index_;
+ new_index = index + num_slots;
+ if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
+ // Stack overflow.
+ return false;
+ }
+ } while (!back_index_.CompareAndSwap(index, new_index));
+ *start_address = &begin_[index];
+ *end_address = &begin_[new_index];
+ if (kIsDebugBuild) {
+ // Sanity check that the memory is zero.
+ for (int32_t i = index; i < new_index; ++i) {
+ DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i << " index=" << index << " new_index=" << new_index;
+ }
+ }
+ return true;
+ }
+
+ void AssertAllZero() {
+ if (kIsDebugBuild) {
+ for (size_t i = 0; i < capacity_; ++i) {
+ DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i;
+ }
+ }
+ }
+
void PushBack(const T& value) {
if (kIsDebugBuild) {
debug_is_sorted_ = false;
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 65d4c44..d02b851 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1560,7 +1560,8 @@ void RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
void RosAlloc::RevokeAllThreadLocalRuns() {
// This is called when a mutator thread won't allocate such as at
// the Zygote creation time or during the GC pause.
- MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+ MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+ MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
Thread* t = *it;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index de9f59e..dbbc115 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -206,6 +206,10 @@ bool MarkSweep::HandleDirtyObjectsPhase() {
// This second sweep makes sure that we don't have any objects in the live stack which point to
// freed objects. These cause problems since their references may be previously freed objects.
SweepArray(GetHeap()->allocation_stack_.get(), false);
+ // Since SweepArray() above resets the (active) allocation
+ // stack. Need to revoke the thread-local allocation stacks that
+ // point into it.
+ GetHeap()->RevokeAllThreadLocalAllocationStacks(self);
}
timings_.StartSplit("PreSweepingGcVerification");
@@ -241,12 +245,15 @@ void MarkSweep::MarkingPhase() {
// Need to do this before the checkpoint since we don't want any threads to add references to
// the live stack during the recursive mark.
timings_.NewSplit("SwapStacks");
- heap_->SwapStacks();
+ heap_->SwapStacks(self);
WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
// If we exclusively hold the mutator lock, all threads must be suspended.
MarkRoots();
+ if (kUseThreadLocalAllocationStack) {
+ heap_->RevokeAllThreadLocalAllocationStacks(self);
+ }
} else {
MarkThreadRoots(self);
// At this point the live stack should no longer have any mutators which push into it.
@@ -995,6 +1002,9 @@ class CheckpointMarkThreadRoots : public Closure {
<< thread->GetState() << " thread " << thread << " self " << self;
thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_);
ATRACE_END();
+ if (kUseThreadLocalAllocationStack) {
+ thread->RevokeThreadLocalAllocationStack();
+ }
mark_sweep_->GetBarrier().Pass(self);
}
@@ -1062,6 +1072,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma
Object** out = objects;
for (size_t i = 0; i < count; ++i) {
Object* obj = objects[i];
+ if (kUseThreadLocalAllocationStack && obj == nullptr) {
+ continue;
+ }
if (space->HasAddress(obj)) {
// This object is in the space, remove it from the array and add it to the sweep buffer
// if needed.
@@ -1100,6 +1113,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma
for (size_t i = 0; i < count; ++i) {
Object* obj = objects[i];
// Handle large objects.
+ if (kUseThreadLocalAllocationStack && obj == nullptr) {
+ continue;
+ }
if (!large_mark_objects->Test(obj)) {
++freed_large_objects;
freed_large_object_bytes += large_object_space->Free(self, obj);
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index ac33cc7..b1122b9 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -210,7 +210,10 @@ void SemiSpace::MarkingPhase() {
// Need to do this before the checkpoint since we don't want any threads to add references to
// the live stack during the recursive mark.
timings_.NewSplit("SwapStacks");
- heap_->SwapStacks();
+ if (kUseThreadLocalAllocationStack) {
+ heap_->RevokeAllThreadLocalAllocationStacks(self_);
+ }
+ heap_->SwapStacks(self_);
WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
MarkRoots();
// Mark roots of immune spaces.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 5e1136b..9c91b0e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -82,11 +82,7 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
DCHECK(!Runtime::Current()->HasStatsEnabled());
}
if (AllocatorHasAllocationStack(allocator)) {
- // This is safe to do since the GC will never free objects which are neither in the allocation
- // stack or the live bitmap.
- while (!allocation_stack_->AtomicPushBack(obj)) {
- CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
- }
+ PushOnAllocationStack(self, obj);
}
if (kInstrumented) {
if (Dbg::IsAllocTrackingEnabled()) {
@@ -111,6 +107,35 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
return obj;
}
+// The size of a thread-local allocation stack in the number of references.
+static constexpr size_t kThreadLocalAllocationStackSize = 128;
+
+inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) {
+ if (kUseThreadLocalAllocationStack) {
+ bool success = self->PushOnThreadLocalAllocationStack(obj);
+ if (UNLIKELY(!success)) {
+ // Slow path. Allocate a new thread-local allocation stack.
+ mirror::Object** start_address;
+ mirror::Object** end_address;
+ while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
+ &start_address, &end_address)) {
+ CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+ }
+ self->SetThreadLocalAllocationStack(start_address, end_address);
+ // Retry on the new thread-local allocation stack.
+ success = self->PushOnThreadLocalAllocationStack(obj);
+ // Must succeed.
+ CHECK(success);
+ }
+ } else {
+ // This is safe to do since the GC will never free objects which are neither in the allocation
+ // stack or the live bitmap.
+ while (!allocation_stack_->AtomicPushBack(obj)) {
+ CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+ }
+ }
+}
+
template <bool kInstrumented, typename PreFenceVisitor>
inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
size_t byte_count,
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5c174f8..f1126ef 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -383,7 +383,8 @@ void Heap::VisitObjects(ObjectCallback callback, void* arg) {
mirror::Object* obj = *it;
if (obj != nullptr && obj->GetClass() != nullptr) {
// Avoid the race condition caused by the object not yet being written into the allocation
- // stack or the class not yet being written in the object.
+ // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack,
+ // there can be nulls on the allocation stack.
callback(obj, arg);
}
}
@@ -1533,13 +1534,14 @@ void Heap::MarkAllocStack(accounting::SpaceBitmap* bitmap1,
mirror::Object** limit = stack->End();
for (mirror::Object** it = stack->Begin(); it != limit; ++it) {
const mirror::Object* obj = *it;
- DCHECK(obj != nullptr);
- if (bitmap1->HasAddress(obj)) {
- bitmap1->Set(obj);
- } else if (bitmap2->HasAddress(obj)) {
- bitmap2->Set(obj);
- } else {
- large_objects->Set(obj);
+ if (!kUseThreadLocalAllocationStack || obj != nullptr) {
+ if (bitmap1->HasAddress(obj)) {
+ bitmap1->Set(obj);
+ } else if (bitmap2->HasAddress(obj)) {
+ bitmap2->Set(obj);
+ } else {
+ large_objects->Set(obj);
+ }
}
}
}
@@ -2004,7 +2006,9 @@ bool Heap::VerifyMissingCardMarks() {
// We can verify objects in the live stack since none of these should reference dead objects.
for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
- visitor(*it);
+ if (!kUseThreadLocalAllocationStack || *it != nullptr) {
+ visitor(*it);
+ }
}
if (visitor.Failed()) {
@@ -2014,10 +2018,30 @@ bool Heap::VerifyMissingCardMarks() {
return true;
}
-void Heap::SwapStacks() {
+void Heap::SwapStacks(Thread* self) {
+ if (kUseThreadLocalAllocationStack) {
+ live_stack_->AssertAllZero();
+ }
allocation_stack_.swap(live_stack_);
}
+void Heap::RevokeAllThreadLocalAllocationStacks(Thread* self) {
+ if (!Runtime::Current()->IsStarted()) {
+ // There's no thread list if the runtime hasn't started (eg
+ // dex2oat or a test). Just revoke for self.
+ self->RevokeThreadLocalAllocationStack();
+ return;
+ }
+ // This must be called only during the pause.
+ CHECK(Locks::mutator_lock_->IsExclusiveHeld(self));
+ MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+ MutexLock mu2(self, *Locks::thread_list_lock_);
+ std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+ for (Thread* t : thread_list) {
+ t->RevokeThreadLocalAllocationStack();
+ }
+}
+
accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) {
auto it = mod_union_tables_.find(space);
if (it == mod_union_tables_.end()) {
@@ -2072,12 +2096,12 @@ void Heap::PreGcVerification(collector::GarbageCollector* gc) {
thread_list->SuspendAll();
{
ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
- SwapStacks();
+ SwapStacks(self);
// Sort the live stack so that we can quickly binary search it later.
if (!VerifyMissingCardMarks()) {
LOG(FATAL) << "Pre " << gc->GetName() << " missing card mark verification failed";
}
- SwapStacks();
+ SwapStacks(self);
}
thread_list->ResumeAll();
}
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index e416c0e..80a5a1a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -111,6 +111,9 @@ static constexpr HeapVerificationMode kDesiredHeapVerification = kNoHeapVerifica
// If true, use rosalloc/RosAllocSpace instead of dlmalloc/DlMallocSpace
static constexpr bool kUseRosAlloc = true;
+// If true, use thread-local allocation stack.
+static constexpr bool kUseThreadLocalAllocationStack = true;
+
// The process state passed in from the activity manager, used to determine when to do trimming
// and compaction.
enum ProcessState {
@@ -665,11 +668,17 @@ class Heap {
SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
// Swap the allocation stack with the live stack.
- void SwapStacks();
+ void SwapStacks(Thread* self);
+
+ // Revoke all the thread-local allocation stacks.
+ void RevokeAllThreadLocalAllocationStacks(Thread* self);
// Clear cards and update the mod union table.
void ProcessCards(TimingLogger& timings);
+ // Push an object onto the allocation stack.
+ void PushOnAllocationStack(Thread* self, mirror::Object* obj);
+
// All-known continuous spaces, where objects lie within fixed bounds.
std::vector<space::ContinuousSpace*> continuous_spaces_;
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 9420e7b..c0bf377 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -170,6 +170,42 @@ inline mirror::Object* Thread::AllocTlab(size_t bytes) {
return ret;
}
+inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
+ DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
+ if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+ // There's room.
+ DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
+ reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
+ DCHECK(*thread_local_alloc_stack_top_ == nullptr);
+ *thread_local_alloc_stack_top_ = obj;
+ ++thread_local_alloc_stack_top_;
+ return true;
+ }
+ return false;
+}
+
+inline void Thread::SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end) {
+ DCHECK(Thread::Current() == this) << "Should be called by self";
+ DCHECK(start != nullptr);
+ DCHECK(end != nullptr);
+ DCHECK_ALIGNED(start, sizeof(mirror::Object*));
+ DCHECK_ALIGNED(end, sizeof(mirror::Object*));
+ DCHECK_LT(start, end);
+ thread_local_alloc_stack_end_ = end;
+ thread_local_alloc_stack_top_ = start;
+}
+
+inline void Thread::RevokeThreadLocalAllocationStack() {
+ if (kIsDebugBuild) {
+ // Note: self is not necessarily equal to this thread since thread may be suspended.
+ Thread* self = Thread::Current();
+ DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
+ << GetState() << " thread " << this << " self " << self;
+ }
+ thread_local_alloc_stack_end_ = nullptr;
+ thread_local_alloc_stack_top_ = nullptr;
+}
+
} // namespace art
#endif // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 9797a48..3382811 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -963,7 +963,9 @@ Thread::Thread(bool daemon)
thread_local_start_(nullptr),
thread_local_pos_(nullptr),
thread_local_end_(nullptr),
- thread_local_objects_(0) {
+ thread_local_objects_(0),
+ thread_local_alloc_stack_top_(nullptr),
+ thread_local_alloc_stack_end_(nullptr) {
CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
state_and_flags_.as_struct.flags = 0;
state_and_flags_.as_struct.state = kNative;
diff --git a/runtime/thread.h b/runtime/thread.h
index a3a77bb..6c072ba 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -829,6 +829,19 @@ class PACKED(4) Thread {
static const size_t kRosAllocNumOfSizeBrackets = 34;
void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
+ // Thread-local allocation stack data/routines.
+ mirror::Object** thread_local_alloc_stack_top_;
+ mirror::Object** thread_local_alloc_stack_end_;
+
+ // Push an object onto the allocation stack.
+ bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+ // Set the thread local allocation pointers to the given pointers.
+ void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+ // Resets the thread local allocation pointers.
+ void RevokeThreadLocalAllocationStack();
+
private:
friend class Dbg; // For SetStateUnsafe.
friend class Monitor;