Thread local bump pointer allocator.

Added a thread local allocator to the heap, each thread has three pointers which specify the thread local buffer: start, cur, and end. When the remaining space in the thread local buffer isn't large enough for the allocation, the allocator allocates a new thread local buffer using the bump pointer allocator. The bump pointer space had to be modified to accomodate thread local buffers. These buffers are called "blocks", where a block is a buffer which contains a set of adjacent objects. Blocks aren't necessarily full and may have wasted memory towards the end. Blocks have an 8 byte header which specifies their size and is required for traversing bump pointer spaces. Memory usage is in between full bump pointer and ROSAlloc since madvised memory limits wasted ram to an average of 1/2 page per block. Added a runtime option -XX:UseTLAB which specifies whether or not to use the thread local allocator. Its a NOP if the garbage collector is not the semispace collector. TODO: Smarter block accounting to prevent us reading objects until we either hit the end of the block or GetClass() == null which signifies that the block isn't 100% full. This would provide a slight speedup to BumpPointerSpace::Walk. Timings: -XX:HeapMinFree=4m -XX:HeapMaxFree=8m -Xmx48m ritzperf memalloc: Dalvik -Xgc:concurrent: 11678 Dalvik -Xgc:noconcurrent: 6697 -Xgc:MS: 5978 -Xgc:SS: 4271 -Xgc:CMS: 4150 -Xgc:SS -XX:UseTLAB: 3255 Bug: 9986565 Bug: 12042213 Change-Id: Ib7e1d4b199a8199f3b1de94b0a7b6e1730689cad
author: Mathieu Chartier <mathieuc@google.com> 2013-11-29 17:24:40 -0800
committer: Mathieu Chartier <mathieuc@google.com> 2013-12-16 16:57:37 -0800
commit: 692fafd9778141fa6ef0048c9569abd7ee0253bf (patch)
tree: 63ce2c7d4be6af2524a5f442195c8c8b6f5cc955
parent: 07dc96d370c4844c7a279c01cedf24a272b9f4f3 (diff)
download: art-692fafd9778141fa6ef0048c9569abd7ee0253bf.zip
art-692fafd9778141fa6ef0048c9569abd7ee0253bf.tar.gz
art-692fafd9778141fa6ef0048c9569abd7ee0253bf.tar.bz2
22 files changed, 360 insertions, 161 deletions
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index bdadc51..2aa6716 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -36,4 +36,6 @@ GENERATE_ALLOC_ENTRYPOINTS
 GENERATE_ALLOC_ENTRYPOINTS _instrumented, Instrumented
 GENERATE_ALLOC_ENTRYPOINTS _bump_pointer, BumpPointer
 GENERATE_ALLOC_ENTRYPOINTS _bump_pointer_instrumented, BumpPointerInstrumented
+GENERATE_ALLOC_ENTRYPOINTS _tlab, TLAB
+GENERATE_ALLOC_ENTRYPOINTS _tlab_instrumented, TLABInstrumented
 .endm
diff --git a/runtime/arch/quick_alloc_entrypoints.cc b/runtime/arch/quick_alloc_entrypoints.cc
index 192b124..4cdb3f2 100644
--- a/runtime/arch/quick_alloc_entrypoints.cc
+++ b/runtime/arch/quick_alloc_entrypoints.cc
@@ -53,6 +53,7 @@ namespace art {
 // Generate the entrypoint functions.
 GENERATE_ENTRYPOINTS();
 GENERATE_ENTRYPOINTS(_bump_pointer);
+GENERATE_ENTRYPOINTS(_tlab);
 
 static bool entry_points_instrumented = false;
 static gc::AllocatorType entry_points_allocator = kMovingCollector ?
@@ -76,6 +77,10 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) {
       SetQuickAllocEntryPoints_bump_pointer(qpoints, entry_points_instrumented);
       break;
     }
+    case gc::kAllocatorTypeTLAB: {
+      SetQuickAllocEntryPoints_tlab(qpoints, entry_points_instrumented);
+      break;
+    }
     default: {
       LOG(FATAL) << "Unimplemented";
     }
diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
index 9155088..1ae39ab 100644
--- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
@@ -81,5 +81,6 @@ extern "C" mirror::Array* artCheckAndAllocArrayFromCodeWithAccessCheck##suffix##
 
 GENERATE_ENTRYPOINTS_FOR_ALLOCATOR(, gc::kAllocatorTypeFreeList)
 GENERATE_ENTRYPOINTS_FOR_ALLOCATOR(BumpPointer, gc::kAllocatorTypeBumpPointer)
+GENERATE_ENTRYPOINTS_FOR_ALLOCATOR(TLAB, gc::kAllocatorTypeTLAB)
 
 }  // namespace art
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index cf301fe..6baee54 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -83,9 +83,9 @@ void GarbageCollector::Run(bool clear_soft_references) {
     uint64_t pause_start = NanoTime();
     ATRACE_BEGIN("Application threads suspended");
     thread_list->SuspendAll();
+    GetHeap()->RevokeAllThreadLocalBuffers();
     MarkingPhase();
     ReclaimPhase();
-    GetHeap()->RevokeAllThreadLocalBuffers();
     thread_list->ResumeAll();
     ATRACE_END();
     RegisterPause(NanoTime() - pause_start);
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 923560e..f29eadb 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -14,22 +14,6 @@
  * limitations under the License.
  */
 
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #include "semi_space.h"
 
 #include <functional>
@@ -337,7 +321,7 @@ Object* SemiSpace::MarkObject(Object* obj) {
       if (forward_address == nullptr) {
         // Otherwise, we need to move the object and add it to the markstack for processing.
         size_t object_size = obj->SizeOf();
-        size_t dummy = 0;
+        size_t bytes_allocated = 0;
         if (kEnableSimplePromo && reinterpret_cast<byte*>(obj) < last_gc_to_space_end_) {
           // If it's allocated before the last GC (older), move (pseudo-promote) it to
           // the non-moving space (as sort of an old generation.)
@@ -346,7 +330,7 @@ Object* SemiSpace::MarkObject(Object* obj) {
           forward_address = non_moving_space->Alloc(self_, object_size, &bytes_promoted);
           if (forward_address == nullptr) {
             // If out of space, fall back to the to-space.
-            forward_address = to_space_->Alloc(self_, object_size, &dummy);
+            forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated);
           } else {
             GetHeap()->num_bytes_allocated_.fetch_add(bytes_promoted);
             bytes_promoted_ += bytes_promoted;
@@ -364,7 +348,7 @@ Object* SemiSpace::MarkObject(Object* obj) {
           DCHECK(forward_address != nullptr);
         } else {
           // If it's allocated after the last GC (younger), copy it to the to-space.
-          forward_address = to_space_->Alloc(self_, object_size, &dummy);
+          forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated);
         }
         // Copy over the object and add it to the mark stack since we still need to update it's
         // references.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 99f084a..9fb5760 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -32,7 +32,7 @@
 namespace art {
 namespace gc {
 
-template <bool kInstrumented, typename PreFenceVisitor>
+template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Class* klass,
                                                       size_t byte_count, AllocatorType allocator,
                                                       const PreFenceVisitor& pre_fence_visitor) {
@@ -43,13 +43,13 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
   self->AssertThreadSuspensionIsAllowable();
   // Need to check that we arent the large object allocator since the large object allocation code
   // path this function. If we didn't check we would have an infinite loop.
-  if (allocator != kAllocatorTypeLOS && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
+  if (kCheckLargeObject && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
     return AllocLargeObject<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
                                                             pre_fence_visitor);
   }
   mirror::Object* obj;
-  size_t bytes_allocated;
   AllocationTimer alloc_timer(this, &obj);
+  size_t bytes_allocated;
   obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated);
   if (UNLIKELY(obj == nullptr)) {
     obj = AllocateInternalWithGc(self, allocator, byte_count, &bytes_allocated, &klass);
@@ -89,7 +89,11 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
   } else {
     DCHECK(!Dbg::IsAllocTrackingEnabled());
   }
-  if (concurrent_gc_) {
+  // concurrent_gc_ isn't known at compile time so we can optimize by not checking it for
+  // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
+  // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
+  // the allocator_type should be constant propagated.
+  if (AllocatorMayHaveConcurrentGC(allocator) && concurrent_gc_) {
     CheckConcurrentGC(self, new_num_bytes_allocated, obj);
   }
   if (kIsDebugBuild) {
@@ -105,15 +109,15 @@ template <bool kInstrumented, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
                                               size_t byte_count,
                                               const PreFenceVisitor& pre_fence_visitor) {
-  return AllocObjectWithAllocator<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
-                                                                  kAllocatorTypeLOS,
-                                                                  pre_fence_visitor);
+  return AllocObjectWithAllocator<kInstrumented, false, PreFenceVisitor>(self, klass, byte_count,
+                                                                         kAllocatorTypeLOS,
+                                                                         pre_fence_visitor);
 }
 
 template <const bool kInstrumented, const bool kGrow>
 inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator_type,
                                            size_t alloc_size, size_t* bytes_allocated) {
-  if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(alloc_size))) {
+  if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
     return nullptr;
   }
   if (kInstrumented) {
@@ -153,6 +157,21 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator
       DCHECK(ret == nullptr || large_object_space_->Contains(ret));
       break;
     }
+    case kAllocatorTypeTLAB: {
+      alloc_size = RoundUp(alloc_size, space::BumpPointerSpace::kAlignment);
+      if (UNLIKELY(self->TLABSize() < alloc_size)) {
+        // Try allocating a new thread local buffer, if the allocaiton fails the space must be
+        // full so return nullptr.
+        if (!bump_pointer_space_->AllocNewTLAB(self, alloc_size + kDefaultTLABSize)) {
+          return nullptr;
+        }
+      }
+      // The allocation can't fail.
+      ret = self->AllocTLAB(alloc_size);
+      DCHECK(ret != nullptr);
+      *bytes_allocated = alloc_size;
+      break;
+    }
     default: {
       LOG(FATAL) << "Invalid allocator type";
       ret = nullptr;
@@ -194,14 +213,14 @@ inline bool Heap::ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) co
   return byte_count >= kLargeObjectThreshold && have_zygote_space_ && c->IsPrimitiveArray();
 }
 
-template <const bool kGrow>
-inline bool Heap::IsOutOfMemoryOnAllocation(size_t alloc_size) {
+template <bool kGrow>
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) {
   size_t new_footprint = num_bytes_allocated_ + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
     }
-    if (!concurrent_gc_) {
+    if (!AllocatorMayHaveConcurrentGC(allocator_type) || !concurrent_gc_) {
       if (!kGrow) {
         return true;
       }
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 11acd33..76a8e79 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -77,7 +77,7 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
            double target_utilization, size_t capacity, const std::string& image_file_name,
            CollectorType post_zygote_collector_type, size_t parallel_gc_threads,
            size_t conc_gc_threads, bool low_memory_mode, size_t long_pause_log_threshold,
-           size_t long_gc_log_threshold, bool ignore_max_footprint)
+           size_t long_gc_log_threshold, bool ignore_max_footprint, bool use_tlab)
     : non_moving_space_(nullptr),
       concurrent_gc_(false),
       collector_type_(kCollectorTypeNone),
@@ -103,11 +103,6 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       native_footprint_gc_watermark_(initial_size),
       native_footprint_limit_(2 * initial_size),
       native_need_to_run_finalization_(false),
-      activity_thread_class_(NULL),
-      application_thread_class_(NULL),
-      activity_thread_(NULL),
-      application_thread_(NULL),
-      last_process_state_id_(NULL),
       // Initially assume we perceive jank in case the process state is never updated.
       process_state_(kProcessStateJankPerceptible),
       concurrent_start_bytes_(std::numeric_limits<size_t>::max()),
@@ -148,7 +143,8 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       total_allocation_time_(0),
       verify_object_mode_(kHeapVerificationNotPermitted),
       gc_disable_count_(0),
-      running_on_valgrind_(RUNNING_ON_VALGRIND) {
+      running_on_valgrind_(RUNNING_ON_VALGRIND),
+      use_tlab_(use_tlab) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
@@ -337,36 +333,21 @@ void Heap::CreateThreadPool() {
 }
 
 void Heap::VisitObjects(ObjectVisitorCallback callback, void* arg) {
-  // Visit objects in bump pointer space.
   Thread* self = Thread::Current();
-  // TODO: Use reference block.
-  std::vector<SirtRef<mirror::Object>*> saved_refs;
+  // GCs can move objects, so don't allow this.
+  const char* old_cause = self->StartAssertNoThreadSuspension("Visiting objects");
   if (bump_pointer_space_ != nullptr) {
-    // Need to put all these in sirts since the callback may trigger a GC. TODO: Use a better data
-    // structure.
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(bump_pointer_space_->Begin());
-    const mirror::Object* end = reinterpret_cast<const mirror::Object*>(
-        bump_pointer_space_->End());
-    while (obj < end) {
-      saved_refs.push_back(new SirtRef<mirror::Object>(self, obj));
-      obj = space::BumpPointerSpace::GetNextObject(obj);
-    }
+    // Visit objects in bump pointer space.
+    bump_pointer_space_->Walk(callback, arg);
   }
   // TODO: Switch to standard begin and end to use ranged a based loop.
   for (mirror::Object** it = allocation_stack_->Begin(), **end = allocation_stack_->End();
       it < end; ++it) {
     mirror::Object* obj = *it;
-    // Objects in the allocation stack might be in a movable space.
-    saved_refs.push_back(new SirtRef<mirror::Object>(self, obj));
+    callback(obj, arg);
   }
   GetLiveBitmap()->Walk(callback, arg);
-  for (const auto& ref : saved_refs) {
-    callback(ref->get(), arg);
-  }
-  // Need to free the sirts in reverse order they were allocated.
-  for (size_t i = saved_refs.size(); i != 0; --i) {
-    delete saved_refs[i - 1];
-  }
+  self->EndAssertNoThreadSuspension(old_cause);
 }
 
 void Heap::MarkAllocStackAsLive(accounting::ObjectStack* stack) {
@@ -471,8 +452,6 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) {
     }
   }
   uint64_t allocation_time = static_cast<uint64_t>(total_allocation_time_) * kTimeAdjust;
-  size_t total_objects_allocated = GetObjectsAllocatedEver();
-  size_t total_bytes_allocated = GetBytesAllocatedEver();
   if (total_duration != 0) {
     const double total_seconds = static_cast<double>(total_duration / 1000) / 1000000.0;
     os << "Total time spent in GC: " << PrettyDuration(total_duration) << "\n";
@@ -481,7 +460,9 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) {
     os << "Mean GC object throughput: "
        << (GetObjectsFreedEver() / total_seconds) << " objects/s\n";
   }
+  size_t total_objects_allocated = GetObjectsAllocatedEver();
   os << "Total number of allocations: " << total_objects_allocated << "\n";
+  size_t total_bytes_allocated = GetBytesAllocatedEver();
   os << "Total bytes allocated " << PrettySize(total_bytes_allocated) << "\n";
   if (kMeasureAllocationTime) {
     os << "Total time spent allocating: " << PrettyDuration(allocation_time) << "\n";
@@ -698,7 +679,7 @@ void Heap::Trim() {
     }
   }
   total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated() -
-      bump_pointer_space_->GetBytesAllocated();
+      bump_pointer_space_->Size();
   const float managed_utilization = static_cast<float>(total_alloc_space_allocated) /
       static_cast<float>(total_alloc_space_size);
   uint64_t gc_heap_end_ns = NanoTime();
@@ -867,12 +848,10 @@ void Heap::VerifyHeap() {
 void Heap::RecordFree(size_t freed_objects, size_t freed_bytes) {
   DCHECK_LE(freed_bytes, static_cast<size_t>(num_bytes_allocated_));
   num_bytes_allocated_.fetch_sub(freed_bytes);
-
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
     thread_stats->freed_bytes += freed_bytes;
-
     // TODO: Do this concurrently.
     RuntimeStats* global_stats = Runtime::Current()->GetStats();
     global_stats->freed_objects += freed_objects;
@@ -945,19 +924,11 @@ size_t Heap::GetObjectsAllocated() const {
 }
 
 size_t Heap::GetObjectsAllocatedEver() const {
-  size_t total = 0;
-  for (space::AllocSpace* space : alloc_spaces_) {
-    total += space->GetTotalObjectsAllocated();
-  }
-  return total;
+  return GetObjectsFreedEver() + GetObjectsAllocated();
 }
 
 size_t Heap::GetBytesAllocatedEver() const {
-  size_t total = 0;
-  for (space::AllocSpace* space : alloc_spaces_) {
-    total += space->GetTotalBytesAllocated();
-  }
-  return total;
+  return GetBytesFreedEver() + GetBytesAllocated();
 }
 
 class InstanceCounter {
@@ -1102,7 +1073,11 @@ void Heap::ChangeCollector(CollectorType collector_type) {
       case kCollectorTypeSS: {
         concurrent_gc_ = false;
         gc_plan_.push_back(collector::kGcTypeFull);
-        ChangeAllocator(kAllocatorTypeBumpPointer);
+        if (use_tlab_) {
+          ChangeAllocator(kAllocatorTypeTLAB);
+        } else {
+          ChangeAllocator(kAllocatorTypeBumpPointer);
+        }
         break;
       }
       case kCollectorTypeMS: {
@@ -1134,6 +1109,10 @@ void Heap::ChangeCollector(CollectorType collector_type) {
   }
 }
 
+static void MarkInBitmapCallback(mirror::Object* obj, void* arg) {
+  reinterpret_cast<accounting::SpaceBitmap*>(arg)->Set(obj);
+}
+
 void Heap::PreZygoteFork() {
   static Mutex zygote_creation_lock_("zygote creation lock", kZygoteCreationLock);
   Thread* self = Thread::Current();
@@ -1158,7 +1137,7 @@ void Heap::PreZygoteFork() {
     // Compact the bump pointer space to a new zygote bump pointer space.
     temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
     Compact(&target_space, bump_pointer_space_);
-    CHECK_EQ(temp_space_->GetBytesAllocated(), 0U);
+    CHECK(temp_space_->IsEmpty());
     total_objects_freed_ever_ += semi_space_collector_->GetFreedObjects();
     total_bytes_freed_ever_ += semi_space_collector_->GetFreedBytes();
     // Update the end and write out image.
@@ -1167,12 +1146,7 @@ void Heap::PreZygoteFork() {
     accounting::SpaceBitmap* bitmap = non_moving_space_->GetLiveBitmap();
     // Record the allocations in the bitmap.
     VLOG(heap) << "Recording zygote allocations";
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(target_space.Begin());
-    const mirror::Object* end = reinterpret_cast<const mirror::Object*>(target_space.End());
-    while (obj < end) {
-      bitmap->Set(obj);
-      obj = space::BumpPointerSpace::GetNextObject(obj);
-    }
+    target_space.Walk(MarkInBitmapCallback, bitmap);
   }
   // Turn the current alloc space into a zygote space and obtain the new alloc space composed of
   // the remaining available heap memory.
@@ -1305,9 +1279,11 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCaus
 
   collector::GarbageCollector* collector = nullptr;
   // TODO: Clean this up.
-  if (current_allocator_ == kAllocatorTypeBumpPointer) {
+  if (collector_type_ == kCollectorTypeSS) {
+    DCHECK(current_allocator_ == kAllocatorTypeBumpPointer ||
+           current_allocator_ == kAllocatorTypeTLAB);
     gc_type = semi_space_collector_->GetGcType();
-    CHECK_EQ(temp_space_->GetObjectsAllocated(), 0U);
+    CHECK(temp_space_->IsEmpty());
     semi_space_collector_->SetFromSpace(bump_pointer_space_);
     semi_space_collector_->SetToSpace(temp_space_);
     mprotect(temp_space_->Begin(), temp_space_->Capacity(), PROT_READ | PROT_WRITE);
@@ -2070,10 +2046,16 @@ void Heap::RequestHeapTrim() {
 
 void Heap::RevokeThreadLocalBuffers(Thread* thread) {
   non_moving_space_->RevokeThreadLocalBuffers(thread);
+  if (bump_pointer_space_ != nullptr) {
+    bump_pointer_space_->RevokeThreadLocalBuffers(thread);
+  }
 }
 
 void Heap::RevokeAllThreadLocalBuffers() {
   non_moving_space_->RevokeAllThreadLocalBuffers();
+  if (bump_pointer_space_ != nullptr) {
+    bump_pointer_space_->RevokeAllThreadLocalBuffers();
+  }
 }
 
 bool Heap::IsGCRequestPending() const {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 9788064..832d5ec 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -91,6 +91,7 @@ class AgeCardVisitor {
 // Different types of allocators.
 enum AllocatorType {
   kAllocatorTypeBumpPointer,
+  kAllocatorTypeTLAB,
   kAllocatorTypeFreeList,  // ROSAlloc / dlmalloc
   kAllocatorTypeLOS,  // Large object space.
 };
@@ -139,6 +140,7 @@ class Heap {
   static constexpr size_t kDefaultMinFree = kDefaultMaxFree / 4;
   static constexpr size_t kDefaultLongPauseLogThreshold = MsToNs(5);
   static constexpr size_t kDefaultLongGCLogThreshold = MsToNs(100);
+  static constexpr size_t kDefaultTLABSize = 256 * KB;
 
   // Default target utilization.
   static constexpr double kDefaultTargetUtilization = 0.5;
@@ -154,24 +156,25 @@ class Heap {
                 const std::string& original_image_file_name, CollectorType collector_type_,
                 size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
                 size_t long_pause_threshold, size_t long_gc_threshold,
-                bool ignore_max_footprint);
+                bool ignore_max_footprint, bool use_tlab);
 
   ~Heap();
 
   // Allocates and initializes storage for an object instance.
-  template <const bool kInstrumented>
+  template <bool kInstrumented>
   inline mirror::Object* AllocObject(Thread* self, mirror::Class* klass, size_t num_bytes)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return AllocObjectWithAllocator<kInstrumented>(self, klass, num_bytes, GetCurrentAllocator());
+    return AllocObjectWithAllocator<kInstrumented, true>(self, klass, num_bytes,
+                                                         GetCurrentAllocator());
   }
-  template <const bool kInstrumented>
+  template <bool kInstrumented>
   inline mirror::Object* AllocNonMovableObject(Thread* self, mirror::Class* klass,
                                                size_t num_bytes)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return AllocObjectWithAllocator<kInstrumented>(self, klass, num_bytes,
-                                                   GetCurrentNonMovingAllocator());
+    return AllocObjectWithAllocator<kInstrumented, true>(self, klass, num_bytes,
+                                                         GetCurrentNonMovingAllocator());
   }
-  template <bool kInstrumented, typename PreFenceVisitor = VoidFunctor>
+  template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor = VoidFunctor>
   ALWAYS_INLINE mirror::Object* AllocObjectWithAllocator(
       Thread* self, mirror::Class* klass, size_t byte_count, AllocatorType allocator,
       const PreFenceVisitor& pre_fence_visitor = VoidFunctor())
@@ -507,17 +510,19 @@ class Heap {
   void Compact(space::ContinuousMemMapAllocSpace* target_space,
                space::ContinuousMemMapAllocSpace* source_space);
 
-  static bool AllocatorHasAllocationStack(AllocatorType allocator_type) {
-    return allocator_type != kAllocatorTypeBumpPointer;
+  static ALWAYS_INLINE bool AllocatorHasAllocationStack(AllocatorType allocator_type) {
+    return
+        allocator_type != kAllocatorTypeBumpPointer &&
+        allocator_type != kAllocatorTypeTLAB;
   }
-  static bool AllocatorHasConcurrentGC(AllocatorType allocator_type) {
-    return allocator_type != kAllocatorTypeBumpPointer;
+  static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
+    return AllocatorHasAllocationStack(allocator_type);
   }
   bool ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) const;
   ALWAYS_INLINE void CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
                                        mirror::Object* obj);
 
-  // We don't force this to be inline since it is a slow path.
+  // We don't force this to be inlined since it is a slow path.
   template <bool kInstrumented, typename PreFenceVisitor>
   mirror::Object* AllocLargeObject(Thread* self, mirror::Class* klass, size_t byte_count,
                                    const PreFenceVisitor& pre_fence_visitor)
@@ -544,8 +549,9 @@ class Heap {
 
   void ThrowOutOfMemoryError(Thread* self, size_t byte_count, bool large_object_allocation)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  template <const bool kGrow>
-  bool IsOutOfMemoryOnAllocation(size_t alloc_size);
+
+  template <bool kGrow>
+  bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size);
 
   // Pushes a list of cleared references out to the managed heap.
   void SetReferenceReferent(mirror::Object* reference, mirror::Object* referent)
@@ -721,13 +727,6 @@ class Heap {
   // Whether or not we need to run finalizers in the next native allocation.
   bool native_need_to_run_finalization_;
 
-  // Activity manager members.
-  jclass activity_thread_class_;
-  jclass application_thread_class_;
-  jobject activity_thread_;
-  jobject application_thread_;
-  jfieldID last_process_state_id_;
-
   // Whether or not we currently care about pause times.
   ProcessState process_state_;
 
@@ -845,6 +844,7 @@ class Heap {
   collector::SemiSpace* semi_space_collector_;
 
   const bool running_on_valgrind_;
+  const bool use_tlab_;
 
   friend class collector::MarkSweep;
   friend class collector::SemiSpace;
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 85ef2f4..82e96a4 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -23,8 +23,8 @@ namespace art {
 namespace gc {
 namespace space {
 
-inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
-  num_bytes = RoundUp(num_bytes, kAlignment);
+inline mirror::Object* BumpPointerSpace::AllocNonvirtualWithoutAccounting(size_t num_bytes) {
+  DCHECK(IsAligned<kAlignment>(num_bytes));
   byte* old_end;
   byte* new_end;
   do {
@@ -38,13 +38,18 @@ inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   } while (android_atomic_cas(reinterpret_cast<int32_t>(old_end),
                               reinterpret_cast<int32_t>(new_end),
                               reinterpret_cast<volatile int32_t*>(&end_)) != 0);
-  // TODO: Less statistics?
-  total_bytes_allocated_.fetch_add(num_bytes);
-  num_objects_allocated_.fetch_add(1);
-  total_objects_allocated_.fetch_add(1);
   return reinterpret_cast<mirror::Object*>(old_end);
 }
 
+inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
+  mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
+  if (ret != nullptr) {
+    objects_allocated_.fetch_add(1);
+    bytes_allocated_.fetch_add(num_bytes);
+  }
+  return ret;
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 06ba57e..7ea202c 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -18,6 +18,7 @@
 #include "bump_pointer_space-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/class-inl.h"
+#include "thread_list.h"
 
 namespace art {
 namespace gc {
@@ -40,18 +41,27 @@ BumpPointerSpace* BumpPointerSpace::Create(const std::string& name, size_t capac
 BumpPointerSpace::BumpPointerSpace(const std::string& name, byte* begin, byte* limit)
     : ContinuousMemMapAllocSpace(name, nullptr, begin, begin, limit,
                                  kGcRetentionPolicyAlwaysCollect),
-      num_objects_allocated_(0), total_bytes_allocated_(0), total_objects_allocated_(0),
-      growth_end_(limit) {
+      growth_end_(limit),
+      objects_allocated_(0), bytes_allocated_(0),
+      block_lock_("Block lock"),
+      num_blocks_(0) {
+  CHECK_GE(Capacity(), sizeof(BlockHeader));
+  end_ += sizeof(BlockHeader);
 }
 
 BumpPointerSpace::BumpPointerSpace(const std::string& name, MemMap* mem_map)
     : ContinuousMemMapAllocSpace(name, mem_map, mem_map->Begin(), mem_map->Begin(), mem_map->End(),
                                  kGcRetentionPolicyAlwaysCollect),
-      num_objects_allocated_(0), total_bytes_allocated_(0), total_objects_allocated_(0),
-      growth_end_(mem_map->End()) {
+      growth_end_(mem_map->End()),
+      objects_allocated_(0), bytes_allocated_(0),
+      block_lock_("Block lock"),
+      num_blocks_(0) {
+  CHECK_GE(Capacity(), sizeof(BlockHeader));
+  end_ += sizeof(BlockHeader);
 }
 
 mirror::Object* BumpPointerSpace::Alloc(Thread*, size_t num_bytes, size_t* bytes_allocated) {
+  num_bytes = RoundUp(num_bytes, kAlignment);
   mirror::Object* ret = AllocNonvirtual(num_bytes);
   if (LIKELY(ret != nullptr)) {
     *bytes_allocated = num_bytes;
@@ -68,9 +78,14 @@ void BumpPointerSpace::Clear() {
   CHECK_NE(madvise(Begin(), Limit() - Begin(), MADV_DONTNEED), -1) << "madvise failed";
   // Reset the end of the space back to the beginning, we move the end forward as we allocate
   // objects.
-  SetEnd(Begin());
+  SetEnd(Begin() + sizeof(BlockHeader));
+  objects_allocated_ = 0;
+  bytes_allocated_ = 0;
   growth_end_ = Limit();
-  num_objects_allocated_ = 0;
+  {
+    MutexLock mu(Thread::Current(), block_lock_);
+    num_blocks_ = 0;
+  }
 }
 
 void BumpPointerSpace::Dump(std::ostream& os) const {
@@ -83,6 +98,131 @@ mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
   return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
 }
 
+void BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
+  MutexLock mu(Thread::Current(), block_lock_);
+  RevokeThreadLocalBuffersLocked(thread);
+}
+
+void BumpPointerSpace::RevokeAllThreadLocalBuffers() {
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  // TODO: Not do a copy of the thread list?
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* thread : thread_list) {
+    RevokeThreadLocalBuffers(thread);
+  }
+}
+
+void BumpPointerSpace::UpdateMainBlock() {
+  BlockHeader* header = reinterpret_cast<BlockHeader*>(Begin());
+  header->size_ = Size() - sizeof(BlockHeader);
+  DCHECK_EQ(num_blocks_, 0U);
+}
+
+// Returns the start of the storage.
+byte* BumpPointerSpace::AllocBlock(size_t bytes) {
+  bytes = RoundUp(bytes, kAlignment);
+  if (!num_blocks_) {
+    UpdateMainBlock();
+  }
+  byte* storage = reinterpret_cast<byte*>(
+      AllocNonvirtualWithoutAccounting(bytes + sizeof(BlockHeader)));
+  if (LIKELY(storage != nullptr)) {
+    BlockHeader* header = reinterpret_cast<BlockHeader*>(storage);
+    header->size_ = bytes;  // Write out the block header.
+    storage += sizeof(BlockHeader);
+    ++num_blocks_;
+  }
+  return storage;
+}
+
+void BumpPointerSpace::Walk(ObjectVisitorCallback callback, void* arg) {
+  byte* pos = Begin();
+
+  {
+    MutexLock mu(Thread::Current(), block_lock_);
+    // If we have 0 blocks then we need to update the main header since we have bump pointer style
+    // allocation into an unbounded region (actually bounded by Capacity()).
+    if (num_blocks_ == 0) {
+      UpdateMainBlock();
+    }
+  }
+
+  while (pos < End()) {
+    BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
+    size_t block_size = header->size_;
+    pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
+    mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
+    const mirror::Object* end = reinterpret_cast<const mirror::Object*>(pos + block_size);
+    CHECK_LE(reinterpret_cast<const byte*>(end), End());
+    // We don't know how many objects are allocated in the current block. When we hit a null class
+    // assume its the end. TODO: Have a thread update the header when it flushes the block?
+    while (obj < end && obj->GetClass() != nullptr) {
+      callback(obj, arg);
+      obj = GetNextObject(obj);
+    }
+    pos += block_size;
+  }
+}
+
+bool BumpPointerSpace::IsEmpty() const {
+  return Size() == sizeof(BlockHeader);
+}
+
+uint64_t BumpPointerSpace::GetBytesAllocated() {
+  // Start out pre-determined amount (blocks which are not being allocated into).
+  uint64_t total = static_cast<uint64_t>(bytes_allocated_.load());
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  MutexLock mu3(Thread::Current(), block_lock_);
+  // If we don't have any blocks, we don't have any thread local buffers. This check is required
+  // since there can exist multiple bump pointer spaces which exist at the same time.
+  if (num_blocks_ > 0) {
+    for (Thread* thread : thread_list) {
+      total += thread->thread_local_pos_ - thread->thread_local_start_;
+    }
+  }
+  return total;
+}
+
+uint64_t BumpPointerSpace::GetObjectsAllocated() {
+  // Start out pre-determined amount (blocks which are not being allocated into).
+  uint64_t total = static_cast<uint64_t>(objects_allocated_.load());
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  MutexLock mu3(Thread::Current(), block_lock_);
+  // If we don't have any blocks, we don't have any thread local buffers. This check is required
+  // since there can exist multiple bump pointer spaces which exist at the same time.
+  if (num_blocks_ > 0) {
+    for (Thread* thread : thread_list) {
+      total += thread->thread_local_objects_;
+    }
+  }
+  return total;
+}
+
+void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
+  objects_allocated_.fetch_add(thread->thread_local_objects_);
+  bytes_allocated_.fetch_add(thread->thread_local_pos_ - thread->thread_local_start_);
+  thread->SetTLAB(nullptr, nullptr);
+}
+
+bool BumpPointerSpace::AllocNewTLAB(Thread* self, size_t bytes) {
+  MutexLock mu(Thread::Current(), block_lock_);
+  RevokeThreadLocalBuffersLocked(self);
+  byte* start = AllocBlock(bytes);
+  if (start == nullptr) {
+    return false;
+  }
+  self->SetTLAB(start, start + bytes);
+  return true;
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 2edd3e2..0a4be8a 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -17,6 +17,7 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 
+#include "root_visitor.h"
 #include "space.h"
 
 namespace art {
@@ -45,12 +46,13 @@ class BumpPointerSpace : public ContinuousMemMapAllocSpace {
   // Allocate num_bytes, returns nullptr if the space is full.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated);
   mirror::Object* AllocNonvirtual(size_t num_bytes);
+  mirror::Object* AllocNonvirtualWithoutAccounting(size_t num_bytes);
 
   // Return the storage space required by obj.
   virtual size_t AllocationSize(const mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Nos unless we support free lists.
+  // NOPS unless we support free lists.
   virtual size_t Free(Thread*, mirror::Object*) {
     return 0;
   }
@@ -92,21 +94,12 @@ class BumpPointerSpace : public ContinuousMemMapAllocSpace {
 
   void Dump(std::ostream& os) const;
 
-  uint64_t GetBytesAllocated() {
-    return Size();
-  }
-
-  uint64_t GetObjectsAllocated() {
-    return num_objects_allocated_;
-  }
+  void RevokeThreadLocalBuffers(Thread* thread);
+  void RevokeAllThreadLocalBuffers();
 
-  uint64_t GetTotalBytesAllocated() {
-    return total_bytes_allocated_;
-  }
-
-  uint64_t GetTotalObjectsAllocated() {
-    return total_objects_allocated_;
-  }
+  uint64_t GetBytesAllocated() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  uint64_t GetObjectsAllocated() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool IsEmpty() const;
 
   bool Contains(const mirror::Object* obj) const {
     const byte* byte_obj = reinterpret_cast<const byte*>(obj);
@@ -120,28 +113,55 @@ class BumpPointerSpace : public ContinuousMemMapAllocSpace {
   static mirror::Object* GetNextObject(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Allocate a new TLAB, returns false if the allocation failed.
+  bool AllocNewTLAB(Thread* self, size_t bytes);
+
   virtual BumpPointerSpace* AsBumpPointerSpace() {
     return this;
   }
 
+  // Go through all of the blocks and visit the continuous objects.
+  void Walk(ObjectVisitorCallback callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Object alignment within the space.
   static constexpr size_t kAlignment = 8;
 
  protected:
   BumpPointerSpace(const std::string& name, MemMap* mem_map);
 
+  // Allocate a raw block of bytes.
+  byte* AllocBlock(size_t bytes) EXCLUSIVE_LOCKS_REQUIRED(block_lock_);
+  void RevokeThreadLocalBuffersLocked(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(block_lock_);
+
   size_t InternalAllocationSize(const mirror::Object* obj);
   mirror::Object* AllocWithoutGrowthLocked(size_t num_bytes, size_t* bytes_allocated)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  // Approximate number of bytes which have been allocated into the space.
-  AtomicInteger num_objects_allocated_;
-  AtomicInteger total_bytes_allocated_;
-  AtomicInteger total_objects_allocated_;
+  // The main block is an unbounded block where objects go when there are no other blocks. This
+  // enables us to maintain tightly packed objects when you are not using thread local buffers for
+  // allocation.
+  // The main block is also the block which starts at address 0.
+  void UpdateMainBlock() EXCLUSIVE_LOCKS_REQUIRED(block_lock_);
 
   byte* growth_end_;
+  AtomicInteger objects_allocated_;  // Accumulated from revoked thread local regions.
+  AtomicInteger bytes_allocated_;  // Accumulated from revoked thread local regions.
+  Mutex block_lock_;
+
+  // The number of blocks in the space, if it is 0 then the space has one long continuous block
+  // which doesn't have an updated header.
+  size_t num_blocks_ GUARDED_BY(block_lock_);
 
  private:
+  struct BlockHeader {
+    size_t size_;  // Size of the block in bytes, does not include the header.
+    size_t unused_;  // Ensures alignment of kAlignment.
+  };
+
+  COMPILE_ASSERT(sizeof(BlockHeader) % kAlignment == 0,
+                 continuous_block_must_be_kAlignment_aligned);
+
   friend class collector::MarkSweep;
   DISALLOW_COPY_AND_ASSIGN(BumpPointerSpace);
 };
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index ca39175..db3aca9 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -198,10 +198,6 @@ class AllocSpace {
   virtual uint64_t GetBytesAllocated() = 0;
   // Number of objects currently allocated.
   virtual uint64_t GetObjectsAllocated() = 0;
-  // Number of bytes allocated since the space was created.
-  virtual uint64_t GetTotalBytesAllocated() = 0;
-  // Number of objects allocated since the space was created.
-  virtual uint64_t GetTotalObjectsAllocated() = 0;
 
   // Allocate num_bytes without allowing growth. If the allocation
   // succeeds, the output parameter bytes_allocated will be set to the
diff --git a/runtime/mirror/array-inl.h b/runtime/mirror/array-inl.h
index cf4b48c..bd81bd5 100644
--- a/runtime/mirror/array-inl.h
+++ b/runtime/mirror/array-inl.h
@@ -83,9 +83,10 @@ inline Array* Array::Alloc(Thread* self, Class* array_class, int32_t component_c
   }
   gc::Heap* heap = Runtime::Current()->GetHeap();
   SetLengthVisitor visitor(component_count);
+  DCHECK(allocator_type != gc::kAllocatorTypeLOS);
   return down_cast<Array*>(
-      heap->AllocObjectWithAllocator<kIsInstrumented>(self, array_class, size, allocator_type,
-                                                      visitor));
+      heap->AllocObjectWithAllocator<kIsInstrumented, true>(self, array_class, size,
+                                                            allocator_type, visitor));
 }
 
 template <bool kIsInstrumented>
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 3a28974..e0fab8c 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -361,8 +361,8 @@ template <bool kIsInstrumented>
 inline Object* Class::Alloc(Thread* self, gc::AllocatorType allocator_type) {
   CheckObjectAlloc();
   gc::Heap* heap = Runtime::Current()->GetHeap();
-  return heap->AllocObjectWithAllocator<kIsInstrumented>(self, this, this->object_size_,
-                                                         allocator_type);
+  return heap->AllocObjectWithAllocator<kIsInstrumented, false>(self, this, this->object_size_,
+                                                                allocator_type);
 }
 
 inline Object* Class::AllocObject(Thread* self) {
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 2746e1e..bd965fa 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -139,9 +139,11 @@ void Class::SetClassSize(size_t new_class_size) {
 // slashes (so "java.lang.String" but "[Ljava.lang.String;"). Madness.
 String* Class::ComputeName() {
   String* name = GetName();
-  if (name != NULL) {
+  if (name != nullptr) {
     return name;
   }
+  Thread* self = Thread::Current();
+  SirtRef<mirror::Class> sirt_c(self, this);
   std::string descriptor(ClassHelper(this).GetDescriptor());
   if ((descriptor[0] != 'L') && (descriptor[0] != '[')) {
     // The descriptor indicates that this is the class for
@@ -160,7 +162,7 @@ String* Class::ComputeName() {
     default:
       LOG(FATAL) << "Unknown primitive type: " << PrintableChar(descriptor[0]);
     }
-    name = String::AllocFromModifiedUtf8(Thread::Current(), c_name);
+    name = String::AllocFromModifiedUtf8(self, c_name);
   } else {
     // Convert the UTF-8 name to a java.lang.String. The name must use '.' to separate package
     // components.
@@ -169,9 +171,9 @@ String* Class::ComputeName() {
       descriptor.erase(descriptor.size() - 1);
     }
     std::replace(descriptor.begin(), descriptor.end(), '/', '.');
-    name = String::AllocFromModifiedUtf8(Thread::Current(), descriptor.c_str());
+    name = String::AllocFromModifiedUtf8(self, descriptor.c_str());
   }
-  SetName(name);
+  sirt_c->SetName(name);
   return name;
 }
 
diff --git a/runtime/native/dalvik_system_VMDebug.cc b/runtime/native/dalvik_system_VMDebug.cc
index 67c4505..6a04c3a 100644
--- a/runtime/native/dalvik_system_VMDebug.cc
+++ b/runtime/native/dalvik_system_VMDebug.cc
@@ -272,6 +272,7 @@ static void VMDebug_getHeapSpaceStats(JNIEnv* env, jclass, jlongArray data) {
       allocSize += malloc_space->GetFootprint();
       allocUsed += malloc_space->GetBytesAllocated();
     } else if (space->IsBumpPointerSpace()) {
+      ScopedObjectAccess soa(env);
       gc::space::BumpPointerSpace* bump_pointer_space = space->AsBumpPointerSpace();
       allocSize += bump_pointer_space->Size();
       allocUsed += bump_pointer_space->GetBytesAllocated();
diff --git a/runtime/root_visitor.h b/runtime/root_visitor.h
index d52f351..78c30ff 100644
--- a/runtime/root_visitor.h
+++ b/runtime/root_visitor.h
@@ -17,6 +17,9 @@
 #ifndef ART_RUNTIME_ROOT_VISITOR_H_
 #define ART_RUNTIME_ROOT_VISITOR_H_
 
+// For size_t.
+#include <stdlib.h>
+
 namespace art {
 namespace mirror {
 class Object;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index e1b4d7e..ff7b8f5 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -355,7 +355,7 @@ Runtime::ParsedOptions* Runtime::ParsedOptions::Create(const Options& options, b
   parsed->heap_min_free_ = gc::Heap::kDefaultMinFree;
   parsed->heap_max_free_ = gc::Heap::kDefaultMaxFree;
   parsed->heap_target_utilization_ = gc::Heap::kDefaultTargetUtilization;
-  parsed->heap_growth_limit_ = 0;  // 0 means no growth limit.
+  parsed->heap_growth_limit_ = 0;  // 0 means no growth limit .
   // Default to number of processors minus one since the main GC thread also does work.
   parsed->parallel_gc_threads_ = sysconf(_SC_NPROCESSORS_CONF) - 1;
   // Only the main GC thread, no workers.
@@ -365,6 +365,7 @@ Runtime::ParsedOptions* Runtime::ParsedOptions::Create(const Options& options, b
   parsed->stack_size_ = 0;  // 0 means default.
   parsed->max_spins_before_thin_lock_inflation_ = Monitor::kDefaultMaxSpinsBeforeThinLockInflation;
   parsed->low_memory_mode_ = false;
+  parsed->use_tlab_ = false;
 
   parsed->is_compiler_ = false;
   parsed->is_zygote_ = false;
@@ -540,6 +541,8 @@ Runtime::ParsedOptions* Runtime::ParsedOptions::Create(const Options& options, b
       parsed->ignore_max_footprint_ = true;
     } else if (option == "-XX:LowMemoryMode") {
       parsed->low_memory_mode_ = true;
+    } else if (option == "-XX:UseTLAB") {
+      parsed->use_tlab_ = true;
     } else if (StartsWith(option, "-D")) {
       parsed->properties_.push_back(option.substr(strlen("-D")));
     } else if (StartsWith(option, "-Xjnitrace:")) {
@@ -925,7 +928,8 @@ bool Runtime::Init(const Options& raw_options, bool ignore_unrecognized) {
                        options->low_memory_mode_,
                        options->long_pause_log_threshold_,
                        options->long_gc_log_threshold_,
-                       options->ignore_max_footprint_);
+                       options->ignore_max_footprint_,
+                       options->use_tlab_);
 
   dump_gc_performance_on_shutdown_ = options->dump_gc_performance_on_shutdown_;
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 01a605a..ce64510 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -104,6 +104,7 @@ class Runtime {
     bool is_zygote_;
     bool interpreter_only_;
     bool is_explicit_gc_disabled_;
+    bool use_tlab_;
     size_t long_pause_log_threshold_;
     size_t long_gc_log_threshold_;
     bool dump_gc_performance_on_shutdown_;
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index e47fd37..6f3c117 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -154,6 +154,18 @@ inline void Thread::VerifyStack() {
   }
 }
 
+inline size_t Thread::TLABSize() const {
+  return thread_local_end_ - thread_local_pos_;
+}
+
+inline mirror::Object* Thread::AllocTLAB(size_t bytes) {
+  DCHECK_GE(TLABSize(), bytes);
+  ++thread_local_objects_;
+  mirror::Object* ret = reinterpret_cast<mirror::Object*>(thread_local_pos_);
+  thread_local_pos_ += bytes;
+  return ret;
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 2861213..bc252de 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -928,7 +928,11 @@ Thread::Thread(bool daemon)
       no_thread_suspension_(0),
       last_no_thread_suspension_cause_(NULL),
       checkpoint_function_(0),
-      thread_exit_check_count_(0) {
+      thread_exit_check_count_(0),
+      thread_local_start_(nullptr),
+      thread_local_pos_(nullptr),
+      thread_local_end_(nullptr),
+      thread_local_objects_(0) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
   state_and_flags_.as_struct.state = kNative;
@@ -2179,6 +2183,14 @@ void Thread::SetStackEndForStackOverflow() {
   stack_end_ = stack_begin_;
 }
 
+void Thread::SetTLAB(byte* start, byte* end) {
+  DCHECK_LE(start, end);
+  thread_local_start_ = start;
+  thread_local_pos_  = thread_local_start_;
+  thread_local_end_ = end;
+  thread_local_objects_ = 0;
+}
+
 std::ostream& operator<<(std::ostream& os, const Thread& thread) {
   thread.ShortDump(os);
   return os;
diff --git a/runtime/thread.h b/runtime/thread.h
index 44b2186..b01ec94 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -577,10 +577,8 @@ class PACKED(4) Thread {
   ~Thread() LOCKS_EXCLUDED(Locks::mutator_lock_,
                            Locks::thread_suspend_count_lock_);
   void Destroy();
-  friend class ThreadList;  // For ~Thread and Destroy.
 
   void CreatePeer(const char* name, bool as_daemon, jobject thread_group);
-  friend class Runtime;  // For CreatePeer.
 
   // Avoid use, callers should use SetState. Used only by SignalCatcher::HandleSigQuit, ~Thread and
   // Dbg::Disconnected.
@@ -589,8 +587,6 @@ class PACKED(4) Thread {
     state_and_flags_.as_struct.state = new_state;
     return old_state;
   }
-  friend class SignalCatcher;  // For SetStateUnsafe.
-  friend class Dbg;  // F or SetStateUnsafe.
 
   void VerifyStackImpl() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -731,9 +727,6 @@ class PACKED(4) Thread {
   // If we're blocked in MonitorEnter, this is the object we're trying to lock.
   mirror::Object* monitor_enter_object_;
 
-  friend class Monitor;
-  friend class MonitorInfo;
-
   // Top of linked list of stack indirect reference tables or NULL for none
   StackIndirectReferenceTable* top_sirt_;
 
@@ -799,13 +792,20 @@ class PACKED(4) Thread {
   PortableEntryPoints portable_entrypoints_;
   QuickEntryPoints quick_entrypoints_;
 
- private:
   // How many times has our pthread key's destructor been called?
   uint32_t thread_exit_check_count_;
 
-  friend class ScopedThreadStateChange;
+  // Thread-local allocation pointer.
+  byte* thread_local_start_;
+  byte* thread_local_pos_;
+  byte* thread_local_end_;
+  size_t thread_local_objects_;
+  // Returns the remaining space in the TLAB.
+  size_t TLABSize() const;
+  // Doesn't check that there is room.
+  mirror::Object* AllocTLAB(size_t bytes);
+  void SetTLAB(byte* start, byte* end);
 
- public:
   // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
   // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
   // RosAlloc class due to a header file circular dependency issue.
@@ -814,6 +814,15 @@ class PACKED(4) Thread {
   static const size_t kRosAllocNumOfSizeBrackets = 34;
   void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
 
+ private:
+  friend class Dbg;  // F or SetStateUnsafe.
+  friend class Monitor;
+  friend class MonitorInfo;
+  friend class Runtime;  // For CreatePeer.
+  friend class ScopedThreadStateChange;
+  friend class SignalCatcher;  // For SetStateUnsafe.
+  friend class ThreadList;  // For ~Thread and Destroy.
+
   DISALLOW_COPY_AND_ASSIGN(Thread);
 };
author	Mathieu Chartier <mathieuc@google.com>	2013-11-29 17:24:40 -0800
committer	Mathieu Chartier <mathieuc@google.com>	2013-12-16 16:57:37 -0800
commit	692fafd9778141fa6ef0048c9569abd7ee0253bf (patch)
tree	63ce2c7d4be6af2524a5f442195c8c8b6f5cc955
parent	07dc96d370c4844c7a279c01cedf24a272b9f4f3 (diff)
download	art-692fafd9778141fa6ef0048c9569abd7ee0253bf.zip art-692fafd9778141fa6ef0048c9569abd7ee0253bf.tar.gz art-692fafd9778141fa6ef0048c9569abd7ee0253bf.tar.bz2