31 files changed, 378 insertions, 407 deletions
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index bd74ada..5aa264c 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -66,8 +66,16 @@ ART_GTEST_elf_writer_test_TARGET_DEPS := $(TARGET_CORE_IMAGE_default_no-pic_64)
 ART_GTEST_proxy_test_HOST_DEPS := $(HOST_CORE_IMAGE_default_no-pic_64) $(HOST_CORE_IMAGE_default_no-pic_32)
 
 # The imgdiag test has dependencies on core.oat since it needs to load it during the test.
-ART_GTEST_imgdiag_test_HOST_DEPS := $(HOST_CORE_IMAGE_default_no-pic_64) $(HOST_CORE_IMAGE_default_no-pic_32)
-ART_GTEST_imgdiag_test_TARGET_DEPS := $(TARGET_CORE_IMAGE_default_no-pic_64) $(TARGET_CORE_IMAGE_default_no-pic_32)
+# For the host, also add the installed tool (in the base size, that should suffice). For the
+# target, just the module is fine, the sync will happen late enough.
+ART_GTEST_imgdiag_test_HOST_DEPS := \
+  $(HOST_CORE_IMAGE_default_no-pic_64) \
+  $(HOST_CORE_IMAGE_default_no-pic_32) \
+  $(HOST_OUT_EXECUTABLES)/imgdiagd
+ART_GTEST_imgdiag_test_TARGET_DEPS := \
+  $(TARGET_CORE_IMAGE_default_no-pic_64) \
+  $(TARGET_CORE_IMAGE_default_no-pic_32) \
+  imgdiagd
 
 # The path for which all the source files are relative, not actually the current directory.
 LOCAL_PATH := art
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 99b2166..0713b7a 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -23,6 +23,7 @@
 #include "mirror/art_method.h"
 #include "mirror/object_array-inl.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "utils.h"
 
 namespace art {
 
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index fe1d126..03e0e92 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -23,6 +23,7 @@
 #include "dex/reg_storage_eq.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array-inl.h"
+#include "utils.h"
 
 namespace art {
 
@@ -567,21 +568,29 @@ bool ArmMir2Lir::SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div
 
 // Try to convert *lit to 1 RegRegRegShift/RegRegShift form.
 bool ArmMir2Lir::GetEasyMultiplyOp(int lit, ArmMir2Lir::EasyMultiplyOp* op) {
+  if (lit == 0) {
+    // Special case for *divide-by-zero*. The ops won't actually be used to generate code, as
+    // GenArithOpIntLit will directly generate exception-throwing code, and multiply-by-zero will
+    // have been optimized away earlier.
+    op->op = kOpInvalid;
+    return true;
+  }
+
   if (IsPowerOfTwo(lit)) {
     op->op = kOpLsl;
-    op->shift = LowestSetBit(lit);
+    op->shift = CTZ(lit);
     return true;
   }
 
   if (IsPowerOfTwo(lit - 1)) {
     op->op = kOpAdd;
-    op->shift = LowestSetBit(lit - 1);
+    op->shift = CTZ(lit - 1);
     return true;
   }
 
   if (IsPowerOfTwo(lit + 1)) {
     op->op = kOpRsub;
-    op->shift = LowestSetBit(lit + 1);
+    op->shift = CTZ(lit + 1);
     return true;
   }
 
@@ -599,7 +608,7 @@ bool ArmMir2Lir::GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops) {
   }
 
   int lit1 = lit;
-  uint32_t shift = LowestSetBit(lit1);
+  uint32_t shift = CTZ(lit1);
   if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
     ops[1].op = kOpLsl;
     ops[1].shift = shift;
@@ -607,7 +616,7 @@ bool ArmMir2Lir::GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops) {
   }
 
   lit1 = lit - 1;
-  shift = LowestSetBit(lit1);
+  shift = CTZ(lit1);
   if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
     ops[1].op = kOpAdd;
     ops[1].shift = shift;
@@ -615,7 +624,7 @@ bool ArmMir2Lir::GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops) {
   }
 
   lit1 = lit + 1;
-  shift = LowestSetBit(lit1);
+  shift = CTZ(lit1);
   if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
     ops[1].op = kOpRsub;
     ops[1].shift = shift;
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 5ac2aa0..88ab6f8 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -543,7 +543,7 @@ bool Arm64Mir2Lir::HandleEasyDivRem64(Instruction::Code dalvik_opcode, bool is_d
       return SmallLiteralDivRem(dalvik_opcode, is_div, rl_src, rl_dest, static_cast<int32_t>(lit));
     }
   }
-  int k = LowestSetBit(lit);
+  int k = CTZ(lit);
   if (k >= nbits - 2) {
     // Avoid special cases.
     return false;
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 67ea897..ae9b0f4 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1175,24 +1175,6 @@ void Mir2Lir::InsertLIRAfter(LIR* current_lir, LIR* new_lir) {
   new_lir->next->prev = new_lir;
 }
 
-bool Mir2Lir::IsPowerOfTwo(uint64_t x) {
-  return (x & (x - 1)) == 0;
-}
-
-// Returns the index of the lowest set bit in 'x'.
-int32_t Mir2Lir::LowestSetBit(uint64_t x) {
-  int bit_posn = 0;
-  while ((x & 0xf) == 0) {
-    bit_posn += 4;
-    x >>= 4;
-  }
-  while ((x & 1) == 0) {
-    bit_posn++;
-    x >>= 1;
-  }
-  return bit_posn;
-}
-
 bool Mir2Lir::PartiallyIntersects(RegLocation rl_src, RegLocation rl_dest) {
   DCHECK(rl_src.wide);
   DCHECK(rl_dest.wide);
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index e8adffb..3733507 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <functional>
+
 #include "arch/arm/instruction_set_features_arm.h"
 #include "dex/compiler_ir.h"
 #include "dex/compiler_internals.h"
@@ -23,8 +26,8 @@
 #include "mirror/object_array-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_reference.h"
+#include "utils.h"
 #include "verifier/method_verifier.h"
-#include <functional>
 
 namespace art {
 
@@ -1733,7 +1736,7 @@ bool Mir2Lir::HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
   if ((cu_->instruction_set == kThumb2) && !IsPowerOfTwo(lit)) {
     return SmallLiteralDivRem(dalvik_opcode, is_div, rl_src, rl_dest, lit);
   }
-  int k = LowestSetBit(lit);
+  int k = CTZ(lit);
   if (k >= 30) {
     // Avoid special cases.
     return false;
@@ -1813,18 +1816,18 @@ bool Mir2Lir::HandleEasyMultiply(RegLocation rl_src, RegLocation rl_dest, int li
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (power_of_two) {
     // Shift.
-    OpRegRegImm(kOpLsl, rl_result.reg, rl_src.reg, LowestSetBit(lit));
+    OpRegRegImm(kOpLsl, rl_result.reg, rl_src.reg, CTZ(lit));
   } else if (pop_count_le2) {
     // Shift and add and shift.
-    int first_bit = LowestSetBit(lit);
-    int second_bit = LowestSetBit(lit ^ (1 << first_bit));
+    int first_bit = CTZ(lit);
+    int second_bit = CTZ(lit ^ (1 << first_bit));
     GenMultiplyByTwoBitMultiplier(rl_src, rl_result, lit, first_bit, second_bit);
   } else {
     // Reverse subtract: (src << (shift + 1)) - src.
     DCHECK(power_of_two_minus_one);
-    // TUNING: rsb dst, src, src lsl#LowestSetBit(lit + 1)
+    // TUNING: rsb dst, src, src lsl#CTZ(lit + 1)
     RegStorage t_reg = AllocTemp();
-    OpRegRegImm(kOpLsl, t_reg, rl_src.reg, LowestSetBit(lit + 1));
+    OpRegRegImm(kOpLsl, t_reg, rl_src.reg, CTZ(lit + 1));
     OpRegRegReg(kOpSub, rl_result.reg, t_reg, rl_src.reg);
   }
   StoreValue(rl_dest, rl_result);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index f102881..5f8a71c 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1482,18 +1482,6 @@ class Mir2Lir : public Backend {
       return cu_;
     }
     /*
-     * @brief Returns the index of the lowest set bit in 'x'.
-     * @param x Value to be examined.
-     * @returns The bit number of the lowest bit set in the value.
-     */
-    int32_t LowestSetBit(uint64_t x);
-    /*
-     * @brief Is this value a power of two?
-     * @param x Value to be examined.
-     * @returns 'true' if only 1 bit is set in the value.
-     */
-    bool IsPowerOfTwo(uint64_t x);
-    /*
      * @brief Do these SRs overlap?
      * @param rl_op1 One RegLocation
      * @param rl_op2 The other RegLocation
diff --git a/compiler/dex/quick/resource_mask.cc b/compiler/dex/quick/resource_mask.cc
index 088bec8..ca68f95 100644
--- a/compiler/dex/quick/resource_mask.cc
+++ b/compiler/dex/quick/resource_mask.cc
@@ -19,6 +19,7 @@
 #include "resource_mask.h"
 
 #include "utils/arena_allocator.h"
+#include "utils.h"
 
 namespace art {
 
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index a79f299..ba9c611 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -21,6 +21,7 @@
 #include "dex/reg_storage_eq.h"
 #include "mirror/art_method.h"
 #include "mirror/array-inl.h"
+#include "utils.h"
 #include "x86_lir.h"
 
 namespace art {
@@ -656,7 +657,7 @@ RegLocation X86Mir2Lir::GenDivRemLit(RegLocation rl_dest, RegLocation rl_src,
     NewLIR3(kX86Lea32RM, rl_result.reg.GetReg(), rl_src.reg.GetReg(), std::abs(imm) - 1);
     NewLIR2(kX86Test32RR, rl_src.reg.GetReg(), rl_src.reg.GetReg());
     OpCondRegReg(kOpCmov, kCondPl, rl_result.reg, rl_src.reg);
-    int shift_amount = LowestSetBit(imm);
+    int shift_amount = CTZ(imm);
     OpRegImm(kOpAsr, rl_result.reg, shift_amount);
     if (imm < 0) {
       OpReg(kOpNeg, rl_result.reg);
@@ -1627,7 +1628,7 @@ bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64
     GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1, flags);
     return true;
   } else if (IsPowerOfTwo(val)) {
-    int shift_amount = LowestSetBit(val);
+    int shift_amount = CTZ(val);
     if (!PartiallyIntersects(rl_src1, rl_dest)) {
       rl_src1 = LoadValueWide(rl_src1, kCoreReg);
       RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest, rl_src1,
@@ -2070,7 +2071,7 @@ void X86Mir2Lir::GenDivRemLongLit(RegLocation rl_dest, RegLocation rl_src,
     OpRegReg(kOpAdd, rl_result.reg, rl_src.reg);
     NewLIR2(kX86Test64RR, rl_src.reg.GetReg(), rl_src.reg.GetReg());
     OpCondRegReg(kOpCmov, kCondPl, rl_result.reg, rl_src.reg);
-    int shift_amount = LowestSetBit(imm);
+    int shift_amount = CTZ(imm);
     OpRegImm(kOpAsr, rl_result.reg, shift_amount);
     if (imm < 0) {
       OpReg(kOpNeg, rl_result.reg);
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 91426f3..4d8154e 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -327,11 +327,13 @@ bool CodeGenerator::GoesToNextBlock(HBasicBlock* current, HBasicBlock* next) con
 
 CodeGenerator* CodeGenerator::Create(ArenaAllocator* allocator,
                                      HGraph* graph,
-                                     InstructionSet instruction_set) {
+                                     InstructionSet instruction_set,
+                                     const InstructionSetFeatures& isa_features) {
   switch (instruction_set) {
     case kArm:
     case kThumb2: {
-      return new (allocator) arm::CodeGeneratorARM(graph);
+      return new (allocator) arm::CodeGeneratorARM(graph,
+          isa_features.AsArmInstructionSetFeatures());
     }
     case kArm64: {
       return new (allocator) arm64::CodeGeneratorARM64(graph);
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 2e7eca2..4205ebe 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_H_
 
 #include "arch/instruction_set.h"
+#include "arch/instruction_set_features.h"
 #include "base/bit_field.h"
 #include "globals.h"
 #include "locations.h"
@@ -84,7 +85,8 @@ class CodeGenerator : public ArenaObject<kArenaAllocMisc> {
   void CompileOptimized(CodeAllocator* allocator);
   static CodeGenerator* Create(ArenaAllocator* allocator,
                                HGraph* graph,
-                               InstructionSet instruction_set);
+                               InstructionSet instruction_set,
+                               const InstructionSetFeatures& isa_features);
 
   HGraph* GetGraph() const { return graph_; }
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 8c107f3..3b3fb64 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_arm.h"
 
+#include "arch/arm/instruction_set_features_arm.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "mirror/array-inl.h"
@@ -372,13 +373,15 @@ size_t CodeGeneratorARM::RestoreCoreRegister(size_t stack_index, uint32_t reg_id
   return kArmWordSize;
 }
 
-CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
+CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
+                                   const ArmInstructionSetFeatures* isa_features)
     : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters, kNumberOfRegisterPairs),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
-      assembler_(true) {}
+      assembler_(true),
+      isa_features_(isa_features) {}
 
 size_t CodeGeneratorARM::FrameEntrySpillSize() const {
   return kNumberOfPushedRegistersAtEntry * kArmWordSize;
@@ -2615,16 +2618,18 @@ void LocationsBuilderARM::HandleFieldSet(HInstruction* instruction, const FieldI
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
 
-  bool is_volatile = field_info.IsVolatile();
+
   Primitive::Type field_type = field_info.GetFieldType();
   bool is_wide = field_type == Primitive::kPrimLong || field_type == Primitive::kPrimDouble;
-
+  bool generate_volatile = field_info.IsVolatile()
+      && is_wide
+      && !codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
   // Temporary registers for the write barrier.
   // TODO: consider renaming StoreNeedsWriteBarrier to StoreNeedsGCMark.
   if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
     locations->AddTemp(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresRegister());
-  } else if (is_volatile && is_wide) {
+  } else if (generate_volatile) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
@@ -2651,6 +2656,7 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
   Location value = locations->InAt(1);
 
   bool is_volatile = field_info.IsVolatile();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
   Primitive::Type field_type = field_info.GetFieldType();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
@@ -2684,10 +2690,7 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
     }
 
     case Primitive::kPrimLong: {
-      if (is_volatile) {
-        // TODO: We could use ldrd and strd that are atomic with Large Physical Address Extension
-        // support. This info is stored in the compiler driver (HasAtomicLdrdAndStrd) and we should
-        // pass it around to be able to optimize.
+      if (is_volatile && !atomic_ldrd_strd) {
         GenerateWideAtomicStore(base, offset,
                                 value.AsRegisterPairLow<Register>(),
                                 value.AsRegisterPairHigh<Register>(),
@@ -2706,7 +2709,7 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
 
     case Primitive::kPrimDouble: {
       DRegister value_reg = FromLowSToD(value.AsFpuRegisterPairLow<SRegister>());
-      if (is_volatile) {
+      if (is_volatile && !atomic_ldrd_strd) {
         Register value_reg_lo = locations->GetTemp(0).AsRegister<Register>();
         Register value_reg_hi = locations->GetTemp(1).AsRegister<Register>();
 
@@ -2740,7 +2743,10 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 
-  if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimDouble)) {
+  bool generate_volatile = field_info.IsVolatile()
+      && (field_info.GetFieldType() == Primitive::kPrimDouble)
+      && !codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
+  if (generate_volatile) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
@@ -2760,6 +2766,7 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
   Register base = locations->InAt(0).AsRegister<Register>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
   Primitive::Type field_type = field_info.GetFieldType();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
@@ -2791,7 +2798,7 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
     }
 
     case Primitive::kPrimLong: {
-      if (is_volatile) {
+      if (is_volatile && !atomic_ldrd_strd) {
         GenerateWideAtomicLoad(base, offset,
                                out.AsRegisterPairLow<Register>(),
                                out.AsRegisterPairHigh<Register>());
@@ -2808,7 +2815,7 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
 
     case Primitive::kPrimDouble: {
       DRegister out_reg = FromLowSToD(out.AsFpuRegisterPairLow<SRegister>());
-      if (is_volatile) {
+      if (is_volatile && !atomic_ldrd_strd) {
         Register lo = locations->GetTemp(0).AsRegister<Register>();
         Register hi = locations->GetTemp(1).AsRegister<Register>();
         GenerateWideAtomicLoad(base, offset, lo, hi);
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index b86670d..40f4edc 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -159,7 +159,7 @@ class InstructionCodeGeneratorARM : public HGraphVisitor {
 
 class CodeGeneratorARM : public CodeGenerator {
  public:
-  explicit CodeGeneratorARM(HGraph* graph);
+  CodeGeneratorARM(HGraph* graph, const ArmInstructionSetFeatures* isa_features);
   virtual ~CodeGeneratorARM() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -233,6 +233,10 @@ class CodeGeneratorARM : public CodeGenerator {
     block_labels_.SetSize(GetGraph()->GetBlocks().Size());
   }
 
+  const ArmInstructionSetFeatures* GetInstructionSetFeatures() const {
+    return isa_features_;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -240,6 +244,7 @@ class CodeGeneratorARM : public CodeGenerator {
   InstructionCodeGeneratorARM instruction_visitor_;
   ParallelMoveResolverARM move_resolver_;
   Thumb2Assembler assembler_;
+  const ArmInstructionSetFeatures* isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
 };
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 8b75cc7..18722f7 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -17,6 +17,7 @@
 #include <functional>
 
 #include "arch/instruction_set.h"
+#include "arch/arm/instruction_set_features_arm.h"
 #include "base/macros.h"
 #include "builder.h"
 #include "code_generator_arm.h"
@@ -87,7 +88,9 @@ static void RunCodeBaseline(HGraph* graph, bool has_result, Expected expected) {
     Run(allocator, codegenX86, has_result, expected);
   }
 
-  arm::CodeGeneratorARM codegenARM(graph);
+  std::unique_ptr<const ArmInstructionSetFeatures> features(
+      ArmInstructionSetFeatures::FromCppDefines());
+  arm::CodeGeneratorARM codegenARM(graph, features.get());
   codegenARM.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kArm || kRuntimeISA == kThumb2) {
     Run(allocator, codegenARM, has_result, expected);
@@ -130,7 +133,7 @@ static void RunCodeOptimized(HGraph* graph,
                              bool has_result,
                              Expected expected) {
   if (kRuntimeISA == kArm || kRuntimeISA == kThumb2) {
-    arm::CodeGeneratorARM codegenARM(graph);
+    arm::CodeGeneratorARM codegenARM(graph, ArmInstructionSetFeatures::FromCppDefines());
     RunCodeOptimized(&codegenARM, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kArm64) {
     arm64::CodeGeneratorARM64 codegenARM64(graph);
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 87f2b90..1a27724 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -289,7 +289,9 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
     return nullptr;
   }
 
-  CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, instruction_set);
+  CompilerDriver* compiler_driver = GetCompilerDriver();
+  CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, instruction_set,
+      *compiler_driver->GetInstructionSetFeatures());
   if (codegen == nullptr) {
     CHECK(!shouldCompile) << "Could not find code generator for optimizing compiler";
     compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledNoCodegen);
@@ -315,7 +317,7 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
       return nullptr;
     }
     RunOptimizations(
-        graph, GetCompilerDriver(), &compilation_stats_, dex_compilation_unit, visualizer);
+        graph, compiler_driver, &compilation_stats_, dex_compilation_unit, visualizer);
 
     PrepareForRegisterAllocation(graph).Run();
     SsaLivenessAnalysis liveness(*graph, codegen);
@@ -333,7 +335,7 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
 
     compilation_stats_.RecordStat(MethodCompilationStat::kCompiledOptimized);
     return CompiledMethod::SwapAllocCompiledMethodStackMap(
-        GetCompilerDriver(),
+        compiler_driver,
         instruction_set,
         ArrayRef<const uint8_t>(allocator.GetMemory()),
         codegen->GetFrameSize(),
@@ -358,16 +360,15 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
 
     std::vector<uint8_t> mapping_table;
     DefaultSrcMap src_mapping_table;
-    codegen->BuildMappingTable(&mapping_table,
-            GetCompilerDriver()->GetCompilerOptions().GetIncludeDebugSymbols() ?
-                 &src_mapping_table : nullptr);
+    bool include_debug_symbol = compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols();
+    codegen->BuildMappingTable(&mapping_table, include_debug_symbol ? &src_mapping_table : nullptr);
     std::vector<uint8_t> vmap_table;
     codegen->BuildVMapTable(&vmap_table);
     std::vector<uint8_t> gc_map;
     codegen->BuildNativeGCMap(&gc_map, dex_compilation_unit);
 
     compilation_stats_.RecordStat(MethodCompilationStat::kCompiledBaseline);
-    return CompiledMethod::SwapAllocCompiledMethod(GetCompilerDriver(),
+    return CompiledMethod::SwapAllocCompiledMethod(compiler_driver,
                                                    instruction_set,
                                                    ArrayRef<const uint8_t>(allocator.GetMemory()),
                                                    codegen->GetFrameSize(),
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index f8c0043..83584a2 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1372,38 +1372,6 @@ void X86Assembler::LoadDoubleConstant(XmmRegister dst, double value) {
 }
 
 
-void X86Assembler::FloatNegate(XmmRegister f) {
-  static const struct {
-    uint32_t a;
-    uint32_t b;
-    uint32_t c;
-    uint32_t d;
-  } float_negate_constant __attribute__((aligned(16))) =
-      { 0x80000000, 0x00000000, 0x80000000, 0x00000000 };
-  xorps(f, Address::Absolute(reinterpret_cast<uintptr_t>(&float_negate_constant)));
-}
-
-
-void X86Assembler::DoubleNegate(XmmRegister d) {
-  static const struct {
-    uint64_t a;
-    uint64_t b;
-  } double_negate_constant __attribute__((aligned(16))) =
-      {0x8000000000000000LL, 0x8000000000000000LL};
-  xorpd(d, Address::Absolute(reinterpret_cast<uintptr_t>(&double_negate_constant)));
-}
-
-
-void X86Assembler::DoubleAbs(XmmRegister reg) {
-  static const struct {
-    uint64_t a;
-    uint64_t b;
-  } double_abs_constant __attribute__((aligned(16))) =
-      {0x7FFFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL};
-  andpd(reg, Address::Absolute(reinterpret_cast<uintptr_t>(&double_abs_constant)));
-}
-
-
 void X86Assembler::Align(int alignment, int offset) {
   CHECK(IsPowerOfTwo(alignment));
   // Emit nop instruction until the real position is aligned.
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 6c3d131..ad07067 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -447,11 +447,6 @@ class X86Assembler FINAL : public Assembler {
   void LoadLongConstant(XmmRegister dst, int64_t value);
   void LoadDoubleConstant(XmmRegister dst, double value);
 
-  void DoubleNegate(XmmRegister d);
-  void FloatNegate(XmmRegister f);
-
-  void DoubleAbs(XmmRegister reg);
-
   void LockCmpxchgl(const Address& address, Register reg) {
     lock()->cmpxchgl(address, reg);
   }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 2a6c58e..d843a72 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1768,38 +1768,6 @@ void X86_64Assembler::LoadDoubleConstant(XmmRegister dst, double value) {
 }
 
 
-void X86_64Assembler::FloatNegate(XmmRegister f) {
-  static const struct {
-    uint32_t a;
-    uint32_t b;
-    uint32_t c;
-    uint32_t d;
-  } float_negate_constant __attribute__((aligned(16))) =
-      { 0x80000000, 0x00000000, 0x80000000, 0x00000000 };
-  xorps(f, Address::Absolute(reinterpret_cast<uintptr_t>(&float_negate_constant)));
-}
-
-
-void X86_64Assembler::DoubleNegate(XmmRegister d) {
-  static const struct {
-    uint64_t a;
-    uint64_t b;
-  } double_negate_constant __attribute__((aligned(16))) =
-      {0x8000000000000000LL, 0x8000000000000000LL};
-  xorpd(d, Address::Absolute(reinterpret_cast<uintptr_t>(&double_negate_constant)));
-}
-
-
-void X86_64Assembler::DoubleAbs(XmmRegister reg) {
-  static const struct {
-    uint64_t a;
-    uint64_t b;
-  } double_abs_constant __attribute__((aligned(16))) =
-      {0x7FFFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL};
-  andpd(reg, Address::Absolute(reinterpret_cast<uintptr_t>(&double_abs_constant)));
-}
-
-
 void X86_64Assembler::Align(int alignment, int offset) {
   CHECK(IsPowerOfTwo(alignment));
   // Emit nop instruction until the real position is aligned.
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index abf2561..ac8bc9a 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -512,11 +512,6 @@ class X86_64Assembler FINAL : public Assembler {
 
   void LoadDoubleConstant(XmmRegister dst, double value);
 
-  void DoubleNegate(XmmRegister d);
-  void FloatNegate(XmmRegister f);
-
-  void DoubleAbs(XmmRegister reg);
-
   void LockCmpxchgl(const Address& address, CpuRegister reg) {
     lock()->cmpxchgl(address, reg);
   }
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 52fd736..31e653b 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -21,6 +21,7 @@
 #include <ostream>
 #include <sstream>
 
+#include "arch/arm/registers_arm.h"
 #include "base/logging.h"
 #include "base/stringprintf.h"
 #include "thread.h"
@@ -148,15 +149,15 @@ struct ThumbRegister : ArmRegister {
   ThumbRegister(uint16_t instruction, uint16_t at_bit) : ArmRegister((instruction >> at_bit) & 0x7) {}
 };
 
-struct Rm {
-  explicit Rm(uint32_t instruction) : shift((instruction >> 4) & 0xff), rm(instruction & 0xf) {}
-  uint32_t shift;
+struct RmLslImm2 {
+  explicit RmLslImm2(uint32_t instr) : imm2((instr >> 4) & 0x3), rm(instr & 0xf) {}
+  uint32_t imm2;
   ArmRegister rm;
 };
-std::ostream& operator<<(std::ostream& os, const Rm& r) {
+std::ostream& operator<<(std::ostream& os, const RmLslImm2& r) {
   os << r.rm;
-  if (r.shift != 0) {
-    os << "-shift-" << r.shift;  // TODO
+  if (r.imm2 != 0) {
+    os << ", lsl #" << r.imm2;
   }
   return os;
 }
@@ -397,7 +398,74 @@ static uint64_t VFPExpand64(uint32_t imm8) {
   uint64_t bit_a = (imm8 >> 7) & 1;
   uint64_t bit_b = (imm8 >> 6) & 1;
   uint64_t slice = imm8 & 0x3f;
-  return (bit_a << 31) | ((UINT64_C(1) << 62) - (bit_b << 54)) | (slice << 48);
+  return (bit_a << 63) | ((UINT64_C(1) << 62) - (bit_b << 54)) | (slice << 48);
+}
+
+enum T2LitType {
+  kT2LitInvalid,
+  kT2LitUByte,
+  kT2LitSByte,
+  kT2LitUHalf,
+  kT2LitSHalf,
+  kT2LitUWord,
+  kT2LitSWord,
+  kT2LitHexWord,
+  kT2LitULong,
+  kT2LitSLong,
+  kT2LitHexLong,
+};
+std::ostream& operator<<(std::ostream& os, T2LitType type) {
+  return os << static_cast<int>(type);
+}
+
+void DumpThumb2Literal(std::ostream& args, const uint8_t* instr_ptr, uint32_t U, uint32_t imm32,
+                       T2LitType type) {
+  // Literal offsets (imm32) are not required to be aligned so we may need unaligned access.
+  typedef const int16_t unaligned_int16_t __attribute__ ((aligned (1)));
+  typedef const uint16_t unaligned_uint16_t __attribute__ ((aligned (1)));
+  typedef const int32_t unaligned_int32_t __attribute__ ((aligned (1)));
+  typedef const uint32_t unaligned_uint32_t __attribute__ ((aligned (1)));
+  typedef const int64_t unaligned_int64_t __attribute__ ((aligned (1)));
+  typedef const uint64_t unaligned_uint64_t __attribute__ ((aligned (1)));
+
+  uintptr_t pc = RoundDown(reinterpret_cast<intptr_t>(instr_ptr) + 4, 4);
+  uintptr_t lit_adr = U ? pc + imm32 : pc - imm32;
+  args << "  ; ";
+  switch (type) {
+    case kT2LitUByte:
+      args << *reinterpret_cast<const uint8_t*>(lit_adr);
+      break;
+    case kT2LitSByte:
+      args << *reinterpret_cast<const int8_t*>(lit_adr);
+      break;
+    case kT2LitUHalf:
+      args << *reinterpret_cast<const unaligned_uint16_t*>(lit_adr);
+      break;
+    case kT2LitSHalf:
+      args << *reinterpret_cast<const unaligned_int16_t*>(lit_adr);
+      break;
+    case kT2LitUWord:
+      args << *reinterpret_cast<const unaligned_uint32_t*>(lit_adr);
+      break;
+    case kT2LitSWord:
+      args << *reinterpret_cast<const unaligned_int32_t*>(lit_adr);
+      break;
+    case kT2LitHexWord:
+      args << StringPrintf("0x%08x", *reinterpret_cast<const unaligned_uint32_t*>(lit_adr));
+      break;
+    case kT2LitULong:
+      args << *reinterpret_cast<const unaligned_uint64_t*>(lit_adr);
+      break;
+    case kT2LitSLong:
+      args << *reinterpret_cast<const unaligned_int64_t*>(lit_adr);
+      break;
+    case kT2LitHexLong:
+      args << StringPrintf("0x%" PRIx64, *reinterpret_cast<unaligned_int64_t*>(lit_adr));
+      break;
+    default:
+      LOG(FATAL) << "Invalid type: " << type;
+      break;
+  }
 }
 
 size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr) {
@@ -756,10 +824,7 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr)
                 args << d << ", [" << Rn << ", #" << ((U == 1) ? "" : "-")
                      << (imm8 << 2) << "]";
                 if (Rn.r == 15 && U == 1) {
-                  intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
-                  lit_adr = RoundDown(lit_adr, 4) + 4 + (imm8 << 2);
-                  typedef const int64_t unaligned_int64_t __attribute__ ((aligned (2)));
-                  args << StringPrintf("  ; 0x%" PRIx64, *reinterpret_cast<unaligned_int64_t*>(lit_adr));
+                  DumpThumb2Literal(args, instr_ptr, U, imm8 << 2, kT2LitHexLong);
                 }
               } else if (Rn.r == 13 && W == 1 && U == L) {  // VPUSH/VPOP
                 opcode << (L == 1 ? "vpop" : "vpush");
@@ -1227,164 +1292,141 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr)
       break;
     case 3:
       switch (op2) {
-        case 0x00: case 0x02: case 0x04: case 0x06:  // 000xxx0
-        case 0x08: case 0x09: case 0x0A: case 0x0C: case 0x0E: {
-          // Store single data item
-          // |111|11|100|000|0|0000|1111|110000|000000|
-          // |5 3|21|098|765|4|3  0|5  2|10   6|5    0|
-          // |---|--|---|---|-|----|----|------|------|
-          // |332|22|222|222|2|1111|1111|110000|000000|
-          // |1 9|87|654|321|0|9  6|5  2|10   6|5    0|
-          // |---|--|---|---|-|----|----|------|------|
-          // |111|11|000|op3|0|    |    |  op4 |      |
-          uint32_t op3 = (instr >> 21) & 7;
-          // uint32_t op4 = (instr >> 6) & 0x3F;
-          switch (op3) {
-            case 0x0: case 0x4: {
-              // {ST,LD}RB Rt,[Rn,#+/-imm12]    - 111 11 00 0 1 00 0 nnnn tttt 1 PUWii ii iiii
-              // {ST,LD}RB Rt,[Rn,#+/-imm8]     - 111 11 00 0 0 00 0 nnnn tttt 1 PUWii ii iiii
-              // {ST,LD}RB Rt,[Rn,Rm,lsl #imm2] - 111 11 00 0 0 00 0 nnnn tttt 0 00000 ii mmmm
-              ArmRegister Rn(instr, 16);
-              ArmRegister Rt(instr, 12);
-              opcode << (HasBitSet(instr, 20) ? "ldrb" : "strb");
-              if (HasBitSet(instr, 23)) {
-                uint32_t imm12 = instr & 0xFFF;
-                args << Rt << ", [" << Rn << ",#" << imm12 << "]";
-              } else if ((instr & 0x800) != 0) {
-                uint32_t imm8 = instr & 0xFF;
-                args << Rt << ", [" << Rn << ",#" << imm8 << "]";
-              } else {
-                uint32_t imm2 = (instr >> 4) & 3;
-                ArmRegister Rm(instr, 0);
-                args << Rt << ", [" << Rn << ", " << Rm;
-                if (imm2 != 0) {
-                  args << ", " << "lsl #" << imm2;
-                }
-                args << "]";
-              }
-              break;
-            }
-            case 0x1: case 0x5: {
-              // STRH Rt,[Rn,#+/-imm12]    - 111 11 00 0 1 01 0 nnnn tttt 1 PUWii ii iiii
-              // STRH Rt,[Rn,#+/-imm8]     - 111 11 00 0 0 01 0 nnnn tttt 1 PUWii ii iiii
-              // STRH Rt,[Rn,Rm,lsl #imm2] - 111 11 00 0 0 01 0 nnnn tttt 0 00000 ii mmmm
-              ArmRegister Rn(instr, 16);
-              ArmRegister Rt(instr, 12);
-              opcode << "strh";
-              if (HasBitSet(instr, 23)) {
-                uint32_t imm12 = instr & 0xFFF;
-                args << Rt << ", [" << Rn << ",#" << imm12 << "]";
-              } else if ((instr & 0x800) != 0) {
-                uint32_t imm8 = instr & 0xFF;
-                args << Rt << ", [" << Rn << ",#" << imm8 << "]";
-              } else {
-                uint32_t imm2 = (instr >> 4) & 3;
-                ArmRegister Rm(instr, 0);
-                args << Rt << ", [" << Rn << ", " << Rm;
-                if (imm2 != 0) {
-                  args << ", " << "lsl #" << imm2;
-                }
-                args << "]";
-              }
-              break;
-            }
-            case 0x2: case 0x6: {
-              ArmRegister Rn(instr, 16);
-              ArmRegister Rt(instr, 12);
-              if (op3 == 2) {
-                if ((instr & 0x800) != 0) {
-                  // STR Rt, [Rn, #imm8] - 111 11 000 010 0 nnnn tttt 1PUWiiiiiiii
-                  uint32_t P = (instr >> 10) & 1;
-                  uint32_t U = (instr >> 9) & 1;
-                  uint32_t W = (instr >> 8) & 1;
-                  uint32_t imm8 = instr & 0xFF;
-                  int32_t imm32 = (imm8 << 24) >> 24;  // sign-extend imm8
-                  if (Rn.r == 13 && P == 1 && U == 0 && W == 1 && imm32 == 4) {
-                    opcode << "push";
-                    args << "{" << Rt << "}";
-                  } else if (Rn.r == 15 || (P == 0 && W == 0)) {
-                    opcode << "UNDEFINED";
-                  } else {
-                    if (P == 1 && U == 1 && W == 0) {
-                      opcode << "strt";
-                    } else {
-                      opcode << "str";
-                    }
-                    args << Rt << ", [" << Rn;
-                    if (P == 0 && W == 1) {
-                      args << "], #" << imm32;
-                    } else {
-                      args << ", #" << imm32 << "]";
-                      if (W == 1) {
-                        args << "!";
-                      }
-                    }
-                  }
-                } else {
-                  // STR Rt, [Rn, Rm, LSL #imm2] - 111 11 000 010 0 nnnn tttt 000000iimmmm
-                  ArmRegister Rm(instr, 0);
-                  uint32_t imm2 = (instr >> 4) & 3;
-                  opcode << "str.w";
-                  args << Rt << ", [" << Rn << ", " << Rm;
-                  if (imm2 != 0) {
-                    args << ", lsl #" << imm2;
-                  }
-                  args << "]";
-                }
-              } else if (op3 == 6) {
-                // STR.W Rt, [Rn, #imm12] - 111 11 000 110 0 nnnn tttt iiiiiiiiiiii
-                uint32_t imm12 = instr & 0xFFF;
-                opcode << "str.w";
-                args << Rt << ", [" << Rn << ", #" << imm12 << "]";
-              }
-              break;
-            }
-          }
-
+        case 0x07: case 0x0F: case 0x17: case 0x1F: {  // Explicitly UNDEFINED, A6.3.
+          opcode << "UNDEFINED";
+          break;
+        }
+        case 0x06: case 0x0E: {  // "Store single data item" undefined opcodes, A6.3.10.
+          opcode << "UNDEFINED [store]";
+          break;
+        }
+        case 0x15: case 0x1D: {  // "Load word" undefined opcodes, A6.3.7.
+          opcode << "UNDEFINED [load]";
           break;
         }
-        case 0x03: case 0x0B: case 0x11: case 0x13: case 0x19: case 0x1B: {  // 00xx011
-          // Load byte/halfword
-          // |111|11|10|0 0|00|0|0000|1111|110000|000000|
-          // |5 3|21|09|8 7|65|4|3  0|5  2|10   6|5    0|
-          // |---|--|--|---|--|-|----|----|------|------|
-          // |332|22|22|2 2|22|2|1111|1111|110000|000000|
-          // |1 9|87|65|4 3|21|0|9  6|5  2|10   6|5    0|
-          // |---|--|--|---|--|-|----|----|------|------|
-          // |111|11|00|op3|01|1| Rn | Rt | op4  |      |
-          // |111|11| op2       |    |    | imm12       |
-          uint32_t op3 = (instr >> 23) & 3;
+        case 0x10: case 0x12: case 0x14: case 0x16: case 0x18: case 0x1A: case 0x1C: case 0x1E: {
+          opcode << "UNKNOWN " << op2 << " [SIMD]";
+          break;
+        }
+        case 0x01: case 0x00: case 0x09: case 0x08:   // {LD,ST}RB{,T}
+        case 0x03: case 0x02: case 0x0B: case 0x0A:   // {LD,ST}RH{,T}
+        case 0x05: case 0x04: case 0x0D: case 0x0C:   // {LD,ST}R{,T}
+        case 0x11:            case 0x19:              // LDRSB{,T} (no signed store)
+        case 0x13:            case 0x1B: {            // LDRSH{,T} (no signed store)
+          // Load:
+          // (Store is the same except that l==0 and always s==0 below.)
+          //                       00s.whl (sign, word, half, load)
+          // LDR{S}B  imm12: 11111|00s1001| Rn | Rt |imm12             (0x09)
+          // LDR{S}B   imm8: 11111|00s0001| Rn | Rt |1PUW|imm8         (0x01)
+          // LDR{S}BT  imm8: 11111|00s0001| Rn | Rt |1110|imm8         (0x01)
+          // LDR{S}B    lit: 11111|00sU001|1111| Rt |imm12             (0x01/0x09)
+          // LDR{S}B    reg: 11111|00s0001| Rn | Rt |000000|imm2| Rm   (0x01)
+          // LDR{S}H  imm12: 11111|00s1011| Rn | Rt |imm12             (0x0B)
+          // LDR{S}H   imm8: 11111|00s0011| Rn | Rt |1PUW|imm8         (0x03)
+          // LDR{S}HT  imm8: 11111|00s0011| Rn | Rt |1110|imm8         (0x03)
+          // LDR{S}H    lit: 11111|00sU011|1111| Rt |imm12             (0x03/0x0B)
+          // LDR{S}H    reg: 11111|00s0011| Rn | Rt |000000|imm2| Rm   (0x03)
+          // LDR      imm12: 11111|0001101| Rn | Rt |imm12             (0x0D)
+          // LDR       imm8: 11111|0000101| Rn | Rt |1PUW|imm8         (0x05)
+          // LDRT      imm8: 11111|0000101| Rn | Rt |1110|imm8         (0x05)
+          // LDR        lit: 11111|000U101|1111| Rt |imm12             (0x05/0x0D)
+          // LDR        reg: 11111|0000101| Rn | Rt |000000|imm2| Rm   (0x05)
+          //
+          // If Rt == 15, instead of load we have preload:
+          // PLD{W}   imm12: 11111|00010W1| Rn |1111|imm12             (0x09/0x0B)
+          // PLD{W}    imm8: 11111|00000W1| Rn |1111|1100|imm8         (0x01/0x03); -imm8
+          // PLD        lit: 11111|000U001|1111|1111|imm12             (0x01/0x09)
+          // PLD{W}     reg: 11111|00000W1| Rn |1111|000000|imm2| Rm   (0x01/0x03)
+          // PLI      imm12: 11111|0011001| Rn |1111|imm12             (0x19)
+          // PLI       imm8: 11111|0010001| Rn |1111|1100|imm8         (0x11); -imm8
+          // PLI        lit: 11111|001U001|1111|1111|imm12             (0x01/0x09)
+          // PLI        reg: 11111|0010001| Rn |1111|000000|imm2| Rm   (0x01/0x03)
+
+          bool is_load = HasBitSet(instr, 20);
+          bool is_half = HasBitSet(instr, 21);  // W for PLD/PLDW.
+          bool is_word = HasBitSet(instr, 22);
+          bool is_signed = HasBitSet(instr, 24);
           ArmRegister Rn(instr, 16);
           ArmRegister Rt(instr, 12);
-          if (Rt.r != 15) {
-            if (op3 == 1) {
-              // LDRH.W Rt, [Rn, #imm12]       - 111 11 00 01 011 nnnn tttt iiiiiiiiiiii
-              uint32_t imm12 = instr & 0xFFF;
-              opcode << "ldrh.w";
-              args << Rt << ", [" << Rn << ", #" << imm12 << "]";
-              if (Rn.r == 9) {
-                args << "  ; ";
-                Thread::DumpThreadOffset<4>(args, imm12);
-              } else if (Rn.r == 15) {
-                intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
-                lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
-                args << StringPrintf("  ; 0x%08x", *reinterpret_cast<int32_t*>(lit_adr));
-              }
-            } else if (op3 == 3) {
-              // LDRSH.W Rt, [Rn, #imm12]      - 111 11 00 11 011 nnnn tttt iiiiiiiiiiii
-              // LDRSB.W Rt, [Rn, #imm12]      - 111 11 00 11 001 nnnn tttt iiiiiiiiiiii
-              uint32_t imm12 = instr & 0xFFF;
-              opcode << (HasBitSet(instr, 20) ? "ldrsb.w" : "ldrsh.w");
-              args << Rt << ", [" << Rn << ", #" << imm12 << "]";
-              if (Rn.r == 9) {
-                args << "  ; ";
-                Thread::DumpThreadOffset<4>(args, imm12);
-              } else if (Rn.r == 15) {
-                intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
-                lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
-                args << StringPrintf("  ; 0x%08x", *reinterpret_cast<int32_t*>(lit_adr));
+          uint32_t imm12 = instr & 0xFFF;
+          uint32_t U = (instr >> 23) & 1;  // U for imm12
+          uint32_t imm8 = instr & 0xFF;
+          uint32_t op4 = (instr >> 8) & 0xF;  // 1PUW for imm8
+          if (Rt.r == PC && is_load && !is_word) {
+            // PLD, PLDW, PLI
+            const char* pld_pli = (is_signed ? "pli" : "pld");
+            const char* w = (is_half ? "w" : "");
+            if (is_signed && !is_half) {
+              opcode << "UNDEFINED [PLI+W]";
+            } else if (Rn.r == PC || U != 0u) {
+              opcode << pld_pli << w;
+              args << "[" << Rn << ", #" << (U != 0u ? "" : "-") << imm12 << "]";
+              if (Rn.r == PC && is_half) {
+                args << " (UNPREDICTABLE)";
               }
+            } else if ((instr & 0xFC0) == 0) {
+              opcode << pld_pli << w;
+              RmLslImm2 Rm(instr);
+              args << "[" << Rn << ", " << Rm << "]";
+            } else if (op4 == 0xC) {
+              opcode << pld_pli << w;
+              args << "[" << Rn << ", #-" << imm8 << "]";
+            } else {
+              opcode << "UNDEFINED [~" << pld_pli << "]";
             }
+            break;
+          }
+          const char* ldr_str = is_load ? "ldr" : "str";
+          const char* sign = is_signed ? "s" : "";
+          const char* type = is_word ? "" : is_half ? "h" : "b";
+          bool unpred = (Rt.r == SP && !is_word) || (Rt.r == PC && !is_load);
+          if (Rn.r == PC && !is_load) {
+            opcode << "UNDEFINED [STR-lit]";
+            unpred = false;
+          } else if (Rn.r == PC || U != 0u) {
+            // Load/store with imm12 (load literal if Rn.r == PC; there's no store literal).
+            opcode << ldr_str << sign << type << ".w";
+            args << Rt << ", [" << Rn << ", #" << (U != 0u ? "" : "-") << imm12 << "]";
+            if (Rn.r == TR && is_load) {
+              args << "  ; ";
+              Thread::DumpThreadOffset<4>(args, imm12);
+            } else if (Rn.r == PC) {
+              T2LitType lit_type[] = {
+                  kT2LitUByte, kT2LitUHalf, kT2LitHexWord, kT2LitInvalid,
+                  kT2LitUByte, kT2LitUHalf, kT2LitHexWord, kT2LitInvalid,
+                  kT2LitSByte, kT2LitSHalf, kT2LitInvalid, kT2LitInvalid,
+                  kT2LitSByte, kT2LitSHalf, kT2LitInvalid, kT2LitInvalid,
+              };
+              DCHECK_LT(op2 >> 1, arraysize(lit_type));
+              DCHECK_NE(lit_type[op2 >> 1], kT2LitInvalid);
+              DumpThumb2Literal(args, instr_ptr, U, imm12, lit_type[op2 >> 1]);
+            }
+          } else if ((instr & 0xFC0) == 0) {
+            opcode << ldr_str << sign << type << ".w";
+            RmLslImm2 Rm(instr);
+            args << Rt << ", [" << Rn << ", " << Rm << "]";
+            unpred = unpred || (Rm.rm.r == SP) || (Rm.rm.r == PC);
+          } else if (is_word && Rn.r == SP && imm8 == 4 && op4 == (is_load ? 0xB : 0xD)) {
+            opcode << (is_load ? "pop" : "push") << ".w";
+            args << Rn;
+            unpred = unpred || (Rn.r == SP);
+          } else if ((op4 & 5) == 0) {
+            opcode << "UNDEFINED [P = W = 0 for " << ldr_str << "]";
+            unpred = false;
+          } else {
+            uint32_t P = (instr >> 10) & 1;
+            U = (instr >> 9) & 1;
+            uint32_t W = (instr >> 8) & 1;
+            bool pre_index = (P != 0 && W == 1);
+            bool post_index = (P == 0 && W == 1);
+            const char* t = (P != 0 && U != 0 && W == 0) ? "t" : "";  // Unprivileged load/store?
+            opcode << ldr_str << sign << type << t << ".w";
+            args << Rt << ", [" << Rn << (post_index ? "]" : "") << ", #" << (U != 0 ? "" : "-")
+                << imm8 << (post_index ? "" : "]") << (pre_index ? "!" : "");
+            unpred = (W != 0 && Rn.r == Rt.r);
+          }
+          if (unpred) {
+            args << " (UNPREDICTABLE)";
           }
           break;
         }
@@ -1413,75 +1455,6 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr)
           }  // else unknown instruction
           break;
         }
-        case 0x05: case 0x0D: case 0x15: case 0x1D: {  // 00xx101
-          // Load word
-          // |111|11|10|0 0|00|0|0000|1111|110000|000000|
-          // |5 3|21|09|8 7|65|4|3  0|5  2|10   6|5    0|
-          // |---|--|--|---|--|-|----|----|------|------|
-          // |332|22|22|2 2|22|2|1111|1111|110000|000000|
-          // |1 9|87|65|4 3|21|0|9  6|5  2|10   6|5    0|
-          // |---|--|--|---|--|-|----|----|------|------|
-          // |111|11|00|op3|10|1| Rn | Rt | op4  |      |
-          // |111|11| op2       |    |    | imm12       |
-          uint32_t op3 = (instr >> 23) & 3;
-          uint32_t op4 = (instr >> 6) & 0x3F;
-          ArmRegister Rn(instr, 16);
-          ArmRegister Rt(instr, 12);
-          if (op3 == 1 || Rn.r == 15) {
-            // LDR.W Rt, [Rn, #imm12]          - 111 11 00 00 101 nnnn tttt iiiiiiiiiiii
-            // LDR.W Rt, [PC, #imm12]          - 111 11 00 0x 101 1111 tttt iiiiiiiiiiii
-            uint32_t imm12 = instr & 0xFFF;
-            opcode << "ldr.w";
-            args << Rt << ", [" << Rn << ", #" << imm12 << "]";
-            if (Rn.r == 9) {
-              args << "  ; ";
-              Thread::DumpThreadOffset<4>(args, imm12);
-            } else if (Rn.r == 15) {
-              intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
-              lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
-              args << StringPrintf("  ; 0x%08x", *reinterpret_cast<int32_t*>(lit_adr));
-            }
-          } else if (op4 == 0) {
-            // LDR.W Rt, [Rn, Rm{, LSL #imm2}] - 111 11 00 00 101 nnnn tttt 000000iimmmm
-            uint32_t imm2 = (instr >> 4) & 0xF;
-            ArmRegister rm(instr, 0);
-            opcode << "ldr.w";
-            args << Rt << ", [" << Rn << ", " << rm;
-            if (imm2 != 0) {
-              args << ", lsl #" << imm2;
-            }
-            args << "]";
-          } else {
-            bool p = (instr & (1 << 10)) != 0;
-            bool w = (instr & (1 << 8)) != 0;
-            bool u = (instr & (1 << 9)) != 0;
-            if (p && u && !w) {
-              // LDRT Rt, [Rn, #imm8]            - 111 11 00 00 101 nnnn tttt 1110iiiiiiii
-              uint32_t imm8 = instr & 0xFF;
-              opcode << "ldrt";
-              args << Rt << ", [" << Rn << ", #" << imm8 << "]";
-            } else if (Rn.r == 13 && !p && u && w && (instr & 0xff) == 4) {
-              // POP
-              opcode << "pop";
-              args << "{" << Rt << "}";
-           } else {
-              bool wback = !p || w;
-              uint32_t offset = (instr & 0xff);
-              opcode << "ldr.w";
-              args << Rt << ",";
-              if (p && !wback) {
-                args << "[" << Rn << ", #" << offset << "]";
-              } else if (p && wback) {
-                args << "[" << Rn << ", #" << offset << "]!";
-              } else if (!p && wback) {
-                args << "[" << Rn << "], #" << offset;
-              } else {
-                LOG(FATAL) << p << " " << w;
-              }
-            }
-          }
-          break;
-        }
       default:      // more formats
         if ((op2 >> 4) == 2) {      // 010xxxx
           // data processing (register)
@@ -1808,6 +1781,23 @@ size_t DisassemblerArm::DumpThumb16(std::ostream& os, const uint8_t* instr_ptr)
           DumpBranchTarget(args, instr_ptr + 4, imm32);
           break;
         }
+        case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27:
+        case 0x28: case 0x29: case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F: {
+          opcode << "push";
+          args << RegisterList((instr & 0xFF) | ((instr & 0x100) << 6));
+          break;
+        }
+        case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
+        case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: {
+          opcode << "pop";
+          args << RegisterList((instr & 0xFF) | ((instr & 0x100) << 7));
+          break;
+        }
+        case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: {
+          opcode << "bkpt";
+          args << "#" << (instr & 0xFF);
+          break;
+        }
         case 0x50: case 0x51:    // 101000x
         case 0x52: case 0x53:    // 101001x
         case 0x56: case 0x57: {  // 101011x
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 556f2f8..5f5d3f7 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -3192,7 +3192,7 @@ static bool IsMethodPossiblyInlined(Thread* self, mirror::ArtMethod* m)
   Handle<mirror::ArtMethod> method(hs.NewHandle(m));
   verifier::MethodVerifier verifier(self, dex_cache->GetDexFile(), dex_cache, class_loader,
                                     &m->GetClassDef(), code_item, m->GetDexMethodIndex(), method,
-                                    m->GetAccessFlags(), false, true, false);
+                                    m->GetAccessFlags(), false, true, false, true);
   // Note: we don't need to verify the method.
   return InlineMethodAnalyser::AnalyseMethodCode(&verifier, nullptr);
 }
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 1ef5221..ef63080 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1001,14 +1001,9 @@ void Monitor::VisitLocks(StackVisitor* stack_visitor, void (*callback)(mirror::O
   // the locks held in this stack frame.
   std::vector<uint32_t> monitor_enter_dex_pcs;
   verifier::MethodVerifier::FindLocksAtDexPc(m, dex_pc, &monitor_enter_dex_pcs);
-  if (monitor_enter_dex_pcs.empty()) {
-    return;
-  }
-
-  for (size_t i = 0; i < monitor_enter_dex_pcs.size(); ++i) {
+  for (uint32_t monitor_dex_pc : monitor_enter_dex_pcs) {
     // The verifier works in terms of the dex pcs of the monitor-enter instructions.
     // We want the registers used by those instructions (so we can read the values out of them).
-    uint32_t monitor_dex_pc = monitor_enter_dex_pcs[i];
     uint16_t monitor_enter_instruction = code_item->insns_[monitor_dex_pc];
 
     // Quick sanity check.
@@ -1018,8 +1013,8 @@ void Monitor::VisitLocks(StackVisitor* stack_visitor, void (*callback)(mirror::O
     }
 
     uint16_t monitor_register = ((monitor_enter_instruction >> 8) & 0xff);
-    mirror::Object* o = reinterpret_cast<mirror::Object*>(stack_visitor->GetVReg(m, monitor_register,
-                                                                                 kReferenceVReg));
+    mirror::Object* o = reinterpret_cast<mirror::Object*>(
+        stack_visitor->GetVReg(m, monitor_register, kReferenceVReg));
     callback(o, callback_context);
   }
 }
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index f37312e..44c6d87 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -301,7 +301,10 @@ static jbyte IsDexOptNeededForFile(const std::string& oat_filename, const char*
                                                         nullptr,
                                                         false, &error_msg));
   if (oat_file.get() == nullptr) {
-    if (kReasonLogging) {
+    // Note that even though this is kDexoptNeeded, we use
+    // kVerboseLogging instead of the usual kReasonLogging since it is
+    // the common case on first boot and very spammy.
+    if (kVerboseLogging) {
       LOG(INFO) << "DexFile_isDexOptNeeded failed to open oat file '" << oat_filename
           << "' for file location '" << filename << "': " << error_msg;
     }
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index 90c9fe7..3517848 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -214,7 +214,7 @@ class DeoptimizeStackVisitor FINAL : public StackVisitor {
     Handle<mirror::ArtMethod> h_method(hs.NewHandle(m));
     verifier::MethodVerifier verifier(self_, h_dex_cache->GetDexFile(), h_dex_cache, h_class_loader,
                                       &m->GetClassDef(), code_item, m->GetDexMethodIndex(),
-                                      h_method, m->GetAccessFlags(), false, true, true);
+                                      h_method, m->GetAccessFlags(), false, true, true, true);
     verifier.Verify();
     const std::vector<int32_t> kinds(verifier.DescribeVRegs(dex_pc));
     for (uint16_t reg = 0; reg < num_regs; ++reg) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5ff7490..d2d5be7 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -932,7 +932,10 @@ struct StackDumpVisitor : public StackVisitor {
         os << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)", reinterpret_cast<intptr_t>(o),
                            PrettyTypeOf(o).c_str());
       } else {
-        os << StringPrintf("<0x%08x> (a %s)", o->IdentityHashCode(), PrettyTypeOf(o).c_str());
+        // IdentityHashCode can cause thread suspension, which would invalidate o if it moved. So
+        // we get the pretty type beofre we call IdentityHashCode.
+        const std::string pretty_type(PrettyTypeOf(o));
+        os << StringPrintf("<0x%08x> (a %s)", o->IdentityHashCode(), pretty_type.c_str());
       }
     }
     os << "\n";
@@ -1339,7 +1342,6 @@ void Thread::HandleScopeVisitRoots(RootCallback* visitor, void* arg, uint32_t th
 }
 
 mirror::Object* Thread::DecodeJObject(jobject obj) const {
-  Locks::mutator_lock_->AssertSharedHeld(this);
   if (obj == nullptr) {
     return nullptr;
   }
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 66846b5..88944d7 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -286,7 +286,7 @@ MethodVerifier::FailureKind MethodVerifier::VerifyMethod(Thread* self, uint32_t
 
   MethodVerifier verifier(self, dex_file, dex_cache, class_loader, class_def, code_item,
                           method_idx, method, method_access_flags, true, allow_soft_failures,
-                          need_precise_constants);
+                          need_precise_constants, true);
   if (verifier.Verify()) {
     // Verification completed, however failures may be pending that didn't cause the verification
     // to hard fail.
@@ -352,7 +352,8 @@ MethodVerifier::MethodVerifier(Thread* self,
                                const DexFile::CodeItem* code_item, uint32_t dex_method_idx,
                                Handle<mirror::ArtMethod> method, uint32_t method_access_flags,
                                bool can_load_classes, bool allow_soft_failures,
-                               bool need_precise_constants, bool verify_to_dump)
+                               bool need_precise_constants, bool verify_to_dump,
+                               bool allow_thread_suspension)
     : self_(self),
       reg_types_(can_load_classes),
       work_insn_idx_(-1),
@@ -377,7 +378,8 @@ MethodVerifier::MethodVerifier(Thread* self,
       need_precise_constants_(need_precise_constants),
       has_check_casts_(false),
       has_virtual_or_interface_invokes_(false),
-      verify_to_dump_(verify_to_dump) {
+      verify_to_dump_(verify_to_dump),
+      allow_thread_suspension_(allow_thread_suspension) {
   Runtime::Current()->AddMethodVerifier(this);
   DCHECK(class_def != nullptr);
 }
@@ -396,7 +398,7 @@ void MethodVerifier::FindLocksAtDexPc(mirror::ArtMethod* m, uint32_t dex_pc,
   Handle<mirror::ArtMethod> method(hs.NewHandle(m));
   MethodVerifier verifier(self, m->GetDexFile(), dex_cache, class_loader, &m->GetClassDef(),
                           m->GetCodeItem(), m->GetDexMethodIndex(), method, m->GetAccessFlags(),
-                          false, true, false);
+                          false, true, false, false);
   verifier.interesting_dex_pc_ = dex_pc;
   verifier.monitor_enter_dex_pcs_ = monitor_enter_dex_pcs;
   verifier.FindLocksAtDexPc();
@@ -443,7 +445,7 @@ mirror::ArtField* MethodVerifier::FindAccessedFieldAtDexPc(mirror::ArtMethod* m,
   Handle<mirror::ArtMethod> method(hs.NewHandle(m));
   MethodVerifier verifier(self, m->GetDexFile(), dex_cache, class_loader, &m->GetClassDef(),
                           m->GetCodeItem(), m->GetDexMethodIndex(), method, m->GetAccessFlags(),
-                          true, true, false);
+                          true, true, false, true);
   return verifier.FindAccessedFieldAtDexPc(dex_pc);
 }
 
@@ -475,7 +477,7 @@ mirror::ArtMethod* MethodVerifier::FindInvokedMethodAtDexPc(mirror::ArtMethod* m
   Handle<mirror::ArtMethod> method(hs.NewHandle(m));
   MethodVerifier verifier(self, m->GetDexFile(), dex_cache, class_loader, &m->GetClassDef(),
                           m->GetCodeItem(), m->GetDexMethodIndex(), method, m->GetAccessFlags(),
-                          true, true, false);
+                          true, true, false, true);
   return verifier.FindInvokedMethodAtDexPc(dex_pc);
 }
 
@@ -1402,7 +1404,9 @@ bool MethodVerifier::CodeFlowVerifyMethod() {
 
   /* Continue until no instructions are marked "changed". */
   while (true) {
-    self_->AllowThreadSuspension();
+    if (allow_thread_suspension_) {
+      self_->AllowThreadSuspension();
+    }
     // Find the first marked one. Use "start_guess" as a way to find one quickly.
     uint32_t insn_idx = start_guess;
     for (; insn_idx < insns_size; insn_idx++) {
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index 15a09c5..b83e647 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -207,10 +207,11 @@ class MethodVerifier {
                  const DexFile::CodeItem* code_item, uint32_t method_idx,
                  Handle<mirror::ArtMethod> method,
                  uint32_t access_flags, bool can_load_classes, bool allow_soft_failures,
-                 bool need_precise_constants) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+                 bool need_precise_constants, bool allow_thread_suspension)
+          SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       : MethodVerifier(self, dex_file, dex_cache, class_loader, class_def, code_item, method_idx,
                        method, access_flags, can_load_classes, allow_soft_failures,
-                       need_precise_constants, false) {}
+                       need_precise_constants, false, allow_thread_suspension) {}
 
   ~MethodVerifier();
 
@@ -260,7 +261,7 @@ class MethodVerifier {
                  const DexFile::CodeItem* code_item, uint32_t method_idx,
                  Handle<mirror::ArtMethod> method, uint32_t access_flags,
                  bool can_load_classes, bool allow_soft_failures, bool need_precise_constants,
-                 bool verify_to_dump)
+                 bool verify_to_dump, bool allow_thread_suspension)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Adds the given string to the beginning of the last failure message.
@@ -729,6 +730,11 @@ class MethodVerifier {
   // VerifyMethodAndDump.
   const bool verify_to_dump_;
 
+  // Whether or not we call AllowThreadSuspension periodically, we want a way to disable this for
+  // thread dumping checkpoints since we may get thread suspension at an inopportune time due to
+  // FindLocksAtDexPC, resulting in deadlocks.
+  const bool allow_thread_suspension_;
+
   DISALLOW_COPY_AND_ASSIGN(MethodVerifier);
 };
 std::ostream& operator<<(std::ostream& os, const MethodVerifier::FailureKind& rhs);
diff --git a/sigchainlib/sigchain.cc b/sigchainlib/sigchain.cc
index 601e321..2eb518c 100644
--- a/sigchainlib/sigchain.cc
+++ b/sigchainlib/sigchain.cc
@@ -170,12 +170,13 @@ extern "C" int sigaction(int signal, const struct sigaction* new_action, struct
   // Note that we check that the signal number is in range here.  An out of range signal
   // number should behave exactly as the libc sigaction.
   if (signal > 0 && signal < _NSIG && user_sigactions[signal].IsClaimed()) {
-    if (old_action != NULL) {
-      *old_action = user_sigactions[signal].GetAction();
-    }
+    struct sigaction saved_action = user_sigactions[signal].GetAction();
     if (new_action != NULL) {
       user_sigactions[signal].SetAction(*new_action, false);
     }
+    if (old_action != NULL) {
+      *old_action = saved_action;
+    }
     return 0;
   }
 
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index d7aede3..6cb08f4 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -12,4 +12,5 @@ FloatIntConstPassing
 b/18718277
 b/18800943 (1)
 b/18800943 (2)
+MoveExc
 Done!
diff --git a/test/800-smali/smali/move_exc.smali b/test/800-smali/smali/move_exc.smali
new file mode 100644
index 0000000..4ade4bc
--- /dev/null
+++ b/test/800-smali/smali/move_exc.smali
@@ -0,0 +1,29 @@
+.class public LMoveExc;
+.super Ljava/lang/Object;
+
+
+.method public constructor <init>()V
+.registers 1
+       invoke-direct {p0}, Ljava/lang/Object;-><init>()V
+       return-void
+.end method
+
+.method public static run()V
+.registers 6
+:Label1
+       const v1, 15
+       const v2, 0
+       div-int v0, v1, v2
+
+:Label2
+       goto :Label4
+
+:Label3
+       move-exception v3
+       throw v3
+
+:Label4
+       return-void
+
+.catchall {:Label1 .. :Label2} :Label3
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index ea25da6..2eda850 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -68,6 +68,7 @@ public class Main {
         testCases.add(new TestCase("b/18718277", "B18718277", "getInt", null, null, 0));
         testCases.add(new TestCase("b/18800943 (1)", "B18800943_1", "n_a", null, new VerifyError(), 0));
         testCases.add(new TestCase("b/18800943 (2)", "B18800943_2", "n_a", null, new VerifyError(), 0));
+        testCases.add(new TestCase("MoveExc", "MoveExc", "run", null, new ArithmeticException(), null));
     }
 
     public void runTests() {