93 files changed, 5178 insertions, 2461 deletions
diff --git a/Android.mk b/Android.mk
index e536a71..15e8308 100644
--- a/Android.mk
+++ b/Android.mk
@@ -67,8 +67,13 @@ ifdef TARGET_2ND_ARCH
 	rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/JAVA_LIBRARIES/*_intermediates/javalib.odex
 	rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/APPS/*_intermediates/*.odex
 endif
+ifneq ($(TMPDIR),)
+	rm -rf $(TMPDIR)/$(USER)/test-*/dalvik-cache/*
+	rm -rf $(TMPDIR)/android-data/dalvik-cache/*
+else
 	rm -rf /tmp/$(USER)/test-*/dalvik-cache/*
 	rm -rf /tmp/android-data/dalvik-cache/*
+endif
 
 .PHONY: clean-oat-target
 clean-oat-target:
@@ -309,14 +314,15 @@ else
 .PHONY: oat-target-$(1)
 oat-target-$(1): $$(OUT_OAT_FILE)
 
-$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD)
+$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
 	$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
 		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
 		--dex-location=/$(1) --oat-file=$$@ \
 		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
 		--instruction-set-features=$(DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES) \
-		--android-root=$(PRODUCT_OUT)/system --include-patch-information
+		--android-root=$(PRODUCT_OUT)/system --include-patch-information \
+		--runtime-arg -Xnorelocate
 
 endif
 
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 6e27190..17c478c 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -91,6 +91,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/entrypoints/quick/quick_trampoline_entrypoints_test.cc \
   runtime/entrypoints_order_test.cc \
   runtime/exception_test.cc \
+  runtime/gc/accounting/card_table_test.cc \
   runtime/gc/accounting/space_bitmap_test.cc \
   runtime/gc/heap_test.cc \
   runtime/gc/space/dlmalloc_space_base_test.cc \
@@ -113,6 +114,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/monitor_pool_test.cc \
   runtime/monitor_test.cc \
   runtime/parsed_options_test.cc \
+  runtime/proxy_test.cc \
   runtime/reference_table_test.cc \
   runtime/thread_pool_test.cc \
   runtime/transaction_test.cc \
@@ -123,7 +125,6 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
 
 COMPILER_GTEST_COMMON_SRC_FILES := \
   runtime/jni_internal_test.cc \
-  runtime/proxy_test.cc \
   runtime/reflection_test.cc \
   compiler/dex/global_value_numbering_test.cc \
   compiler/dex/local_value_numbering_test.cc \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index cd6b13a..10936a4 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -26,7 +26,7 @@ include art/build/Android.common_path.mk
 # Use dex2oat debug version for better error reporting
 # $(1): 2ND_ or undefined, 2ND_ for 32-bit host builds.
 define create-core-oat-host-rules
-$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD)
+$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "host dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
 	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \
@@ -49,7 +49,7 @@ $(eval $(call create-core-oat-host-rules,2ND_))
 endif
 
 define create-core-oat-target-rules
-$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD)
+$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
 	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
diff --git a/compiler/compilers.cc b/compiler/compilers.cc
index 250924a..5cf846f 100644
--- a/compiler/compilers.cc
+++ b/compiler/compilers.cc
@@ -38,9 +38,6 @@ extern "C" art::CompiledMethod* ArtQuickJniCompileMethod(art::CompilerDriver* dr
                                                          uint32_t access_flags, uint32_t method_idx,
                                                          const art::DexFile& dex_file);
 
-// Hack for CFI CIE initialization
-extern std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64);
-
 void QuickCompiler::Init() const {
   ArtInitQuickCompilerContext(GetCompilerDriver());
 }
@@ -126,17 +123,6 @@ Backend* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_
   return mir_to_lir;
 }
 
-std::vector<uint8_t>* QuickCompiler::GetCallFrameInformationInitialization(
-    const CompilerDriver& driver) const {
-  if (driver.GetInstructionSet() == kX86) {
-    return X86CFIInitialization(false);
-  }
-  if (driver.GetInstructionSet() == kX86_64) {
-    return X86CFIInitialization(true);
-  }
-  return nullptr;
-}
-
 CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
                                             uint32_t access_flags,
                                             InvokeType invoke_type,
diff --git a/compiler/compilers.h b/compiler/compilers.h
index 2c231e1..151bf6f 100644
--- a/compiler/compilers.h
+++ b/compiler/compilers.h
@@ -56,17 +56,6 @@ class QuickCompiler : public Compiler {
 
   void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE {}
 
-  /*
-   * @brief Generate and return Dwarf CFI initialization, if supported by the
-   * backend.
-   * @param driver CompilerDriver for this compile.
-   * @returns nullptr if not supported by backend or a vector of bytes for CFI DWARF
-   * information.
-   * @note This is used for backtrace information in generated code.
-   */
-  std::vector<uint8_t>* GetCallFrameInformationInitialization(const CompilerDriver& driver) const
-      OVERRIDE;
-
  private:
   DISALLOW_COPY_AND_ASSIGN(QuickCompiler);
 };
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 8d0a5a3..6aee563 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -69,6 +69,7 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = {
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
     : reg_location_(NULL),
+      block_id_map_(std::less<unsigned int>(), arena->Adapter()),
       cu_(cu),
       ssa_base_vregs_(NULL),
       ssa_subscripts_(NULL),
@@ -101,11 +102,14 @@ MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
       num_blocks_(0),
       current_code_item_(NULL),
       dex_pc_to_block_map_(arena, 0, kGrowableArrayMisc),
+      m_units_(arena->Adapter()),
+      method_stack_(arena->Adapter()),
       current_method_(kInvalidEntry),
       current_offset_(kInvalidEntry),
       def_count_(0),
       opcode_count_(NULL),
       num_ssa_regs_(0),
+      extended_basic_blocks_(arena->Adapter()),
       method_sreg_(0),
       attributes_(METHOD_IS_LEAF),  // Start with leaf assumption, change on encountering invoke.
       checkstats_(NULL),
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 768ae21..491d72e 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -27,6 +27,7 @@
 #include "mir_method_info.h"
 #include "utils/arena_bit_vector.h"
 #include "utils/growable_array.h"
+#include "utils/arena_containers.h"
 #include "utils/scoped_arena_containers.h"
 #include "reg_location.h"
 #include "reg_storage.h"
@@ -1051,8 +1052,8 @@ class MIRGraph {
   std::set<uint32_t> catches_;
 
   // TODO: make these private.
-  RegLocation* reg_location_;                         // Map SSA names to location.
-  SafeMap<unsigned int, unsigned int> block_id_map_;  // Block collapse lookup cache.
+  RegLocation* reg_location_;                               // Map SSA names to location.
+  ArenaSafeMap<unsigned int, unsigned int> block_id_map_;   // Block collapse lookup cache.
 
   static const char* extended_mir_op_names_[kMirOpLast - kMirOpFirst];
   static const uint32_t analysis_attributes_[kMirOpLast];
@@ -1171,15 +1172,15 @@ class MIRGraph {
   unsigned int num_blocks_;
   const DexFile::CodeItem* current_code_item_;
   GrowableArray<uint16_t> dex_pc_to_block_map_;  // FindBlock lookup cache.
-  std::vector<DexCompilationUnit*> m_units_;     // List of methods included in this graph
+  ArenaVector<DexCompilationUnit*> m_units_;     // List of methods included in this graph
   typedef std::pair<int, int> MIRLocation;       // Insert point, (m_unit_ index, offset)
-  std::vector<MIRLocation> method_stack_;        // Include stack
+  ArenaVector<MIRLocation> method_stack_;        // Include stack
   int current_method_;
   DexOffset current_offset_;                     // Offset in code units
   int def_count_;                                // Used to estimate size of ssa name storage.
   int* opcode_count_;                            // Dex opcode coverage stats.
   int num_ssa_regs_;                             // Number of names following SSA transformation.
-  std::vector<BasicBlockId> extended_basic_blocks_;  // Heads of block "traces".
+  ArenaVector<BasicBlockId> extended_basic_blocks_;  // Heads of block "traces".
   int method_sreg_;
   unsigned int attributes_;
   Checkstats* checkstats_;
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 5059c5f..b133991 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -43,8 +43,7 @@ namespace art {
  *   add   rARM_PC, r_disp   ; This is the branch from which we compute displacement
  *   cbnz  r_idx, lp
  */
-void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
-                                 RegLocation rl_src) {
+void ArmMir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpSparseSwitchTable(table);
@@ -92,8 +91,7 @@ void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
 }
 
 
-void ArmMir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
-                                 RegLocation rl_src) {
+void ArmMir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index e0b8ec6..072acbe 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -84,6 +84,8 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
 
     // Required for target - Dalvik-level generators.
+    void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                        RegLocation rl_src2) OVERRIDE;
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
     void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
@@ -92,12 +94,6 @@ class ArmMir2Lir FINAL : public Mir2Lir {
                      RegLocation rl_src, int scale, bool card_mark);
     void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_shift);
-    void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
     void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                           RegLocation rl_src2);
     void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -112,16 +108,6 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
-    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                   RegLocation rl_src2);
-    void GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                       RegLocation rl_src2, bool is_div);
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
@@ -144,8 +130,8 @@ class ArmMir2Lir FINAL : public Mir2Lir {
                                        int first_bit, int second_bit);
     void GenNegDouble(RegLocation rl_dest, RegLocation rl_src);
     void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
-    void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
-    void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
 
     // Required for target - single operation generators.
     LIR* OpUnconditionalBranch(LIR* target);
@@ -201,6 +187,9 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     size_t GetInstructionOffset(LIR* lir);
 
   private:
+    void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
+    void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                    RegLocation rl_src2);
     void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1, int64_t val,
                                   ConditionCode ccode);
     LIR* LoadFPConstantValue(int r_dest, int value);
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index dd14ed9..6711ab3 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -1039,15 +1039,6 @@ bool ArmMir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
 #endif
 }
 
-void ArmMir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
-  LOG(FATAL) << "Unexpected use GenNotLong()";
-}
-
-void ArmMir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2, bool is_div) {
-  LOG(FATAL) << "Unexpected use GenDivRemLong()";
-}
-
 void ArmMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
   rl_src = LoadValueWide(rl_src, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
@@ -1173,29 +1164,23 @@ void ArmMir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
     StoreValueWide(rl_dest, rl_result);
 }
 
-void ArmMir2Lir::GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenAddLong for Arm";
-}
-
-void ArmMir2Lir::GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenSubLong for Arm";
-}
-
-void ArmMir2Lir::GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenAndLong for Arm";
-}
+void ArmMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                                RegLocation rl_src2) {
+  switch (opcode) {
+    case Instruction::MUL_LONG:
+    case Instruction::MUL_LONG_2ADDR:
+      GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::NEG_LONG:
+      GenNegLong(rl_dest, rl_src2);
+      return;
 
-void ArmMir2Lir::GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenOrLong for Arm";
-}
+    default:
+      break;
+  }
 
-void ArmMir2Lir::GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of genXoLong for Arm";
+  // Fallback for all other ops.
+  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
 }
 
 /*
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 90cb156..a449cbd 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -267,6 +267,8 @@ enum ArmOpcode {
   kA64Fcvtzs2xf,     // fcvtzs [100111100s111000000000] rn[9-5] rd[4-0].
   kA64Fcvt2Ss,       // fcvt   [0001111000100010110000] rn[9-5] rd[4-0].
   kA64Fcvt2sS,       // fcvt   [0001111001100010010000] rn[9-5] rd[4-0].
+  kA64Fcvtms2ws,     // fcvtms [0001111000110000000000] rn[9-5] rd[4-0].
+  kA64Fcvtms2xS,     // fcvtms [1001111001110000000000] rn[9-5] rd[4-0].
   kA64Fdiv3fff,      // fdiv[000111100s1] rm[20-16] [000110] rn[9-5] rd[4-0].
   kA64Fmax3fff,      // fmax[000111100s1] rm[20-16] [010010] rn[9-5] rd[4-0].
   kA64Fmin3fff,      // fmin[000111100s1] rm[20-16] [010110] rn[9-5] rd[4-0].
@@ -278,6 +280,9 @@ enum ArmOpcode {
   kA64Fmov2xS,       // fmov[1001111001101111000000] rn[9-5] rd[4-0].
   kA64Fmul3fff,      // fmul[000111100s1] rm[20-16] [000010] rn[9-5] rd[4-0].
   kA64Fneg2ff,       // fneg[000111100s100001010000] rn[9-5] rd[4-0].
+  kA64Frintp2ff,     // frintp [000111100s100100110000] rn[9-5] rd[4-0].
+  kA64Frintm2ff,     // frintm [000111100s100101010000] rn[9-5] rd[4-0].
+  kA64Frintn2ff,     // frintn [000111100s100100010000] rn[9-5] rd[4-0].
   kA64Frintz2ff,     // frintz [000111100s100101110000] rn[9-5] rd[4-0].
   kA64Fsqrt2ff,      // fsqrt[000111100s100001110000] rn[9-5] rd[4-0].
   kA64Fsub3fff,      // fsub[000111100s1] rm[20-16] [001110] rn[9-5] rd[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index c46be53..15c89f2 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -260,6 +260,14 @@ const ArmEncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = {
                  kFmtRegS, 4, 0, kFmtRegD, 9, 5, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
                  "fcvt", "!0s, !1S", kFixupNone),
+    ENCODING_MAP(kA64Fcvtms2ws, NO_VARIANTS(0x1e300000),
+                 kFmtRegW, 4, 0, kFmtRegS, 9, 5, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 "fcvtms", "!0w, !1s", kFixupNone),
+    ENCODING_MAP(kA64Fcvtms2xS, NO_VARIANTS(0x9e700000),
+                 kFmtRegX, 4, 0, kFmtRegD, 9, 5, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 "fcvtms", "!0x, !1S", kFixupNone),
     ENCODING_MAP(FWIDE(kA64Fdiv3fff), FLOAT_VARIANTS(0x1e201800),
                  kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtRegF, 20, 16,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
@@ -304,6 +312,18 @@ const ArmEncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = {
                  kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
                  "fneg", "!0f, !1f", kFixupNone),
+    ENCODING_MAP(FWIDE(kA64Frintp2ff), FLOAT_VARIANTS(0x1e24c000),
+                 kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 "frintp", "!0f, !1f", kFixupNone),
+    ENCODING_MAP(FWIDE(kA64Frintm2ff), FLOAT_VARIANTS(0x1e254000),
+                 kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 "frintm", "!0f, !1f", kFixupNone),
+    ENCODING_MAP(FWIDE(kA64Frintn2ff), FLOAT_VARIANTS(0x1e244000),
+                 kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 "frintn", "!0f, !1f", kFixupNone),
     ENCODING_MAP(FWIDE(kA64Frintz2ff), FLOAT_VARIANTS(0x1e25c000),
                  kFmtRegF, 4, 0, kFmtRegF, 9, 5, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
@@ -521,7 +541,7 @@ const ArmEncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = {
     ENCODING_MAP(WIDE(kA64StpPre4ffXD), CUSTOM_VARIANTS(0x2d800000, 0x6d800000),
                  kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5,
                  kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
-                 "stp", "!0r, !1f, [!2X, #!3D]!!", kFixupNone),
+                 "stp", "!0f, !1f, [!2X, #!3D]!!", kFixupNone),
     ENCODING_MAP(WIDE(kA64StpPre4rrXD), CUSTOM_VARIANTS(0x29800000, 0xa9800000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
                  kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 6fa8a4a..7c5c4fa 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -43,8 +43,7 @@ namespace art {
  *   br    r_base
  * quit:
  */
-void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
-                                   RegLocation rl_src) {
+void Arm64Mir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpSparseSwitchTable(table);
@@ -96,8 +95,7 @@ void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
 }
 
 
-void Arm64Mir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
-                                   RegLocation rl_src) {
+void Arm64Mir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 18f2a29..2cd24c6 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -59,332 +59,340 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
     bool initialized_;
   };
 
-  public:
-    Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
-
-    // Required for target - codegen helpers.
-    bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
-                            RegLocation rl_dest, int lit) OVERRIDE;
-    bool SmallLiteralDivRem64(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
-                              RegLocation rl_dest, int64_t lit);
-    bool HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
-                          RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
-    bool HandleEasyDivRem64(Instruction::Code dalvik_opcode, bool is_div,
-                            RegLocation rl_src, RegLocation rl_dest, int64_t lit);
-    bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
-    LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
-    LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                      OpSize size, VolatileKind is_volatile) OVERRIDE;
-    LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                     VolatileKind is_volatile)
-        OVERRIDE;
-    LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
-                         OpSize size) OVERRIDE;
-    LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale)
-        OVERRIDE;
-    LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
-    LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-    LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                       OpSize size, VolatileKind is_volatile) OVERRIDE;
-    LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                      VolatileKind is_volatile) OVERRIDE;
-    LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
-                          OpSize size) OVERRIDE;
-    LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale)
-        OVERRIDE;
-    void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) OVERRIDE;
-    LIR* OpCmpMemImmBranch(ConditionCode cond, RegStorage temp_reg, RegStorage base_reg,
-                           int offset, int check_value, LIR* target, LIR** compare) OVERRIDE;
-
-    // Required for target - register utilities.
-    RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
-    RegStorage TargetReg(SpecialTargetRegister symbolic_reg, WideKind wide_kind) OVERRIDE {
-      if (wide_kind == kWide || wide_kind == kRef) {
-        return As64BitReg(TargetReg(symbolic_reg));
-      } else {
-        return Check32BitReg(TargetReg(symbolic_reg));
-      }
-    }
-    RegStorage TargetPtrReg(SpecialTargetRegister symbolic_reg) OVERRIDE {
+ public:
+  Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
+
+  // Required for target - codegen helpers.
+  bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
+                          RegLocation rl_dest, int lit) OVERRIDE;
+  bool HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
+                        RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+  bool HandleEasyDivRem64(Instruction::Code dalvik_opcode, bool is_div,
+                          RegLocation rl_src, RegLocation rl_dest, int64_t lit);
+  bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+  LIR* CheckSuspendUsingLoad() OVERRIDE;
+  RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
+  LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                    OpSize size, VolatileKind is_volatile) OVERRIDE;
+  LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                   VolatileKind is_volatile) OVERRIDE;
+  LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
+                       OpSize size) OVERRIDE;
+  LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale)
+      OVERRIDE;
+  LIR* LoadConstantNoClobber(RegStorage r_dest, int value) OVERRIDE;
+  LIR* LoadConstantWide(RegStorage r_dest, int64_t value) OVERRIDE;
+  LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src, OpSize size,
+                     VolatileKind is_volatile) OVERRIDE;
+  LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src, VolatileKind is_volatile)
+      OVERRIDE;
+  LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
+                        OpSize size) OVERRIDE;
+  LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale) OVERRIDE;
+  void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) OVERRIDE;
+  LIR* OpCmpMemImmBranch(ConditionCode cond, RegStorage temp_reg, RegStorage base_reg,
+                         int offset, int check_value, LIR* target, LIR** compare) OVERRIDE;
+
+  // Required for target - register utilities.
+  RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
+  RegStorage TargetReg(SpecialTargetRegister symbolic_reg, WideKind wide_kind) OVERRIDE {
+    if (wide_kind == kWide || wide_kind == kRef) {
       return As64BitReg(TargetReg(symbolic_reg));
+    } else {
+      return Check32BitReg(TargetReg(symbolic_reg));
     }
-    RegStorage GetArgMappingToPhysicalReg(int arg_num);
-    RegLocation GetReturnAlt();
-    RegLocation GetReturnWideAlt();
-    RegLocation LocCReturn();
-    RegLocation LocCReturnRef();
-    RegLocation LocCReturnDouble();
-    RegLocation LocCReturnFloat();
-    RegLocation LocCReturnWide();
-    ResourceMask GetRegMaskCommon(const RegStorage& reg) const OVERRIDE;
-    void AdjustSpillMask();
-    void ClobberCallerSave();
-    void FreeCallTemps();
-    void LockCallTemps();
-    void CompilerInitializeRegAlloc();
-
-    // Required for target - miscellaneous.
-    void AssembleLIR();
-    uint32_t LinkFixupInsns(LIR* head_lir, LIR* tail_lir, CodeOffset offset);
-    int AssignInsnOffsets();
-    void AssignOffsets();
-    uint8_t* EncodeLIRs(uint8_t* write_pos, LIR* lir);
-    void DumpResourceMask(LIR* lir, const ResourceMask& mask, const char* prefix) OVERRIDE;
-    void SetupTargetResourceMasks(LIR* lir, uint64_t flags,
-                                  ResourceMask* use_mask, ResourceMask* def_mask) OVERRIDE;
-    const char* GetTargetInstFmt(int opcode);
-    const char* GetTargetInstName(int opcode);
-    std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr);
-    ResourceMask GetPCUseDefEncoding() const OVERRIDE;
-    uint64_t GetTargetInstFlags(int opcode);
-    size_t GetInsnSize(LIR* lir) OVERRIDE;
-    bool IsUnconditionalBranch(LIR* lir);
-
-    // Get the register class for load/store of a field.
-    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
-
-    // Required for target - Dalvik-level generators.
-    void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                        RegLocation lr_shift);
-    void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_src2);
-    void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
-                     RegLocation rl_index, RegLocation rl_dest, int scale);
-    void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
-                     RegLocation rl_src, int scale, bool card_mark);
-    void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_shift);
-    void GenLongOp(OpKind op, RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
-    void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                          RegLocation rl_src2);
-    void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                         RegLocation rl_src2);
-    void GenCmpFP(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2);
-    void GenConversion(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src);
-    bool GenInlinedReverseBits(CallInfo* info, OpSize size);
-    bool GenInlinedAbsFloat(CallInfo* info) OVERRIDE;
-    bool GenInlinedAbsDouble(CallInfo* info) OVERRIDE;
-    bool GenInlinedCas(CallInfo* info, bool is_long, bool is_object);
-    bool GenInlinedMinMax(CallInfo* info, bool is_min, bool is_long);
-    bool GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double);
-    bool GenInlinedSqrt(CallInfo* info);
-    bool GenInlinedPeek(CallInfo* info, OpSize size);
-    bool GenInlinedPoke(CallInfo* info, OpSize size);
-    bool GenInlinedAbsLong(CallInfo* info);
-    void GenIntToLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                   RegLocation rl_src2);
-    void GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                       RegLocation rl_src2, bool is_div);
-    RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
-    RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
-    void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
-    void GenDivZeroCheckWide(RegStorage reg);
-    void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method);
-    void GenExitSequence();
-    void GenSpecialExitSequence();
-    void GenFillArrayData(DexOffset table_offset, RegLocation rl_src);
-    void GenFusedFPCmpBranch(BasicBlock* bb, MIR* mir, bool gt_bias, bool is_double);
-    void GenFusedLongCmpBranch(BasicBlock* bb, MIR* mir);
-    void GenSelect(BasicBlock* bb, MIR* mir) OVERRIDE;
-    void GenSelectConst32(RegStorage left_op, RegStorage right_op, ConditionCode code,
-                          int32_t true_val, int32_t false_val, RegStorage rs_dest,
-                          int dest_reg_class) OVERRIDE;
-    // Helper used in the above two.
-    void GenSelect(int32_t left, int32_t right, ConditionCode code, RegStorage rs_dest,
-                   int result_reg_class);
-
-    bool GenMemBarrier(MemBarrierKind barrier_kind);
-    void GenMonitorEnter(int opt_flags, RegLocation rl_src);
-    void GenMonitorExit(int opt_flags, RegLocation rl_src);
-    void GenMoveException(RegLocation rl_dest);
-    void GenMultiplyByTwoBitMultiplier(RegLocation rl_src, RegLocation rl_result, int lit,
-                                       int first_bit, int second_bit);
-    void GenNegDouble(RegLocation rl_dest, RegLocation rl_src);
-    void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
-    void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
-    void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
-
-    // Spill core and FP registers. Returns the SP difference: either spill size, or whole
-    // frame size.
-    int SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
-
-    // Unspill core and FP registers.
-    void UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
-
-    // Required for target - single operation generators.
-    LIR* OpUnconditionalBranch(LIR* target);
-    LIR* OpCmpBranch(ConditionCode cond, RegStorage src1, RegStorage src2, LIR* target);
-    LIR* OpCmpImmBranch(ConditionCode cond, RegStorage reg, int check_value, LIR* target);
-    LIR* OpCondBranch(ConditionCode cc, LIR* target);
-    LIR* OpDecAndBranch(ConditionCode c_code, RegStorage reg, LIR* target);
-    LIR* OpFpRegCopy(RegStorage r_dest, RegStorage r_src);
-    LIR* OpIT(ConditionCode cond, const char* guide);
-    void OpEndIT(LIR* it);
-    LIR* OpMem(OpKind op, RegStorage r_base, int disp);
-    LIR* OpPcRelLoad(RegStorage reg, LIR* target);
-    LIR* OpReg(OpKind op, RegStorage r_dest_src);
-    void OpRegCopy(RegStorage r_dest, RegStorage r_src);
-    LIR* OpRegCopyNoInsert(RegStorage r_dest, RegStorage r_src);
-    LIR* OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value);
-    LIR* OpRegImm(OpKind op, RegStorage r_dest_src1, int value);
-    LIR* OpRegReg(OpKind op, RegStorage r_dest_src1, RegStorage r_src2);
-    LIR* OpMovRegMem(RegStorage r_dest, RegStorage r_base, int offset, MoveType move_type);
-    LIR* OpMovMemReg(RegStorage r_base, int offset, RegStorage r_src, MoveType move_type);
-    LIR* OpCondRegReg(OpKind op, ConditionCode cc, RegStorage r_dest, RegStorage r_src);
-    LIR* OpRegRegImm64(OpKind op, RegStorage r_dest, RegStorage r_src1, int64_t value);
-    LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
-    LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
-    LIR* OpTestSuspend(LIR* target);
-    LIR* OpVldm(RegStorage r_base, int count);
-    LIR* OpVstm(RegStorage r_base, int count);
-    void OpRegCopyWide(RegStorage dest, RegStorage src);
-
-    LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size);
-    LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
-    LIR* OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
-                          int shift);
-    LIR* OpRegRegRegExtend(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
-                           A64RegExtEncodings ext, uint8_t amount);
-    LIR* OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift);
-    LIR* OpRegRegExtend(OpKind op, RegStorage r_dest_src1, RegStorage r_src2,
-                        A64RegExtEncodings ext, uint8_t amount);
-    static const ArmEncodingMap EncodingMap[kA64Last];
-    int EncodeShift(int code, int amount);
-    int EncodeExtend(int extend_type, int amount);
-    bool IsExtendEncoding(int encoded_value);
-    int EncodeLogicalImmediate(bool is_wide, uint64_t value);
-    uint64_t DecodeLogicalImmediate(bool is_wide, int value);
-
-    ArmConditionCode ArmConditionEncoding(ConditionCode code);
-    bool InexpensiveConstantInt(int32_t value);
-    bool InexpensiveConstantFloat(int32_t value);
-    bool InexpensiveConstantLong(int64_t value);
-    bool InexpensiveConstantDouble(int64_t value);
-
-    void FlushIns(RegLocation* ArgLocs, RegLocation rl_method);
-
-    int GenDalvikArgsNoRange(CallInfo* info, int call_state, LIR** pcrLabel,
-                             NextCallInsn next_call_insn,
-                             const MethodReference& target_method,
-                             uint32_t vtable_idx,
-                             uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
-                             bool skip_this);
-
-    int GenDalvikArgsRange(CallInfo* info, int call_state, LIR** pcrLabel,
+  }
+  RegStorage TargetPtrReg(SpecialTargetRegister symbolic_reg) OVERRIDE {
+    return As64BitReg(TargetReg(symbolic_reg));
+  }
+  RegStorage GetArgMappingToPhysicalReg(int arg_num) OVERRIDE;
+  RegLocation GetReturnAlt() OVERRIDE;
+  RegLocation GetReturnWideAlt() OVERRIDE;
+  RegLocation LocCReturn() OVERRIDE;
+  RegLocation LocCReturnRef() OVERRIDE;
+  RegLocation LocCReturnDouble() OVERRIDE;
+  RegLocation LocCReturnFloat() OVERRIDE;
+  RegLocation LocCReturnWide() OVERRIDE;
+  ResourceMask GetRegMaskCommon(const RegStorage& reg) const OVERRIDE;
+  void AdjustSpillMask() OVERRIDE;
+  void ClobberCallerSave() OVERRIDE;
+  void FreeCallTemps() OVERRIDE;
+  void LockCallTemps() OVERRIDE;
+  void CompilerInitializeRegAlloc() OVERRIDE;
+
+  // Required for target - miscellaneous.
+  void AssembleLIR() OVERRIDE;
+  void DumpResourceMask(LIR* lir, const ResourceMask& mask, const char* prefix) OVERRIDE;
+  void SetupTargetResourceMasks(LIR* lir, uint64_t flags,
+                                ResourceMask* use_mask, ResourceMask* def_mask) OVERRIDE;
+  const char* GetTargetInstFmt(int opcode) OVERRIDE;
+  const char* GetTargetInstName(int opcode) OVERRIDE;
+  std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr) OVERRIDE;
+  ResourceMask GetPCUseDefEncoding() const OVERRIDE;
+  uint64_t GetTargetInstFlags(int opcode) OVERRIDE;
+  size_t GetInsnSize(LIR* lir) OVERRIDE;
+  bool IsUnconditionalBranch(LIR* lir) OVERRIDE;
+
+  // Get the register class for load/store of a field.
+  RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
+  // Required for target - Dalvik-level generators.
+  void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                      RegLocation lr_shift) OVERRIDE;
+  void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                         RegLocation rl_src2) OVERRIDE;
+  void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
+                   RegLocation rl_dest, int scale) OVERRIDE;
+  void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
+                   RegLocation rl_src, int scale, bool card_mark) OVERRIDE;
+  void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                         RegLocation rl_shift) OVERRIDE;
+  void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                        RegLocation rl_src2) OVERRIDE;
+  void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                       RegLocation rl_src2) OVERRIDE;
+  void GenCmpFP(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                RegLocation rl_src2) OVERRIDE;
+  void GenConversion(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
+  bool GenInlinedReverseBits(CallInfo* info, OpSize size) OVERRIDE;
+  bool GenInlinedAbsFloat(CallInfo* info) OVERRIDE;
+  bool GenInlinedAbsDouble(CallInfo* info) OVERRIDE;
+  bool GenInlinedCas(CallInfo* info, bool is_long, bool is_object) OVERRIDE;
+  bool GenInlinedMinMax(CallInfo* info, bool is_min, bool is_long) OVERRIDE;
+  bool GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double) OVERRIDE;
+  bool GenInlinedSqrt(CallInfo* info) OVERRIDE;
+  bool GenInlinedCeil(CallInfo* info) OVERRIDE;
+  bool GenInlinedFloor(CallInfo* info) OVERRIDE;
+  bool GenInlinedRint(CallInfo* info) OVERRIDE;
+  bool GenInlinedRound(CallInfo* info, bool is_double) OVERRIDE;
+  bool GenInlinedPeek(CallInfo* info, OpSize size) OVERRIDE;
+  bool GenInlinedPoke(CallInfo* info, OpSize size) OVERRIDE;
+  bool GenInlinedAbsLong(CallInfo* info) OVERRIDE;
+  void GenIntToLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
+  void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                      RegLocation rl_src2) OVERRIDE;
+  RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div)
+      OVERRIDE;
+  RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div)
+      OVERRIDE;
+  void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2)  OVERRIDE;
+  void GenDivZeroCheckWide(RegStorage reg) OVERRIDE;
+  void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) OVERRIDE;
+  void GenExitSequence() OVERRIDE;
+  void GenSpecialExitSequence() OVERRIDE;
+  void GenFillArrayData(DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  void GenFusedFPCmpBranch(BasicBlock* bb, MIR* mir, bool gt_bias, bool is_double) OVERRIDE;
+  void GenFusedLongCmpBranch(BasicBlock* bb, MIR* mir) OVERRIDE;
+  void GenSelect(BasicBlock* bb, MIR* mir) OVERRIDE;
+  void GenSelectConst32(RegStorage left_op, RegStorage right_op, ConditionCode code,
+                        int32_t true_val, int32_t false_val, RegStorage rs_dest,
+                        int dest_reg_class) OVERRIDE;
+
+  bool GenMemBarrier(MemBarrierKind barrier_kind) OVERRIDE;
+  void GenMonitorEnter(int opt_flags, RegLocation rl_src) OVERRIDE;
+  void GenMonitorExit(int opt_flags, RegLocation rl_src) OVERRIDE;
+  void GenMoveException(RegLocation rl_dest) OVERRIDE;
+  void GenMultiplyByTwoBitMultiplier(RegLocation rl_src, RegLocation rl_result, int lit,
+                                     int first_bit, int second_bit) OVERRIDE;
+  void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
+  void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
+  void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+
+  // Required for target - single operation generators.
+  LIR* OpUnconditionalBranch(LIR* target) OVERRIDE;
+  LIR* OpCmpBranch(ConditionCode cond, RegStorage src1, RegStorage src2, LIR* target) OVERRIDE;
+  LIR* OpCmpImmBranch(ConditionCode cond, RegStorage reg, int check_value, LIR* target) OVERRIDE;
+  LIR* OpCondBranch(ConditionCode cc, LIR* target) OVERRIDE;
+  LIR* OpDecAndBranch(ConditionCode c_code, RegStorage reg, LIR* target) OVERRIDE;
+  LIR* OpFpRegCopy(RegStorage r_dest, RegStorage r_src) OVERRIDE;
+  LIR* OpIT(ConditionCode cond, const char* guide) OVERRIDE;
+  void OpEndIT(LIR* it) OVERRIDE;
+  LIR* OpMem(OpKind op, RegStorage r_base, int disp) OVERRIDE;
+  LIR* OpPcRelLoad(RegStorage reg, LIR* target) OVERRIDE;
+  LIR* OpReg(OpKind op, RegStorage r_dest_src) OVERRIDE;
+  void OpRegCopy(RegStorage r_dest, RegStorage r_src) OVERRIDE;
+  LIR* OpRegCopyNoInsert(RegStorage r_dest, RegStorage r_src) OVERRIDE;
+  LIR* OpRegImm(OpKind op, RegStorage r_dest_src1, int value) OVERRIDE;
+  LIR* OpRegReg(OpKind op, RegStorage r_dest_src1, RegStorage r_src2) OVERRIDE;
+  LIR* OpMovRegMem(RegStorage r_dest, RegStorage r_base, int offset, MoveType move_type) OVERRIDE;
+  LIR* OpMovMemReg(RegStorage r_base, int offset, RegStorage r_src, MoveType move_type) OVERRIDE;
+  LIR* OpCondRegReg(OpKind op, ConditionCode cc, RegStorage r_dest, RegStorage r_src) OVERRIDE;
+  LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value) OVERRIDE;
+  LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2) OVERRIDE;
+  LIR* OpTestSuspend(LIR* target) OVERRIDE;
+  LIR* OpVldm(RegStorage r_base, int count) OVERRIDE;
+  LIR* OpVstm(RegStorage r_base, int count) OVERRIDE;
+  void OpRegCopyWide(RegStorage dest, RegStorage src) OVERRIDE;
+
+  bool InexpensiveConstantInt(int32_t value) OVERRIDE;
+  bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) OVERRIDE;
+  bool InexpensiveConstantFloat(int32_t value) OVERRIDE;
+  bool InexpensiveConstantLong(int64_t value) OVERRIDE;
+  bool InexpensiveConstantDouble(int64_t value) OVERRIDE;
+
+  void FlushIns(RegLocation* ArgLocs, RegLocation rl_method) OVERRIDE;
+
+  int GenDalvikArgsNoRange(CallInfo* info, int call_state, LIR** pcrLabel,
                            NextCallInsn next_call_insn,
                            const MethodReference& target_method,
                            uint32_t vtable_idx,
                            uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
-                           bool skip_this);
-    InToRegStorageMapping in_to_reg_storage_mapping_;
+                           bool skip_this) OVERRIDE;
 
-    bool WideGPRsAreAliases() OVERRIDE {
-      return true;  // 64b architecture.
-    }
-    bool WideFPRsAreAliases() OVERRIDE {
-      return true;  // 64b architecture.
-    }
-    size_t GetInstructionOffset(LIR* lir);
-
-    LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
-
-  private:
-    /**
-     * @brief Given register xNN (dNN), returns register wNN (sNN).
-     * @param reg #RegStorage containing a Solo64 input register (e.g. @c x1 or @c d2).
-     * @return A Solo32 with the same register number as the @p reg (e.g. @c w1 or @c s2).
-     * @see As64BitReg
-     */
-    RegStorage As32BitReg(RegStorage reg) {
-      DCHECK(!reg.IsPair());
-      if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
-        if (kFailOnSizeError) {
-          LOG(FATAL) << "Expected 64b register";
-        } else {
-          LOG(WARNING) << "Expected 64b register";
-          return reg;
-        }
+  int GenDalvikArgsRange(CallInfo* info, int call_state, LIR** pcrLabel,
+                         NextCallInsn next_call_insn,
+                         const MethodReference& target_method,
+                         uint32_t vtable_idx,
+                         uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
+                         bool skip_this) OVERRIDE;
+
+  bool WideGPRsAreAliases() OVERRIDE {
+    return true;  // 64b architecture.
+  }
+  bool WideFPRsAreAliases() OVERRIDE {
+    return true;  // 64b architecture.
+  }
+
+  size_t GetInstructionOffset(LIR* lir) OVERRIDE;
+
+  LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
+
+ private:
+  /**
+   * @brief Given register xNN (dNN), returns register wNN (sNN).
+   * @param reg #RegStorage containing a Solo64 input register (e.g. @c x1 or @c d2).
+   * @return A Solo32 with the same register number as the @p reg (e.g. @c w1 or @c s2).
+   * @see As64BitReg
+   */
+  RegStorage As32BitReg(RegStorage reg) {
+    DCHECK(!reg.IsPair());
+    if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
+      if (kFailOnSizeError) {
+        LOG(FATAL) << "Expected 64b register";
+      } else {
+        LOG(WARNING) << "Expected 64b register";
+        return reg;
       }
-      RegStorage ret_val = RegStorage(RegStorage::k32BitSolo,
-                                      reg.GetRawBits() & RegStorage::kRegTypeMask);
-      DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k32SoloStorageMask)
-                               ->GetReg().GetReg(),
-                ret_val.GetReg());
-      return ret_val;
     }
+    RegStorage ret_val = RegStorage(RegStorage::k32BitSolo,
+                                    reg.GetRawBits() & RegStorage::kRegTypeMask);
+    DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k32SoloStorageMask)
+              ->GetReg().GetReg(),
+              ret_val.GetReg());
+    return ret_val;
+  }
 
-    RegStorage Check32BitReg(RegStorage reg) {
-      if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
-        if (kFailOnSizeError) {
-          LOG(FATAL) << "Checked for 32b register";
-        } else {
-          LOG(WARNING) << "Checked for 32b register";
-          return As32BitReg(reg);
-        }
+  RegStorage Check32BitReg(RegStorage reg) {
+    if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
+      if (kFailOnSizeError) {
+        LOG(FATAL) << "Checked for 32b register";
+      } else {
+        LOG(WARNING) << "Checked for 32b register";
+        return As32BitReg(reg);
       }
-      return reg;
     }
+    return reg;
+  }
 
-    /**
-     * @brief Given register wNN (sNN), returns register xNN (dNN).
-     * @param reg #RegStorage containing a Solo32 input register (e.g. @c w1 or @c s2).
-     * @return A Solo64 with the same register number as the @p reg (e.g. @c x1 or @c d2).
-     * @see As32BitReg
-     */
-    RegStorage As64BitReg(RegStorage reg) {
-      DCHECK(!reg.IsPair());
-      if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
-        if (kFailOnSizeError) {
-          LOG(FATAL) << "Expected 32b register";
-        } else {
-          LOG(WARNING) << "Expected 32b register";
-          return reg;
-        }
+  /**
+   * @brief Given register wNN (sNN), returns register xNN (dNN).
+   * @param reg #RegStorage containing a Solo32 input register (e.g. @c w1 or @c s2).
+   * @return A Solo64 with the same register number as the @p reg (e.g. @c x1 or @c d2).
+   * @see As32BitReg
+   */
+  RegStorage As64BitReg(RegStorage reg) {
+    DCHECK(!reg.IsPair());
+    if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
+      if (kFailOnSizeError) {
+        LOG(FATAL) << "Expected 32b register";
+      } else {
+        LOG(WARNING) << "Expected 32b register";
+        return reg;
       }
-      RegStorage ret_val = RegStorage(RegStorage::k64BitSolo,
-                                      reg.GetRawBits() & RegStorage::kRegTypeMask);
-      DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k64SoloStorageMask)
-                               ->GetReg().GetReg(),
-                ret_val.GetReg());
-      return ret_val;
     }
+    RegStorage ret_val = RegStorage(RegStorage::k64BitSolo,
+                                    reg.GetRawBits() & RegStorage::kRegTypeMask);
+    DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k64SoloStorageMask)
+              ->GetReg().GetReg(),
+              ret_val.GetReg());
+    return ret_val;
+  }
 
-    RegStorage Check64BitReg(RegStorage reg) {
-      if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
-        if (kFailOnSizeError) {
-          LOG(FATAL) << "Checked for 64b register";
-        } else {
-          LOG(WARNING) << "Checked for 64b register";
-          return As64BitReg(reg);
-        }
+  RegStorage Check64BitReg(RegStorage reg) {
+    if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
+      if (kFailOnSizeError) {
+        LOG(FATAL) << "Checked for 64b register";
+      } else {
+        LOG(WARNING) << "Checked for 64b register";
+        return As64BitReg(reg);
       }
-      return reg;
     }
+    return reg;
+  }
+
+  int32_t EncodeImmSingle(uint32_t bits);
+  int32_t EncodeImmDouble(uint64_t bits);
+  LIR* LoadFPConstantValue(RegStorage r_dest, int32_t value);
+  LIR* LoadFPConstantValueWide(RegStorage r_dest, int64_t value);
+  void ReplaceFixup(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
+  void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
+  void AssignDataOffsets();
+  RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                        bool is_div, bool check_zero);
+  RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
+  size_t GetLoadStoreSize(LIR* lir);
+
+  bool SmallLiteralDivRem64(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
+                            RegLocation rl_dest, int64_t lit);
+
+  uint32_t LinkFixupInsns(LIR* head_lir, LIR* tail_lir, CodeOffset offset);
+  int AssignInsnOffsets();
+  void AssignOffsets();
+  uint8_t* EncodeLIRs(uint8_t* write_pos, LIR* lir);
+
+  // Spill core and FP registers. Returns the SP difference: either spill size, or whole
+  // frame size.
+  int SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
+
+  // Unspill core and FP registers.
+  void UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
+
+  void GenLongOp(OpKind op, RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
+
+  LIR* OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value);
+  LIR* OpRegRegImm64(OpKind op, RegStorage r_dest, RegStorage r_src1, int64_t value);
+
+  LIR* OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift);
+  LIR* OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
+                        int shift);
+  int EncodeShift(int code, int amount);
+
+  LIR* OpRegRegExtend(OpKind op, RegStorage r_dest_src1, RegStorage r_src2,
+                      A64RegExtEncodings ext, uint8_t amount);
+  LIR* OpRegRegRegExtend(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
+                         A64RegExtEncodings ext, uint8_t amount);
+  int EncodeExtend(int extend_type, int amount);
+  bool IsExtendEncoding(int encoded_value);
+
+  LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size);
+  LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
+
+  int EncodeLogicalImmediate(bool is_wide, uint64_t value);
+  uint64_t DecodeLogicalImmediate(bool is_wide, int value);
+  ArmConditionCode ArmConditionEncoding(ConditionCode code);
+
+  // Helper used in the two GenSelect variants.
+  void GenSelect(int32_t left, int32_t right, ConditionCode code, RegStorage rs_dest,
+                 int result_reg_class);
+
+  void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
+  void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
+  void GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                     RegLocation rl_src2, bool is_div);
 
-    LIR* LoadFPConstantValue(RegStorage r_dest, int32_t value);
-    LIR* LoadFPConstantValueWide(RegStorage r_dest, int64_t value);
-    void ReplaceFixup(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
-    void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
-    void AssignDataOffsets();
-    RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
-                          bool is_div, bool check_zero);
-    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
-    size_t GetLoadStoreSize(LIR* lir);
+  InToRegStorageMapping in_to_reg_storage_mapping_;
+  static const ArmEncodingMap EncodingMap[kA64Last];
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/fp_arm64.cc b/compiler/dex/quick/arm64/fp_arm64.cc
index ed13c04..d0b2636 100644
--- a/compiler/dex/quick/arm64/fp_arm64.cc
+++ b/compiler/dex/quick/arm64/fp_arm64.cc
@@ -17,6 +17,7 @@
 #include "arm64_lir.h"
 #include "codegen_arm64.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "utils.h"
 
 namespace art {
 
@@ -386,6 +387,52 @@ bool Arm64Mir2Lir::GenInlinedSqrt(CallInfo* info) {
   return true;
 }
 
+bool Arm64Mir2Lir::GenInlinedCeil(CallInfo* info) {
+  RegLocation rl_src = info->args[0];
+  RegLocation rl_dest = InlineTargetWide(info);
+  rl_src = LoadValueWide(rl_src, kFPReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
+  NewLIR2(FWIDE(kA64Frintp2ff), rl_result.reg.GetReg(), rl_src.reg.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+  return true;
+}
+
+bool Arm64Mir2Lir::GenInlinedFloor(CallInfo* info) {
+  RegLocation rl_src = info->args[0];
+  RegLocation rl_dest = InlineTargetWide(info);
+  rl_src = LoadValueWide(rl_src, kFPReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
+  NewLIR2(FWIDE(kA64Frintm2ff), rl_result.reg.GetReg(), rl_src.reg.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+  return true;
+}
+
+bool Arm64Mir2Lir::GenInlinedRint(CallInfo* info) {
+  RegLocation rl_src = info->args[0];
+  RegLocation rl_dest = InlineTargetWide(info);
+  rl_src = LoadValueWide(rl_src, kFPReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
+  NewLIR2(FWIDE(kA64Frintn2ff), rl_result.reg.GetReg(), rl_src.reg.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+  return true;
+}
+
+bool Arm64Mir2Lir::GenInlinedRound(CallInfo* info, bool is_double) {
+  int32_t encoded_imm = EncodeImmSingle(bit_cast<float, uint32_t>(0.5f));
+  ArmOpcode wide = (is_double) ? FWIDE(0) : FUNWIDE(0);
+  RegLocation rl_src = info->args[0];
+  RegLocation rl_dest = (is_double) ? InlineTargetWide(info) : InlineTarget(info);
+  rl_src = (is_double) ? LoadValueWide(rl_src, kFPReg) : LoadValue(rl_src, kFPReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  RegStorage r_tmp = (is_double) ? AllocTempDouble() : AllocTempSingle();
+  // 0.5f and 0.5d are encoded in the same way.
+  NewLIR2(kA64Fmov2fI | wide, r_tmp.GetReg(), encoded_imm);
+  NewLIR3(kA64Fadd3fff | wide, rl_src.reg.GetReg(), rl_src.reg.GetReg(), r_tmp.GetReg());
+  NewLIR2((is_double) ? kA64Fcvtms2xS : kA64Fcvtms2ws, rl_result.reg.GetReg(), rl_src.reg.GetReg());
+  (is_double) ? StoreValueWide(rl_dest, rl_result) : StoreValue(rl_dest, rl_result);
+  return true;
+}
+
 bool Arm64Mir2Lir::GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double) {
   DCHECK_EQ(cu_->instruction_set, kArm64);
   int op = (is_min) ? kA64Fmin3fff : kA64Fmax3fff;
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 9403d5e..147fee8 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -931,34 +931,52 @@ void Arm64Mir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
   StoreValueWide(rl_dest, rl_result);
 }
 
-void Arm64Mir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
-                              RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongOp(kOpMul, rl_dest, rl_src1, rl_src2);
-}
-
-void Arm64Mir2Lir::GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                              RegLocation rl_src2) {
-  GenLongOp(kOpAdd, rl_dest, rl_src1, rl_src2);
-}
-
-void Arm64Mir2Lir::GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  GenLongOp(kOpSub, rl_dest, rl_src1, rl_src2);
-}
-
-void Arm64Mir2Lir::GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  GenLongOp(kOpAnd, rl_dest, rl_src1, rl_src2);
-}
-
-void Arm64Mir2Lir::GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2) {
-  GenLongOp(kOpOr, rl_dest, rl_src1, rl_src2);
-}
-
-void Arm64Mir2Lir::GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  GenLongOp(kOpXor, rl_dest, rl_src1, rl_src2);
+void Arm64Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                                  RegLocation rl_src1, RegLocation rl_src2) {
+  switch (opcode) {
+    case Instruction::NOT_LONG:
+      GenNotLong(rl_dest, rl_src2);
+      return;
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+      GenLongOp(kOpAdd, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::SUB_LONG:
+    case Instruction::SUB_LONG_2ADDR:
+      GenLongOp(kOpSub, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::MUL_LONG:
+    case Instruction::MUL_LONG_2ADDR:
+      GenLongOp(kOpMul, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::DIV_LONG:
+    case Instruction::DIV_LONG_2ADDR:
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
+      return;
+    case Instruction::REM_LONG:
+    case Instruction::REM_LONG_2ADDR:
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
+      return;
+    case Instruction::AND_LONG_2ADDR:
+    case Instruction::AND_LONG:
+      GenLongOp(kOpAnd, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::OR_LONG:
+    case Instruction::OR_LONG_2ADDR:
+      GenLongOp(kOpOr, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::XOR_LONG:
+    case Instruction::XOR_LONG_2ADDR:
+      GenLongOp(kOpXor, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::NEG_LONG: {
+      GenNegLong(rl_dest, rl_src2);
+      return;
+    }
+    default:
+      LOG(FATAL) << "Invalid long arith op";
+      return;
+  }
 }
 
 /*
@@ -1192,22 +1210,7 @@ void Arm64Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
 
 void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                      RegLocation rl_src1, RegLocation rl_src2) {
-  if ((opcode == Instruction::SUB_LONG) || (opcode == Instruction::SUB_LONG_2ADDR)) {
-    if (!rl_src2.is_const) {
-      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-    }
-  } else {
-    // Associativity.
-    if (!rl_src2.is_const) {
-      DCHECK(rl_src1.is_const);
-      std::swap(rl_src1, rl_src2);
-    }
-  }
-  DCHECK(rl_src2.is_const);
-
   OpKind op = kOpBkpt;
-  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-
   switch (opcode) {
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
@@ -1233,6 +1236,20 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de
       LOG(FATAL) << "Unexpected opcode";
   }
 
+  if (op == kOpSub) {
+    if (!rl_src2.is_const) {
+      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+    }
+  } else {
+    // Associativity.
+    if (!rl_src2.is_const) {
+      DCHECK(rl_src1.is_const);
+      std::swap(rl_src1, rl_src2);
+    }
+  }
+  DCHECK(rl_src2.is_const);
+  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
+
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
   OpRegRegImm64(op, rl_result.reg, rl_src1.reg, val);
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 5131bd8..5326e74 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -23,7 +23,7 @@ namespace art {
 
 /* This file contains codegen for the A64 ISA. */
 
-static int32_t EncodeImmSingle(uint32_t bits) {
+int32_t Arm64Mir2Lir::EncodeImmSingle(uint32_t bits) {
   /*
    * Valid values will have the form:
    *
@@ -55,7 +55,7 @@ static int32_t EncodeImmSingle(uint32_t bits) {
   return (bit7 | bit6 | bit5_to_0);
 }
 
-static int32_t EncodeImmDouble(uint64_t bits) {
+int32_t Arm64Mir2Lir::EncodeImmDouble(uint64_t bits) {
   /*
    * Valid values will have the form:
    *
@@ -269,8 +269,47 @@ int Arm64Mir2Lir::EncodeLogicalImmediate(bool is_wide, uint64_t value) {
   return (n << 12 | imm_r << 6 | imm_s);
 }
 
+// Maximum number of instructions to use for encoding the immediate.
+static const int max_num_ops_per_const_load = 2;
+
+/**
+ * @brief Return the number of fast halfwords in the given uint64_t integer.
+ * @details The input integer is split into 4 halfwords (bits 0-15, 16-31, 32-47, 48-63). The
+ *   number of fast halfwords (halfwords that are either 0 or 0xffff) is returned. See below for
+ *   a more accurate description.
+ * @param value The input 64-bit integer.
+ * @return Return @c retval such that (retval & 0x7) is the maximum between n and m, where n is
+ *   the number of halfwords with all bits unset (0) and m is the number of halfwords with all bits
+ *   set (0xffff). Additionally (retval & 0x8) is set when m > n.
+ */
+static int GetNumFastHalfWords(uint64_t value) {
+  unsigned int num_0000_halfwords = 0;
+  unsigned int num_ffff_halfwords = 0;
+  for (int shift = 0; shift < 64; shift += 16) {
+    uint16_t halfword = static_cast<uint16_t>(value >> shift);
+    if (halfword == 0)
+      num_0000_halfwords++;
+    else if (halfword == UINT16_C(0xffff))
+      num_ffff_halfwords++;
+  }
+  if (num_0000_halfwords >= num_ffff_halfwords) {
+    DCHECK_LE(num_0000_halfwords, 4U);
+    return num_0000_halfwords;
+  } else {
+    DCHECK_LE(num_ffff_halfwords, 4U);
+    return num_ffff_halfwords | 0x8;
+  }
+}
+
+// The InexpensiveConstantXXX variants below are used in the promotion algorithm to determine how a
+// constant is considered for promotion. If the constant is "inexpensive" then the promotion
+// algorithm will give it a low priority for promotion, even when it is referenced many times in
+// the code.
+
 bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value) {
-  return false;  // (ModifiedImmediate(value) >= 0) || (ModifiedImmediate(~value) >= 0);
+  // A 32-bit int can always be loaded with 2 instructions (and without using the literal pool).
+  // We therefore return true and give it a low priority for promotion.
+  return true;
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantFloat(int32_t value) {
@@ -278,13 +317,70 @@ bool Arm64Mir2Lir::InexpensiveConstantFloat(int32_t value) {
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantLong(int64_t value) {
-  return InexpensiveConstantInt(High32Bits(value)) && InexpensiveConstantInt(Low32Bits(value));
+  int num_slow_halfwords = 4 - (GetNumFastHalfWords(value) & 0x7);
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
+    return true;
+  }
+  return (EncodeLogicalImmediate(/*is_wide=*/true, value) >= 0);
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantDouble(int64_t value) {
   return EncodeImmDouble(value) >= 0;
 }
 
+// The InexpensiveConstantXXX variants below are used to determine which A64 instructions to use
+// when one of the operands is an immediate (e.g. register version or immediate version of add).
+
+bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+  switch (opcode) {
+  case Instruction::IF_EQ:
+  case Instruction::IF_NE:
+  case Instruction::IF_LT:
+  case Instruction::IF_GE:
+  case Instruction::IF_GT:
+  case Instruction::IF_LE:
+  case Instruction::ADD_INT:
+  case Instruction::ADD_INT_2ADDR:
+  case Instruction::SUB_INT:
+  case Instruction::SUB_INT_2ADDR:
+    // The code below is consistent with the implementation of OpRegRegImm().
+    {
+      int32_t abs_value = std::abs(value);
+      if (abs_value < 0x1000) {
+        return true;
+      } else if ((abs_value & UINT64_C(0xfff)) == 0 && ((abs_value >> 12) < 0x1000)) {
+        return true;
+      }
+      return false;
+    }
+  case Instruction::SHL_INT:
+  case Instruction::SHL_INT_2ADDR:
+  case Instruction::SHR_INT:
+  case Instruction::SHR_INT_2ADDR:
+  case Instruction::USHR_INT:
+  case Instruction::USHR_INT_2ADDR:
+    return true;
+  case Instruction::AND_INT:
+  case Instruction::AND_INT_2ADDR:
+  case Instruction::AND_INT_LIT16:
+  case Instruction::AND_INT_LIT8:
+  case Instruction::OR_INT:
+  case Instruction::OR_INT_2ADDR:
+  case Instruction::OR_INT_LIT16:
+  case Instruction::OR_INT_LIT8:
+  case Instruction::XOR_INT:
+  case Instruction::XOR_INT_2ADDR:
+  case Instruction::XOR_INT_LIT16:
+  case Instruction::XOR_INT_LIT8:
+    if (value == 0 || value == INT32_C(-1)) {
+      return true;
+    }
+    return (EncodeLogicalImmediate(/*is_wide=*/false, value) >= 0);
+  default:
+    return false;
+  }
+}
+
 /*
  * Load a immediate using one single instruction when possible; otherwise
  * use a pair of movz and movk instructions.
@@ -358,9 +454,6 @@ LIR* Arm64Mir2Lir::LoadConstantNoClobber(RegStorage r_dest, int value) {
 
 // TODO: clean up the names. LoadConstantWide() should really be LoadConstantNoClobberWide().
 LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
-  // Maximum number of instructions to use for encoding the immediate.
-  const int max_num_ops = 2;
-
   if (r_dest.IsFloat()) {
     return LoadFPConstantValueWide(r_dest, value);
   }
@@ -378,19 +471,12 @@ LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
   }
 
   // At least one in value's halfwords is not 0x0, nor 0xffff: find out how many.
-  int num_0000_halfwords = 0;
-  int num_ffff_halfwords = 0;
   uint64_t uvalue = static_cast<uint64_t>(value);
-  for (int shift = 0; shift < 64; shift += 16) {
-    uint16_t halfword = static_cast<uint16_t>(uvalue >> shift);
-    if (halfword == 0)
-      num_0000_halfwords++;
-    else if (halfword == UINT16_C(0xffff))
-      num_ffff_halfwords++;
-  }
-  int num_fast_halfwords = std::max(num_0000_halfwords, num_ffff_halfwords);
+  int num_fast_halfwords = GetNumFastHalfWords(uvalue);
+  int num_slow_halfwords = 4 - (num_fast_halfwords & 0x7);
+  bool more_ffff_halfwords = (num_fast_halfwords & 0x8) != 0;
 
-  if (num_fast_halfwords < 3) {
+  if (num_slow_halfwords > 1) {
     // A single movz/movn is not enough. Try the logical immediate route.
     int log_imm = EncodeLogicalImmediate(/*is_wide=*/true, value);
     if (log_imm >= 0) {
@@ -398,19 +484,19 @@ LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
     }
   }
 
-  if (num_fast_halfwords >= 4 - max_num_ops) {
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
     // We can encode the number using a movz/movn followed by one or more movk.
     ArmOpcode op;
     uint16_t background;
     LIR* res = nullptr;
 
     // Decide whether to use a movz or a movn.
-    if (num_0000_halfwords >= num_ffff_halfwords) {
-      op = WIDE(kA64Movz3rdM);
-      background = 0;
-    } else {
+    if (more_ffff_halfwords) {
       op = WIDE(kA64Movn3rdM);
       background = 0xffff;
+    } else {
+      op = WIDE(kA64Movz3rdM);
+      background = 0;
     }
 
     // Emit the first instruction (movz, movn).
@@ -726,7 +812,7 @@ LIR* Arm64Mir2Lir::OpRegRegImm64(OpKind op, RegStorage r_dest, RegStorage r_src1
   int64_t abs_value = (neg) ? -value : value;
   ArmOpcode opcode = kA64Brk1d;
   ArmOpcode alt_opcode = kA64Brk1d;
-  int32_t log_imm = -1;
+  bool is_logical = false;
   bool is_wide = r_dest.Is64Bit();
   ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
   int info = 0;
@@ -761,65 +847,89 @@ LIR* Arm64Mir2Lir::OpRegRegImm64(OpKind op, RegStorage r_dest, RegStorage r_src1
         opcode = (neg) ? kA64Add4RRdT : kA64Sub4RRdT;
         return NewLIR4(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), abs_value >> 12, 1);
       } else {
-        log_imm = -1;
         alt_opcode = (op == kOpAdd) ? kA64Add4RRre : kA64Sub4RRre;
         info = EncodeExtend(is_wide ? kA64Uxtx : kA64Uxtw, 0);
       }
       break;
-    // case kOpRsub:
-    //   opcode = kThumb2RsubRRI8M;
-    //   alt_opcode = kThumb2RsubRRR;
-    //   break;
     case kOpAdc:
-      log_imm = -1;
       alt_opcode = kA64Adc3rrr;
       break;
     case kOpSbc:
-      log_imm = -1;
       alt_opcode = kA64Sbc3rrr;
       break;
     case kOpOr:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Orr3Rrl;
       alt_opcode = kA64Orr4rrro;
       break;
     case kOpAnd:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64And3Rrl;
       alt_opcode = kA64And4rrro;
       break;
     case kOpXor:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Eor3Rrl;
       alt_opcode = kA64Eor4rrro;
       break;
     case kOpMul:
       // TUNING: power of 2, shift & add
-      log_imm = -1;
       alt_opcode = kA64Mul3rrr;
       break;
     default:
       LOG(FATAL) << "Bad opcode: " << op;
   }
 
-  if (log_imm >= 0) {
-    return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
-  } else {
-    RegStorage r_scratch;
-    if (is_wide) {
-      r_scratch = AllocTempWide();
-      LoadConstantWide(r_scratch, value);
+  if (is_logical) {
+    int log_imm = EncodeLogicalImmediate(is_wide, value);
+    if (log_imm >= 0) {
+      return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
     } else {
-      r_scratch = AllocTemp();
-      LoadConstant(r_scratch, value);
+      // When the immediate is either 0 or ~0, the logical operation can be trivially reduced
+      // to a - possibly negated - assignment.
+      if (value == 0) {
+        switch (op) {
+          case kOpOr:
+          case kOpXor:
+            // Or/Xor by zero reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // And by zero reduces to a `mov rdest, xzr'.
+            DCHECK(op == kOpAnd);
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      } else if (value == INT64_C(-1)
+                 || (!is_wide && static_cast<uint32_t>(value) == ~UINT32_C(0))) {
+        switch (op) {
+          case kOpAnd:
+            // And by -1 reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          case kOpXor:
+            // Xor by -1 reduces to an `mvn rdest, rsrc'.
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // Or by -1 reduces to a `mvn rdest, xzr'.
+            DCHECK(op == kOpOr);
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      }
     }
-    if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
-      res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
-    else
-      res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
-    FreeTemp(r_scratch);
-    return res;
   }
+
+  RegStorage r_scratch;
+  if (is_wide) {
+    r_scratch = AllocTempWide();
+    LoadConstantWide(r_scratch, value);
+  } else {
+    r_scratch = AllocTemp();
+    LoadConstant(r_scratch, value);
+  }
+  if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
+    res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
+  else
+    res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
+  FreeTemp(r_scratch);
+  return res;
 }
 
 LIR* Arm64Mir2Lir::OpRegImm(OpKind op, RegStorage r_dest_src1, int value) {
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 463f277..9f60427 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -394,6 +394,18 @@ LIR* Mir2Lir::ScanLiteralPoolMethod(LIR* data_target, const MethodReference& met
   return nullptr;
 }
 
+/* Search the existing constants in the literal pool for an exact class match */
+LIR* Mir2Lir::ScanLiteralPoolClass(LIR* data_target, const DexFile& dex_file, uint32_t type_idx) {
+  while (data_target) {
+    if (static_cast<uint32_t>(data_target->operands[0]) == type_idx &&
+        UnwrapPointer(data_target->operands[1]) == &dex_file) {
+      return data_target;
+    }
+    data_target = data_target->next;
+  }
+  return nullptr;
+}
+
 /*
  * The following are building blocks to insert constants into the pool or
  * instruction streams.
@@ -492,10 +504,13 @@ void Mir2Lir::InstallLiteralPools() {
   data_lir = class_literal_list_;
   while (data_lir != NULL) {
     uint32_t target_method_idx = data_lir->operands[0];
+    const DexFile* class_dex_file =
+      reinterpret_cast<const DexFile*>(UnwrapPointer(data_lir->operands[1]));
     cu_->compiler_driver->AddClassPatch(cu_->dex_file,
                                         cu_->class_def_idx,
                                         cu_->method_idx,
                                         target_method_idx,
+                                        class_dex_file,
                                         code_buffer_.size());
     const DexFile::TypeId& target_method_id = cu_->dex_file->GetTypeId(target_method_idx);
     // unique value based on target to ensure code deduplication works
@@ -983,6 +998,8 @@ Mir2Lir::Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena
       estimated_native_code_size_(0),
       reg_pool_(NULL),
       live_sreg_(0),
+      core_vmap_table_(mir_graph->GetArena()->Adapter()),
+      fp_vmap_table_(mir_graph->GetArena()->Adapter()),
       num_core_spills_(0),
       num_fp_spills_(0),
       frame_size_(0),
@@ -1220,12 +1237,14 @@ void Mir2Lir::LoadMethodAddress(const MethodReference& target_method, InvokeType
   DCHECK_NE(cu_->instruction_set, kMips) << reinterpret_cast<void*>(data_target);
 }
 
-void Mir2Lir::LoadClassType(uint32_t type_idx, SpecialTargetRegister symbolic_reg) {
+void Mir2Lir::LoadClassType(const DexFile& dex_file, uint32_t type_idx,
+                            SpecialTargetRegister symbolic_reg) {
   // Use the literal pool and a PC-relative load from a data word.
-  LIR* data_target = ScanLiteralPool(class_literal_list_, type_idx, 0);
+  LIR* data_target = ScanLiteralPoolClass(class_literal_list_, dex_file, type_idx);
   if (data_target == nullptr) {
     data_target = AddWordData(&class_literal_list_, type_idx);
   }
+  data_target->operands[1] = WrapPointer(const_cast<DexFile*>(&dex_file));
   // Loads a Class pointer, which is a reference as it lives in the heap.
   LIR* load_pc_rel = OpPcRelLoad(TargetReg(symbolic_reg, kRef), data_target);
   AppendLIR(load_pc_rel);
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index 0e46c96..dbceaff 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -48,7 +48,12 @@ static constexpr bool kIntrinsicIsStatic[] = {
     true,   // kIntrinsicMinMaxFloat
     true,   // kIntrinsicMinMaxDouble
     true,   // kIntrinsicSqrt
-    false,  // kIntrinsicGet
+    true,   // kIntrinsicCeil
+    true,   // kIntrinsicFloor
+    true,   // kIntrinsicRint
+    true,   // kIntrinsicRoundFloat
+    true,   // kIntrinsicRoundDouble
+    false,  // kIntrinsicReferenceGet
     false,  // kIntrinsicCharAt
     false,  // kIntrinsicCompareTo
     false,  // kIntrinsicIsEmptyOrLength
@@ -75,7 +80,12 @@ COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicMinMaxLong], MinMaxLong_must_be_stat
 COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicMinMaxFloat], MinMaxFloat_must_be_static);
 COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicMinMaxDouble], MinMaxDouble_must_be_static);
 COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicSqrt], Sqrt_must_be_static);
-COMPILE_ASSERT(!kIntrinsicIsStatic[kIntrinsicGet], Get_must_not_be_static);
+COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicCeil], Ceil_must_be_static);
+COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicFloor], Floor_must_be_static);
+COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicRint], Rint_must_be_static);
+COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicRoundFloat], RoundFloat_must_be_static);
+COMPILE_ASSERT(kIntrinsicIsStatic[kIntrinsicRoundDouble], RoundDouble_must_be_static);
+COMPILE_ASSERT(!kIntrinsicIsStatic[kIntrinsicReferenceGet], Get_must_not_be_static);
 COMPILE_ASSERT(!kIntrinsicIsStatic[kIntrinsicCharAt], CharAt_must_not_be_static);
 COMPILE_ASSERT(!kIntrinsicIsStatic[kIntrinsicCompareTo], CompareTo_must_not_be_static);
 COMPILE_ASSERT(!kIntrinsicIsStatic[kIntrinsicIsEmptyOrLength], IsEmptyOrLength_must_not_be_static);
@@ -155,7 +165,11 @@ const char* const DexFileMethodInliner::kNameCacheNames[] = {
     "max",                   // kNameCacheMax
     "min",                   // kNameCacheMin
     "sqrt",                  // kNameCacheSqrt
-    "get",                   // kNameCacheGet
+    "ceil",                  // kNameCacheCeil
+    "floor",                 // kNameCacheFloor
+    "rint",                  // kNameCacheRint
+    "round",                 // kNameCacheRound
+    "get",                   // kNameCacheReferenceGet
     "charAt",                // kNameCacheCharAt
     "compareTo",             // kNameCacheCompareTo
     "isEmpty",               // kNameCacheIsEmpty
@@ -314,7 +328,18 @@ const DexFileMethodInliner::IntrinsicDef DexFileMethodInliner::kIntrinsicMethods
     INTRINSIC(JavaLangMath,       Sqrt, D_D, kIntrinsicSqrt, 0),
     INTRINSIC(JavaLangStrictMath, Sqrt, D_D, kIntrinsicSqrt, 0),
 
-    INTRINSIC(JavaLangRefReference, Get, _Object, kIntrinsicGet, 0),
+    INTRINSIC(JavaLangMath,       Ceil, D_D, kIntrinsicCeil, 0),
+    INTRINSIC(JavaLangStrictMath, Ceil, D_D, kIntrinsicCeil, 0),
+    INTRINSIC(JavaLangMath,       Floor, D_D, kIntrinsicFloor, 0),
+    INTRINSIC(JavaLangStrictMath, Floor, D_D, kIntrinsicFloor, 0),
+    INTRINSIC(JavaLangMath,       Rint, D_D, kIntrinsicRint, 0),
+    INTRINSIC(JavaLangStrictMath, Rint, D_D, kIntrinsicRint, 0),
+    INTRINSIC(JavaLangMath,       Round, F_I, kIntrinsicRoundFloat, 0),
+    INTRINSIC(JavaLangStrictMath, Round, F_I, kIntrinsicRoundFloat, 0),
+    INTRINSIC(JavaLangMath,       Round, D_J, kIntrinsicRoundDouble, 0),
+    INTRINSIC(JavaLangStrictMath, Round, D_J, kIntrinsicRoundDouble, 0),
+
+    INTRINSIC(JavaLangRefReference, ReferenceGet, _Object, kIntrinsicReferenceGet, 0),
 
     INTRINSIC(JavaLangString, CharAt, I_C, kIntrinsicCharAt, 0),
     INTRINSIC(JavaLangString, CompareTo, String_I, kIntrinsicCompareTo, 0),
@@ -436,8 +461,18 @@ bool DexFileMethodInliner::GenIntrinsic(Mir2Lir* backend, CallInfo* info) {
       return backend->GenInlinedMinMaxFP(info, intrinsic.d.data & kIntrinsicFlagMin, true /* is_double */);
     case kIntrinsicSqrt:
       return backend->GenInlinedSqrt(info);
-    case kIntrinsicGet:
-      return backend->GenInlinedGet(info);
+    case kIntrinsicCeil:
+      return backend->GenInlinedCeil(info);
+    case kIntrinsicFloor:
+      return backend->GenInlinedFloor(info);
+    case kIntrinsicRint:
+      return backend->GenInlinedRint(info);
+    case kIntrinsicRoundFloat:
+      return backend->GenInlinedRound(info, false /* is_double */);
+    case kIntrinsicRoundDouble:
+      return backend->GenInlinedRound(info, true /* is_double */);
+    case kIntrinsicReferenceGet:
+      return backend->GenInlinedReferenceGet(info);
     case kIntrinsicCharAt:
       return backend->GenInlinedCharAt(info);
     case kIntrinsicCompareTo:
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index cb8c165..b875e2b 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -141,7 +141,11 @@ class DexFileMethodInliner {
       kNameCacheMax,
       kNameCacheMin,
       kNameCacheSqrt,
-      kNameCacheGet,
+      kNameCacheCeil,
+      kNameCacheFloor,
+      kNameCacheRint,
+      kNameCacheRound,
+      kNameCacheReferenceGet,
       kNameCacheCharAt,
       kNameCacheCompareTo,
       kNameCacheIsEmpty,
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index aae9155..3f22913 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -256,7 +256,7 @@ void Mir2Lir::GenCompareAndBranch(Instruction::Code opcode, RegLocation rl_src1,
     RegLocation rl_temp = UpdateLoc(rl_src2);
     int32_t constant_value = mir_graph_->ConstantValue(rl_src2);
     if ((rl_temp.location == kLocDalvikFrame) &&
-        InexpensiveConstantInt(constant_value)) {
+        InexpensiveConstantInt(constant_value, opcode)) {
       // OK - convert this to a compare immediate and branch
       OpCmpImmBranch(cond, rl_src1.reg, mir_graph_->ConstantValue(rl_src2), taken);
       return;
@@ -361,7 +361,7 @@ void Mir2Lir::GenNewArray(uint32_t type_idx, RegLocation rl_dest,
                                    &direct_type_ptr, &is_finalizable)) {
       // The fast path.
       if (!use_direct_type_ptr) {
-        LoadClassType(type_idx, kArg0);
+        LoadClassType(*dex_file, type_idx, kArg0);
         CallRuntimeHelperRegMethodRegLocation(kQuickAllocArrayResolved, TargetReg(kArg0, kNotWide),
                                               rl_src, true);
       } else {
@@ -961,7 +961,7 @@ void Mir2Lir::GenNewInstance(uint32_t type_idx, RegLocation rl_dest) {
                                    !is_finalizable) {
       // The fast path.
       if (!use_direct_type_ptr) {
-        LoadClassType(type_idx, kArg0);
+        LoadClassType(*dex_file, type_idx, kArg0);
         if (!is_type_initialized) {
           CallRuntimeHelperRegMethod(kQuickAllocObjectResolved, TargetReg(kArg0, kRef), true);
         } else {
@@ -1808,10 +1808,6 @@ void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
 
   switch (opcode) {
     case Instruction::NOT_LONG:
-      if (cu_->instruction_set == kArm64 || cu_->instruction_set == kX86_64) {
-        GenNotLong(rl_dest, rl_src2);
-        return;
-      }
       rl_src2 = LoadValueWide(rl_src2, kCoreReg);
       rl_result = EvalLoc(rl_dest, kCoreReg, true);
       // Check for destructive overlap
@@ -1829,39 +1825,22 @@ void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
       return;
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
-      if (cu_->instruction_set != kThumb2) {
-        GenAddLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      }
       first_op = kOpAdd;
       second_op = kOpAdc;
       break;
     case Instruction::SUB_LONG:
     case Instruction::SUB_LONG_2ADDR:
-      if (cu_->instruction_set != kThumb2) {
-        GenSubLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      }
       first_op = kOpSub;
       second_op = kOpSbc;
       break;
     case Instruction::MUL_LONG:
     case Instruction::MUL_LONG_2ADDR:
-      if (cu_->instruction_set != kMips) {
-        GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      } else {
-        call_out = true;
-        TargetReg(kRet0, kNotWide).GetReg();
-        target = kQuickLmul;
-      }
+      call_out = true;
+      ret_reg = TargetReg(kRet0, kNotWide).GetReg();
+      target = kQuickLmul;
       break;
     case Instruction::DIV_LONG:
     case Instruction::DIV_LONG_2ADDR:
-      if (cu_->instruction_set == kArm64 || cu_->instruction_set == kX86_64) {
-        GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
-        return;
-      }
       call_out = true;
       check_zero = true;
       ret_reg = TargetReg(kRet0, kNotWide).GetReg();
@@ -1869,10 +1848,6 @@ void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
       break;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
-      if (cu_->instruction_set == kArm64 || cu_->instruction_set == kX86_64) {
-        GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
-        return;
-      }
       call_out = true;
       check_zero = true;
       target = kQuickLmod;
@@ -1882,37 +1857,19 @@ void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
       break;
     case Instruction::AND_LONG_2ADDR:
     case Instruction::AND_LONG:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64 ||
-          cu_->instruction_set == kArm64) {
-        return GenAndLong(opcode, rl_dest, rl_src1, rl_src2);
-      }
       first_op = kOpAnd;
       second_op = kOpAnd;
       break;
     case Instruction::OR_LONG:
     case Instruction::OR_LONG_2ADDR:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64 ||
-          cu_->instruction_set == kArm64) {
-        GenOrLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      }
       first_op = kOpOr;
       second_op = kOpOr;
       break;
     case Instruction::XOR_LONG:
     case Instruction::XOR_LONG_2ADDR:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64 ||
-          cu_->instruction_set == kArm64) {
-        GenXorLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      }
       first_op = kOpXor;
       second_op = kOpXor;
       break;
-    case Instruction::NEG_LONG: {
-      GenNegLong(rl_dest, rl_src2);
-      return;
-    }
     default:
       LOG(FATAL) << "Invalid long arith op";
   }
@@ -2051,4 +2008,92 @@ void Mir2Lir::GenConstWide(RegLocation rl_dest, int64_t value) {
   StoreValueWide(rl_dest, rl_result);
 }
 
+void Mir2Lir::GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+  const uint16_t entries = table[1];
+  // Chained cmp-and-branch.
+  const int32_t* as_int32 = reinterpret_cast<const int32_t*>(&table[2]);
+  int32_t current_key = as_int32[0];
+  const int32_t* targets = &as_int32[1];
+  rl_src = LoadValue(rl_src, kCoreReg);
+  int i = 0;
+  for (; i < entries; i++, current_key++) {
+    if (!InexpensiveConstantInt(current_key, Instruction::Code::IF_EQ)) {
+      // Switch to using a temp and add.
+      break;
+    }
+    BasicBlock* case_block =
+        mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+    OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]);
+  }
+  if (i < entries) {
+    // The rest do not seem to be inexpensive. Try to allocate a temp and use add.
+    RegStorage key_temp = AllocTypedTemp(false, kCoreReg, false);
+    if (key_temp.Valid()) {
+      LoadConstantNoClobber(key_temp, current_key);
+      for (; i < entries - 1; i++, current_key++) {
+        BasicBlock* case_block =
+            mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+        OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+        OpRegImm(kOpAdd, key_temp, 1);  // Increment key.
+      }
+      BasicBlock* case_block =
+          mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+      OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+    } else {
+      // No free temp, just finish the old loop.
+      for (; i < entries; i++, current_key++) {
+        BasicBlock* case_block =
+            mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+        OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]);
+      }
+    }
+  }
+}
+
+void Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+  if (cu_->verbose) {
+    DumpSparseSwitchTable(table);
+  }
+
+  const uint16_t entries = table[1];
+  if (entries <= kSmallSwitchThreshold) {
+    GenSmallPackedSwitch(mir, table_offset, rl_src);
+  } else {
+    // Use the backend-specific implementation.
+    GenLargePackedSwitch(mir, table_offset, rl_src);
+  }
+}
+
+void Mir2Lir::GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+  const uint16_t entries = table[1];
+  // Chained cmp-and-branch.
+  const int32_t* keys = reinterpret_cast<const int32_t*>(&table[2]);
+  const int32_t* targets = &keys[entries];
+  rl_src = LoadValue(rl_src, kCoreReg);
+  for (int i = 0; i < entries; i++) {
+    int key = keys[i];
+    BasicBlock* case_block =
+        mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+    OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block->id]);
+  }
+}
+
+void Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+  if (cu_->verbose) {
+    DumpSparseSwitchTable(table);
+  }
+
+  const uint16_t entries = table[1];
+  if (entries <= kSmallSwitchThreshold) {
+    GenSmallSparseSwitch(mir, table_offset, rl_src);
+  } else {
+    // Use the backend-specific implementation.
+    GenLargeSparseSwitch(mir, table_offset, rl_src);
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 5fc6996..3cfc9a6 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -25,10 +25,8 @@
 #include "mirror/class-inl.h"
 #include "mirror/dex_cache.h"
 #include "mirror/object_array-inl.h"
-#include "mirror/reference-inl.h"
 #include "mirror/string.h"
 #include "mir_to_lir-inl.h"
-#include "scoped_thread_state_change.h"
 #include "x86/codegen_x86.h"
 
 namespace art {
@@ -1129,57 +1127,32 @@ RegLocation Mir2Lir::InlineTargetWide(CallInfo* info) {
   return res;
 }
 
-bool Mir2Lir::GenInlinedGet(CallInfo* info) {
+bool Mir2Lir::GenInlinedReferenceGet(CallInfo* info) {
   if (cu_->instruction_set == kMips) {
     // TODO - add Mips implementation
     return false;
   }
 
-  // the refrence class is stored in the image dex file which might not be the same as the cu's
-  // dex file. Query the reference class for the image dex file then reset to starting dex file
-  // in after loading class type.
-  uint16_t type_idx = 0;
-  const DexFile* ref_dex_file = nullptr;
-  {
-    ScopedObjectAccess soa(Thread::Current());
-    type_idx = mirror::Reference::GetJavaLangRefReference()->GetDexTypeIndex();
-    ref_dex_file = mirror::Reference::GetJavaLangRefReference()->GetDexCache()->GetDexFile();
-  }
-  CHECK(LIKELY(ref_dex_file != nullptr));
-
-  // address is either static within the image file, or needs to be patched up after compilation.
-  bool unused_type_initialized;
   bool use_direct_type_ptr;
   uintptr_t direct_type_ptr;
-  bool is_finalizable;
-  const DexFile* old_dex = cu_->dex_file;
-  cu_->dex_file = ref_dex_file;
+  ClassReference ref;
+  if (!cu_->compiler_driver->CanEmbedReferenceTypeInCode(&ref,
+        &use_direct_type_ptr, &direct_type_ptr)) {
+    return false;
+  }
+
   RegStorage reg_class = TargetReg(kArg1, kRef);
   Clobber(reg_class);
   LockTemp(reg_class);
-  if (!cu_->compiler_driver->CanEmbedTypeInCode(*ref_dex_file, type_idx, &unused_type_initialized,
-                                                &use_direct_type_ptr, &direct_type_ptr,
-                                                &is_finalizable) || is_finalizable) {
-    cu_->dex_file = old_dex;
-    // address is not known and post-compile patch is not possible, cannot insert intrinsic.
-    return false;
-  }
   if (use_direct_type_ptr) {
     LoadConstant(reg_class, direct_type_ptr);
   } else {
-    LoadClassType(type_idx, kArg1);
+    uint16_t type_idx = ref.first->GetClassDef(ref.second).class_idx_;
+    LoadClassType(*ref.first, type_idx, kArg1);
   }
-  cu_->dex_file = old_dex;
 
-  // get the offset for flags in reference class.
-  uint32_t slow_path_flag_offset = 0;
-  uint32_t disable_flag_offset = 0;
-  {
-    ScopedObjectAccess soa(Thread::Current());
-    mirror::Class* reference_class = mirror::Reference::GetJavaLangRefReference();
-    slow_path_flag_offset = reference_class->GetSlowPathFlagOffset().Uint32Value();
-    disable_flag_offset = reference_class->GetDisableIntrinsicFlagOffset().Uint32Value();
-  }
+  uint32_t slow_path_flag_offset = cu_->compiler_driver->GetReferenceSlowFlagOffset();
+  uint32_t disable_flag_offset = cu_->compiler_driver->GetReferenceDisableFlagOffset();
   CHECK(slow_path_flag_offset && disable_flag_offset &&
         (slow_path_flag_offset != disable_flag_offset));
 
@@ -1427,6 +1400,22 @@ bool Mir2Lir::GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double) {
   return false;
 }
 
+bool Mir2Lir::GenInlinedCeil(CallInfo* info) {
+  return false;
+}
+
+bool Mir2Lir::GenInlinedFloor(CallInfo* info) {
+  return false;
+}
+
+bool Mir2Lir::GenInlinedRint(CallInfo* info) {
+  return false;
+}
+
+bool Mir2Lir::GenInlinedRound(CallInfo* info, bool is_double) {
+  return false;
+}
+
 bool Mir2Lir::GenInlinedFloatCvt(CallInfo* info) {
   if (cu_->instruction_set == kMips) {
     // TODO - add Mips implementation
diff --git a/compiler/dex/quick/local_optimizations.cc b/compiler/dex/quick/local_optimizations.cc
index eec2b32..e0f4691 100644
--- a/compiler/dex/quick/local_optimizations.cc
+++ b/compiler/dex/quick/local_optimizations.cc
@@ -200,7 +200,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) {
     /* Initialize alias list */
     alias_list.clear();
     ResourceMask alias_reg_list_mask = kEncodeNone;
-    if (!this_mem_mask.Intersects(kEncodeLiteral)) {
+    if (!this_mem_mask.Intersects(kEncodeMem) && !this_mem_mask.Intersects(kEncodeLiteral)) {
       alias_list.push_back(dest_reg_id);
       SetupRegMask(&alias_reg_list_mask, dest_reg_id);
     }
@@ -248,7 +248,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) {
         bool is_check_lir_load = check_flags & IS_LOAD;
         bool reg_compatible = RegStorage::SameRegType(check_lir->operands[0], native_reg_id);
 
-        if (alias_mem_mask.Equals(kEncodeLiteral)) {
+        if (!alias_mem_mask.Intersects(kEncodeMem) && alias_mem_mask.Equals(kEncodeLiteral)) {
           DCHECK(check_flags & IS_LOAD);
           /* Same value && same register type */
           if (reg_compatible && (this_lir->target == check_lir->target)) {
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 9adddf0..4577a4c 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -61,8 +61,7 @@ bool MipsMir2Lir::GenSpecialCase(BasicBlock* bb, MIR* mir,
  * done:
  *
  */
-void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
-                                  RegLocation rl_src) {
+void MipsMir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpSparseSwitchTable(table);
@@ -139,8 +138,7 @@ void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
  *   jr    rRA
  * done:
  */
-void MipsMir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
-                                  RegLocation rl_src) {
+void MipsMir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 4bd2748..43cbde7 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -92,12 +92,6 @@ class MipsMir2Lir FINAL : public Mir2Lir {
                      RegLocation rl_index, RegLocation rl_src, int scale, bool card_mark);
     void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                            RegLocation rl_shift);
-    void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
     void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                           RegLocation rl_src2);
     void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -112,16 +106,8 @@ class MipsMir2Lir FINAL : public Mir2Lir {
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
-    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
-    void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                   RegLocation rl_src2);
-    void GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                    RegLocation rl_src2);
-    void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                       RegLocation rl_src2, bool is_div);
+    void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                        RegLocation rl_src2) OVERRIDE;
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
@@ -142,8 +128,8 @@ class MipsMir2Lir FINAL : public Mir2Lir {
                                        int first_bit, int second_bit);
     void GenNegDouble(RegLocation rl_dest, RegLocation rl_src);
     void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
-    void GenPackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
-    void GenSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
+    void GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
+    void GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
     bool GenSpecialCase(BasicBlock* bb, MIR* mir, const InlineMethod& special);
 
     // Required for target - single operation generators.
@@ -196,6 +182,12 @@ class MipsMir2Lir FINAL : public Mir2Lir {
     LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
 
   private:
+    void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
+    void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                    RegLocation rl_src2);
+    void GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                    RegLocation rl_src2);
+
     void ConvertShortToLongBranch(LIR* lir);
     RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
                           RegLocation rl_src2, bool is_div, bool check_zero);
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index d727615..ea56989 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -392,11 +392,6 @@ void MipsMir2Lir::OpEndIT(LIR* it) {
 }
 
 
-void MipsMir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1, RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenMulLong for Mips";
-}
-
 void MipsMir2Lir::GenAddLong(Instruction::Code opcode, RegLocation rl_dest,
                              RegLocation rl_src1, RegLocation rl_src2) {
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
@@ -441,13 +436,27 @@ void MipsMir2Lir::GenSubLong(Instruction::Code opcode, RegLocation rl_dest,
   StoreValueWide(rl_dest, rl_result);
 }
 
-void MipsMir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
-  LOG(FATAL) << "Unexpected use GenNotLong()";
-}
+void MipsMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                                 RegLocation rl_src2) {
+  switch (opcode) {
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+      GenAddLong(opcode, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::SUB_LONG:
+    case Instruction::SUB_LONG_2ADDR:
+      GenSubLong(opcode, rl_dest, rl_src1, rl_src2);
+      return;
+    case Instruction::NEG_LONG:
+      GenNegLong(rl_dest, rl_src2);
+      return;
+
+    default:
+      break;
+  }
 
-void MipsMir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2, bool is_div) {
-  LOG(FATAL) << "Unexpected use GenDivRemLong()";
+  // Fallback for all other ops.
+  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
 }
 
 void MipsMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
@@ -470,22 +479,6 @@ void MipsMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
   StoreValueWide(rl_dest, rl_result);
 }
 
-void MipsMir2Lir::GenAndLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1,
-                             RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenAndLong for Mips";
-}
-
-void MipsMir2Lir::GenOrLong(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenOrLong for Mips";
-}
-
-void MipsMir2Lir::GenXorLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1, RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenXorLong for Mips";
-}
-
 /*
  * Generate array load
  */
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 4d8b91e..e519011 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -926,11 +926,11 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
     case Instruction::XOR_INT:
     case Instruction::XOR_INT_2ADDR:
       if (rl_src[0].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[1],
                              mir_graph_->ConstantValue(rl_src[0].orig_sreg));
       } else if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+                 InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0],
                              mir_graph_->ConstantValue(rl_src[1].orig_sreg));
       } else {
@@ -951,7 +951,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
       if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0], mir_graph_->ConstantValue(rl_src[1]));
       } else {
         GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1]);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index b19942d..2221bb5 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -32,6 +32,7 @@
 #include "safe_map.h"
 #include "utils/array_ref.h"
 #include "utils/arena_allocator.h"
+#include "utils/arena_containers.h"
 #include "utils/growable_array.h"
 #include "utils/stack_checks.h"
 
@@ -228,6 +229,9 @@ class Mir2Lir : public Backend {
     static constexpr bool kFailOnSizeError = true && kIsDebugBuild;
     static constexpr bool kReportSizeError = true && kIsDebugBuild;
 
+    // TODO: If necessary, this could be made target-dependent.
+    static constexpr uint16_t kSmallSwitchThreshold = 5;
+
     /*
      * Auxiliary information describing the location of data embedded in the Dalvik
      * byte code stream.
@@ -681,6 +685,7 @@ class Mir2Lir : public Backend {
     LIR* ScanLiteralPool(LIR* data_target, int value, unsigned int delta);
     LIR* ScanLiteralPoolWide(LIR* data_target, int val_lo, int val_hi);
     LIR* ScanLiteralPoolMethod(LIR* data_target, const MethodReference& method);
+    LIR* ScanLiteralPoolClass(LIR* data_target, const DexFile& dex_file, uint32_t type_idx);
     LIR* AddWordData(LIR* *constant_list_p, int value);
     LIR* AddWideData(LIR* *constant_list_p, int val_lo, int val_hi);
     void ProcessSwitchTables();
@@ -867,8 +872,8 @@ class Mir2Lir : public Backend {
                         RegLocation rl_src1, RegLocation rl_shift);
     void GenArithOpIntLit(Instruction::Code opcode, RegLocation rl_dest,
                           RegLocation rl_src, int lit);
-    void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                        RegLocation rl_src1, RegLocation rl_src2);
+    virtual void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                                RegLocation rl_src1, RegLocation rl_src2);
     void GenConversionCall(QuickEntrypointEnum trampoline, RegLocation rl_dest, RegLocation rl_src);
     virtual void GenSuspendTest(int opt_flags);
     virtual void GenSuspendTestAndBranch(int opt_flags, LIR* target);
@@ -954,7 +959,7 @@ class Mir2Lir : public Backend {
      */
     RegLocation InlineTargetWide(CallInfo* info);
 
-    bool GenInlinedGet(CallInfo* info);
+    bool GenInlinedReferenceGet(CallInfo* info);
     virtual bool GenInlinedCharAt(CallInfo* info);
     bool GenInlinedStringIsEmptyOrLength(CallInfo* info, bool is_empty);
     virtual bool GenInlinedReverseBits(CallInfo* info, OpSize size);
@@ -965,6 +970,10 @@ class Mir2Lir : public Backend {
     virtual bool GenInlinedAbsDouble(CallInfo* info) = 0;
     bool GenInlinedFloatCvt(CallInfo* info);
     bool GenInlinedDoubleCvt(CallInfo* info);
+    virtual bool GenInlinedCeil(CallInfo* info);
+    virtual bool GenInlinedFloor(CallInfo* info);
+    virtual bool GenInlinedRint(CallInfo* info);
+    virtual bool GenInlinedRound(CallInfo* info, bool is_double);
     virtual bool GenInlinedArrayCopyCharArray(CallInfo* info);
     virtual bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
     bool GenInlinedStringCompareTo(CallInfo* info);
@@ -1105,11 +1114,13 @@ class Mir2Lir : public Backend {
 
     /*
      * @brief Load the Class* of a Dex Class type into the register.
+     * @param dex DexFile that contains the class type.
      * @param type How the method will be invoked.
      * @param register that will contain the code address.
      * @note register will be passed to TargetReg to get physical register.
      */
-    virtual void LoadClassType(uint32_t type_idx, SpecialTargetRegister symbolic_reg);
+    virtual void LoadClassType(const DexFile& dex_file, uint32_t type_idx,
+                               SpecialTargetRegister symbolic_reg);
 
     // Routines that work for the generic case, but may be overriden by target.
     /*
@@ -1246,15 +1257,6 @@ class Mir2Lir : public Backend {
     // Required for target - Dalvik-level generators.
     virtual void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                    RegLocation rl_src1, RegLocation rl_src2) = 0;
-    virtual void GenMulLong(Instruction::Code,
-                            RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) = 0;
-    virtual void GenAddLong(Instruction::Code,
-                            RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) = 0;
-    virtual void GenAndLong(Instruction::Code,
-                            RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) = 0;
     virtual void GenArithOpDouble(Instruction::Code opcode,
                                   RegLocation rl_dest, RegLocation rl_src1,
                                   RegLocation rl_src2) = 0;
@@ -1282,16 +1284,6 @@ class Mir2Lir : public Backend {
     virtual bool GenInlinedSqrt(CallInfo* info) = 0;
     virtual bool GenInlinedPeek(CallInfo* info, OpSize size) = 0;
     virtual bool GenInlinedPoke(CallInfo* info, OpSize size) = 0;
-    virtual void GenNotLong(RegLocation rl_dest, RegLocation rl_src) = 0;
-    virtual void GenNegLong(RegLocation rl_dest, RegLocation rl_src) = 0;
-    virtual void GenOrLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2) = 0;
-    virtual void GenSubLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) = 0;
-    virtual void GenXorLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) = 0;
-    virtual void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2, bool is_div) = 0;
     virtual RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi,
                                   bool is_div) = 0;
     virtual RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit,
@@ -1369,8 +1361,19 @@ class Mir2Lir : public Backend {
                                                int first_bit, int second_bit) = 0;
     virtual void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) = 0;
     virtual void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) = 0;
-    virtual void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
-    virtual void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+
+    // Create code for switch statements. Will decide between short and long versions below.
+    void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+
+    // Potentially backend-specific versions of switch instructions for shorter switch statements.
+    // The default implementation will create a chained compare-and-branch.
+    virtual void GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    virtual void GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    // Backend-specific versions of switch instructions for longer switch statements.
+    virtual void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+    virtual void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+
     virtual void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
                              RegLocation rl_index, RegLocation rl_dest, int scale) = 0;
     virtual void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
@@ -1441,6 +1444,9 @@ class Mir2Lir : public Backend {
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
     virtual bool InexpensiveConstantDouble(int64_t value) = 0;
+    virtual bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+      return InexpensiveConstantInt(value);
+    }
 
     // May be optimized by targets.
     virtual void GenMonitorEnter(int opt_flags, RegLocation rl_src);
@@ -1711,8 +1717,8 @@ class Mir2Lir : public Backend {
     CodeBuffer code_buffer_;
     // The encoding mapping table data (dex -> pc offset and pc offset -> dex) with a size prefix.
     std::vector<uint8_t> encoded_mapping_table_;
-    std::vector<uint32_t> core_vmap_table_;
-    std::vector<uint32_t> fp_vmap_table_;
+    ArenaVector<uint32_t> core_vmap_table_;
+    ArenaVector<uint32_t> fp_vmap_table_;
     std::vector<uint8_t> native_gc_map_;
     int num_core_spills_;
     int num_fp_spills_;
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 45244e1..be966e1 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1171,12 +1171,13 @@ void Mir2Lir::CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num
       } else {
         counts[p_map_idx].count += use_count;
       }
-    } else if (!IsInexpensiveConstant(loc)) {
+    } else {
       if (loc.wide && WideGPRsAreAliases()) {
-        // Longs and doubles can be counted together.
         i++;
       }
-      counts[p_map_idx].count += use_count;
+      if (!IsInexpensiveConstant(loc)) {
+        counts[p_map_idx].count += use_count;
+      }
     }
   }
 }
@@ -1185,9 +1186,10 @@ void Mir2Lir::CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num
 static int SortCounts(const void *val1, const void *val2) {
   const Mir2Lir::RefCounts* op1 = reinterpret_cast<const Mir2Lir::RefCounts*>(val1);
   const Mir2Lir::RefCounts* op2 = reinterpret_cast<const Mir2Lir::RefCounts*>(val2);
-  // Note that we fall back to sorting on reg so we get stable output
-  // on differing qsort implementations (such as on host and target or
-  // between local host and build servers).
+  // Note that we fall back to sorting on reg so we get stable output on differing qsort
+  // implementations (such as on host and target or between local host and build servers).
+  // Note also that if a wide val1 and a non-wide val2 have the same count, then val1 always
+  // ``loses'' (as STARTING_WIDE_SREG is or-ed in val1->s_reg).
   return (op1->count == op2->count)
           ? (op1->s_reg - op2->s_reg)
           : (op1->count < op2->count ? 1 : -1);
@@ -1230,8 +1232,8 @@ void Mir2Lir::DoPromotion() {
    * TUNING: replace with linear scan once we have the ability
    * to describe register live ranges for GC.
    */
-  size_t core_reg_count_size = cu_->target64 ? num_regs * 2 : num_regs;
-  size_t fp_reg_count_size = num_regs * 2;
+  size_t core_reg_count_size = WideGPRsAreAliases() ? num_regs : num_regs * 2;
+  size_t fp_reg_count_size = WideFPRsAreAliases() ? num_regs : num_regs * 2;
   RefCounts *core_regs =
       static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * core_reg_count_size,
                                             kArenaAllocRegAlloc));
@@ -1261,7 +1263,6 @@ void Mir2Lir::DoPromotion() {
   // Sum use counts of SSA regs by original Dalvik vreg.
   CountRefs(core_regs, fp_regs, num_regs);
 
-
   // Sort the count arrays
   qsort(core_regs, core_reg_count_size, sizeof(RefCounts), SortCounts);
   qsort(fp_regs, fp_reg_count_size, sizeof(RefCounts), SortCounts);
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index efd9079..8ebe55c 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -35,16 +35,16 @@ const X86EncodingMap X86Mir2Lir::EncodingMap[kX86Last] = {
                      rm32_i32, rm32_i32_modrm, \
                      rm32_i8, rm32_i8_modrm) \
 { kX86 ## opname ## 8MR, kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { 0,             0, rm8_r8, 0, 0, 0,            0,      0, true }, #opname "8MR", "[!0r+!1d],!2r" }, \
-{ kX86 ## opname ## 8AR, kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { 0,             0, rm8_r8, 0, 0, 0,            0,      0, true}, #opname "8AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
+{ kX86 ## opname ## 8AR, kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { 0,             0, rm8_r8, 0, 0, 0,            0,      0, true }, #opname "8AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
 { kX86 ## opname ## 8TR, kThreadReg, mem_use | IS_BINARY_OP   |           REG_USE1   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm8_r8, 0, 0, 0,            0,      0, true }, #opname "8TR", "fs:[!0d],!1r" }, \
 { kX86 ## opname ## 8RR, kRegReg,              IS_BINARY_OP   | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r8_rm8, 0, 0, 0,            0,      0, true }, #opname "8RR", "!0r,!1r" }, \
 { kX86 ## opname ## 8RM, kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r8_rm8, 0, 0, 0,            0,      0, true }, #opname "8RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## 8RA, kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { 0,             0, r8_rm8, 0, 0, 0,            0,      0, true }, #opname "8RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
 { kX86 ## opname ## 8RT, kRegThread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, r8_rm8, 0, 0, 0,            0,      0, true }, #opname "8RT", "!0r,fs:[!1d]" }, \
 { kX86 ## opname ## 8RI, kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm8_i8, 0, 0, rm8_i8_modrm, ax8_i8, 1, true }, #opname "8RI", "!0r,!1d" }, \
-{ kX86 ## opname ## 8MI, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, true }, #opname "8MI", "[!0r+!1d],!2d" }, \
-{ kX86 ## opname ## 8AI, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, true }, #opname "8AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
-{ kX86 ## opname ## 8TI, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, true }, #opname "8TI", "fs:[!0d],!1d" }, \
+{ kX86 ## opname ## 8MI, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, false}, #opname "8MI", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 8AI, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, false}, #opname "8AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 8TI, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm8_i8, 0, 0, rm8_i8_modrm, 0,      1, false}, #opname "8TI", "fs:[!0d],!1d" }, \
   \
 { kX86 ## opname ## 16MR,  kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { 0x66,          0,    rm32_r32, 0, 0, 0,              0,        0, false }, #opname "16MR", "[!0r+!1d],!2r" }, \
 { kX86 ## opname ## 16AR,  kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { 0x66,          0,    rm32_r32, 0, 0, 0,              0,        0, false }, #opname "16AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
@@ -170,9 +170,9 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   { kX86Mov8RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { 0,             0, 0x8A, 0, 0, 0, 0, 0, true }, "Mov8RA", "!0r,[!1r+!2r<<!3d+!4d]" },
   { kX86Mov8RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, 0, 0x8A, 0, 0, 0, 0, 0, true }, "Mov8RT", "!0r,fs:[!1d]" },
   { kX86Mov8RI, kMovRegImm,            IS_BINARY_OP   | REG_DEF0,       { 0,             0, 0xB0, 0, 0, 0, 0, 1, true }, "Mov8RI", "!0r,!1d" },
-  { kX86Mov8MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { 0,             0, 0xC6, 0, 0, 0, 0, 1, true }, "Mov8MI", "[!0r+!1d],!2d" },
-  { kX86Mov8AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC6, 0, 0, 0, 0, 1, true }, "Mov8AI", "[!0r+!1r<<!2d+!3d],!4d" },
-  { kX86Mov8TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC6, 0, 0, 0, 0, 1, true }, "Mov8TI", "fs:[!0d],!1d" },
+  { kX86Mov8MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { 0,             0, 0xC6, 0, 0, 0, 0, 1, false}, "Mov8MI", "[!0r+!1d],!2d" },
+  { kX86Mov8AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC6, 0, 0, 0, 0, 1, false}, "Mov8AI", "[!0r+!1r<<!2d+!3d],!4d" },
+  { kX86Mov8TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC6, 0, 0, 0, 0, 1, false}, "Mov8TI", "fs:[!0d],!1d" },
 
   { kX86Mov16MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0x66,          0,    0x89, 0, 0, 0, 0, 0, false }, "Mov16MR", "[!0r+!1d],!2r" },
   { kX86Mov16AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0x66,          0,    0x89, 0, 0, 0, 0, 0, false }, "Mov16AR", "[!0r+!1r<<!2d+!3d],!4r" },
@@ -286,7 +286,7 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
 
   { kX86Test32RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { 0,     0, 0x85, 0, 0, 0, 0, 0, false }, "Test32RR", "!0r,!1r" },
   { kX86Test64RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { REX_W, 0, 0x85, 0, 0, 0, 0, 0, false }, "Test64RR", "!0r,!1r" },
-  { kX86Test32RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,     0, 0x85, 0, 0, 0, 0, 0, false }, "Test32RM", "!0r,[!1r+!1d]" },
+  { kX86Test32RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | REG_USE01 | SETS_CCODES, { 0,     0, 0x85, 0, 0, 0, 0, 0, false }, "Test32RM", "!0r,[!1r+!2d]" },
 
 #define UNARY_ENCODING_MAP(opname, modrm, is_store, sets_ccodes, \
                            reg, reg_kind, reg_flags, \
@@ -407,9 +407,9 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Haddpd,    0x66, 0x7C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Haddps,    0xF2, 0x7C, REG_DEF0_USE0),
 
-  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrbRRI, kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
   { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
-  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
   { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" },
   { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" },
   { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" },
@@ -478,7 +478,7 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   { kX86MovsxdRM, kRegMem,      IS_LOAD | IS_TERTIARY_OP | REG_DEF0 | REG_USE1,  { REX_W, 0, 0x63, 0, 0, 0, 0, 0, false }, "MovsxdRM", "!0r,[!1r+!2d]" },
   { kX86MovsxdRA, kRegArray,    IS_LOAD | IS_QUIN_OP     | REG_DEF0 | REG_USE12, { REX_W, 0, 0x63, 0, 0, 0, 0, 0, false }, "MovsxdRA", "!0r,[!1r+!2r<<!3d+!4d]" },
 
-  { kX86Set8R, kRegCond,              IS_BINARY_OP   | REG_DEF0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0, true }, "Set8R", "!1c !0r" },
+  { kX86Set8R, kRegCond,   IS_BINARY_OP | REG_DEF0   | REG_USE0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0, true  }, "Set8R", "!1c !0r" },
   { kX86Set8M, kMemCond,   IS_STORE | IS_TERTIARY_OP | REG_USE0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0, false }, "Set8M", "!2c [!0r+!1d]" },
   { kX86Set8A, kArrayCond, IS_STORE | IS_QUIN_OP     | REG_USE01 | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0, false }, "Set8A", "!4c [!0r+!1r<<!2d+!3d]" },
 
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 15aae9e..f5f8671 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -27,8 +27,7 @@ namespace art {
  * The sparse table in the literal pool is an array of <key,displacement>
  * pairs.
  */
-void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
-                                 RegLocation rl_src) {
+void X86Mir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpSparseSwitchTable(table);
@@ -61,8 +60,7 @@ void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
  * jmp  r_start_of_method
  * done:
  */
-void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
-                                 RegLocation rl_src) {
+void X86Mir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   if (cu_->verbose) {
     DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 40621b1..d3ed48d 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -167,26 +167,12 @@ class X86Mir2Lir : public Mir2Lir {
   bool GenInlinedCharAt(CallInfo* info) OVERRIDE;
 
   // Long instructions.
+  void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                      RegLocation rl_src2) OVERRIDE;
   void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                          RegLocation rl_src2) OVERRIDE;
   void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                          RegLocation rl_src1, RegLocation rl_shift) OVERRIDE;
-  void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2) OVERRIDE;
-  void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2) OVERRIDE;
-  void GenAndLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2) OVERRIDE;
-  void GenNotLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
-  void GenNegLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
-  void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                 RegLocation rl_src2) OVERRIDE;
-  void GenSubLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2) OVERRIDE;
-  void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2) OVERRIDE;
-  void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                     RegLocation rl_src2, bool is_div) OVERRIDE;
   void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) OVERRIDE;
   void GenIntToLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
@@ -260,8 +246,8 @@ class X86Mir2Lir : public Mir2Lir {
                                      int first_bit, int second_bit) OVERRIDE;
   void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
-  void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
-  void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
 
   /**
    * @brief Implement instanceof a final class with x86 specific code.
@@ -333,11 +319,13 @@ class X86Mir2Lir : public Mir2Lir {
 
   /*
    * @brief Load the Class* of a Dex Class type into the register.
+   * @param dex DexFile that contains the class type.
    * @param type How the method will be invoked.
    * @param register that will contain the code address.
    * @note register will be passed to TargetReg to get physical register.
    */
-  void LoadClassType(uint32_t type_idx, SpecialTargetRegister symbolic_reg) OVERRIDE;
+  void LoadClassType(const DexFile& dex_file, uint32_t type_idx,
+                     SpecialTargetRegister symbolic_reg) OVERRIDE;
 
   void FlushIns(RegLocation* ArgLocs, RegLocation rl_method) OVERRIDE;
 
@@ -369,12 +357,6 @@ class X86Mir2Lir : public Mir2Lir {
   void InstallLiteralPools() OVERRIDE;
 
   /*
-   * @brief Generate the debug_frame CFI information.
-   * @returns pointer to vector containing CFE information
-   */
-  static std::vector<uint8_t>* ReturnCommonCallFrameInformation(bool is_x86_64);
-
-  /*
    * @brief Generate the debug_frame FDE information.
    * @returns pointer to vector containing CFE information
    */
@@ -827,6 +809,16 @@ class X86Mir2Lir : public Mir2Lir {
 
   void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
 
+  // Try to do a long multiplication where rl_src2 is a constant. This simplified setup might fail,
+  // in which case false will be returned.
+  bool GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val);
+  void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                  RegLocation rl_src2);
+  void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
+  void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
+  void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                     RegLocation rl_src2, bool is_div);
+
   void SpillCoreRegs();
   void UnSpillCoreRegs();
   void UnSpillFPRegs();
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 057639c..fdc46e2 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -1283,91 +1283,113 @@ void X86Mir2Lir::GenImulMemImm(RegStorage dest, int sreg, int displacement, int
   }
 }
 
-void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
-  // All memory accesses below reference dalvik regs.
-  ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-
-  if (cu_->target64) {
-    if (rl_src1.is_const) {
-      std::swap(rl_src1, rl_src2);
-    }
-    // Are we multiplying by a constant?
-    if (rl_src2.is_const) {
-      int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-      if (val == 0) {
-        RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
-        OpRegReg(kOpXor, rl_result.reg, rl_result.reg);
-        StoreValueWide(rl_dest, rl_result);
-        return;
-      } else if (val == 1) {
-        StoreValueWide(rl_dest, rl_src1);
-        return;
-      } else if (val == 2) {
-        GenAddLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1);
+void X86Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                                RegLocation rl_src2) {
+  if (!cu_->target64) {
+    // Some x86 32b ops are fallback.
+    switch (opcode) {
+      case Instruction::NOT_LONG:
+      case Instruction::DIV_LONG:
+      case Instruction::DIV_LONG_2ADDR:
+      case Instruction::REM_LONG:
+      case Instruction::REM_LONG_2ADDR:
+        Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
-      } else if (IsPowerOfTwo(val)) {
-        int shift_amount = LowestSetBit(val);
-        if (!BadOverlap(rl_src1, rl_dest)) {
-          rl_src1 = LoadValueWide(rl_src1, kCoreReg);
-          RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest,
-                                                    rl_src1, shift_amount);
-          StoreValueWide(rl_dest, rl_result);
-          return;
-        }
-      }
-    }
-    rl_src1 = LoadValueWide(rl_src1, kCoreReg);
-    rl_src2 = LoadValueWide(rl_src2, kCoreReg);
-    RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
-    if (rl_result.reg.GetReg() == rl_src1.reg.GetReg() &&
-        rl_result.reg.GetReg() == rl_src2.reg.GetReg()) {
-      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
-    } else if (rl_result.reg.GetReg() != rl_src1.reg.GetReg() &&
-               rl_result.reg.GetReg() == rl_src2.reg.GetReg()) {
-      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src1.reg.GetReg());
-    } else if (rl_result.reg.GetReg() == rl_src1.reg.GetReg() &&
-               rl_result.reg.GetReg() != rl_src2.reg.GetReg()) {
-      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src2.reg.GetReg());
-    } else {
-      OpRegCopy(rl_result.reg, rl_src1.reg);
-      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src2.reg.GetReg());
+
+      default:
+        // Everything else we can handle.
+        break;
     }
-    StoreValueWide(rl_dest, rl_result);
-    return;
   }
 
-  if (rl_src1.is_const) {
-    std::swap(rl_src1, rl_src2);
+  switch (opcode) {
+    case Instruction::NOT_LONG:
+      GenNotLong(rl_dest, rl_src2);
+      return;
+
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+      GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
+      return;
+
+    case Instruction::SUB_LONG:
+    case Instruction::SUB_LONG_2ADDR:
+      GenLongArith(rl_dest, rl_src1, rl_src2, opcode, false);
+      return;
+
+    case Instruction::MUL_LONG:
+    case Instruction::MUL_LONG_2ADDR:
+      GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
+      return;
+
+    case Instruction::DIV_LONG:
+    case Instruction::DIV_LONG_2ADDR:
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
+      return;
+
+    case Instruction::REM_LONG:
+    case Instruction::REM_LONG_2ADDR:
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
+      return;
+
+    case Instruction::AND_LONG_2ADDR:
+    case Instruction::AND_LONG:
+      GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
+      return;
+
+    case Instruction::OR_LONG:
+    case Instruction::OR_LONG_2ADDR:
+      GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
+      return;
+
+    case Instruction::XOR_LONG:
+    case Instruction::XOR_LONG_2ADDR:
+      GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
+      return;
+
+    case Instruction::NEG_LONG:
+      GenNegLong(rl_dest, rl_src2);
+      return;
+
+    default:
+      LOG(FATAL) << "Invalid long arith op";
+      return;
   }
-  // Are we multiplying by a constant?
-  if (rl_src2.is_const) {
-    // Do special compare/branch against simple const operand
-    int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-    if (val == 0) {
-      RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+}
+
+bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val) {
+  // All memory accesses below reference dalvik regs.
+  ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+
+  if (val == 0) {
+    RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+    if (cu_->target64) {
+      OpRegReg(kOpXor, rl_result.reg, rl_result.reg);
+    } else {
       OpRegReg(kOpXor, rl_result.reg.GetLow(), rl_result.reg.GetLow());
       OpRegReg(kOpXor, rl_result.reg.GetHigh(), rl_result.reg.GetHigh());
+    }
+    StoreValueWide(rl_dest, rl_result);
+    return true;
+  } else if (val == 1) {
+    StoreValueWide(rl_dest, rl_src1);
+    return true;
+  } else if (val == 2) {
+    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1);
+    return true;
+  } else if (IsPowerOfTwo(val)) {
+    int shift_amount = LowestSetBit(val);
+    if (!BadOverlap(rl_src1, rl_dest)) {
+      rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+      RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest, rl_src1,
+                                                shift_amount);
       StoreValueWide(rl_dest, rl_result);
-      return;
-    } else if (val == 1) {
-      StoreValueWide(rl_dest, rl_src1);
-      return;
-    } else if (val == 2) {
-      GenAddLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1);
-      return;
-    } else if (IsPowerOfTwo(val)) {
-      int shift_amount = LowestSetBit(val);
-      if (!BadOverlap(rl_src1, rl_dest)) {
-        rl_src1 = LoadValueWide(rl_src1, kCoreReg);
-        RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest,
-                                                  rl_src1, shift_amount);
-        StoreValueWide(rl_dest, rl_result);
-        return;
-      }
+      return true;
     }
+  }
 
-    // Okay, just bite the bullet and do it.
+  // Okay, on 32b just bite the bullet and do it, still better than the general case.
+  if (!cu_->target64) {
     int32_t val_lo = Low32Bits(val);
     int32_t val_hi = High32Bits(val);
     FlushAllRegs();
@@ -1408,10 +1430,48 @@ void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation
     RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1,
                              RegStorage::MakeRegPair(rs_r0, rs_r2), INVALID_SREG, INVALID_SREG};
     StoreValueWide(rl_dest, rl_result);
+    return true;
+  }
+  return false;
+}
+
+void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                            RegLocation rl_src2) {
+  if (rl_src1.is_const) {
+    std::swap(rl_src1, rl_src2);
+  }
+
+  if (rl_src2.is_const) {
+    if (GenMulLongConst(rl_dest, rl_src1, mir_graph_->ConstantValueWide(rl_src2))) {
+      return;
+    }
+  }
+
+  // All memory accesses below reference dalvik regs.
+  ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+
+  if (cu_->target64) {
+    rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+    rl_src2 = LoadValueWide(rl_src2, kCoreReg);
+    RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+    if (rl_result.reg.GetReg() == rl_src1.reg.GetReg() &&
+        rl_result.reg.GetReg() == rl_src2.reg.GetReg()) {
+      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
+    } else if (rl_result.reg.GetReg() != rl_src1.reg.GetReg() &&
+               rl_result.reg.GetReg() == rl_src2.reg.GetReg()) {
+      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src1.reg.GetReg());
+    } else if (rl_result.reg.GetReg() == rl_src1.reg.GetReg() &&
+               rl_result.reg.GetReg() != rl_src2.reg.GetReg()) {
+      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src2.reg.GetReg());
+    } else {
+      OpRegCopy(rl_result.reg, rl_src1.reg);
+      NewLIR2(kX86Imul64RR, rl_result.reg.GetReg(), rl_src2.reg.GetReg());
+    }
+    StoreValueWide(rl_dest, rl_result);
     return;
   }
 
-  // Nope.  Do it the hard way
+  // Not multiplying by a constant. Do it the hard way
   // Check for V*V.  We can eliminate a multiply in that case, as 2L*1H == 2H*1L.
   bool is_square = mir_graph_->SRegToVReg(rl_src1.s_reg_low) ==
                    mir_graph_->SRegToVReg(rl_src2.s_reg_low);
@@ -1681,31 +1741,6 @@ void X86Mir2Lir::GenLongArith(RegLocation rl_dest, RegLocation rl_src1,
   StoreFinalValueWide(rl_dest, rl_src1);
 }
 
-void X86Mir2Lir::GenAddLong(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
-}
-
-void X86Mir2Lir::GenSubLong(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongArith(rl_dest, rl_src1, rl_src2, opcode, false);
-}
-
-void X86Mir2Lir::GenAndLong(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
-}
-
-void X86Mir2Lir::GenOrLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
-}
-
-void X86Mir2Lir::GenXorLong(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
-  GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
-}
-
 void X86Mir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
   if (cu_->target64) {
     rl_src = LoadValueWide(rl_src, kCoreReg);
@@ -2214,7 +2249,7 @@ void X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest
   } else if (shift_amount == 1 &&
             (opcode ==  Instruction::SHL_LONG || opcode == Instruction::SHL_LONG_2ADDR)) {
     // Need to handle this here to avoid calling StoreValueWide twice.
-    GenAddLong(Instruction::ADD_LONG, rl_dest, rl_src, rl_src);
+    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src, rl_src);
     return;
   }
   if (BadOverlap(rl_src, rl_dest)) {
@@ -2246,7 +2281,7 @@ void X86Mir2Lir::GenArithImmOpLong(Instruction::Code opcode,
       if (rl_src2.is_const) {
         isConstSuccess = GenLongLongImm(rl_dest, rl_src1, rl_src2, opcode);
       } else {
-        GenSubLong(opcode, rl_dest, rl_src1, rl_src2);
+        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
         isConstSuccess = true;
       }
       break;
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index a72d94a..69f3e67 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -971,19 +971,21 @@ void X86Mir2Lir::LoadMethodAddress(const MethodReference& target_method, InvokeT
   method_address_insns_.Insert(move);
 }
 
-void X86Mir2Lir::LoadClassType(uint32_t type_idx, SpecialTargetRegister symbolic_reg) {
+void X86Mir2Lir::LoadClassType(const DexFile& dex_file, uint32_t type_idx,
+                               SpecialTargetRegister symbolic_reg) {
   /*
    * For x86, just generate a 32 bit move immediate instruction, that will be filled
    * in at 'link time'.  For now, put a unique value based on target to ensure that
    * code deduplication works.
    */
-  const DexFile::TypeId& id = cu_->dex_file->GetTypeId(type_idx);
+  const DexFile::TypeId& id = dex_file.GetTypeId(type_idx);
   uintptr_t ptr = reinterpret_cast<uintptr_t>(&id);
 
   // Generate the move instruction with the unique pointer and save index and type.
   LIR *move = RawLIR(current_dalvik_offset_, kX86Mov32RI,
                      TargetReg(symbolic_reg, kNotWide).GetReg(),
-                     static_cast<int>(ptr), type_idx);
+                     static_cast<int>(ptr), type_idx,
+                     WrapPointer(const_cast<DexFile*>(&dex_file)));
   AppendLIR(move);
   class_type_address_insns_.Insert(move);
 }
@@ -1068,12 +1070,16 @@ void X86Mir2Lir::InstallLiteralPools() {
   for (uint32_t i = 0; i < class_type_address_insns_.Size(); i++) {
       LIR* p = class_type_address_insns_.Get(i);
       DCHECK_EQ(p->opcode, kX86Mov32RI);
+
+      const DexFile* class_dex_file =
+        reinterpret_cast<const DexFile*>(UnwrapPointer(p->operands[3]));
       uint32_t target_method_idx = p->operands[2];
 
       // The offset to patch is the last 4 bytes of the instruction.
       int patch_offset = p->offset + p->flags.size - 4;
       cu_->compiler_driver->AddClassPatch(cu_->dex_file, cu_->class_def_idx,
-                                          cu_->method_idx, target_method_idx, patch_offset);
+                                          cu_->method_idx, target_method_idx, class_dex_file,
+                                          patch_offset);
   }
 
   // And now the PC-relative calls to methods.
@@ -1098,11 +1104,6 @@ void X86Mir2Lir::InstallLiteralPools() {
 }
 
 bool X86Mir2Lir::GenInlinedArrayCopyCharArray(CallInfo* info) {
-  if (cu_->target64) {
-    // TODO: Implement ArrayCOpy intrinsic for x86_64
-    return false;
-  }
-
   RegLocation rl_src = info->args[0];
   RegLocation rl_srcPos = info->args[1];
   RegLocation rl_dst = info->args[2];
@@ -1115,31 +1116,32 @@ bool X86Mir2Lir::GenInlinedArrayCopyCharArray(CallInfo* info) {
     return false;
   }
   ClobberCallerSave();
-  LockCallTemps();  // Using fixed registers
-  LoadValueDirectFixed(rl_src , rs_rAX);
-  LoadValueDirectFixed(rl_dst , rs_rCX);
-  LIR* src_dst_same  = OpCmpBranch(kCondEq, rs_rAX , rs_rCX, nullptr);
-  LIR* src_null_branch = OpCmpImmBranch(kCondEq, rs_rAX , 0, nullptr);
-  LIR* dst_null_branch = OpCmpImmBranch(kCondEq, rs_rCX , 0, nullptr);
-  LoadValueDirectFixed(rl_length , rs_rDX);
-  LIR* len_negative  = OpCmpImmBranch(kCondLt, rs_rDX , 0, nullptr);
-  LIR* len_too_big  = OpCmpImmBranch(kCondGt, rs_rDX , 128, nullptr);
-  LoadValueDirectFixed(rl_src , rs_rAX);
-  LoadWordDisp(rs_rAX , mirror::Array::LengthOffset().Int32Value(), rs_rAX);
+  LockCallTemps();  // Using fixed registers.
+  RegStorage tmp_reg = cu_->target64 ? rs_r11 : rs_rBX;
+  LoadValueDirectFixed(rl_src, rs_rAX);
+  LoadValueDirectFixed(rl_dst, rs_rCX);
+  LIR* src_dst_same  = OpCmpBranch(kCondEq, rs_rAX, rs_rCX, nullptr);
+  LIR* src_null_branch = OpCmpImmBranch(kCondEq, rs_rAX, 0, nullptr);
+  LIR* dst_null_branch = OpCmpImmBranch(kCondEq, rs_rCX, 0, nullptr);
+  LoadValueDirectFixed(rl_length, rs_rDX);
+  // If the length of the copy is > 128 characters (256 bytes) or negative then go slow path.
+  LIR* len_too_big  = OpCmpImmBranch(kCondHi, rs_rDX, 128, nullptr);
+  LoadValueDirectFixed(rl_src, rs_rAX);
+  LoadWordDisp(rs_rAX, mirror::Array::LengthOffset().Int32Value(), rs_rAX);
   LIR* src_bad_len  = nullptr;
   LIR* srcPos_negative  = nullptr;
   if (!rl_srcPos.is_const) {
-    LoadValueDirectFixed(rl_srcPos , rs_rBX);
-    srcPos_negative  = OpCmpImmBranch(kCondLt, rs_rBX , 0, nullptr);
-    OpRegReg(kOpAdd, rs_rBX, rs_rDX);
-    src_bad_len  = OpCmpBranch(kCondLt, rs_rAX , rs_rBX, nullptr);
+    LoadValueDirectFixed(rl_srcPos, tmp_reg);
+    srcPos_negative  = OpCmpImmBranch(kCondLt, tmp_reg, 0, nullptr);
+    OpRegReg(kOpAdd, tmp_reg, rs_rDX);
+    src_bad_len  = OpCmpBranch(kCondLt, rs_rAX, tmp_reg, nullptr);
   } else {
-    int pos_val = mir_graph_->ConstantValue(rl_srcPos.orig_sreg);
+    int32_t pos_val = mir_graph_->ConstantValue(rl_srcPos.orig_sreg);
     if (pos_val == 0) {
-      src_bad_len  = OpCmpBranch(kCondLt, rs_rAX , rs_rDX, nullptr);
+      src_bad_len  = OpCmpBranch(kCondLt, rs_rAX, rs_rDX, nullptr);
     } else {
-      OpRegRegImm(kOpAdd, rs_rBX,  rs_rDX, pos_val);
-      src_bad_len  = OpCmpBranch(kCondLt, rs_rAX , rs_rBX, nullptr);
+      OpRegRegImm(kOpAdd, tmp_reg,  rs_rDX, pos_val);
+      src_bad_len  = OpCmpBranch(kCondLt, rs_rAX, tmp_reg, nullptr);
     }
   }
   LIR* dstPos_negative = nullptr;
@@ -1147,49 +1149,49 @@ bool X86Mir2Lir::GenInlinedArrayCopyCharArray(CallInfo* info) {
   LoadValueDirectFixed(rl_dst, rs_rAX);
   LoadWordDisp(rs_rAX, mirror::Array::LengthOffset().Int32Value(), rs_rAX);
   if (!rl_dstPos.is_const) {
-    LoadValueDirectFixed(rl_dstPos , rs_rBX);
-    dstPos_negative = OpCmpImmBranch(kCondLt, rs_rBX , 0, nullptr);
-    OpRegRegReg(kOpAdd, rs_rBX, rs_rBX, rs_rDX);
-    dst_bad_len = OpCmpBranch(kCondLt, rs_rAX , rs_rBX, nullptr);
+    LoadValueDirectFixed(rl_dstPos, tmp_reg);
+    dstPos_negative = OpCmpImmBranch(kCondLt, tmp_reg, 0, nullptr);
+    OpRegRegReg(kOpAdd, tmp_reg, tmp_reg, rs_rDX);
+    dst_bad_len = OpCmpBranch(kCondLt, rs_rAX, tmp_reg, nullptr);
   } else {
-    int pos_val = mir_graph_->ConstantValue(rl_dstPos.orig_sreg);
+    int32_t pos_val = mir_graph_->ConstantValue(rl_dstPos.orig_sreg);
     if (pos_val == 0) {
-      dst_bad_len = OpCmpBranch(kCondLt, rs_rAX , rs_rDX, nullptr);
+      dst_bad_len = OpCmpBranch(kCondLt, rs_rAX, rs_rDX, nullptr);
     } else {
-      OpRegRegImm(kOpAdd, rs_rBX,  rs_rDX, pos_val);
-      dst_bad_len = OpCmpBranch(kCondLt, rs_rAX , rs_rBX, nullptr);
+      OpRegRegImm(kOpAdd, tmp_reg,  rs_rDX, pos_val);
+      dst_bad_len = OpCmpBranch(kCondLt, rs_rAX, tmp_reg, nullptr);
     }
   }
-  // everything is checked now
-  LoadValueDirectFixed(rl_src , rs_rAX);
-  LoadValueDirectFixed(rl_dst , rs_rBX);
-  LoadValueDirectFixed(rl_srcPos , rs_rCX);
+  // Everything is checked now.
+  LoadValueDirectFixed(rl_src, rs_rAX);
+  LoadValueDirectFixed(rl_dst, tmp_reg);
+  LoadValueDirectFixed(rl_srcPos, rs_rCX);
   NewLIR5(kX86Lea32RA, rs_rAX.GetReg(), rs_rAX.GetReg(),
-       rs_rCX.GetReg() , 1, mirror::Array::DataOffset(2).Int32Value());
-  // RAX now holds the address of the first src element to be copied
+       rs_rCX.GetReg(), 1, mirror::Array::DataOffset(2).Int32Value());
+  // RAX now holds the address of the first src element to be copied.
 
-  LoadValueDirectFixed(rl_dstPos , rs_rCX);
-  NewLIR5(kX86Lea32RA, rs_rBX.GetReg(), rs_rBX.GetReg(),
-       rs_rCX.GetReg() , 1, mirror::Array::DataOffset(2).Int32Value() );
-  // RBX now holds the address of the first dst element to be copied
+  LoadValueDirectFixed(rl_dstPos, rs_rCX);
+  NewLIR5(kX86Lea32RA, tmp_reg.GetReg(), tmp_reg.GetReg(),
+       rs_rCX.GetReg(), 1, mirror::Array::DataOffset(2).Int32Value() );
+  // RBX now holds the address of the first dst element to be copied.
 
-  // check if the number of elements to be copied is odd or even. If odd
+  // Check if the number of elements to be copied is odd or even. If odd
   // then copy the first element (so that the remaining number of elements
   // is even).
-  LoadValueDirectFixed(rl_length , rs_rCX);
+  LoadValueDirectFixed(rl_length, rs_rCX);
   OpRegImm(kOpAnd, rs_rCX, 1);
   LIR* jmp_to_begin_loop  = OpCmpImmBranch(kCondEq, rs_rCX, 0, nullptr);
   OpRegImm(kOpSub, rs_rDX, 1);
   LoadBaseIndexedDisp(rs_rAX, rs_rDX, 1, 0, rs_rCX, kSignedHalf);
-  StoreBaseIndexedDisp(rs_rBX, rs_rDX, 1, 0, rs_rCX, kSignedHalf);
+  StoreBaseIndexedDisp(tmp_reg, rs_rDX, 1, 0, rs_rCX, kSignedHalf);
 
-  // since the remaining number of elements is even, we will copy by
+  // Since the remaining number of elements is even, we will copy by
   // two elements at a time.
-  LIR *beginLoop = NewLIR0(kPseudoTargetLabel);
-  LIR* jmp_to_ret  = OpCmpImmBranch(kCondEq, rs_rDX , 0, nullptr);
+  LIR* beginLoop = NewLIR0(kPseudoTargetLabel);
+  LIR* jmp_to_ret  = OpCmpImmBranch(kCondEq, rs_rDX, 0, nullptr);
   OpRegImm(kOpSub, rs_rDX, 2);
   LoadBaseIndexedDisp(rs_rAX, rs_rDX, 1, 0, rs_rCX, kSingle);
-  StoreBaseIndexedDisp(rs_rBX, rs_rDX, 1, 0, rs_rCX, kSingle);
+  StoreBaseIndexedDisp(tmp_reg, rs_rDX, 1, 0, rs_rCX, kSingle);
   OpUnconditionalBranch(beginLoop);
   LIR *check_failed = NewLIR0(kPseudoTargetLabel);
   LIR* launchpad_branch  = OpUnconditionalBranch(nullptr);
@@ -1197,7 +1199,6 @@ bool X86Mir2Lir::GenInlinedArrayCopyCharArray(CallInfo* info) {
   jmp_to_ret->target = return_point;
   jmp_to_begin_loop->target = beginLoop;
   src_dst_same->target = check_failed;
-  len_negative->target = check_failed;
   len_too_big->target = check_failed;
   src_null_branch->target = check_failed;
   if (srcPos_negative != nullptr)
@@ -1442,11 +1443,6 @@ static void AdvanceLoc(std::vector<uint8_t>&buf, uint32_t increment) {
   }
 }
 
-
-std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64) {
-  return X86Mir2Lir::ReturnCommonCallFrameInformation(is_x86_64);
-}
-
 static void EncodeUnsignedLeb128(std::vector<uint8_t>& buf, uint32_t value) {
   uint8_t buffer[12];
   uint8_t *ptr = EncodeUnsignedLeb128(buffer, value);
@@ -1463,84 +1459,6 @@ static void EncodeSignedLeb128(std::vector<uint8_t>& buf, int32_t value) {
   }
 }
 
-std::vector<uint8_t>* X86Mir2Lir::ReturnCommonCallFrameInformation(bool is_x86_64) {
-  std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>;
-
-  // Length (will be filled in later in this routine).
-  PushWord(*cfi_info, 0);
-
-  // CIE id: always 0.
-  PushWord(*cfi_info, 0);
-
-  // Version: always 1.
-  cfi_info->push_back(0x01);
-
-  // Augmentation: 'zR\0'
-  cfi_info->push_back(0x7a);
-  cfi_info->push_back(0x52);
-  cfi_info->push_back(0x0);
-
-  // Code alignment: 1.
-  EncodeUnsignedLeb128(*cfi_info, 1);
-
-  // Data alignment.
-  if (is_x86_64) {
-    EncodeSignedLeb128(*cfi_info, -8);
-  } else {
-    EncodeSignedLeb128(*cfi_info, -4);
-  }
-
-  // Return address register.
-  if (is_x86_64) {
-    // R16(RIP)
-    cfi_info->push_back(0x10);
-  } else {
-    // R8(EIP)
-    cfi_info->push_back(0x08);
-  }
-
-  // Augmentation length: 1.
-  cfi_info->push_back(1);
-
-  // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4).
-  cfi_info->push_back(0x03);
-
-  // Initial instructions.
-  if (is_x86_64) {
-    // DW_CFA_def_cfa R7(RSP) 8.
-    cfi_info->push_back(0x0c);
-    cfi_info->push_back(0x07);
-    cfi_info->push_back(0x08);
-
-    // DW_CFA_offset R16(RIP) 1 (* -8).
-    cfi_info->push_back(0x90);
-    cfi_info->push_back(0x01);
-  } else {
-    // DW_CFA_def_cfa R4(ESP) 4.
-    cfi_info->push_back(0x0c);
-    cfi_info->push_back(0x04);
-    cfi_info->push_back(0x04);
-
-    // DW_CFA_offset R8(EIP) 1 (* -4).
-    cfi_info->push_back(0x88);
-    cfi_info->push_back(0x01);
-  }
-
-  // Padding to a multiple of 4
-  while ((cfi_info->size() & 3) != 0) {
-    // DW_CFA_nop is encoded as 0.
-    cfi_info->push_back(0);
-  }
-
-  // Set the length of the CIE inside the generated bytes.
-  uint32_t length = cfi_info->size() - 4;
-  (*cfi_info)[0] = length;
-  (*cfi_info)[1] = length >> 8;
-  (*cfi_info)[2] = length >> 16;
-  (*cfi_info)[3] = length >> 24;
-  return cfi_info;
-}
-
 static bool ARTRegIDToDWARFRegID(bool is_x86_64, int art_reg_id, int* dwarf_reg_id) {
   if (is_x86_64) {
     switch (art_reg_id) {
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index a77d79e..a48613f 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -565,6 +565,7 @@ LIR* X86Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
     bool is_fp = r_dest.IsFloat();
     // TODO: clean this up once we fully recognize 64-bit storage containers.
     if (is_fp) {
+      DCHECK(r_dest.IsDouble());
       if (value == 0) {
         return NewLIR2(kX86XorpsRR, low_reg_val, low_reg_val);
       } else if (base_of_code_ != nullptr) {
@@ -594,16 +595,23 @@ LIR* X86Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
         Clobber(rl_method.reg);
         store_method_addr_used_ = true;
       } else {
-        if (val_lo == 0) {
-          res = NewLIR2(kX86XorpsRR, low_reg_val, low_reg_val);
+        if (r_dest.IsPair()) {
+          if (val_lo == 0) {
+            res = NewLIR2(kX86XorpsRR, low_reg_val, low_reg_val);
+          } else {
+            res = LoadConstantNoClobber(RegStorage::FloatSolo32(low_reg_val), val_lo);
+          }
+          if (val_hi != 0) {
+            RegStorage r_dest_hi = AllocTempDouble();
+            LoadConstantNoClobber(r_dest_hi, val_hi);
+            NewLIR2(kX86PunpckldqRR, low_reg_val, r_dest_hi.GetReg());
+            FreeTemp(r_dest_hi);
+          }
         } else {
-          res = LoadConstantNoClobber(RegStorage::FloatSolo32(low_reg_val), val_lo);
-        }
-        if (val_hi != 0) {
-          RegStorage r_dest_hi = AllocTempDouble();
-          LoadConstantNoClobber(r_dest_hi, val_hi);
-          NewLIR2(kX86PunpckldqRR, low_reg_val, r_dest_hi.GetReg());
-          FreeTemp(r_dest_hi);
+          RegStorage r_temp = AllocTypedTempWide(false, kCoreReg);
+          res = LoadConstantWide(r_temp, value);
+          OpRegCopyWide(r_dest, r_temp);
+          FreeTemp(r_temp);
         }
       }
     } else {
@@ -1008,8 +1016,8 @@ void X86Mir2Lir::AnalyzeFPInstruction(int opcode, BasicBlock * bb, MIR *mir) {
 }
 
 void X86Mir2Lir::AnalyzeDoubleUse(RegLocation use) {
-  // If this is a double literal, we will want it in the literal pool.
-  if (use.is_const) {
+  // If this is a double literal, we will want it in the literal pool on 32b platforms.
+  if (use.is_const && !cu_->target64) {
     store_method_addr_ = true;
   }
 }
@@ -1043,12 +1051,18 @@ RegLocation X86Mir2Lir::UpdateLocWideTyped(RegLocation loc, int reg_class) {
 }
 
 void X86Mir2Lir::AnalyzeInvokeStatic(int opcode, BasicBlock * bb, MIR *mir) {
+  // For now this is only actual for x86-32.
+  if (cu_->target64) {
+    return;
+  }
+
   uint32_t index = mir->dalvikInsn.vB;
   if (!(mir->optimization_flags & MIR_INLINED)) {
     DCHECK(cu_->compiler_driver->GetMethodInlinerMap() != nullptr);
+    DexFileMethodInliner* method_inliner =
+      cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file);
     InlineMethod method;
-    if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file)
-        ->IsIntrinsic(index, &method)) {
+    if (method_inliner->IsIntrinsic(index, &method)) {
       switch (method.opcode) {
         case kIntrinsicAbsDouble:
         case kIntrinsicMinMaxDouble:
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index f85bc65..f40120e 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -353,7 +353,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options,
       compiler_enable_auto_elf_loading_(NULL),
       compiler_get_method_code_addr_(NULL),
       support_boot_image_fixup_(instruction_set != kMips),
-      cfi_info_(nullptr),
       dedupe_code_("dedupe code"),
       dedupe_mapping_table_("dedupe mapping table"),
       dedupe_vmap_table_("dedupe vmap table"),
@@ -376,11 +375,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options,
     CHECK(image_classes_.get() == nullptr);
   }
 
-  // Are we generating CFI information?
-  if (compiler_options->GetGenerateGDBInformation()) {
-    cfi_info_.reset(compiler_->GetCallFrameInformationInitialization(*this));
-  }
-
   // Read the profile file if one is provided.
   if (!profile_file.empty()) {
     profile_present_ = profile_file_.LoadFile(profile_file);
@@ -597,7 +591,7 @@ void CompilerDriver::Resolve(jobject class_loader, const std::vector<const DexFi
   for (size_t i = 0; i != dex_files.size(); ++i) {
     const DexFile* dex_file = dex_files[i];
     CHECK(dex_file != nullptr);
-    ResolveDexFile(class_loader, *dex_file, thread_pool, timings);
+    ResolveDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
   }
 }
 
@@ -933,13 +927,13 @@ bool CompilerDriver::CanEmbedTypeInCode(const DexFile& dex_file, uint32_t type_i
   }
   *out_is_finalizable = resolved_class->IsFinalizable();
   const bool compiling_boot = Runtime::Current()->GetHeap()->IsCompilingBoot();
+  const bool support_boot_image_fixup = GetSupportBootImageFixup();
   if (compiling_boot) {
     // boot -> boot class pointers.
     // True if the class is in the image at boot compiling time.
     const bool is_image_class = IsImage() && IsImageClass(
         dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_));
     // True if pc relative load works.
-    const bool support_boot_image_fixup = GetSupportBootImageFixup();
     if (is_image_class && support_boot_image_fixup) {
       *is_type_initialized = resolved_class->IsInitialized();
       *use_direct_type_ptr = false;
@@ -952,7 +946,7 @@ bool CompilerDriver::CanEmbedTypeInCode(const DexFile& dex_file, uint32_t type_i
     // True if the class is in the image at app compiling time.
     const bool class_in_image =
         Runtime::Current()->GetHeap()->FindSpaceFromObject(resolved_class, false)->IsImageSpace();
-    if (class_in_image) {
+    if (class_in_image && support_boot_image_fixup) {
       // boot -> app class pointers.
       *is_type_initialized = resolved_class->IsInitialized();
       // TODO This is somewhat hacky. We should refactor all of this invoke codepath.
@@ -969,6 +963,43 @@ bool CompilerDriver::CanEmbedTypeInCode(const DexFile& dex_file, uint32_t type_i
   }
 }
 
+bool CompilerDriver::CanEmbedReferenceTypeInCode(ClassReference* ref,
+                                                 bool* use_direct_ptr,
+                                                 uintptr_t* direct_type_ptr) {
+  CHECK(ref != nullptr);
+  CHECK(use_direct_ptr != nullptr);
+  CHECK(direct_type_ptr != nullptr);
+
+  ScopedObjectAccess soa(Thread::Current());
+  mirror::Class* reference_class = mirror::Reference::GetJavaLangRefReference();
+  bool is_initialized;
+  bool unused_finalizable;
+  // Make sure we have a finished Reference class object before attempting to use it.
+  if (!CanEmbedTypeInCode(*reference_class->GetDexCache()->GetDexFile(),
+                          reference_class->GetDexTypeIndex(), &is_initialized,
+                          use_direct_ptr, direct_type_ptr, &unused_finalizable) ||
+      !is_initialized) {
+    return false;
+  }
+  ref->first = &reference_class->GetDexFile();
+  ref->second = reference_class->GetDexClassDefIndex();
+  return true;
+}
+
+uint32_t CompilerDriver::GetReferenceSlowFlagOffset() const {
+  ScopedObjectAccess soa(Thread::Current());
+  mirror::Class* klass = mirror::Reference::GetJavaLangRefReference();
+  DCHECK(klass->IsInitialized());
+  return klass->GetSlowPathFlagOffset().Uint32Value();
+}
+
+uint32_t CompilerDriver::GetReferenceDisableFlagOffset() const {
+  ScopedObjectAccess soa(Thread::Current());
+  mirror::Class* klass = mirror::Reference::GetJavaLangRefReference();
+  DCHECK(klass->IsInitialized());
+  return klass->GetDisableIntrinsicFlagOffset().Uint32Value();
+}
+
 void CompilerDriver::ProcessedInstanceField(bool resolved) {
   if (!resolved) {
     stats_->UnresolvedInstanceField();
@@ -1340,12 +1371,14 @@ void CompilerDriver::AddClassPatch(const DexFile* dex_file,
                                     uint16_t referrer_class_def_idx,
                                     uint32_t referrer_method_idx,
                                     uint32_t target_type_idx,
+                                    const DexFile* target_type_dex_file,
                                     size_t literal_offset) {
   MutexLock mu(Thread::Current(), compiled_methods_lock_);
   classes_to_patch_.push_back(new TypePatchInformation(dex_file,
                                                        referrer_class_def_idx,
                                                        referrer_method_idx,
                                                        target_type_idx,
+                                                       target_type_dex_file,
                                                        literal_offset));
 }
 
@@ -1357,12 +1390,14 @@ class ParallelCompilationManager {
                              jobject class_loader,
                              CompilerDriver* compiler,
                              const DexFile* dex_file,
+                             const std::vector<const DexFile*>& dex_files,
                              ThreadPool* thread_pool)
     : index_(0),
       class_linker_(class_linker),
       class_loader_(class_loader),
       compiler_(compiler),
       dex_file_(dex_file),
+      dex_files_(dex_files),
       thread_pool_(thread_pool) {}
 
   ClassLinker* GetClassLinker() const {
@@ -1384,6 +1419,10 @@ class ParallelCompilationManager {
     return dex_file_;
   }
 
+  const std::vector<const DexFile*>& GetDexFiles() const {
+    return dex_files_;
+  }
+
   void ForAll(size_t begin, size_t end, Callback callback, size_t work_units) {
     Thread* self = Thread::Current();
     self->AssertNoPendingException();
@@ -1441,11 +1480,24 @@ class ParallelCompilationManager {
   const jobject class_loader_;
   CompilerDriver* const compiler_;
   const DexFile* const dex_file_;
+  const std::vector<const DexFile*>& dex_files_;
   ThreadPool* const thread_pool_;
 
   DISALLOW_COPY_AND_ASSIGN(ParallelCompilationManager);
 };
 
+static bool SkipClassCheckClassPath(const char* descriptor, const DexFile& dex_file,
+                                    const std::vector<const DexFile*>& classpath) {
+  DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, classpath);
+  CHECK(pair.second != NULL);
+  if (pair.first != &dex_file) {
+    LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation()
+                 << " previously found in " << pair.first->GetLocation();
+    return true;
+  }
+  return false;
+}
+
 // Return true if the class should be skipped during compilation.
 //
 // The first case where we skip is for redundant class definitions in
@@ -1454,20 +1506,23 @@ class ParallelCompilationManager {
 // The second case where we skip is when an app bundles classes found
 // in the boot classpath. Since at runtime we will select the class from
 // the boot classpath, we ignore the one from the app.
+//
+// The third case is if the app itself has the class defined in multiple dex files. Then we skip
+// it if it is not the first occurrence.
 static bool SkipClass(ClassLinker* class_linker, jobject class_loader, const DexFile& dex_file,
+                      const std::vector<const DexFile*>& dex_files,
                       const DexFile::ClassDef& class_def) {
   const char* descriptor = dex_file.GetClassDescriptor(class_def);
+
   if (class_loader == NULL) {
-    DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, class_linker->GetBootClassPath());
-    CHECK(pair.second != NULL);
-    if (pair.first != &dex_file) {
-      LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation()
-                   << " previously found in " << pair.first->GetLocation();
-      return true;
-    }
-    return false;
+    return SkipClassCheckClassPath(descriptor, dex_file, class_linker->GetBootClassPath());
+  }
+
+  if (class_linker->IsInBootClassPath(descriptor)) {
+    return true;
   }
-  return class_linker->IsInBootClassPath(descriptor);
+
+  return SkipClassCheckClassPath(descriptor, dex_file, dex_files);
 }
 
 // A fast version of SkipClass above if the class pointer is available
@@ -1525,7 +1580,7 @@ static void ResolveClassFieldsAndMethods(const ParallelCompilationManager* manag
   // definitions, since many of them many never be referenced by
   // generated code.
   const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
-  if (!SkipClass(class_linker, jclass_loader, dex_file, class_def)) {
+  if (!SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) {
     ScopedObjectAccess soa(self);
     StackHandleScope<2> hs(soa.Self());
     Handle<mirror::ClassLoader> class_loader(
@@ -1632,13 +1687,15 @@ static void ResolveType(const ParallelCompilationManager* manager, size_t type_i
 }
 
 void CompilerDriver::ResolveDexFile(jobject class_loader, const DexFile& dex_file,
+                                    const std::vector<const DexFile*>& dex_files,
                                     ThreadPool* thread_pool, TimingLogger* timings) {
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
 
   // TODO: we could resolve strings here, although the string table is largely filled with class
   //       and method names.
 
-  ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool);
+  ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files,
+                                     thread_pool);
   if (IsImage()) {
     // For images we resolve all types, such as array, whereas for applications just those with
     // classdefs are resolved by ResolveClassFieldsAndMethods.
@@ -1655,7 +1712,7 @@ void CompilerDriver::Verify(jobject class_loader, const std::vector<const DexFil
   for (size_t i = 0; i != dex_files.size(); ++i) {
     const DexFile* dex_file = dex_files[i];
     CHECK(dex_file != NULL);
-    VerifyDexFile(class_loader, *dex_file, thread_pool, timings);
+    VerifyDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
   }
 }
 
@@ -1707,10 +1764,12 @@ static void VerifyClass(const ParallelCompilationManager* manager, size_t class_
 }
 
 void CompilerDriver::VerifyDexFile(jobject class_loader, const DexFile& dex_file,
+                                   const std::vector<const DexFile*>& dex_files,
                                    ThreadPool* thread_pool, TimingLogger* timings) {
   TimingLogger::ScopedTiming t("Verify Dex File", timings);
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-  ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool);
+  ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files,
+                                     thread_pool);
   context.ForAll(0, dex_file.NumClassDefs(), VerifyClass, thread_count_);
 }
 
@@ -1800,10 +1859,12 @@ static void InitializeClass(const ParallelCompilationManager* manager, size_t cl
 }
 
 void CompilerDriver::InitializeClasses(jobject jni_class_loader, const DexFile& dex_file,
+                                       const std::vector<const DexFile*>& dex_files,
                                        ThreadPool* thread_pool, TimingLogger* timings) {
   TimingLogger::ScopedTiming t("InitializeNoClinit", timings);
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-  ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, thread_pool);
+  ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, dex_files,
+                                     thread_pool);
   size_t thread_count;
   if (IsImage()) {
     // TODO: remove this when transactional mode supports multithreading.
@@ -1824,7 +1885,7 @@ void CompilerDriver::InitializeClasses(jobject class_loader,
   for (size_t i = 0; i != dex_files.size(); ++i) {
     const DexFile* dex_file = dex_files[i];
     CHECK(dex_file != NULL);
-    InitializeClasses(class_loader, *dex_file, thread_pool, timings);
+    InitializeClasses(class_loader, *dex_file, dex_files, thread_pool, timings);
   }
 }
 
@@ -1833,7 +1894,7 @@ void CompilerDriver::Compile(jobject class_loader, const std::vector<const DexFi
   for (size_t i = 0; i != dex_files.size(); ++i) {
     const DexFile* dex_file = dex_files[i];
     CHECK(dex_file != NULL);
-    CompileDexFile(class_loader, *dex_file, thread_pool, timings);
+    CompileDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
   }
 }
 
@@ -1843,7 +1904,7 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz
   const DexFile& dex_file = *manager->GetDexFile();
   const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
   ClassLinker* class_linker = manager->GetClassLinker();
-  if (SkipClass(class_linker, jclass_loader, dex_file, class_def)) {
+  if (SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) {
     return;
   }
   ClassReference ref(&dex_file, class_def_index);
@@ -1912,10 +1973,11 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz
 }
 
 void CompilerDriver::CompileDexFile(jobject class_loader, const DexFile& dex_file,
+                                    const std::vector<const DexFile*>& dex_files,
                                     ThreadPool* thread_pool, TimingLogger* timings) {
   TimingLogger::ScopedTiming t("Compile Dex File", timings);
   ParallelCompilationManager context(Runtime::Current()->GetClassLinker(), class_loader, this,
-                                     &dex_file, thread_pool);
+                                     &dex_file, dex_files, thread_pool);
   context.ForAll(0, dex_file.NumClassDefs(), CompilerDriver::CompileClass, thread_count_);
 }
 
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 6dae398..2a5cdb9 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -213,6 +213,12 @@ class CompilerDriver {
                           bool* is_type_initialized, bool* use_direct_type_ptr,
                           uintptr_t* direct_type_ptr, bool* out_is_finalizable);
 
+  // Query methods for the java.lang.ref.Reference class.
+  bool CanEmbedReferenceTypeInCode(ClassReference* ref,
+                                   bool* use_direct_type_ptr, uintptr_t* direct_type_ptr);
+  uint32_t GetReferenceSlowFlagOffset() const;
+  uint32_t GetReferenceDisableFlagOffset() const;
+
   // Get the DexCache for the
   mirror::DexCache* GetDexCache(const DexCompilationUnit* mUnit)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -356,6 +362,7 @@ class CompilerDriver {
                      uint16_t referrer_class_def_idx,
                      uint32_t referrer_method_idx,
                      uint32_t target_method_idx,
+                     const DexFile* target_dex_file,
                      size_t literal_offset)
       LOCKS_EXCLUDED(compiled_methods_lock_);
 
@@ -402,10 +409,6 @@ class CompilerDriver {
     return dump_passes_;
   }
 
-  bool DidIncludeDebugSymbols() const {
-    return compiler_options_->GetIncludeDebugSymbols();
-  }
-
   CumulativeLogger* GetTimingsLogger() const {
     return timings_logger_;
   }
@@ -549,6 +552,10 @@ class CompilerDriver {
 
   class TypePatchInformation : public PatchInformation {
    public:
+    const DexFile& GetTargetTypeDexFile() const {
+      return *target_type_dex_file_;
+    }
+
     uint32_t GetTargetTypeIdx() const {
       return target_type_idx_;
     }
@@ -565,13 +572,15 @@ class CompilerDriver {
                          uint16_t referrer_class_def_idx,
                          uint32_t referrer_method_idx,
                          uint32_t target_type_idx,
+                         const DexFile* target_type_dex_file,
                          size_t literal_offset)
         : PatchInformation(dex_file, referrer_class_def_idx,
                            referrer_method_idx, literal_offset),
-          target_type_idx_(target_type_idx) {
+          target_type_idx_(target_type_idx), target_type_dex_file_(target_type_dex_file) {
     }
 
     const uint32_t target_type_idx_;
+    const DexFile* target_type_dex_file_;
 
     friend class CompilerDriver;
     DISALLOW_COPY_AND_ASSIGN(TypePatchInformation);
@@ -599,14 +608,6 @@ class CompilerDriver {
   std::vector<uint8_t>* DeduplicateGCMap(const std::vector<uint8_t>& code);
   std::vector<uint8_t>* DeduplicateCFIInfo(const std::vector<uint8_t>* cfi_info);
 
-  /*
-   * @brief return the pointer to the Call Frame Information.
-   * @return pointer to call frame information for this compilation.
-   */
-  std::vector<uint8_t>* GetCallFrameInformation() const {
-    return cfi_info_.get();
-  }
-
   ProfileFile profile_file_;
   bool profile_present_;
 
@@ -658,12 +659,14 @@ class CompilerDriver {
                ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
   void ResolveDexFile(jobject class_loader, const DexFile& dex_file,
+                      const std::vector<const DexFile*>& dex_files,
                       ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   void Verify(jobject class_loader, const std::vector<const DexFile*>& dex_files,
               ThreadPool* thread_pool, TimingLogger* timings);
   void VerifyDexFile(jobject class_loader, const DexFile& dex_file,
+                     const std::vector<const DexFile*>& dex_files,
                      ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
@@ -671,6 +674,7 @@ class CompilerDriver {
                          ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
   void InitializeClasses(jobject class_loader, const DexFile& dex_file,
+                         const std::vector<const DexFile*>& dex_files,
                          ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_, compiled_classes_lock_);
 
@@ -681,6 +685,7 @@ class CompilerDriver {
   void Compile(jobject class_loader, const std::vector<const DexFile*>& dex_files,
                ThreadPool* thread_pool, TimingLogger* timings);
   void CompileDexFile(jobject class_loader, const DexFile& dex_file,
+                      const std::vector<const DexFile*>& dex_files,
                       ThreadPool* thread_pool, TimingLogger* timings)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
   void CompileMethod(const DexFile::CodeItem* code_item, uint32_t access_flags,
@@ -766,9 +771,6 @@ class CompilerDriver {
 
   bool support_boot_image_fixup_;
 
-  // Call Frame Information, which might be generated to help stack tracebacks.
-  std::unique_ptr<std::vector<uint8_t>> cfi_info_;
-
   // DeDuplication data structures, these own the corresponding byte arrays.
   class DedupeHashFunc {
    public:
diff --git a/compiler/elf_patcher.cc b/compiler/elf_patcher.cc
index 6112fbb..9ae755d 100644
--- a/compiler/elf_patcher.cc
+++ b/compiler/elf_patcher.cc
@@ -99,11 +99,13 @@ mirror::ArtMethod* ElfPatcher::GetTargetMethod(const CompilerDriver::CallPatchIn
 mirror::Class* ElfPatcher::GetTargetType(const CompilerDriver::TypePatchInformation* patch) {
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   StackHandleScope<2> hs(Thread::Current());
-  Handle<mirror::DexCache> dex_cache(hs.NewHandle(class_linker->FindDexCache(patch->GetDexFile())));
-  mirror::Class* klass = class_linker->ResolveType(patch->GetDexFile(), patch->GetTargetTypeIdx(),
+  Handle<mirror::DexCache> dex_cache(hs.NewHandle(class_linker->FindDexCache(
+          patch->GetTargetTypeDexFile())));
+  mirror::Class* klass = class_linker->ResolveType(patch->GetTargetTypeDexFile(),
+                                                   patch->GetTargetTypeIdx(),
                                                    dex_cache, NullHandle<mirror::ClassLoader>());
   CHECK(klass != NULL)
-    << patch->GetDexFile().GetLocation() << " " << patch->GetTargetTypeIdx();
+    << patch->GetTargetTypeDexFile().GetLocation() << " " << patch->GetTargetTypeIdx();
   CHECK(dex_cache->GetResolvedTypes()->Get(patch->GetTargetTypeIdx()) == klass)
     << patch->GetDexFile().GetLocation() << " " << patch->GetReferrerMethodIdx() << " "
     << PrettyClass(dex_cache->GetResolvedTypes()->Get(patch->GetTargetTypeIdx())) << " "
@@ -120,6 +122,7 @@ void ElfPatcher::AddPatch(uintptr_t p) {
 
 uint32_t* ElfPatcher::GetPatchLocation(uintptr_t patch_ptr) {
   CHECK_GE(patch_ptr, reinterpret_cast<uintptr_t>(oat_file_->Begin()));
+  CHECK_LE(patch_ptr, reinterpret_cast<uintptr_t>(oat_file_->End()));
   uintptr_t off = patch_ptr - reinterpret_cast<uintptr_t>(oat_file_->Begin());
   uintptr_t ret = reinterpret_cast<uintptr_t>(oat_header_) + off;
 
@@ -144,20 +147,20 @@ void ElfPatcher::SetPatchLocation(const CompilerDriver::PatchInformation* patch,
           cpatch->GetTargetDexFile()->GetMethodId(cpatch->GetTargetMethodIdx());
       uint32_t expected = reinterpret_cast<uintptr_t>(&id) & 0xFFFFFFFF;
       uint32_t actual = *patch_location;
-      CHECK(actual == expected || actual == value) << std::hex
-          << "actual=" << actual
-          << "expected=" << expected
-          << "value=" << value;
+      CHECK(actual == expected || actual == value) << "Patching call failed: " << std::hex
+          << " actual=" << actual
+          << " expected=" << expected
+          << " value=" << value;
     }
     if (patch->IsType()) {
       const CompilerDriver::TypePatchInformation* tpatch = patch->AsType();
-      const DexFile::TypeId& id = tpatch->GetDexFile().GetTypeId(tpatch->GetTargetTypeIdx());
+      const DexFile::TypeId& id = tpatch->GetTargetTypeDexFile().GetTypeId(tpatch->GetTargetTypeIdx());
       uint32_t expected = reinterpret_cast<uintptr_t>(&id) & 0xFFFFFFFF;
       uint32_t actual = *patch_location;
-      CHECK(actual == expected || actual == value) << std::hex
-          << "actual=" << actual
-          << "expected=" << expected
-          << "value=" << value;
+      CHECK(actual == expected || actual == value) << "Patching type failed: " << std::hex
+          << " actual=" << actual
+          << " expected=" << expected
+          << " value=" << value;
     }
   }
   *patch_location = value;
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 1fde12e..71f02d3 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -24,6 +24,7 @@
 #include "elf_utils.h"
 #include "file_output_stream.h"
 #include "globals.h"
+#include "leb128.h"
 #include "oat.h"
 #include "oat_writer.h"
 #include "utils.h"
@@ -38,6 +39,25 @@ static uint8_t MakeStInfo(uint8_t binding, uint8_t type) {
   return ((binding) << 4) + ((type) & 0xf);
 }
 
+static void UpdateWord(std::vector<uint8_t>* buf, int offset, int data) {
+  (*buf)[offset+0] = data;
+  (*buf)[offset+1] = data >> 8;
+  (*buf)[offset+2] = data >> 16;
+  (*buf)[offset+3] = data >> 24;
+}
+
+static void PushWord(std::vector<uint8_t>* buf, int data) {
+  buf->push_back(data & 0xff);
+  buf->push_back((data >> 8) & 0xff);
+  buf->push_back((data >> 16) & 0xff);
+  buf->push_back((data >> 24) & 0xff);
+}
+
+static void PushHalf(std::vector<uint8_t>* buf, int data) {
+  buf->push_back(data & 0xff);
+  buf->push_back((data >> 8) & 0xff);
+}
+
 bool ElfWriterQuick::ElfBuilder::Write() {
   // The basic layout of the elf file. Order may be different in final output.
   // +-------------------------+
@@ -822,37 +842,131 @@ void ElfWriterQuick::ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug)
   }
 }
 
+static void EncodeUnsignedLeb128(uint32_t data, std::vector<uint8_t>* dst) {
+  size_t encoded_size = UnsignedLeb128Size(data);
+  size_t cur_index = dst->size();
+  dst->resize(dst->size() + encoded_size);
+  uint8_t* write_pos = &((*dst)[cur_index]);
+  uint8_t* write_pos_after = EncodeUnsignedLeb128(write_pos, data);
+  DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size);
+}
+
+static void EncodeSignedLeb128(int32_t data, std::vector<uint8_t>* dst) {
+  size_t encoded_size = SignedLeb128Size(data);
+  size_t cur_index = dst->size();
+  dst->resize(dst->size() + encoded_size);
+  uint8_t* write_pos = &((*dst)[cur_index]);
+  uint8_t* write_pos_after = EncodeSignedLeb128(write_pos, data);
+  DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size);
+}
+
+std::vector<uint8_t>* ConstructCIEFrameX86(bool is_x86_64) {
+  std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>;
+
+  // Length (will be filled in later in this routine).
+  PushWord(cfi_info, 0);
+
+  // CIE id: always 0.
+  PushWord(cfi_info, 0);
+
+  // Version: always 1.
+  cfi_info->push_back(0x01);
+
+  // Augmentation: 'zR\0'
+  cfi_info->push_back(0x7a);
+  cfi_info->push_back(0x52);
+  cfi_info->push_back(0x0);
+
+  // Code alignment: 1.
+  EncodeUnsignedLeb128(1, cfi_info);
+
+  // Data alignment.
+  if (is_x86_64) {
+    EncodeSignedLeb128(-8, cfi_info);
+  } else {
+    EncodeSignedLeb128(-4, cfi_info);
+  }
+
+  // Return address register.
+  if (is_x86_64) {
+    // R16(RIP)
+    cfi_info->push_back(0x10);
+  } else {
+    // R8(EIP)
+    cfi_info->push_back(0x08);
+  }
+
+  // Augmentation length: 1.
+  cfi_info->push_back(1);
+
+  // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4).
+  cfi_info->push_back(0x03);
+
+  // Initial instructions.
+  if (is_x86_64) {
+    // DW_CFA_def_cfa R7(RSP) 8.
+    cfi_info->push_back(0x0c);
+    cfi_info->push_back(0x07);
+    cfi_info->push_back(0x08);
+
+    // DW_CFA_offset R16(RIP) 1 (* -8).
+    cfi_info->push_back(0x90);
+    cfi_info->push_back(0x01);
+  } else {
+    // DW_CFA_def_cfa R4(ESP) 4.
+    cfi_info->push_back(0x0c);
+    cfi_info->push_back(0x04);
+    cfi_info->push_back(0x04);
+
+    // DW_CFA_offset R8(EIP) 1 (* -4).
+    cfi_info->push_back(0x88);
+    cfi_info->push_back(0x01);
+  }
+
+  // Padding to a multiple of 4
+  while ((cfi_info->size() & 3) != 0) {
+    // DW_CFA_nop is encoded as 0.
+    cfi_info->push_back(0);
+  }
+
+  // Set the length of the CIE inside the generated bytes.
+  uint32_t length = cfi_info->size() - 4;
+  (*cfi_info)[0] = length;
+  (*cfi_info)[1] = length >> 8;
+  (*cfi_info)[2] = length >> 16;
+  (*cfi_info)[3] = length >> 24;
+  return cfi_info;
+}
+
+std::vector<uint8_t>* ConstructCIEFrame(InstructionSet isa) {
+  switch (isa) {
+    case kX86:
+      return ConstructCIEFrameX86(false);
+    case kX86_64:
+      return ConstructCIEFrameX86(true);
+
+    default:
+      // Not implemented.
+      return nullptr;
+  }
+}
+
 bool ElfWriterQuick::Write(OatWriter* oat_writer,
                            const std::vector<const DexFile*>& dex_files_unused,
                            const std::string& android_root_unused,
                            bool is_host_unused) {
-  const bool debug = false;
-  const bool add_symbols = oat_writer->DidAddSymbols();
+  constexpr bool debug = false;
   const OatHeader& oat_header = oat_writer->GetOatHeader();
   Elf32_Word oat_data_size = oat_header.GetExecutableOffset();
   uint32_t oat_exec_size = oat_writer->GetSize() - oat_data_size;
 
   ElfBuilder builder(oat_writer, elf_file_, compiler_driver_->GetInstructionSet(), 0,
-                     oat_data_size, oat_data_size, oat_exec_size, add_symbols, debug);
+                     oat_data_size, oat_data_size, oat_exec_size,
+                     compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols(),
+                     debug);
 
-  if (add_symbols) {
-    AddDebugSymbols(builder, oat_writer, debug);
-  }
-
-  bool generateDebugInformation = compiler_driver_->GetCallFrameInformation() != nullptr;
-  if (generateDebugInformation) {
-    ElfRawSectionBuilder debug_info(".debug_info",   SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    ElfRawSectionBuilder debug_str(".debug_str",    SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    ElfRawSectionBuilder eh_frame(".eh_frame",  SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
-    eh_frame.SetBuffer(*compiler_driver_->GetCallFrameInformation());
-
-    FillInCFIInformation(oat_writer, debug_info.GetBuffer(),
-                         debug_abbrev.GetBuffer(), debug_str.GetBuffer());
-    builder.RegisterRawSection(debug_info);
-    builder.RegisterRawSection(debug_abbrev);
-    builder.RegisterRawSection(eh_frame);
-    builder.RegisterRawSection(debug_str);
+  if (compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
+    WriteDebugSymbols(builder, oat_writer);
   }
 
   if (compiler_driver_->GetCompilerOptions().GetIncludePatchInformation()) {
@@ -865,32 +979,62 @@ bool ElfWriterQuick::Write(OatWriter* oat_writer,
   return builder.Write();
 }
 
-void ElfWriterQuick::AddDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer, bool debug) {
+void ElfWriterQuick::WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer) {
+  std::unique_ptr<std::vector<uint8_t>> cfi_info(
+      ConstructCIEFrame(compiler_driver_->GetInstructionSet()));
+
+  // Iterate over the compiled methods.
   const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetCFIMethodInfo();
   ElfSymtabBuilder* symtab = &builder.symtab_builder_;
   for (auto it = method_info.begin(); it != method_info.end(); ++it) {
     symtab->AddSymbol(it->method_name_, &builder.text_builder_, it->low_pc_, true,
                       it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
-  }
-}
 
-static void UpdateWord(std::vector<uint8_t>*buf, int offset, int data) {
-  (*buf)[offset+0] = data;
-  (*buf)[offset+1] = data >> 8;
-  (*buf)[offset+2] = data >> 16;
-  (*buf)[offset+3] = data >> 24;
-}
+    // Include CFI for compiled method, if possible.
+    if (cfi_info.get() != nullptr) {
+      DCHECK(it->compiled_method_ != nullptr);
+
+      // Copy in the FDE, if present
+      const std::vector<uint8_t>* fde = it->compiled_method_->GetCFIInfo();
+      if (fde != nullptr) {
+        // Copy the information into cfi_info and then fix the address in the new copy.
+        int cur_offset = cfi_info->size();
+        cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
+
+        // Set the 'CIE_pointer' field to cur_offset+4.
+        uint32_t CIE_pointer = cur_offset + 4;
+        uint32_t offset_to_update = cur_offset + sizeof(uint32_t);
+        (*cfi_info)[offset_to_update+0] = CIE_pointer;
+        (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8;
+        (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16;
+        (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24;
+
+        // Set the 'initial_location' field to address the start of the method.
+        offset_to_update = cur_offset + 2*sizeof(uint32_t);
+        const uint32_t quick_code_start = it->low_pc_;
+        (*cfi_info)[offset_to_update+0] = quick_code_start;
+        (*cfi_info)[offset_to_update+1] = quick_code_start >> 8;
+        (*cfi_info)[offset_to_update+2] = quick_code_start >> 16;
+        (*cfi_info)[offset_to_update+3] = quick_code_start >> 24;
+      }
+    }
+  }
 
-static void PushWord(std::vector<uint8_t>*buf, int data) {
-  buf->push_back(data & 0xff);
-  buf->push_back((data >> 8) & 0xff);
-  buf->push_back((data >> 16) & 0xff);
-  buf->push_back((data >> 24) & 0xff);
-}
+  if (cfi_info.get() != nullptr) {
+    // Now lay down the Elf sections.
+    ElfRawSectionBuilder debug_info(".debug_info",   SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    ElfRawSectionBuilder debug_str(".debug_str",    SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    ElfRawSectionBuilder eh_frame(".eh_frame",  SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
+    eh_frame.SetBuffer(std::move(*cfi_info.get()));
 
-static void PushHalf(std::vector<uint8_t>*buf, int data) {
-  buf->push_back(data & 0xff);
-  buf->push_back((data >> 8) & 0xff);
+    FillInCFIInformation(oat_writer, debug_info.GetBuffer(), debug_abbrev.GetBuffer(),
+                         debug_str.GetBuffer());
+    builder.RegisterRawSection(debug_info);
+    builder.RegisterRawSection(debug_abbrev);
+    builder.RegisterRawSection(eh_frame);
+    builder.RegisterRawSection(debug_str);
+  }
 }
 
 void ElfWriterQuick::FillInCFIInformation(OatWriter* oat_writer,
diff --git a/compiler/elf_writer_quick.h b/compiler/elf_writer_quick.h
index a0d36df..8cfe550 100644
--- a/compiler/elf_writer_quick.h
+++ b/compiler/elf_writer_quick.h
@@ -48,9 +48,7 @@ class ElfWriterQuick FINAL : public ElfWriter {
   ~ElfWriterQuick() {}
 
   class ElfBuilder;
-  void AddDebugSymbols(ElfBuilder& builder,
-                       OatWriter* oat_writer,
-                       bool debug);
+  void WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer);
   void ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug);
 
   class ElfSectionBuilder {
@@ -126,7 +124,7 @@ class ElfWriterQuick FINAL : public ElfWriter {
         : ElfSectionBuilder(sec_name, type, flags, link, info, align, entsize) {}
     ~ElfRawSectionBuilder() {}
     std::vector<uint8_t>* GetBuffer() { return &buf_; }
-    void SetBuffer(std::vector<uint8_t> buf) { buf_ = buf; }
+    void SetBuffer(std::vector<uint8_t>&& buf) { buf_ = buf; }
 
    protected:
     std::vector<uint8_t> buf_;
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 3005e56..6b23345 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -141,6 +141,8 @@ TEST_F(ImageTest, WriteRead) {
   std::string image("-Ximage:");
   image.append(image_location.GetFilename());
   options.push_back(std::make_pair(image.c_str(), reinterpret_cast<void*>(NULL)));
+  // By default the compiler this creates will not include patch information.
+  options.push_back(std::make_pair("-Xnorelocate", nullptr));
 
   if (!Runtime::Create(options, false)) {
     LOG(FATAL) << "Failed to create runtime";
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 9da59ab..1ba5d32 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -357,7 +357,6 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor {
         uint32_t thumb_offset = compiled_method->CodeDelta();
         quick_code_offset = offset_ + sizeof(OatQuickMethodHeader) + thumb_offset;
 
-        bool force_debug_capture = false;
         bool deduped = false;
 
         // Deduplicate code arrays.
@@ -400,47 +399,22 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor {
           offset_ += code_size;
         }
 
-        uint32_t quick_code_start = quick_code_offset - writer_->oat_header_->GetExecutableOffset();
-        std::vector<uint8_t>* cfi_info = writer_->compiler_driver_->GetCallFrameInformation();
-        if (cfi_info != nullptr) {
-          // Copy in the FDE, if present
-          const std::vector<uint8_t>* fde = compiled_method->GetCFIInfo();
-          if (fde != nullptr) {
-            // Copy the information into cfi_info and then fix the address in the new copy.
-            int cur_offset = cfi_info->size();
-            cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
-
-            // Set the 'CIE_pointer' field to cur_offset+4.
-            uint32_t CIE_pointer = cur_offset + 4;
-            uint32_t offset_to_update = cur_offset + sizeof(uint32_t);
-            (*cfi_info)[offset_to_update+0] = CIE_pointer;
-            (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8;
-            (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16;
-            (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24;
-
-            // Set the 'initial_location' field to address the start of the method.
-            offset_to_update = cur_offset + 2*sizeof(uint32_t);
-            (*cfi_info)[offset_to_update+0] = quick_code_start;
-            (*cfi_info)[offset_to_update+1] = quick_code_start >> 8;
-            (*cfi_info)[offset_to_update+2] = quick_code_start >> 16;
-            (*cfi_info)[offset_to_update+3] = quick_code_start >> 24;
-            force_debug_capture = true;
-          }
-        }
+        if (writer_->compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
+          // Record debug information for this function if we are doing that.
 
-
-        if (writer_->compiler_driver_->DidIncludeDebugSymbols() || force_debug_capture) {
-          // Record debug information for this function if we are doing that or
-          // we have CFI and so need it.
           std::string name = PrettyMethod(it.GetMemberIndex(), *dex_file_, true);
           if (deduped) {
-            // TODO We should place the DEDUPED tag on the first instance of a
-            // deduplicated symbol so that it will show up in a debuggerd crash
-            // report.
+            // TODO We should place the DEDUPED tag on the first instance of a deduplicated symbol
+            // so that it will show up in a debuggerd crash report.
             name += " [ DEDUPED ]";
           }
-          writer_->method_info_.push_back(DebugInfo(name, quick_code_start,
-                                                    quick_code_start + code_size));
+
+          const uint32_t quick_code_start = quick_code_offset -
+              writer_->oat_header_->GetExecutableOffset();
+          writer_->method_info_.push_back(DebugInfo(name,
+                                                    quick_code_start,
+                                                    quick_code_start + code_size,
+                                                    compiled_method));
         }
       }
 
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 945048e..ef5fd6b 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -30,6 +30,7 @@
 namespace art {
 
 class BitVector;
+class CompiledMethod;
 class OutputStream;
 
 // OatHeader         variable length with count of D OatDexFiles
@@ -97,22 +98,21 @@ class OatWriter {
   ~OatWriter();
 
   struct DebugInfo {
-    DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc)
-      : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc) {
+    DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc,
+              CompiledMethod* compiled_method)
+      : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc),
+        compiled_method_(compiled_method) {
     }
-    std::string method_name_;
+    std::string method_name_;  // Note: this name is a pretty-printed name.
     uint32_t    low_pc_;
     uint32_t    high_pc_;
+    CompiledMethod* compiled_method_;
   };
 
   const std::vector<DebugInfo>& GetCFIMethodInfo() const {
     return method_info_;
   }
 
-  bool DidAddSymbols() const {
-    return compiler_driver_->DidIncludeDebugSymbols();
-  }
-
  private:
   // The DataAccess classes are helper classes that provide access to members related to
   // a given map, i.e. GC map, mapping table or vmap table. By abstracting these away
diff --git a/compiler/utils/arena_allocator.h b/compiler/utils/arena_allocator.h
index f4bcb1d..7bfbb6f 100644
--- a/compiler/utils/arena_allocator.h
+++ b/compiler/utils/arena_allocator.h
@@ -24,6 +24,7 @@
 #include "base/mutex.h"
 #include "mem_map.h"
 #include "utils.h"
+#include "utils/debug_stack.h"
 
 namespace art {
 
@@ -34,6 +35,9 @@ class ArenaStack;
 class ScopedArenaAllocator;
 class MemStats;
 
+template <typename T>
+class ArenaAllocatorAdapter;
+
 static constexpr bool kArenaAllocatorCountAllocations = false;
 
 // Type of allocation for memory tuning.
@@ -147,11 +151,14 @@ class ArenaPool {
   DISALLOW_COPY_AND_ASSIGN(ArenaPool);
 };
 
-class ArenaAllocator : private ArenaAllocatorStats {
+class ArenaAllocator : private DebugStackRefCounter, private ArenaAllocatorStats {
  public:
   explicit ArenaAllocator(ArenaPool* pool);
   ~ArenaAllocator();
 
+  // Get adapter for use in STL containers. See arena_containers.h .
+  ArenaAllocatorAdapter<void> Adapter(ArenaAllocKind kind = kArenaAllocSTL);
+
   // Returns zeroed memory.
   void* Alloc(size_t bytes, ArenaAllocKind kind) ALWAYS_INLINE {
     if (UNLIKELY(running_on_valgrind_)) {
@@ -190,6 +197,9 @@ class ArenaAllocator : private ArenaAllocatorStats {
   Arena* arena_head_;
   bool running_on_valgrind_;
 
+  template <typename U>
+  friend class ArenaAllocatorAdapter;
+
   DISALLOW_COPY_AND_ASSIGN(ArenaAllocator);
 };  // ArenaAllocator
 
diff --git a/compiler/utils/arena_containers.h b/compiler/utils/arena_containers.h
new file mode 100644
index 0000000..c48b0c8
--- /dev/null
+++ b/compiler/utils/arena_containers.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_ARENA_CONTAINERS_H_
+#define ART_COMPILER_UTILS_ARENA_CONTAINERS_H_
+
+#include <deque>
+#include <queue>
+#include <set>
+#include <vector>
+
+#include "utils/arena_allocator.h"
+#include "safe_map.h"
+
+namespace art {
+
+// Adapter for use of ArenaAllocator in STL containers.
+// Use ArenaAllocator::Adapter() to create an adapter to pass to container constructors.
+// For example,
+//   struct Foo {
+//     explicit Foo(ArenaAllocator* allocator)
+//         : foo_vector(allocator->Adapter(kArenaAllocMisc)),
+//           foo_map(std::less<int>(), allocator->Adapter()) {
+//     }
+//     ArenaVector<int> foo_vector;
+//     ArenaSafeMap<int, int> foo_map;
+//   };
+template <typename T>
+class ArenaAllocatorAdapter;
+
+template <typename T>
+using ArenaDeque = std::deque<T, ArenaAllocatorAdapter<T>>;
+
+template <typename T>
+using ArenaQueue = std::queue<T, ArenaDeque<T>>;
+
+template <typename T>
+using ArenaVector = std::vector<T, ArenaAllocatorAdapter<T>>;
+
+template <typename T, typename Comparator = std::less<T>>
+using ArenaSet = std::set<T, Comparator, ArenaAllocatorAdapter<T>>;
+
+template <typename K, typename V, typename Comparator = std::less<K>>
+using ArenaSafeMap =
+    SafeMap<K, V, Comparator, ArenaAllocatorAdapter<std::pair<const K, V>>>;
+
+// Implementation details below.
+
+template <bool kCount>
+class ArenaAllocatorAdapterKindImpl;
+
+template <>
+class ArenaAllocatorAdapterKindImpl<false> {
+ public:
+  // Not tracking allocations, ignore the supplied kind and arbitrarily provide kArenaAllocSTL.
+  explicit ArenaAllocatorAdapterKindImpl(ArenaAllocKind kind) { }
+  ArenaAllocatorAdapterKindImpl& operator=(const ArenaAllocatorAdapterKindImpl& other) = default;
+  ArenaAllocKind Kind() { return kArenaAllocSTL; }
+};
+
+template <bool kCount>
+class ArenaAllocatorAdapterKindImpl {
+ public:
+  explicit ArenaAllocatorAdapterKindImpl(ArenaAllocKind kind) : kind_(kind) { }
+  ArenaAllocatorAdapterKindImpl& operator=(const ArenaAllocatorAdapterKindImpl& other) = default;
+  ArenaAllocKind Kind() { return kind_; }
+
+ private:
+  ArenaAllocKind kind_;
+};
+
+typedef ArenaAllocatorAdapterKindImpl<kArenaAllocatorCountAllocations> ArenaAllocatorAdapterKind;
+
+template <>
+class ArenaAllocatorAdapter<void>
+    : private DebugStackReference, private ArenaAllocatorAdapterKind {
+ public:
+  typedef void value_type;
+  typedef void* pointer;
+  typedef const void* const_pointer;
+
+  template <typename U>
+  struct rebind {
+    typedef ArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ArenaAllocatorAdapter(ArenaAllocator* arena_allocator,
+                                 ArenaAllocKind kind = kArenaAllocSTL)
+      : DebugStackReference(arena_allocator),
+        ArenaAllocatorAdapterKind(kind),
+        arena_allocator_(arena_allocator) {
+  }
+  template <typename U>
+  ArenaAllocatorAdapter(const ArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        ArenaAllocatorAdapterKind(other),
+        arena_allocator_(other.arena_allocator_) {
+  }
+  ArenaAllocatorAdapter(const ArenaAllocatorAdapter& other) = default;
+  ArenaAllocatorAdapter& operator=(const ArenaAllocatorAdapter& other) = default;
+  ~ArenaAllocatorAdapter() = default;
+
+ private:
+  ArenaAllocator* arena_allocator_;
+
+  template <typename U>
+  friend class ArenaAllocatorAdapter;
+};
+
+template <typename T>
+class ArenaAllocatorAdapter : private DebugStackReference, private ArenaAllocatorAdapterKind {
+ public:
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef const T* const_pointer;
+  typedef const T& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  template <typename U>
+  struct rebind {
+    typedef ArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ArenaAllocatorAdapter(ArenaAllocator* arena_allocator, ArenaAllocKind kind)
+      : DebugStackReference(arena_allocator),
+        ArenaAllocatorAdapterKind(kind),
+        arena_allocator_(arena_allocator) {
+  }
+  template <typename U>
+  ArenaAllocatorAdapter(const ArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        ArenaAllocatorAdapterKind(other),
+        arena_allocator_(other.arena_allocator_) {
+  }
+  ArenaAllocatorAdapter(const ArenaAllocatorAdapter& other) = default;
+  ArenaAllocatorAdapter& operator=(const ArenaAllocatorAdapter& other) = default;
+  ~ArenaAllocatorAdapter() = default;
+
+  size_type max_size() const {
+    return static_cast<size_type>(-1) / sizeof(T);
+  }
+
+  pointer address(reference x) const { return &x; }
+  const_pointer address(const_reference x) const { return &x; }
+
+  pointer allocate(size_type n, ArenaAllocatorAdapter<void>::pointer hint = nullptr) {
+    DCHECK_LE(n, max_size());
+    return reinterpret_cast<T*>(arena_allocator_->Alloc(n * sizeof(T),
+                                                        ArenaAllocatorAdapterKind::Kind()));
+  }
+  void deallocate(pointer p, size_type n) {
+  }
+
+  void construct(pointer p, const_reference val) {
+    new (static_cast<void*>(p)) value_type(val);
+  }
+  void destroy(pointer p) {
+    p->~value_type();
+  }
+
+ private:
+  ArenaAllocator* arena_allocator_;
+
+  template <typename U>
+  friend class ArenaAllocatorAdapter;
+
+  template <typename U>
+  friend bool operator==(const ArenaAllocatorAdapter<U>& lhs,
+                         const ArenaAllocatorAdapter<U>& rhs);
+};
+
+template <typename T>
+inline bool operator==(const ArenaAllocatorAdapter<T>& lhs,
+                       const ArenaAllocatorAdapter<T>& rhs) {
+  return lhs.arena_allocator_ == rhs.arena_allocator_;
+}
+
+template <typename T>
+inline bool operator!=(const ArenaAllocatorAdapter<T>& lhs,
+                       const ArenaAllocatorAdapter<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+inline ArenaAllocatorAdapter<void> ArenaAllocator::Adapter(ArenaAllocKind kind) {
+  return ArenaAllocatorAdapter<void>(this, kind);
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_ARENA_CONTAINERS_H_
diff --git a/compiler/utils/scoped_arena_allocator.h b/compiler/utils/scoped_arena_allocator.h
index 9f33f2d..62ea330 100644
--- a/compiler/utils/scoped_arena_allocator.h
+++ b/compiler/utils/scoped_arena_allocator.h
@@ -120,8 +120,8 @@ class ScopedArenaAllocator
     return arena_stack_->Alloc(bytes, kind);
   }
 
-  // ScopedArenaAllocatorAdapter is incomplete here, we need to define this later.
-  ScopedArenaAllocatorAdapter<void> Adapter();
+  // Get adapter for use in STL containers. See scoped_arena_containers.h .
+  ScopedArenaAllocatorAdapter<void> Adapter(ArenaAllocKind kind = kArenaAllocSTL);
 
   // Allow a delete-expression to destroy but not deallocate allocators created by Create().
   static void operator delete(void* ptr) { UNUSED(ptr); }
@@ -138,125 +138,6 @@ class ScopedArenaAllocator
   DISALLOW_COPY_AND_ASSIGN(ScopedArenaAllocator);
 };
 
-template <>
-class ScopedArenaAllocatorAdapter<void>
-    : private DebugStackReference, private DebugStackIndirectTopRef {
- public:
-  typedef void value_type;
-  typedef void* pointer;
-  typedef const void* const_pointer;
-
-  template <typename U>
-  struct rebind {
-    typedef ScopedArenaAllocatorAdapter<U> other;
-  };
-
-  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator)
-      : DebugStackReference(arena_allocator),
-        DebugStackIndirectTopRef(arena_allocator),
-        arena_stack_(arena_allocator->arena_stack_) {
-  }
-  template <typename U>
-  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
-      : DebugStackReference(other),
-        DebugStackIndirectTopRef(other),
-        arena_stack_(other.arena_stack_) {
-  }
-  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
-  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
-  ~ScopedArenaAllocatorAdapter() = default;
-
- private:
-  ArenaStack* arena_stack_;
-
-  template <typename U>
-  friend class ScopedArenaAllocatorAdapter;
-};
-
-// Adapter for use of ScopedArenaAllocator in STL containers.
-template <typename T>
-class ScopedArenaAllocatorAdapter : private DebugStackReference, private DebugStackIndirectTopRef {
- public:
-  typedef T value_type;
-  typedef T* pointer;
-  typedef T& reference;
-  typedef const T* const_pointer;
-  typedef const T& const_reference;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  template <typename U>
-  struct rebind {
-    typedef ScopedArenaAllocatorAdapter<U> other;
-  };
-
-  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator)
-      : DebugStackReference(arena_allocator),
-        DebugStackIndirectTopRef(arena_allocator),
-        arena_stack_(arena_allocator->arena_stack_) {
-  }
-  template <typename U>
-  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
-      : DebugStackReference(other),
-        DebugStackIndirectTopRef(other),
-        arena_stack_(other.arena_stack_) {
-  }
-  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
-  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
-  ~ScopedArenaAllocatorAdapter() = default;
-
-  size_type max_size() const {
-    return static_cast<size_type>(-1) / sizeof(T);
-  }
-
-  pointer address(reference x) const { return &x; }
-  const_pointer address(const_reference x) const { return &x; }
-
-  pointer allocate(size_type n, ScopedArenaAllocatorAdapter<void>::pointer hint = nullptr) {
-    DCHECK_LE(n, max_size());
-    DebugStackIndirectTopRef::CheckTop();
-    return reinterpret_cast<T*>(arena_stack_->Alloc(n * sizeof(T), kArenaAllocSTL));
-  }
-  void deallocate(pointer p, size_type n) {
-    DebugStackIndirectTopRef::CheckTop();
-  }
-
-  void construct(pointer p, const_reference val) {
-    // Don't CheckTop(), allow reusing existing capacity of a vector/deque below the top.
-    new (static_cast<void*>(p)) value_type(val);
-  }
-  void destroy(pointer p) {
-    // Don't CheckTop(), allow reusing existing capacity of a vector/deque below the top.
-    p->~value_type();
-  }
-
- private:
-  ArenaStack* arena_stack_;
-
-  template <typename U>
-  friend class ScopedArenaAllocatorAdapter;
-
-  template <typename U>
-  friend bool operator==(const ScopedArenaAllocatorAdapter<U>& lhs,
-                         const ScopedArenaAllocatorAdapter<U>& rhs);
-};
-
-template <typename T>
-inline bool operator==(const ScopedArenaAllocatorAdapter<T>& lhs,
-                       const ScopedArenaAllocatorAdapter<T>& rhs) {
-  return lhs.arena_stack_ == rhs.arena_stack_;
-}
-
-template <typename T>
-inline bool operator!=(const ScopedArenaAllocatorAdapter<T>& lhs,
-                       const ScopedArenaAllocatorAdapter<T>& rhs) {
-  return !(lhs == rhs);
-}
-
-inline ScopedArenaAllocatorAdapter<void> ScopedArenaAllocator::Adapter() {
-  return ScopedArenaAllocatorAdapter<void>(this);
-}
-
 }  // namespace art
 
 #endif  // ART_COMPILER_UTILS_SCOPED_ARENA_ALLOCATOR_H_
diff --git a/compiler/utils/scoped_arena_containers.h b/compiler/utils/scoped_arena_containers.h
index 6728565..0de7403 100644
--- a/compiler/utils/scoped_arena_containers.h
+++ b/compiler/utils/scoped_arena_containers.h
@@ -22,11 +22,23 @@
 #include <set>
 #include <vector>
 
+#include "utils/arena_containers.h"  // For ArenaAllocatorAdapterKind.
 #include "utils/scoped_arena_allocator.h"
 #include "safe_map.h"
 
 namespace art {
 
+// Adapter for use of ScopedArenaAllocator in STL containers.
+// Use ScopedArenaAllocator::Adapter() to create an adapter to pass to container constructors.
+// For example,
+//   void foo(ScopedArenaAllocator* allocator) {
+//     ScopedArenaVector<int> foo_vector(allocator->Adapter(kArenaAllocMisc));
+//     ScopedArenaSafeMap<int, int> foo_map(std::less<int>(), allocator->Adapter());
+//     // Use foo_vector and foo_map...
+//   }
+template <typename T>
+class ScopedArenaAllocatorAdapter;
+
 template <typename T>
 using ScopedArenaDeque = std::deque<T, ScopedArenaAllocatorAdapter<T>>;
 
@@ -43,6 +55,136 @@ template <typename K, typename V, typename Comparator = std::less<K>>
 using ScopedArenaSafeMap =
     SafeMap<K, V, Comparator, ScopedArenaAllocatorAdapter<std::pair<const K, V>>>;
 
+// Implementation details below.
+
+template <>
+class ScopedArenaAllocatorAdapter<void>
+    : private DebugStackReference, private DebugStackIndirectTopRef,
+      private ArenaAllocatorAdapterKind {
+ public:
+  typedef void value_type;
+  typedef void* pointer;
+  typedef const void* const_pointer;
+
+  template <typename U>
+  struct rebind {
+    typedef ScopedArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator,
+                                       ArenaAllocKind kind = kArenaAllocSTL)
+      : DebugStackReference(arena_allocator),
+        DebugStackIndirectTopRef(arena_allocator),
+        ArenaAllocatorAdapterKind(kind),
+        arena_stack_(arena_allocator->arena_stack_) {
+  }
+  template <typename U>
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        DebugStackIndirectTopRef(other),
+        ArenaAllocatorAdapterKind(other),
+        arena_stack_(other.arena_stack_) {
+  }
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
+  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
+  ~ScopedArenaAllocatorAdapter() = default;
+
+ private:
+  ArenaStack* arena_stack_;
+
+  template <typename U>
+  friend class ScopedArenaAllocatorAdapter;
+};
+
+template <typename T>
+class ScopedArenaAllocatorAdapter
+    : private DebugStackReference, private DebugStackIndirectTopRef,
+      private ArenaAllocatorAdapterKind {
+ public:
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef const T* const_pointer;
+  typedef const T& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  template <typename U>
+  struct rebind {
+    typedef ScopedArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator,
+                                       ArenaAllocKind kind = kArenaAllocSTL)
+      : DebugStackReference(arena_allocator),
+        DebugStackIndirectTopRef(arena_allocator),
+        ArenaAllocatorAdapterKind(kind),
+        arena_stack_(arena_allocator->arena_stack_) {
+  }
+  template <typename U>
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        DebugStackIndirectTopRef(other),
+        ArenaAllocatorAdapterKind(other),
+        arena_stack_(other.arena_stack_) {
+  }
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
+  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
+  ~ScopedArenaAllocatorAdapter() = default;
+
+  size_type max_size() const {
+    return static_cast<size_type>(-1) / sizeof(T);
+  }
+
+  pointer address(reference x) const { return &x; }
+  const_pointer address(const_reference x) const { return &x; }
+
+  pointer allocate(size_type n, ScopedArenaAllocatorAdapter<void>::pointer hint = nullptr) {
+    DCHECK_LE(n, max_size());
+    DebugStackIndirectTopRef::CheckTop();
+    return reinterpret_cast<T*>(arena_stack_->Alloc(n * sizeof(T),
+                                                    ArenaAllocatorAdapterKind::Kind()));
+  }
+  void deallocate(pointer p, size_type n) {
+    DebugStackIndirectTopRef::CheckTop();
+  }
+
+  void construct(pointer p, const_reference val) {
+    // Don't CheckTop(), allow reusing existing capacity of a vector/deque below the top.
+    new (static_cast<void*>(p)) value_type(val);
+  }
+  void destroy(pointer p) {
+    // Don't CheckTop(), allow reusing existing capacity of a vector/deque below the top.
+    p->~value_type();
+  }
+
+ private:
+  ArenaStack* arena_stack_;
+
+  template <typename U>
+  friend class ScopedArenaAllocatorAdapter;
+
+  template <typename U>
+  friend bool operator==(const ScopedArenaAllocatorAdapter<U>& lhs,
+                         const ScopedArenaAllocatorAdapter<U>& rhs);
+};
+
+template <typename T>
+inline bool operator==(const ScopedArenaAllocatorAdapter<T>& lhs,
+                       const ScopedArenaAllocatorAdapter<T>& rhs) {
+  return lhs.arena_stack_ == rhs.arena_stack_;
+}
+
+template <typename T>
+inline bool operator!=(const ScopedArenaAllocatorAdapter<T>& lhs,
+                       const ScopedArenaAllocatorAdapter<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+inline ScopedArenaAllocatorAdapter<void> ScopedArenaAllocator::Adapter(ArenaAllocKind kind) {
+  return ScopedArenaAllocatorAdapter<void>(this, kind);
+}
+
 }  // namespace art
 
 #endif  // ART_COMPILER_UTILS_SCOPED_ARENA_CONTAINERS_H_
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 0d14376..7684271 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -283,8 +283,8 @@ void X86_64Assembler::movw(CpuRegister /*dst*/, const Address& /*src*/) {
 
 void X86_64Assembler::movw(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitOptionalRex32(src, dst);
   EmitOperandSizeOverride();
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x89);
   EmitOperand(src.LowBits(), dst);
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 1d6655c..2f814df 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -124,8 +124,8 @@ class Operand {
     if (index.NeedsRex()) {
       rex_ |= 0x42;  // REX.00X0
     }
-    encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) |
-        static_cast<uint8_t>(base.AsRegister());
+    encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.LowBits()) << 3) |
+        static_cast<uint8_t>(base.LowBits());
     length_ = 2;
   }
 
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index dc1758f..4ed7b20 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -128,13 +128,29 @@ TEST_F(AssemblerX86_64Test, XorqImm) {
 TEST_F(AssemblerX86_64Test, Movl) {
   GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::CpuRegister(x86_64::R11));
   GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::CpuRegister(x86_64::R11));
+  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
   const char* expected =
     "movl %R11d, %R8d\n"
-    "movl %R11d, %EAX\n";
+    "movl %R11d, %EAX\n"
+    "movl 0xc(%RDI,%RBX,4), %EAX\n"
+    "movl 0xc(%RDI,%R9,4), %EAX\n"
+    "movl 0xc(%RDI,%R9,4), %R8d\n";
 
   DriverStr(expected, "movl");
 }
 
+TEST_F(AssemblerX86_64Test, Movw) {
+  GetAssembler()->movw(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
+                       x86_64::CpuRegister(x86_64::R9));
+  const char* expected = "movw %R9w, 0(%RAX)\n";
+  DriverStr(expected, "movw");
+}
+
 
 std::string setcc_test_fn(x86_64::X86_64Assembler* assembler) {
   // From Condition
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index a78d3f7..0437f30 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -430,8 +430,7 @@ class Dex2Oat {
       t2.NewTiming("Patching ELF");
       std::string error_msg;
       if (!PatchOatCode(driver.get(), oat_file, oat_location, &error_msg)) {
-        LOG(ERROR) << "Failed to fixup ELF file " << oat_file->GetPath();
-        LOG(ERROR) << "Error was: " << error_msg;
+        LOG(ERROR) << "Failed to fixup ELF file " << oat_file->GetPath() << ": " << error_msg;
         return nullptr;
       }
     }
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 101a55d..0ca8962 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -268,7 +268,7 @@ DISASSEMBLER_ENTRY(cmp,
     target_specific = true;
     break;
   case 0x63:
-    if (rex == 0x48) {
+    if ((rex & REX_W) != 0) {
       opcode << "movsxd";
       has_modrm = true;
       load = true;
@@ -959,7 +959,7 @@ DISASSEMBLER_ENTRY(cmp,
     byte_operand = true;
     break;
   case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF:
-    if (rex == 0x48) {
+    if ((rex & REX_W) != 0) {
       opcode << "movabsq";
       immediate_bytes = 8;
       reg_in_opcode = true;
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 8fc5e34..302e835 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -252,6 +252,7 @@ LIBART_SRC_FILES_x86_64 := \
   arch/x86_64/context_x86_64.cc \
   arch/x86_64/entrypoints_init_x86_64.cc \
   arch/x86_64/jni_entrypoints_x86_64.S \
+  arch/x86_64/memcmp16_x86_64.S \
   arch/x86_64/portable_entrypoints_x86_64.S \
   arch/x86_64/quick_entrypoints_x86_64.S \
   arch/x86_64/thread_x86_64.cc \
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 4939610..86cb16a 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -365,8 +365,9 @@ END art_quick_invoke_stub
 ARM_ENTRY art_quick_do_long_jump
     vldm r1, {s0-s31}     @ load all fprs from argument fprs_
     ldr  r2, [r0, #60]    @ r2 = r15 (PC from gprs_ 60=4*15)
+    ldr  r14, [r0, #56]   @ (LR from gprs_ 56=4*14)
     add  r0, r0, #12      @ increment r0 to skip gprs_[0..2] 12=4*3
-    ldm  r0, {r3-r14}     @ load remaining gprs from argument gprs_
+    ldm  r0, {r3-r13}     @ load remaining gprs from argument gprs_
     mov  r0, #0           @ clear result registers r0 and r1
     mov  r1, #0
     bx   r2               @ do long jump
diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h
index 65d2f92..14dc1e3 100644
--- a/runtime/arch/memcmp16.h
+++ b/runtime/arch/memcmp16.h
@@ -30,7 +30,7 @@
 //
 // In both cases, MemCmp16 is declared.
 
-#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) || defined(__x86_64__)
 
 extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count);
 #define MemCmp16 __memcmp16
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 17662fa..a315a37 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -21,1018 +21,1018 @@
 /* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
 
 #ifndef L
-# define L(label)	.L##label
+# define L(label)    .L##label
 #endif
 
-#define CFI_PUSH(REG)	\
-	CFI_ADJUST_CFA_OFFSET(4);	\
-	CFI_REL_OFFSET(REG, 0)
+#define CFI_PUSH(REG)    \
+    CFI_ADJUST_CFA_OFFSET(4);    \
+    CFI_REL_OFFSET(REG, 0)
 
-#define CFI_POP(REG)	\
-	CFI_ADJUST_CFA_OFFSET(-4);	\
-	CFI_RESTORE(REG)
+#define CFI_POP(REG)    \
+    CFI_ADJUST_CFA_OFFSET(-4);    \
+    CFI_RESTORE(REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+#define PUSH(REG)    pushl REG; CFI_PUSH (REG)
+#define POP(REG)    popl REG; CFI_POP (REG)
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN		RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
+#define PARMS        4
+#define BLK1        PARMS
+#define BLK2        BLK1+4
+#define LEN        BLK2+4
+#define RETURN_END    POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN        RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
 
 DEFINE_FUNCTION MEMCMP
-	movl	LEN(%esp), %ecx
+    movl       LEN(%esp), %ecx
 
-	shl	$1, %ecx
-	jz	L(zero)
+    shl        $1, %ecx
+    jz         L(zero)
 
-	movl	BLK1(%esp), %eax
-	cmp	$48, %ecx
-	movl	BLK2(%esp), %edx
-	jae	L(48bytesormore)
+    movl       BLK1(%esp), %eax
+    cmp        $48, %ecx
+    movl       BLK2(%esp), %edx
+    jae        L(48bytesormore)
 
-	PUSH	(%ebx)
-	add	%ecx, %edx
-	add	%ecx, %eax
-	jmp	L(less48bytes)
+    PUSH       (%ebx)
+    add        %ecx, %edx
+    add        %ecx, %eax
+    jmp        L(less48bytes)
 
-	CFI_POP	(%ebx)
+    CFI_POP    (%ebx)
 
-	.p2align 4
+    .p2align 4
 L(zero):
-	xor	%eax, %eax
-	ret
+    xor        %eax, %eax
+    ret
 
-	.p2align 4
+    .p2align 4
 L(48bytesormore):
-	PUSH	(%ebx)
-	PUSH	(%esi)
-	PUSH	(%edi)
-	CFI_REMEMBER_STATE
-	movdqu	(%eax), %xmm3
-	movdqu	(%edx), %xmm0
-	movl	%eax, %edi
-	movl	%edx, %esi
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%edi), %edi
-
-	sub	$0xffff, %edx
-	lea	16(%esi), %esi
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%edx, %edi
-	sub	%edx, %esi
-	add	%edx, %ecx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%edx, %esi
-
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$6, %edx
-	je	L(shr_6)
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$12, %edx
-	je	L(shr_12)
-	jmp	L(shr_14)
-
-	.p2align 4
+    PUSH       (%ebx)
+    PUSH       (%esi)
+    PUSH       (%edi)
+    CFI_REMEMBER_STATE
+    movdqu     (%eax), %xmm3
+    movdqu     (%edx), %xmm0
+    movl       %eax, %edi
+    movl       %edx, %esi
+    pcmpeqb    %xmm0, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        16(%edi), %edi
+
+    sub        $0xffff, %edx
+    lea        16(%esi), %esi
+    jnz        L(less16bytes)
+    mov        %edi, %edx
+    and        $0xf, %edx
+    xor        %edx, %edi
+    sub        %edx, %esi
+    add        %edx, %ecx
+    mov        %esi, %edx
+    and        $0xf, %edx
+    jz         L(shr_0)
+    xor        %edx, %esi
+
+    cmp        $0, %edx
+    je         L(shr_0)
+    cmp        $2, %edx
+    je         L(shr_2)
+    cmp        $4, %edx
+    je         L(shr_4)
+    cmp        $6, %edx
+    je         L(shr_6)
+    cmp        $8, %edx
+    je         L(shr_8)
+    cmp        $10, %edx
+    je         L(shr_10)
+    cmp        $12, %edx
+    je         L(shr_12)
+    jmp        L(shr_14)
+
+    .p2align 4
 L(shr_0):
-	cmp	$80, %ecx
-	jae	L(shr_0_gobble)
-	lea	-48(%ecx), %ecx
-	xor	%eax, %eax
-	movaps	(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-	movaps	16(%esi), %xmm2
-	pcmpeqb	16(%edi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	add	$32, %edi
-	add	$32, %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    jae        L(shr_0_gobble)
+    lea        -48(%ecx), %ecx
+    xor        %eax, %eax
+    movaps     (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+    movaps     16(%esi), %xmm2
+    pcmpeqb    16(%edi), %xmm2
+    pand       %xmm1, %xmm2
+    pmovmskb   %xmm2, %edx
+    add        $32, %edi
+    add        $32, %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        (%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_0_gobble):
-	lea	-48(%ecx), %ecx
-	movdqa	(%esi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%edi), %xmm0
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm2
-	pcmpeqb	16(%edi), %xmm2
+    lea        -48(%ecx), %ecx
+    movdqa     (%esi), %xmm0
+    xor        %eax, %eax
+    pcmpeqb    (%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm2
+    pcmpeqb    16(%edi), %xmm2
 L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %ecx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%esi), %xmm0
-	movdqa	48(%esi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%edi), %xmm0
-	pcmpeqb	48(%edi), %xmm2
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %ecx
-	jge	L(shr_0_gobble_loop_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm2
+    sub        $32, %ecx
+    pmovmskb   %xmm2, %edx
+    movdqa     %xmm0, %xmm1
+    movdqa     32(%esi), %xmm0
+    movdqa     48(%esi), %xmm2
+    sbb        $0xffff, %edx
+    pcmpeqb    32(%edi), %xmm0
+    pcmpeqb    48(%edi), %xmm2
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    jz         L(shr_0_gobble_loop)
+
+    pand       %xmm0, %xmm2
+    cmp        $0, %ecx
+    jge        L(shr_0_gobble_loop_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_0_gobble_loop_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb %xmm2, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        (%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_2):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$2,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	2(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_2_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $2,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $2,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        2(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_2_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$2,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $2,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$2,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $2,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$2,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$2,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $2,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $2,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_2_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_2_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	2(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        2(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_4):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$4,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	4(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_4_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $4,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $4,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        4(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_4_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$4,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $4,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$4,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $4,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$4,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$4,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $4,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $4,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_4_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_4_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	4(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        4(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_6):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$6,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	6(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_6_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $6,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $6,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        6(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_6_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$6,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $6,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$6,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $6,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$6,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$6,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $6,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $6,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_6_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_6_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	6(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        6(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_8):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$8,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	8(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_8_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $8,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $8,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        8(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_8_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$8,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $8,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$8,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $8,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$8,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$8,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $8,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $8,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_8_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_8_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	8(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        8(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_10):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$10,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	10(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_10_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $10, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $10,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        10(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_10_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$10, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $10, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$10, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $10, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$10,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$10,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $10,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $10,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_10_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_10_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	10(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        10(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_12):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	12(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_12_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $12, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $12, %xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        12(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_12_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$12, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $12, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$12, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $12, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$12,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$12,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $12,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $12,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_12_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_12_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	12(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        12(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_14):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-
-	movdqa	32(%esi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	14(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_14_gobble)
+
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $14, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+
+    movdqa     32(%esi), %xmm3
+    palignr    $14, %xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
+
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        14(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_14_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$14, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $14, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$14, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $14, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%esi), %xmm3
-	palignr	$14,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$14,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
-
-	lea	32(%edi), %edi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %ecx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+
+    movdqa     64(%esi), %xmm3
+    palignr    $14,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $14,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
+
+    lea        32(%edi), %edi
+    jz         L(shr_14_gobble_loop)
+    pand       %xmm0, %xmm3
+
+    cmp        $0, %ecx
+    jge        L(shr_14_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	(%ecx, %edi,1), %eax
-	lea	14(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
-
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    test       %edx, %edx
+    jnz        L(exit)
+
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+
+    lea        (%ecx, %edi,1), %eax
+    lea        14(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
+
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(exit):
-	pmovmskb %xmm1, %ebx
-	sub	$0xffff, %ebx
-	jz	L(first16bytes)
-	lea	-16(%esi), %esi
-	lea	-16(%edi), %edi
-	mov	%ebx, %edx
+    pmovmskb   %xmm1, %ebx
+    sub        $0xffff, %ebx
+    jz         L(first16bytes)
+    lea        -16(%esi), %esi
+    lea        -16(%edi), %edi
+    mov        %ebx, %edx
 
 L(first16bytes):
-	add	%eax, %esi
+    add        %eax, %esi
 L(less16bytes):
-	test	%dl, %dl
-	jz	L(next_four_words)
-	test	$15, %dl
-	jz	L(second_two_words)
-	test	$3, %dl
-	jz	L(second_word)
-	movzwl	-16(%edi), %eax
-	movzwl	-16(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
-
-	.p2align 4
+    test       %dl, %dl
+    jz         L(next_four_words)
+    test       $15, %dl
+    jz         L(second_two_words)
+    test       $3, %dl
+    jz         L(second_word)
+    movzwl     -16(%edi), %eax
+    movzwl     -16(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
+
+    .p2align 4
 L(second_word):
-	movzwl	-14(%edi), %eax
-	movzwl	-14(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -14(%edi), %eax
+    movzwl     -14(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(second_two_words):
-	test	$63, %dl
-	jz	L(fourth_word)
-	movzwl	-12(%edi), %eax
-	movzwl	-12(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
-
-	.p2align 4
+    test       $63, %dl
+    jz         L(fourth_word)
+    movzwl     -12(%edi), %eax
+    movzwl     -12(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
+
+    .p2align 4
 L(fourth_word):
-	movzwl	-10(%edi), %eax
-	movzwl	-10(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -10(%edi), %eax
+    movzwl     -10(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(next_four_words):
-	test	$15, %dh
-	jz	L(fourth_two_words)
-	test	$3, %dh
-	jz	L(sixth_word)
-	movzwl	-8(%edi), %eax
-	movzwl	-8(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
-
-	.p2align 4
+    test       $15, %dh
+    jz         L(fourth_two_words)
+    test       $3, %dh
+    jz         L(sixth_word)
+    movzwl     -8(%edi), %eax
+    movzwl     -8(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
+
+    .p2align 4
 L(sixth_word):
-	movzwl	-6(%edi), %eax
-	movzwl	-6(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -6(%edi), %eax
+    movzwl     -6(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(fourth_two_words):
-	test	$63, %dh
-	jz	L(eighth_word)
-	movzwl	-4(%edi), %eax
-	movzwl	-4(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
-
-	.p2align 4
+    test       $63, %dh
+    jz         L(eighth_word)
+    movzwl     -4(%edi), %eax
+    movzwl     -4(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
+
+    .p2align 4
 L(eighth_word):
-	movzwl	-2(%edi), %eax
-	movzwl	-2(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -2(%edi), %eax
+    movzwl     -2(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
 
-	CFI_PUSH (%ebx)
+    CFI_PUSH (%ebx)
 
-	.p2align 4
+    .p2align 4
 L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	jmp	L(14bytes)
-
-	.p2align 4
+    cmp        $16, %ecx
+    jae        L(more16bytes)
+    cmp        $8, %ecx
+    je         L(8bytes)
+    cmp        $10, %ecx
+    je         L(10bytes)
+    cmp        $12, %ecx
+    je         L(12bytes)
+    jmp        L(14bytes)
+
+    .p2align 4
 L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	jmp	L(22bytes)
-
-	.p2align 4
+    cmp        $24, %ecx
+    jae        L(more24bytes)
+    cmp        $16, %ecx
+    je         L(16bytes)
+    cmp        $18, %ecx
+    je         L(18bytes)
+    cmp        $20, %ecx
+    je         L(20bytes)
+    jmp        L(22bytes)
+
+    .p2align 4
 L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	jmp	L(30bytes)
-
-	.p2align 4
+    cmp        $32, %ecx
+    jae        L(more32bytes)
+    cmp        $24, %ecx
+    je         L(24bytes)
+    cmp        $26, %ecx
+    je         L(26bytes)
+    cmp        $28, %ecx
+    je         L(28bytes)
+    jmp        L(30bytes)
+
+    .p2align 4
 L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	jmp	L(38bytes)
-
-	.p2align 4
+    cmp        $40, %ecx
+    jae        L(more40bytes)
+    cmp        $32, %ecx
+    je         L(32bytes)
+    cmp        $34, %ecx
+    je         L(34bytes)
+    cmp        $36, %ecx
+    je         L(36bytes)
+    jmp        L(38bytes)
+
+    .p2align 4
 L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	jmp	L(6bytes)
-
-	.p2align 4
+    cmp        $8, %ecx
+    jae        L(more8bytes)
+    cmp        $2, %ecx
+    je         L(2bytes)
+    cmp        $4, %ecx
+    je         L(4bytes)
+    jmp        L(6bytes)
+
+    .p2align 4
 L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	jmp	L(46bytes)
-
-	.p2align 4
+    cmp        $40, %ecx
+    je         L(40bytes)
+    cmp        $42, %ecx
+    je         L(42bytes)
+    cmp        $44, %ecx
+    je         L(44bytes)
+    jmp        L(46bytes)
+
+    .p2align 4
 L(46bytes):
-	movzwl	-46(%eax), %ecx
-	movzwl	-46(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -46(%eax), %ecx
+    movzwl     -46(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(44bytes):
-	movzwl	-44(%eax), %ecx
-	movzwl	-44(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -44(%eax), %ecx
+    movzwl     -44(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(42bytes):
-	movzwl	-42(%eax), %ecx
-	movzwl	-42(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -42(%eax), %ecx
+    movzwl     -42(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(40bytes):
-	movzwl	-40(%eax), %ecx
-	movzwl	-40(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -40(%eax), %ecx
+    movzwl     -40(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(38bytes):
-	movzwl	-38(%eax), %ecx
-	movzwl	-38(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -38(%eax), %ecx
+    movzwl     -38(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(36bytes):
-	movzwl	-36(%eax), %ecx
-	movzwl	-36(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -36(%eax), %ecx
+    movzwl     -36(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(34bytes):
-	movzwl	-34(%eax), %ecx
-	movzwl	-34(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -34(%eax), %ecx
+    movzwl     -34(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(32bytes):
-	movzwl	-32(%eax), %ecx
-	movzwl	-32(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -32(%eax), %ecx
+    movzwl     -32(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(30bytes):
-	movzwl	-30(%eax), %ecx
-	movzwl	-30(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -30(%eax), %ecx
+    movzwl     -30(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(28bytes):
-	movzwl	-28(%eax), %ecx
-	movzwl	-28(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -28(%eax), %ecx
+    movzwl     -28(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(26bytes):
-	movzwl	-26(%eax), %ecx
-	movzwl	-26(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -26(%eax), %ecx
+    movzwl     -26(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(24bytes):
-	movzwl	-24(%eax), %ecx
-	movzwl	-24(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -24(%eax), %ecx
+    movzwl     -24(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(22bytes):
-	movzwl	-22(%eax), %ecx
-	movzwl	-22(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -22(%eax), %ecx
+    movzwl     -22(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(20bytes):
-	movzwl	-20(%eax), %ecx
-	movzwl	-20(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -20(%eax), %ecx
+    movzwl     -20(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(18bytes):
-	movzwl	-18(%eax), %ecx
-	movzwl	-18(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -18(%eax), %ecx
+    movzwl     -18(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(16bytes):
-	movzwl	-16(%eax), %ecx
-	movzwl	-16(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -16(%eax), %ecx
+    movzwl     -16(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(14bytes):
-	movzwl	-14(%eax), %ecx
-	movzwl	-14(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -14(%eax), %ecx
+    movzwl     -14(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(12bytes):
-	movzwl	-12(%eax), %ecx
-	movzwl	-12(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -12(%eax), %ecx
+    movzwl     -12(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(10bytes):
-	movzwl	-10(%eax), %ecx
-	movzwl	-10(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -10(%eax), %ecx
+    movzwl     -10(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(8bytes):
-	movzwl	-8(%eax), %ecx
-	movzwl	-8(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -8(%eax), %ecx
+    movzwl     -8(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(6bytes):
-	movzwl	-6(%eax), %ecx
-	movzwl	-6(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -6(%eax), %ecx
+    movzwl     -6(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(4bytes):
-	movzwl	-4(%eax), %ecx
-	movzwl	-4(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -4(%eax), %ecx
+    movzwl     -4(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(2bytes):
-	movzwl	-2(%eax), %eax
-	movzwl	-2(%edx), %ebx
-	subl	%ebx, %eax
-	POP	(%ebx)
-	ret
-	CFI_PUSH (%ebx)
-
-	.p2align 4
+    movzwl     -2(%eax), %eax
+    movzwl     -2(%edx), %ebx
+    subl       %ebx, %eax
+    POP        (%ebx)
+    ret
+    CFI_PUSH   (%ebx)
+
+    .p2align 4
 L(memcmp16_exit):
-	POP	(%ebx)
-	mov	%ecx, %eax
-	ret
+    POP        (%ebx)
+    mov        %ecx, %eax
+    ret
 END_FUNCTION MEMCMP
diff --git a/runtime/arch/x86_64/memcmp16_x86_64.S b/runtime/arch/x86_64/memcmp16_x86_64.S
new file mode 100755
index 0000000..46e4ba3
--- /dev/null
+++ b/runtime/arch/x86_64/memcmp16_x86_64.S
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "asm_support_x86_64.S"
+
+#define MEMCMP  __memcmp16
+
+/*
+ * Half of Silvermont L1 Data Cache size
+ *(see original file cache.h in bionic/libc/arch-x86_64/).
+ * This value is used for specific optimization on big lengths.
+ */
+#define DATA_CACHE_SIZE_HALF    (12*1024)
+
+#ifndef L
+# define L(label)    .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)    .p2align n
+#endif
+
+#define JMPTBL(I, B)    (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)        \
+  lea        TABLE(%rip), %r11;                \
+  movslq    (%r11, INDEX, SCALE), %rcx;            \
+  add        %r11, %rcx;                    \
+  jmp        *%rcx;                        \
+  ud2
+
+DEFINE_FUNCTION MEMCMP
+    pxor      %xmm0, %xmm0
+    shl       $1, %rdx
+    cmp       $79, %rdx
+    ja        L(79bytesormore)
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+    ALIGN (4)
+L(79bytesormore):
+    movdqu    (%rsi), %xmm1
+    movdqu    (%rdi), %xmm2
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+    mov       %rsi, %rcx
+    and       $-16, %rsi
+    add       $16, %rsi
+    sub       %rsi, %rcx
+
+    sub       %rcx, %rdi
+    add       %rcx, %rdx
+    test      $0xf, %rdi
+    jz        L(2aligned)
+
+    cmp       $128, %rdx
+    ja        L(128bytesormore)
+L(less128bytes):
+    sub       $64, %rdx
+
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqu    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqu    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+    cmp       $32, %rdx
+    jb        L(less32bytesin64)
+
+    movdqu    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqu    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin64):
+    add       $64, %rdi
+    add       $64, %rsi
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(128bytesormore):
+    cmp       $512, %rdx
+    ja        L(512bytesormore)
+    cmp       $256, %rdx
+    ja        L(less512bytes)
+L(less256bytes):
+    sub       $128, %rdx
+
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqu    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqu    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+
+    movdqu    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqu    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+
+    movdqu    96(%rdi), %xmm2
+    pxor      96(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(112bytesin256)
+
+    movdqu    112(%rdi), %xmm2
+    pxor      112(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(128bytesin256)
+
+    add       $128, %rsi
+    add       $128, %rdi
+
+    cmp       $64, %rdx
+    jae       L(less128bytes)
+
+    cmp       $32, %rdx
+    jb        L(less32bytesin128)
+
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin128):
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(less512bytes):
+    sub       $256, %rdx
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqu    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqu    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+
+    movdqu    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqu    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+
+    movdqu    96(%rdi), %xmm2
+    pxor      96(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(112bytesin256)
+
+    movdqu    112(%rdi), %xmm2
+    pxor      112(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(128bytesin256)
+
+    movdqu    128(%rdi), %xmm2
+    pxor      128(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(144bytesin256)
+
+    movdqu    144(%rdi), %xmm2
+    pxor      144(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(160bytesin256)
+
+    movdqu    160(%rdi), %xmm2
+    pxor      160(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(176bytesin256)
+
+    movdqu    176(%rdi), %xmm2
+    pxor      176(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(192bytesin256)
+
+    movdqu    192(%rdi), %xmm2
+    pxor      192(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(208bytesin256)
+
+    movdqu    208(%rdi), %xmm2
+    pxor      208(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(224bytesin256)
+
+    movdqu    224(%rdi), %xmm2
+    pxor      224(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(240bytesin256)
+
+    movdqu    240(%rdi), %xmm2
+    pxor      240(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(256bytesin256)
+
+    add       $256, %rsi
+    add       $256, %rdi
+
+    cmp       $128, %rdx
+    jae       L(less256bytes)
+
+    cmp       $64, %rdx
+    jae       L(less128bytes)
+
+    cmp       $32, %rdx
+    jb        L(less32bytesin256)
+
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin256):
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+    ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+    mov       $DATA_CACHE_SIZE_HALF, %r8
+#else
+    mov       __x86_64_data_cache_size_half(%rip), %r8
+#endif
+    mov       %r8, %r9
+    shr       $1, %r8
+    add       %r9, %r8
+    cmp       %r8, %rdx
+    ja        L(L2_L3_cache_unaglined)
+    sub       $64, %rdx
+    ALIGN (4)
+L(64bytesormore_loop):
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    movdqa    %xmm2, %xmm1
+
+    movdqu    16(%rdi), %xmm3
+    pxor      16(%rsi), %xmm3
+    por       %xmm3, %xmm1
+
+    movdqu    32(%rdi), %xmm4
+    pxor      32(%rsi), %xmm4
+    por       %xmm4, %xmm1
+
+    movdqu    48(%rdi), %xmm5
+    pxor      48(%rsi), %xmm5
+    por       %xmm5, %xmm1
+
+    ptest     %xmm1, %xmm0
+    jnc       L(64bytesormore_loop_end)
+    add       $64, %rsi
+    add       $64, %rdi
+    sub       $64, %rdx
+    jae       L(64bytesormore_loop)
+
+    add       $64, %rdx
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(L2_L3_cache_unaglined):
+    sub       $64, %rdx
+    ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+    prefetchnta 0x1c0(%rdi)
+    prefetchnta 0x1c0(%rsi)
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    movdqa    %xmm2, %xmm1
+
+    movdqu    16(%rdi), %xmm3
+    pxor      16(%rsi), %xmm3
+    por       %xmm3, %xmm1
+
+    movdqu    32(%rdi), %xmm4
+    pxor      32(%rsi), %xmm4
+    por       %xmm4, %xmm1
+
+    movdqu    48(%rdi), %xmm5
+    pxor      48(%rsi), %xmm5
+    por       %xmm5, %xmm1
+
+    ptest     %xmm1, %xmm0
+    jnc       L(64bytesormore_loop_end)
+    add       $64, %rsi
+    add       $64, %rdi
+    sub       $64, %rdx
+    jae       L(L2_L3_unaligned_128bytes_loop)
+
+    add       $64, %rdx
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+    ALIGN (4)
+L(2aligned):
+    cmp       $128, %rdx
+    ja        L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+    sub       $64, %rdx
+
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqa    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqa    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqa    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+    cmp       $32, %rdx
+    jb        L(less32bytesin64in2alinged)
+
+    movdqa    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqa    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin64in2alinged):
+    add       $64, %rdi
+    add       $64, %rsi
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+    ALIGN (4)
+L(128bytesormorein2aligned):
+    cmp       $512, %rdx
+    ja        L(512bytesormorein2aligned)
+    cmp       $256, %rdx
+    ja        L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+    sub       $128, %rdx
+
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqa    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqa    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqa    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+
+    movdqa    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqa    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+
+    movdqa    96(%rdi), %xmm2
+    pxor      96(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(112bytesin256)
+
+    movdqa    112(%rdi), %xmm2
+    pxor      112(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(128bytesin256)
+
+    add       $128, %rsi
+    add       $128, %rdi
+
+    cmp       $64, %rdx
+    jae       L(less128bytesin2aligned)
+
+    cmp       $32, %rdx
+    jb        L(less32bytesin128in2aligned)
+
+    movdqu    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqu    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin128in2aligned):
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+    ALIGN (4)
+L(256bytesormorein2aligned):
+
+    sub       $256, %rdx
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqa    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+
+    movdqa    32(%rdi), %xmm2
+    pxor      32(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(48bytesin256)
+
+    movdqa    48(%rdi), %xmm2
+    pxor      48(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(64bytesin256)
+
+    movdqa    64(%rdi), %xmm2
+    pxor      64(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(80bytesin256)
+
+    movdqa    80(%rdi), %xmm2
+    pxor      80(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(96bytesin256)
+
+    movdqa    96(%rdi), %xmm2
+    pxor      96(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(112bytesin256)
+
+    movdqa    112(%rdi), %xmm2
+    pxor      112(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(128bytesin256)
+
+    movdqa    128(%rdi), %xmm2
+    pxor      128(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(144bytesin256)
+
+    movdqa    144(%rdi), %xmm2
+    pxor      144(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(160bytesin256)
+
+    movdqa    160(%rdi), %xmm2
+    pxor      160(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(176bytesin256)
+
+    movdqa    176(%rdi), %xmm2
+    pxor      176(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(192bytesin256)
+
+    movdqa    192(%rdi), %xmm2
+    pxor      192(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(208bytesin256)
+
+    movdqa    208(%rdi), %xmm2
+    pxor      208(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(224bytesin256)
+
+    movdqa    224(%rdi), %xmm2
+    pxor      224(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(240bytesin256)
+
+    movdqa    240(%rdi), %xmm2
+    pxor      240(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(256bytesin256)
+
+    add       $256, %rsi
+    add       $256, %rdi
+
+    cmp       $128, %rdx
+    jae       L(less256bytesin2alinged)
+
+    cmp       $64, %rdx
+    jae       L(less128bytesin2aligned)
+
+    cmp       $32, %rdx
+    jb        L(less32bytesin256in2alinged)
+
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytesin256)
+
+    movdqa    16(%rdi), %xmm2
+    pxor      16(%rsi), %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(32bytesin256)
+    sub       $32, %rdx
+    add       $32, %rdi
+    add       $32, %rsi
+L(less32bytesin256in2alinged):
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+    ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+    mov       $DATA_CACHE_SIZE_HALF, %r8
+#else
+    mov       __x86_64_data_cache_size_half(%rip), %r8
+#endif
+    mov       %r8, %r9
+    shr       $1, %r8
+    add       %r9, %r8
+    cmp       %r8, %rdx
+    ja        L(L2_L3_cache_aglined)
+
+    sub       $64, %rdx
+    ALIGN (4)
+L(64bytesormore_loopin2aligned):
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    movdqa    %xmm2, %xmm1
+
+    movdqa    16(%rdi), %xmm3
+    pxor      16(%rsi), %xmm3
+    por       %xmm3, %xmm1
+
+    movdqa    32(%rdi), %xmm4
+    pxor      32(%rsi), %xmm4
+    por       %xmm4, %xmm1
+
+    movdqa    48(%rdi), %xmm5
+    pxor      48(%rsi), %xmm5
+    por       %xmm5, %xmm1
+
+    ptest     %xmm1, %xmm0
+    jnc       L(64bytesormore_loop_end)
+    add       $64, %rsi
+    add       $64, %rdi
+    sub       $64, %rdx
+    jae       L(64bytesormore_loopin2aligned)
+
+    add       $64, %rdx
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+L(L2_L3_cache_aglined):
+    sub       $64, %rdx
+    ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+    prefetchnta 0x1c0(%rdi)
+    prefetchnta 0x1c0(%rsi)
+    movdqa    (%rdi), %xmm2
+    pxor      (%rsi), %xmm2
+    movdqa    %xmm2, %xmm1
+
+    movdqa    16(%rdi), %xmm3
+    pxor      16(%rsi), %xmm3
+    por       %xmm3, %xmm1
+
+    movdqa    32(%rdi), %xmm4
+    pxor      32(%rsi), %xmm4
+    por       %xmm4, %xmm1
+
+    movdqa    48(%rdi), %xmm5
+    pxor      48(%rsi), %xmm5
+    por       %xmm5, %xmm1
+
+    ptest     %xmm1, %xmm0
+    jnc       L(64bytesormore_loop_end)
+    add       $64, %rsi
+    add       $64, %rdi
+    sub       $64, %rdx
+    jae    L(L2_L3_aligned_128bytes_loop)
+
+    add       $64, %rdx
+    add       %rdx, %rsi
+    add       %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+
+    ALIGN (4)
+L(64bytesormore_loop_end):
+    add       $16, %rdi
+    add       $16, %rsi
+    ptest     %xmm2, %xmm0
+    jnc       L(16bytes)
+
+    add       $16, %rdi
+    add       $16, %rsi
+    ptest     %xmm3, %xmm0
+    jnc       L(16bytes)
+
+    add       $16, %rdi
+    add       $16, %rsi
+    ptest     %xmm4, %xmm0
+    jnc       L(16bytes)
+
+    add       $16, %rdi
+    add       $16, %rsi
+    jmp       L(16bytes)
+
+L(256bytesin256):
+    add       $256, %rdi
+    add       $256, %rsi
+    jmp       L(16bytes)
+L(240bytesin256):
+    add       $240, %rdi
+    add       $240, %rsi
+    jmp       L(16bytes)
+L(224bytesin256):
+    add       $224, %rdi
+    add       $224, %rsi
+    jmp       L(16bytes)
+L(208bytesin256):
+    add       $208, %rdi
+    add       $208, %rsi
+    jmp       L(16bytes)
+L(192bytesin256):
+    add       $192, %rdi
+    add       $192, %rsi
+    jmp       L(16bytes)
+L(176bytesin256):
+    add       $176, %rdi
+    add       $176, %rsi
+    jmp       L(16bytes)
+L(160bytesin256):
+    add       $160, %rdi
+    add       $160, %rsi
+    jmp       L(16bytes)
+L(144bytesin256):
+    add       $144, %rdi
+    add       $144, %rsi
+    jmp       L(16bytes)
+L(128bytesin256):
+    add       $128, %rdi
+    add       $128, %rsi
+    jmp       L(16bytes)
+L(112bytesin256):
+    add       $112, %rdi
+    add       $112, %rsi
+    jmp       L(16bytes)
+L(96bytesin256):
+    add       $96, %rdi
+    add       $96, %rsi
+    jmp       L(16bytes)
+L(80bytesin256):
+    add       $80, %rdi
+    add       $80, %rsi
+    jmp       L(16bytes)
+L(64bytesin256):
+    add       $64, %rdi
+    add       $64, %rsi
+    jmp       L(16bytes)
+L(48bytesin256):
+    add       $16, %rdi
+    add       $16, %rsi
+L(32bytesin256):
+    add       $16, %rdi
+    add       $16, %rsi
+L(16bytesin256):
+    add       $16, %rdi
+    add       $16, %rsi
+L(16bytes):
+    mov       -16(%rdi), %rax
+    mov       -16(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+L(8bytes):
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(12bytes):
+    mov       -12(%rdi), %rax
+    mov       -12(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+L(4bytes):
+    mov       -4(%rsi), %ecx
+    mov       -4(%rdi), %eax
+    cmp       %eax, %ecx
+    jne       L(diffin4bytes)
+L(0bytes):
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(66bytes):
+    movdqu    -66(%rdi), %xmm1
+    movdqu    -66(%rsi), %xmm2
+    mov       $-66, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(50bytes):
+    movdqu    -50(%rdi), %xmm1
+    movdqu    -50(%rsi), %xmm2
+    mov       $-50, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(34bytes):
+    movdqu    -34(%rdi), %xmm1
+    movdqu    -34(%rsi), %xmm2
+    mov       $-34, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(18bytes):
+    mov       -18(%rdi), %rax
+    mov       -18(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+L(10bytes):
+    mov       -10(%rdi), %rax
+    mov       -10(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    movzwl    -2(%rdi), %eax
+    movzwl    -2(%rsi), %ecx
+    cmp       %cl, %al
+    jne       L(end)
+    and       $0xffff, %eax
+    and       $0xffff, %ecx
+    sub       %ecx, %eax
+    ret
+
+    ALIGN (4)
+L(14bytes):
+    mov       -14(%rdi), %rax
+    mov       -14(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(6bytes):
+    mov       -6(%rdi), %eax
+    mov       -6(%rsi), %ecx
+    cmp       %eax, %ecx
+    jne       L(diffin4bytes)
+L(2bytes):
+    movzwl    -2(%rsi), %ecx
+    movzwl    -2(%rdi), %eax
+    cmp       %cl, %al
+    jne       L(end)
+    and       $0xffff, %eax
+    and       $0xffff, %ecx
+    sub       %ecx, %eax
+    ret
+
+    ALIGN (4)
+L(68bytes):
+    movdqu    -68(%rdi), %xmm2
+    movdqu    -68(%rsi), %xmm1
+    mov       $-68, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(52bytes):
+    movdqu    -52(%rdi), %xmm2
+    movdqu    -52(%rsi), %xmm1
+    mov       $-52, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(36bytes):
+    movdqu    -36(%rdi), %xmm2
+    movdqu    -36(%rsi), %xmm1
+    mov       $-36, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(20bytes):
+    movdqu    -20(%rdi), %xmm2
+    movdqu    -20(%rsi), %xmm1
+    mov       $-20, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -4(%rdi), %eax
+    mov       -4(%rsi), %ecx
+    cmp       %eax, %ecx
+    jne       L(diffin4bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(70bytes):
+    movdqu    -70(%rsi), %xmm1
+    movdqu    -70(%rdi), %xmm2
+    mov       $-70, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(54bytes):
+    movdqu    -54(%rsi), %xmm1
+    movdqu    -54(%rdi), %xmm2
+    mov       $-54, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(38bytes):
+    movdqu    -38(%rsi), %xmm1
+    movdqu    -38(%rdi), %xmm2
+    mov       $-38, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(22bytes):
+    movdqu    -22(%rsi), %xmm1
+    movdqu    -22(%rdi), %xmm2
+    mov       $-22, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(72bytes):
+    movdqu    -72(%rsi), %xmm1
+    movdqu    -72(%rdi), %xmm2
+    mov       $-72, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(56bytes):
+    movdqu    -56(%rdi), %xmm2
+    movdqu    -56(%rsi), %xmm1
+    mov       $-56, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(40bytes):
+    movdqu    -40(%rdi), %xmm2
+    movdqu    -40(%rsi), %xmm1
+    mov       $-40, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(24bytes):
+    movdqu    -24(%rdi), %xmm2
+    movdqu    -24(%rsi), %xmm1
+    mov       $-24, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(74bytes):
+    movdqu    -74(%rsi), %xmm1
+    movdqu    -74(%rdi), %xmm2
+    mov       $-74, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(58bytes):
+    movdqu    -58(%rdi), %xmm2
+    movdqu    -58(%rsi), %xmm1
+    mov       $-58, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(42bytes):
+    movdqu    -42(%rdi), %xmm2
+    movdqu    -42(%rsi), %xmm1
+    mov       $-42, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(26bytes):
+    movdqu    -26(%rdi), %xmm2
+    movdqu    -26(%rsi), %xmm1
+    mov       $-26, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -10(%rdi), %rax
+    mov       -10(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    movzwl    -2(%rdi), %eax
+    movzwl    -2(%rsi), %ecx
+    jmp       L(end)
+
+    ALIGN (4)
+L(76bytes):
+    movdqu    -76(%rsi), %xmm1
+    movdqu    -76(%rdi), %xmm2
+    mov       $-76, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(60bytes):
+    movdqu    -60(%rdi), %xmm2
+    movdqu    -60(%rsi), %xmm1
+    mov       $-60, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(44bytes):
+    movdqu    -44(%rdi), %xmm2
+    movdqu    -44(%rsi), %xmm1
+    mov       $-44, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(28bytes):
+    movdqu    -28(%rdi), %xmm2
+    movdqu    -28(%rsi), %xmm1
+    mov       $-28, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -12(%rdi), %rax
+    mov       -12(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    mov       -4(%rdi), %eax
+    mov       -4(%rsi), %ecx
+    cmp       %eax, %ecx
+    jne       L(diffin4bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(78bytes):
+    movdqu    -78(%rsi), %xmm1
+    movdqu    -78(%rdi), %xmm2
+    mov       $-78, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(62bytes):
+    movdqu    -62(%rdi), %xmm2
+    movdqu    -62(%rsi), %xmm1
+    mov       $-62, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(46bytes):
+    movdqu    -46(%rdi), %xmm2
+    movdqu    -46(%rsi), %xmm1
+    mov       $-46, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(30bytes):
+    movdqu    -30(%rdi), %xmm2
+    movdqu    -30(%rsi), %xmm1
+    mov       $-30, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+    mov       -14(%rdi), %rax
+    mov       -14(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+    ALIGN (4)
+L(64bytes):
+    movdqu    -64(%rdi), %xmm2
+    movdqu    -64(%rsi), %xmm1
+    mov       $-64, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(48bytes):
+    movdqu    -48(%rdi), %xmm2
+    movdqu    -48(%rsi), %xmm1
+    mov       $-48, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+L(32bytes):
+    movdqu    -32(%rdi), %xmm2
+    movdqu    -32(%rsi), %xmm1
+    mov       $-32, %dl
+    pxor      %xmm1, %xmm2
+    ptest     %xmm2, %xmm0
+    jnc       L(less16bytes)
+
+    mov       -16(%rdi), %rax
+    mov       -16(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+
+    mov       -8(%rdi), %rax
+    mov       -8(%rsi), %rcx
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    xor       %eax, %eax
+    ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+    ALIGN (3)
+L(less16bytes):
+    movsbq    %dl, %rdx
+    mov       (%rsi, %rdx), %rcx
+    mov       (%rdi, %rdx), %rax
+    cmp       %rax, %rcx
+    jne       L(diffin8bytes)
+    mov       8(%rsi, %rdx), %rcx
+    mov       8(%rdi, %rdx), %rax
+L(diffin8bytes):
+    cmp       %eax, %ecx
+    jne       L(diffin4bytes)
+    shr       $32, %rcx
+    shr       $32, %rax
+L(diffin4bytes):
+    cmp       %cx, %ax
+    jne       L(end)
+    shr       $16, %ecx
+    shr       $16, %eax
+    jmp       L(end)
+
+    ALIGN (4)
+L(end):
+    and       $0xffff, %eax
+    and       $0xffff, %ecx
+    sub       %ecx, %eax
+    ret
+
+END_FUNCTION MEMCMP
+
+    ALIGN (3)
+L(table_64bytes):
+    .int    JMPTBL (L(0bytes), L(table_64bytes))
+    .int    JMPTBL (L(2bytes), L(table_64bytes))
+    .int    JMPTBL (L(4bytes), L(table_64bytes))
+    .int    JMPTBL (L(6bytes), L(table_64bytes))
+    .int    JMPTBL (L(8bytes), L(table_64bytes))
+    .int    JMPTBL (L(10bytes), L(table_64bytes))
+    .int    JMPTBL (L(12bytes), L(table_64bytes))
+    .int    JMPTBL (L(14bytes), L(table_64bytes))
+    .int    JMPTBL (L(16bytes), L(table_64bytes))
+    .int    JMPTBL (L(18bytes), L(table_64bytes))
+    .int    JMPTBL (L(20bytes), L(table_64bytes))
+    .int    JMPTBL (L(22bytes), L(table_64bytes))
+    .int    JMPTBL (L(24bytes), L(table_64bytes))
+    .int    JMPTBL (L(26bytes), L(table_64bytes))
+    .int    JMPTBL (L(28bytes), L(table_64bytes))
+    .int    JMPTBL (L(30bytes), L(table_64bytes))
+    .int    JMPTBL (L(32bytes), L(table_64bytes))
+    .int    JMPTBL (L(34bytes), L(table_64bytes))
+    .int    JMPTBL (L(36bytes), L(table_64bytes))
+    .int    JMPTBL (L(38bytes), L(table_64bytes))
+    .int    JMPTBL (L(40bytes), L(table_64bytes))
+    .int    JMPTBL (L(42bytes), L(table_64bytes))
+    .int    JMPTBL (L(44bytes), L(table_64bytes))
+    .int    JMPTBL (L(46bytes), L(table_64bytes))
+    .int    JMPTBL (L(48bytes), L(table_64bytes))
+    .int    JMPTBL (L(50bytes), L(table_64bytes))
+    .int    JMPTBL (L(52bytes), L(table_64bytes))
+    .int    JMPTBL (L(54bytes), L(table_64bytes))
+    .int    JMPTBL (L(56bytes), L(table_64bytes))
+    .int    JMPTBL (L(58bytes), L(table_64bytes))
+    .int    JMPTBL (L(60bytes), L(table_64bytes))
+    .int    JMPTBL (L(62bytes), L(table_64bytes))
+    .int    JMPTBL (L(64bytes), L(table_64bytes))
+    .int    JMPTBL (L(66bytes), L(table_64bytes))
+    .int    JMPTBL (L(68bytes), L(table_64bytes))
+    .int    JMPTBL (L(70bytes), L(table_64bytes))
+    .int    JMPTBL (L(72bytes), L(table_64bytes))
+    .int    JMPTBL (L(74bytes), L(table_64bytes))
+    .int    JMPTBL (L(76bytes), L(table_64bytes))
+    .int    JMPTBL (L(78bytes), L(table_64bytes))
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 12b7680..f0b1b95 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -615,6 +615,14 @@ bool ClassLinker::GenerateOatFile(const char* dex_filename,
     argv.push_back("--compiler-filter=verify-none");
   }
 
+  if (Runtime::Current()->MustRelocateIfPossible()) {
+    argv.push_back("--runtime-arg");
+    argv.push_back("-Xrelocate");
+  } else {
+    argv.push_back("--runtime-arg");
+    argv.push_back("-Xnorelocate");
+  }
+
   if (!kIsTargetBuild) {
     argv.push_back("--host");
   }
@@ -680,14 +688,6 @@ const OatFile* ClassLinker::FindOpenedOatFile(const char* oat_location, const ch
   return NULL;
 }
 
-static std::string GetMultiDexClassesDexName(size_t number, const char* dex_location) {
-  if (number == 0) {
-    return dex_location;
-  } else {
-    return StringPrintf("%s" kMultiDexSeparatorString "classes%zu.dex", dex_location, number + 1);
-  }
-}
-
 static bool LoadMultiDexFilesFromOatFile(const OatFile* oat_file, const char* dex_location,
                                          bool generated,
                                          std::vector<std::string>* error_msgs,
@@ -700,7 +700,7 @@ static bool LoadMultiDexFilesFromOatFile(const OatFile* oat_file, const char* de
 
   bool success = true;
   for (size_t i = 0; success; ++i) {
-    std::string next_name_str = GetMultiDexClassesDexName(i, dex_location);
+    std::string next_name_str = DexFile::GetMultiDexClassesDexName(i, dex_location);
     const char* next_name = next_name_str.c_str();
 
     uint32_t dex_location_checksum;
@@ -994,11 +994,25 @@ const OatFile* ClassLinker::CreateOatFileForDexLocation(const char* dex_location
   return oat_file.release();
 }
 
-bool ClassLinker::VerifyOatFileChecksums(const OatFile* oat_file,
-                                         const char* dex_location,
-                                         uint32_t dex_location_checksum,
-                                         const InstructionSet instruction_set,
-                                         std::string* error_msg) {
+bool ClassLinker::VerifyOatImageChecksum(const OatFile* oat_file,
+                                         const InstructionSet instruction_set) {
+  Runtime* runtime = Runtime::Current();
+  const gc::space::ImageSpace* image_space = runtime->GetHeap()->GetImageSpace();
+  uint32_t image_oat_checksum = 0;
+  if (instruction_set == kRuntimeISA) {
+    const ImageHeader& image_header = image_space->GetImageHeader();
+    image_oat_checksum = image_header.GetOatChecksum();
+  } else {
+    std::unique_ptr<ImageHeader> image_header(gc::space::ImageSpace::ReadImageHeaderOrDie(
+        image_space->GetImageLocation().c_str(), instruction_set));
+    image_oat_checksum = image_header->GetOatChecksum();
+  }
+  return oat_file->GetOatHeader().GetImageFileLocationOatChecksum() == image_oat_checksum;
+}
+
+bool ClassLinker::VerifyOatChecksums(const OatFile* oat_file,
+                                     const InstructionSet instruction_set,
+                                     std::string* error_msg) {
   Runtime* runtime = Runtime::Current();
   const gc::space::ImageSpace* image_space = runtime->GetHeap()->GetImageSpace();
 
@@ -1021,9 +1035,28 @@ bool ClassLinker::VerifyOatFileChecksums(const OatFile* oat_file,
     image_patch_delta = image_header->GetPatchDelta();
   }
   const OatHeader& oat_header = oat_file->GetOatHeader();
-  bool image_check = ((oat_header.GetImageFileLocationOatChecksum() == image_oat_checksum)
-                      && (oat_header.GetImageFileLocationOatDataBegin() == image_oat_data_begin)
-                      && (oat_header.GetImagePatchDelta() == image_patch_delta));
+  bool ret = ((oat_header.GetImageFileLocationOatChecksum() == image_oat_checksum)
+              && (oat_header.GetImagePatchDelta() == image_patch_delta)
+              && (oat_header.GetImageFileLocationOatDataBegin() == image_oat_data_begin));
+  if (!ret) {
+    *error_msg = StringPrintf("oat file '%s' mismatch (0x%x, %d, %d) with (0x%x, %" PRIdPTR ", %d)",
+                              oat_file->GetLocation().c_str(),
+                              oat_file->GetOatHeader().GetImageFileLocationOatChecksum(),
+                              oat_file->GetOatHeader().GetImageFileLocationOatDataBegin(),
+                              oat_file->GetOatHeader().GetImagePatchDelta(),
+                              image_oat_checksum, image_oat_data_begin, image_patch_delta);
+  }
+  return ret;
+}
+
+bool ClassLinker::VerifyOatAndDexFileChecksums(const OatFile* oat_file,
+                                               const char* dex_location,
+                                               uint32_t dex_location_checksum,
+                                               const InstructionSet instruction_set,
+                                               std::string* error_msg) {
+  if (!VerifyOatChecksums(oat_file, instruction_set, error_msg)) {
+    return false;
+  }
 
   const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location,
                                                                     &dex_location_checksum);
@@ -1039,27 +1072,15 @@ bool ClassLinker::VerifyOatFileChecksums(const OatFile* oat_file,
     }
     return false;
   }
-  bool dex_check = dex_location_checksum == oat_dex_file->GetDexFileLocationChecksum();
-
-  if (image_check && dex_check) {
-    return true;
-  }
 
-  if (!image_check) {
-    ScopedObjectAccess soa(Thread::Current());
-    *error_msg = StringPrintf("oat file '%s' mismatch (0x%x, %d) with (0x%x, %" PRIdPTR ")",
-                              oat_file->GetLocation().c_str(),
-                              oat_file->GetOatHeader().GetImageFileLocationOatChecksum(),
-                              oat_file->GetOatHeader().GetImageFileLocationOatDataBegin(),
-                              image_oat_checksum, image_oat_data_begin);
-  }
-  if (!dex_check) {
+  if (dex_location_checksum != oat_dex_file->GetDexFileLocationChecksum()) {
     *error_msg = StringPrintf("oat file '%s' mismatch (0x%x) with '%s' (0x%x)",
                               oat_file->GetLocation().c_str(),
                               oat_dex_file->GetDexFileLocationChecksum(),
                               dex_location, dex_location_checksum);
+    return false;
   }
-  return false;
+  return true;
 }
 
 bool ClassLinker::VerifyOatWithDexFile(const OatFile* oat_file,
@@ -1082,8 +1103,8 @@ bool ClassLinker::VerifyOatWithDexFile(const OatFile* oat_file,
     }
     dex_file.reset(oat_dex_file->OpenDexFile(error_msg));
   } else {
-    bool verified = VerifyOatFileChecksums(oat_file, dex_location, dex_location_checksum,
-                                           kRuntimeISA, error_msg);
+    bool verified = VerifyOatAndDexFileChecksums(oat_file, dex_location, dex_location_checksum,
+                                                 kRuntimeISA, error_msg);
     if (!verified) {
       return false;
     }
@@ -3495,14 +3516,19 @@ mirror::ArtMethod* ClassLinker::CreateProxyConstructor(Thread* self,
       proxy_class->GetDirectMethods();
   CHECK_EQ(proxy_direct_methods->GetLength(), 16);
   mirror::ArtMethod* proxy_constructor = proxy_direct_methods->Get(2);
-  // Clone the existing constructor of Proxy (our constructor would just invoke it so steal its
-  // code_ too)
-  mirror::ArtMethod* constructor =
-      down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self));
-  if (constructor == NULL) {
+  mirror::ArtMethod* constructor = down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self));
+  if (constructor == nullptr) {
     CHECK(self->IsExceptionPending());  // OOME.
-    return NULL;
+    return nullptr;
   }
+  // Make the proxy constructor's code always point to the uninstrumented code. This avoids
+  // getting a method enter event for the proxy constructor as the proxy constructor doesn't
+  // have an activation.
+  bool have_portable_code;
+  constructor->SetEntryPointFromQuickCompiledCode(GetQuickOatCodeFor(proxy_constructor));
+  constructor->SetEntryPointFromPortableCompiledCode(GetPortableOatCodeFor(proxy_constructor,
+                                                                           &have_portable_code));
+
   // Make this constructor public and fix the class to be our Proxy version
   constructor->SetAccessFlags((constructor->GetAccessFlags() & ~kAccProtected) | kAccPublic);
   constructor->SetDeclaringClass(klass.Get());
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 1bb1635..8c09042 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -274,12 +274,18 @@ class ClassLinker {
                            std::vector<const DexFile*>* dex_files)
       LOCKS_EXCLUDED(dex_lock_, Locks::mutator_lock_);
 
+  // Returns true if the given oat file has the same image checksum as the image it is paired with.
+  static bool VerifyOatImageChecksum(const OatFile* oat_file, const InstructionSet instruction_set);
+  // Returns true if the oat file checksums match with the image and the offsets are such that it
+  // could be loaded with it.
+  static bool VerifyOatChecksums(const OatFile* oat_file, const InstructionSet instruction_set,
+                                 std::string* error_msg);
   // Returns true if oat file contains the dex file with the given location and checksum.
-  static bool VerifyOatFileChecksums(const OatFile* oat_file,
-                                     const char* dex_location,
-                                     uint32_t dex_location_checksum,
-                                     InstructionSet instruction_set,
-                                     std::string* error_msg);
+  static bool VerifyOatAndDexFileChecksums(const OatFile* oat_file,
+                                           const char* dex_location,
+                                           uint32_t dex_location_checksum,
+                                           InstructionSet instruction_set,
+                                           std::string* error_msg);
 
   // TODO: replace this with multiple methods that allocate the correct managed type.
   template <class T>
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 8e363c4..9972362 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -137,7 +137,17 @@ void CommonRuntimeTest::SetEnvironmentVariables(std::string& android_data) {
   }
 
   // On target, Cannot use /mnt/sdcard because it is mounted noexec, so use subdir of dalvik-cache
-  android_data = (IsHost() ? "/tmp/art-data-XXXXXX" : "/data/dalvik-cache/art-data-XXXXXX");
+  if (IsHost()) {
+    const char* tmpdir = getenv("TMPDIR");
+    if (tmpdir != nullptr && tmpdir[0] != 0) {
+      android_data = tmpdir;
+    } else {
+      android_data = "/tmp";
+    }
+  } else {
+    android_data = "/data/dalvik-cache";
+  }
+  android_data += "/art-data-XXXXXX";
   if (mkdtemp(&android_data[0]) == nullptr) {
     PLOG(FATAL) << "mkdtemp(\"" << &android_data[0] << "\") failed";
   }
@@ -212,7 +222,7 @@ void CommonRuntimeTest::ClearDirectory(const char* dirpath) {
     if ((strcmp(e->d_name, ".") == 0) || (strcmp(e->d_name, "..") == 0)) {
       continue;
     }
-    std::string filename(dalvik_cache_);
+    std::string filename(dirpath);
     filename.push_back('/');
     filename.append(e->d_name);
     int stat_result = lstat(filename.c_str(), &s);
@@ -265,6 +275,19 @@ std::string CommonRuntimeTest::GetDexFileName(const std::string& jar_prefix) {
   return StringPrintf("%s/framework/%s.jar", GetAndroidRoot(), jar_prefix.c_str());
 }
 
+std::string CommonRuntimeTest::GetLibCoreOatFileName() {
+  return GetOatFileName("core");
+}
+
+std::string CommonRuntimeTest::GetOatFileName(const std::string& oat_prefix) {
+  if (IsHost()) {
+    const char* host_dir = getenv("ANDROID_HOST_OUT");
+    CHECK(host_dir != nullptr);
+    return StringPrintf("%s/framework/%s.art", host_dir, oat_prefix.c_str());
+  }
+  return StringPrintf("%s/framework/%s.art", GetAndroidRoot(), oat_prefix.c_str());
+}
+
 std::string CommonRuntimeTest::GetTestAndroidRoot() {
   if (IsHost()) {
     const char* host_dir = getenv("ANDROID_HOST_OUT");
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index eb96352..363d8da 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -85,10 +85,18 @@ class CommonRuntimeTest : public testing::Test {
 
   virtual void TearDown();
 
+  // Gets the path of the libcore dex file.
   std::string GetLibCoreDexFileName();
 
+  // Gets the path of the specified dex file for host or target.
   std::string GetDexFileName(const std::string& jar_prefix);
 
+  // Gets the path of the libcore oat file.
+  std::string GetLibCoreOatFileName();
+
+  // Gets the path of the specified oat file for host or target.
+  std::string GetOatFileName(const std::string& oat_prefix);
+
   std::string GetTestAndroidRoot();
 
   std::vector<const DexFile*> OpenTestDexFiles(const char* name)
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index e5bc7c8..e1a7771 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -951,6 +951,38 @@ std::pair<const char*, const char*> DexFile::SplitMultiDexLocation(
   return std::make_pair(tmp, colon_ptr + 1);
 }
 
+std::string DexFile::GetMultiDexClassesDexName(size_t number, const char* dex_location) {
+  if (number == 0) {
+    return dex_location;
+  } else {
+    return StringPrintf("%s" kMultiDexSeparatorString "classes%zu.dex", dex_location, number + 1);
+  }
+}
+
+std::string DexFile::GetDexCanonicalLocation(const char* dex_location) {
+  CHECK_NE(dex_location, static_cast<const char*>(nullptr));
+  char* path = nullptr;
+  if (!IsMultiDexLocation(dex_location)) {
+    path = realpath(dex_location, nullptr);
+  } else {
+    std::pair<const char*, const char*> pair = DexFile::SplitMultiDexLocation(dex_location);
+    const char* dex_real_location(realpath(pair.first, nullptr));
+    delete pair.first;
+    if (dex_real_location != nullptr) {
+      int length = strlen(dex_real_location) + strlen(pair.second) + strlen(kMultiDexSeparatorString) + 1;
+      char* multidex_canonical_location = reinterpret_cast<char*>(malloc(sizeof(char) * length));
+      snprintf(multidex_canonical_location, length, "%s" kMultiDexSeparatorString "%s", dex_real_location, pair.second);
+      free(const_cast<char*>(dex_real_location));
+      path = multidex_canonical_location;
+    }
+  }
+
+  // If realpath fails then we just copy the argument.
+  std::string result(path == nullptr ? dex_location : path);
+  free(path);
+  return result;
+}
+
 std::ostream& operator<<(std::ostream& os, const DexFile& dex_file) {
   os << StringPrintf("[DexFile: %s dex-checksum=%08x location-checksum=%08x %p-%p]",
                      dex_file.GetLocation().c_str(),
@@ -958,6 +990,7 @@ std::ostream& operator<<(std::ostream& os, const DexFile& dex_file) {
                      dex_file.Begin(), dex_file.Begin() + dex_file.Size());
   return os;
 }
+
 std::string Signature::ToString() const {
   if (dex_file_ == nullptr) {
     CHECK(proto_id_ == nullptr);
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index d64a030..2794af6 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -841,6 +841,23 @@ class DexFile {
     return size_;
   }
 
+  static std::string GetMultiDexClassesDexName(size_t number, const char* dex_location);
+
+  // Returns the canonical form of the given dex location.
+  //
+  // There are different flavors of "dex locations" as follows:
+  // the file name of a dex file:
+  //     The actual file path that the dex file has on disk.
+  // dex_location:
+  //     This acts as a key for the class linker to know which dex file to load.
+  //     It may correspond to either an old odex file or a particular dex file
+  //     inside an oat file. In the first case it will also match the file name
+  //     of the dex file. In the second case (oat) it will include the file name
+  //     and possibly some multidex annotation to uniquely identify it.
+  // canonical_dex_location:
+  //     the dex_location where it's file name part has been made canonical.
+  static std::string GetDexCanonicalLocation(const char* dex_location);
+
  private:
   // Opens a .dex file
   static const DexFile* OpenFile(int fd, const char* location, bool verify, std::string* error_msg);
diff --git a/runtime/dex_file_test.cc b/runtime/dex_file_test.cc
index 284aa89..fa13290 100644
--- a/runtime/dex_file_test.cc
+++ b/runtime/dex_file_test.cc
@@ -345,4 +345,31 @@ TEST_F(DexFileTest, FindFieldId) {
   }
 }
 
+TEST_F(DexFileTest, GetMultiDexClassesDexName) {
+  std::string dex_location_str = "/system/app/framework.jar";
+  const char* dex_location = dex_location_str.c_str();
+  ASSERT_EQ("/system/app/framework.jar", DexFile::GetMultiDexClassesDexName(0, dex_location));
+  ASSERT_EQ("/system/app/framework.jar:classes2.dex", DexFile::GetMultiDexClassesDexName(1, dex_location));
+  ASSERT_EQ("/system/app/framework.jar:classes101.dex", DexFile::GetMultiDexClassesDexName(100, dex_location));
+}
+
+TEST_F(DexFileTest, GetDexCanonicalLocation) {
+  ScratchFile file;
+  std::string dex_location = file.GetFilename();
+
+  ASSERT_EQ(file.GetFilename(), DexFile::GetDexCanonicalLocation(dex_location.c_str()));
+  std::string multidex_location = DexFile::GetMultiDexClassesDexName(1, dex_location.c_str());
+  ASSERT_EQ(multidex_location, DexFile::GetDexCanonicalLocation(multidex_location.c_str()));
+
+  std::string dex_location_sym = dex_location + "symlink";
+  ASSERT_EQ(0, symlink(dex_location.c_str(), dex_location_sym.c_str()));
+
+  ASSERT_EQ(dex_location, DexFile::GetDexCanonicalLocation(dex_location_sym.c_str()));
+
+  std::string multidex_location_sym = DexFile::GetMultiDexClassesDexName(1, dex_location_sym.c_str());
+  ASSERT_EQ(multidex_location, DexFile::GetDexCanonicalLocation(multidex_location_sym.c_str()));
+
+  ASSERT_EQ(0, unlink(dex_location_sym.c_str()));
+}
+
 }  // namespace art
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 594c65f..6179b5e 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -837,6 +837,7 @@ bool ElfFile::Load(bool executable, std::string* error_msg) {
     }
   }
 
+  bool reserved = false;
   for (Elf32_Word i = 0; i < GetProgramHeaderNum(); i++) {
     Elf32_Phdr& program_header = GetProgramHeader(i);
 
@@ -853,10 +854,8 @@ bool ElfFile::Load(bool executable, std::string* error_msg) {
 
     // Found something to load.
 
-    // If p_vaddr is zero, it must be the first loadable segment,
-    // since they must be in order.  Since it is zero, there isn't a
-    // specific address requested, so first request a contiguous chunk
-    // of required size for all segments, but with no
+    // Before load the actual segments, reserve a contiguous chunk
+    // of required size and address for all segments, but with no
     // permissions. We'll then carve that up with the proper
     // permissions as we load the actual segments. If p_vaddr is
     // non-zero, the segments require the specific address specified,
@@ -870,18 +869,24 @@ bool ElfFile::Load(bool executable, std::string* error_msg) {
       return false;
     }
     size_t file_length = static_cast<size_t>(temp_file_length);
-    if (program_header.p_vaddr == 0) {
+    if (!reserved) {
+      byte* reserve_base = ((program_header.p_vaddr != 0) ?
+                            reinterpret_cast<byte*>(program_header.p_vaddr) : nullptr);
       std::string reservation_name("ElfFile reservation for ");
       reservation_name += file_->GetPath();
       std::unique_ptr<MemMap> reserve(MemMap::MapAnonymous(reservation_name.c_str(),
-                                                     nullptr, GetLoadedSize(), PROT_NONE, false,
-                                                     error_msg));
+                                                           reserve_base,
+                                                           GetLoadedSize(), PROT_NONE, false,
+                                                           error_msg));
       if (reserve.get() == nullptr) {
         *error_msg = StringPrintf("Failed to allocate %s: %s",
                                   reservation_name.c_str(), error_msg->c_str());
         return false;
       }
-      base_address_ = reserve->Begin();
+      reserved = true;
+      if (reserve_base == nullptr) {
+        base_address_ = reserve->Begin();
+      }
       segments_.push_back(reserve.release());
     }
     // empty segment, nothing to map
@@ -1335,7 +1340,8 @@ void ElfFile::GdbJITSupport() {
   const Elf32_Shdr* symtab_sec = all.FindSectionByName(".symtab");
   Elf32_Shdr* text_sec = all.FindSectionByName(".text");
   if (debug_info == nullptr || debug_abbrev == nullptr || eh_frame == nullptr ||
-      debug_str == nullptr || text_sec == nullptr || strtab_sec == nullptr || symtab_sec == nullptr) {
+      debug_str == nullptr || text_sec == nullptr || strtab_sec == nullptr ||
+      symtab_sec == nullptr) {
     return;
   }
   // We need to add in a strtab and symtab to the image.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 49bb65f..fa198d7 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -592,8 +592,7 @@ extern "C" uint64_t artQuickProxyInvokeHandler(mirror::ArtMethod* proxy_method,
   const char* old_cause =
       self->StartAssertNoThreadSuspension("Adding to IRT proxy object arguments");
   // Register the top of the managed stack, making stack crawlable.
-  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method)
-  << PrettyMethod(proxy_method);
+  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) << PrettyMethod(proxy_method);
   self->SetTopOfStack(sp, 0);
   DCHECK_EQ(proxy_method->GetFrameSizeInBytes(),
             Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes())
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index 46b9363..217360f 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -37,12 +37,13 @@ static inline bool byte_cas(byte old_value, byte new_value, byte* address) {
   // Align the address down.
   address -= shift_in_bytes;
   const size_t shift_in_bits = shift_in_bytes * kBitsPerByte;
-  AtomicInteger* word_atomic = reinterpret_cast<AtomicInteger*>(address);
+  Atomic<uintptr_t>* word_atomic = reinterpret_cast<Atomic<uintptr_t>*>(address);
 
   // Word with the byte we are trying to cas cleared.
-  const int32_t cur_word = word_atomic->LoadRelaxed() & ~(0xFF << shift_in_bits);
-  const int32_t old_word = cur_word | (static_cast<int32_t>(old_value) << shift_in_bits);
-  const int32_t new_word = cur_word | (static_cast<int32_t>(new_value) << shift_in_bits);
+  const uintptr_t cur_word = word_atomic->LoadRelaxed() &
+      ~(static_cast<uintptr_t>(0xFF) << shift_in_bits);
+  const uintptr_t old_word = cur_word | (static_cast<uintptr_t>(old_value) << shift_in_bits);
+  const uintptr_t new_word = cur_word | (static_cast<uintptr_t>(new_value) << shift_in_bits);
   return word_atomic->CompareExchangeWeakRelaxed(old_word, new_word);
 #endif
 }
diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc
index ceb42e5..0498550 100644
--- a/runtime/gc/accounting/card_table.cc
+++ b/runtime/gc/accounting/card_table.cc
@@ -28,6 +28,11 @@ namespace art {
 namespace gc {
 namespace accounting {
 
+constexpr size_t CardTable::kCardShift;
+constexpr size_t CardTable::kCardSize;
+constexpr uint8_t CardTable::kCardClean;
+constexpr uint8_t CardTable::kCardDirty;
+
 /*
  * Maintain a card table from the write barrier. All writes of
  * non-NULL values to heap addresses should go through an entry in
@@ -55,9 +60,9 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) {
   size_t capacity = heap_capacity / kCardSize;
   /* Allocate an extra 256 bytes to allow fixed low-byte of base */
   std::string error_msg;
-  std::unique_ptr<MemMap> mem_map(MemMap::MapAnonymous("card table", NULL,
-                                                 capacity + 256, PROT_READ | PROT_WRITE,
-                                                 false, &error_msg));
+  std::unique_ptr<MemMap> mem_map(
+      MemMap::MapAnonymous("card table", nullptr, capacity + 256, PROT_READ | PROT_WRITE,
+                           false, &error_msg));
   CHECK(mem_map.get() != NULL) << "couldn't allocate card table: " << error_msg;
   // All zeros is the correct initial value; all clean. Anonymous mmaps are initialized to zero, we
   // don't clear the card table to avoid unnecessary pages being allocated
@@ -67,17 +72,17 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) {
   CHECK(cardtable_begin != NULL);
 
   // We allocated up to a bytes worth of extra space to allow biased_begin's byte value to equal
-  // GC_CARD_DIRTY, compute a offset value to make this the case
+  // kCardDirty, compute a offset value to make this the case
   size_t offset = 0;
   byte* biased_begin = reinterpret_cast<byte*>(reinterpret_cast<uintptr_t>(cardtable_begin) -
       (reinterpret_cast<uintptr_t>(heap_begin) >> kCardShift));
-  if (((uintptr_t)biased_begin & 0xff) != kCardDirty) {
-    int delta = kCardDirty - (reinterpret_cast<uintptr_t>(biased_begin) & 0xff);
+  uintptr_t biased_byte = reinterpret_cast<uintptr_t>(biased_begin) & 0xff;
+  if (biased_byte != kCardDirty) {
+    int delta = kCardDirty - biased_byte;
     offset = delta + (delta < 0 ? 0x100 : 0);
     biased_begin += offset;
   }
   CHECK_EQ(reinterpret_cast<uintptr_t>(biased_begin) & 0xff, kCardDirty);
-
   return new CardTable(mem_map.release(), biased_begin, offset);
 }
 
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 7934974..fbeea85 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -46,10 +46,10 @@ template<size_t kAlignment> class SpaceBitmap;
 // WriteBarrier, and from there to here.
 class CardTable {
  public:
-  static const size_t kCardShift = 7;
-  static const size_t kCardSize = (1 << kCardShift);
-  static const uint8_t kCardClean = 0x0;
-  static const uint8_t kCardDirty = 0x70;
+  static constexpr size_t kCardShift = 7;
+  static constexpr size_t kCardSize = 1 << kCardShift;
+  static constexpr uint8_t kCardClean = 0x0;
+  static constexpr uint8_t kCardDirty = 0x70;
 
   static CardTable* Create(const byte* heap_begin, size_t heap_capacity);
 
diff --git a/runtime/gc/accounting/card_table_test.cc b/runtime/gc/accounting/card_table_test.cc
new file mode 100644
index 0000000..a88b2c9
--- /dev/null
+++ b/runtime/gc/accounting/card_table_test.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "card_table-inl.h"
+
+#include <string>
+
+#include "atomic.h"
+#include "common_runtime_test.h"
+#include "handle_scope-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/string-inl.h"  // Strings are easiest to allocate
+#include "scoped_thread_state_change.h"
+#include "thread_pool.h"
+#include "utils.h"
+
+namespace art {
+
+namespace mirror {
+  class Object;
+}  // namespace mirror
+
+class CardTableTest : public CommonRuntimeTest {
+ public:
+  std::unique_ptr<gc::accounting::CardTable> card_table_;
+  static constexpr size_t kCardSize = gc::accounting::CardTable::kCardSize;
+
+  void CommonSetup() {
+    if (card_table_.get() == nullptr) {
+      card_table_.reset(gc::accounting::CardTable::Create(heap_begin_, heap_size_));
+      EXPECT_TRUE(card_table_.get() != nullptr);
+    } else {
+      ClearCardTable();
+    }
+  }
+  // Default values for the test, not random to avoid undeterministic behaviour.
+  CardTableTest() : heap_begin_(reinterpret_cast<byte*>(0x2000000)), heap_size_(2 * MB) {
+  }
+  void ClearCardTable() {
+    card_table_->ClearCardTable();
+  }
+  byte* HeapBegin() const {
+    return heap_begin_;
+  }
+  byte* HeapLimit() const {
+    return HeapBegin() + heap_size_;
+  }
+  byte PRandCard(const byte* addr) const {
+    size_t offset = RoundDown(addr - heap_begin_, kCardSize);
+    return 1 + offset % 254;
+  }
+  void FillRandom() {
+    for (const byte* addr = HeapBegin(); addr != HeapLimit(); addr += kCardSize) {
+      EXPECT_TRUE(card_table_->AddrIsInCardTable(addr));
+      byte* card = card_table_->CardFromAddr(addr);
+      *card = PRandCard(addr);
+    }
+  }
+
+ private:
+  byte* const heap_begin_;
+  const size_t heap_size_;
+};
+
+TEST_F(CardTableTest, TestMarkCard) {
+  CommonSetup();
+  for (const byte* addr = HeapBegin(); addr < HeapLimit(); addr += kObjectAlignment) {
+    auto obj = reinterpret_cast<const mirror::Object*>(addr);
+    EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardClean);
+    EXPECT_TRUE(!card_table_->IsDirty(obj));
+    card_table_->MarkCard(addr);
+    EXPECT_TRUE(card_table_->IsDirty(obj));
+    EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardDirty);
+    byte* card_addr = card_table_->CardFromAddr(addr);
+    EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardDirty);
+    *card_addr = gc::accounting::CardTable::kCardClean;
+    EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardClean);
+  }
+}
+
+class UpdateVisitor {
+ public:
+  byte operator()(byte c) const {
+    return c * 93 + 123;
+  }
+  void operator()(byte* /*card*/, byte /*expected_value*/, byte /*new_value*/) const {
+  }
+};
+
+TEST_F(CardTableTest, TestModifyCardsAtomic) {
+  CommonSetup();
+  FillRandom();
+  const size_t delta = std::min(static_cast<size_t>(HeapLimit() - HeapBegin()), 8U * kCardSize);
+  UpdateVisitor visitor;
+  size_t start_offset = 0;
+  for (byte* cstart = HeapBegin(); cstart < HeapBegin() + delta; cstart += kCardSize) {
+    start_offset = (start_offset + kObjectAlignment) % kCardSize;
+    size_t end_offset = 0;
+    for (byte* cend = HeapLimit() - delta; cend < HeapLimit(); cend += kCardSize) {
+      // Don't always start at a card boundary.
+      byte* start = cstart + start_offset;
+      byte* end = cend - end_offset;
+      end_offset = (end_offset + kObjectAlignment) % kCardSize;
+      // Modify cards.
+      card_table_->ModifyCardsAtomic(start, end, visitor, visitor);
+      // Check adjacent cards not modified.
+      for (byte* cur = start - kCardSize; cur >= HeapBegin(); cur -= kCardSize) {
+        EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur));
+      }
+      for (byte* cur = end + kCardSize; cur < HeapLimit(); cur += kCardSize) {
+        EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur));
+      }
+      // Verify Range.
+      for (byte* cur = start; cur < AlignUp(end, kCardSize); cur += kCardSize) {
+        byte* card = card_table_->CardFromAddr(cur);
+        byte value = PRandCard(cur);
+        if (visitor(value) != *card) {
+          LOG(ERROR) << reinterpret_cast<void*>(start) << " " << reinterpret_cast<void*>(cur) << " " << reinterpret_cast<void*>(end);
+        }
+        EXPECT_EQ(visitor(value), *card);
+        // Restore for next iteration.
+        *card = value;
+      }
+    }
+  }
+}
+
+// TODO: Add test for CardTable::Scan.
+
+}  // namespace art
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index fc6d2ef..1d10af2 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -166,7 +166,8 @@ static bool ReadSpecificImageHeader(const char* filename, ImageHeader* image_hea
     return true;
 }
 
-bool ImageSpace::RelocateImage(const char* image_location, const char* dest_filename,
+// Relocate the image at image_location to dest_filename and relocate it by a random amount.
+static bool RelocateImage(const char* image_location, const char* dest_filename,
                                InstructionSet isa, std::string* error_msg) {
   std::string patchoat(Runtime::Current()->GetPatchoatExecutable());
 
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index debca52..6be3b8f 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -124,9 +124,6 @@ class ImageSpace : public MemMapSpace {
                           bool validate_oat_file, std::string* error_msg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static bool RelocateImage(const char* image_location, const char* dest_filename,
-                            InstructionSet isa, std::string* error_msg);
-
   OatFile* OpenOatFile(const char* image, std::string* error_msg) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/globals.h b/runtime/globals.h
index 1d9f22c..107e064 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -118,6 +118,8 @@ static constexpr TraceClockSource kDefaultTraceClockSource = kTraceClockSourceDu
 static constexpr TraceClockSource kDefaultTraceClockSource = kTraceClockSourceWall;
 #endif
 
+static constexpr bool kDefaultMustRelocate = true;
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_GLOBALS_H_
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index ab86eaa..e585412 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -65,7 +65,7 @@ class LockWord {
     kThinLockOwnerMask = (1 << kThinLockOwnerSize) - 1,
     // Count in higher bits.
     kThinLockCountShift = kThinLockOwnerSize + kThinLockOwnerShift,
-    kThinLockCountMask = (1 << kThinLockCountShift) - 1,
+    kThinLockCountMask = (1 << kThinLockCountSize) - 1,
     kThinLockMaxCount = kThinLockCountMask,
 
     // State in the highest bits.
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index 1074253..6c7ee5b 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -130,8 +130,67 @@ static uintptr_t GenerateNextMemPos() {
 uintptr_t MemMap::next_mem_pos_ = GenerateNextMemPos();
 #endif
 
+// Return true if the address range is contained in a single /proc/self/map entry.
+static bool CheckOverlapping(uintptr_t begin,
+                             uintptr_t end,
+                             std::string* error_msg) {
+  std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid(), true));
+  if (!map->Build()) {
+    *error_msg = StringPrintf("Failed to build process map");
+    return false;
+  }
+  for (BacktraceMap::const_iterator it = map->begin(); it != map->end(); ++it) {
+    if ((begin >= it->start && begin < it->end)  // start of new within old
+        && (end > it->start && end <= it->end)) {  // end of new within old
+      return true;
+    }
+  }
+  std::string maps;
+  ReadFileToString("/proc/self/maps", &maps);
+  *error_msg = StringPrintf("Requested region 0x%08" PRIxPTR "-0x%08" PRIxPTR " does not overlap "
+                            "any existing map:\n%s\n",
+                            begin, end, maps.c_str());
+  return false;
+}
+
+// Return true if the address range does not conflict with any /proc/self/maps entry.
+static bool CheckNonOverlapping(uintptr_t begin,
+                                uintptr_t end,
+                                std::string* error_msg) {
+  std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid(), true));
+  if (!map->Build()) {
+    *error_msg = StringPrintf("Failed to build process map");
+    return false;
+  }
+  for (BacktraceMap::const_iterator it = map->begin(); it != map->end(); ++it) {
+    if ((begin >= it->start && begin < it->end)      // start of new within old
+        || (end > it->start && end < it->end)        // end of new within old
+        || (begin <= it->start && end > it->end)) {  // start/end of new includes all of old
+      std::ostringstream map_info;
+      map_info << std::make_pair(it, map->end());
+      *error_msg = StringPrintf("Requested region 0x%08" PRIxPTR "-0x%08" PRIxPTR " overlaps with "
+                                "existing map 0x%08" PRIxPTR "-0x%08" PRIxPTR " (%s)\n%s",
+                                begin, end,
+                                static_cast<uintptr_t>(it->start), static_cast<uintptr_t>(it->end),
+                                it->name.c_str(),
+                                map_info.str().c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+// CheckMapRequest to validate a non-MAP_FAILED mmap result based on
+// the expected value, calling munmap if validation fails, giving the
+// reason in error_msg.
+//
+// If the expected_ptr is nullptr, nothing is checked beyond the fact
+// that the actual_ptr is not MAP_FAILED. However, if expected_ptr is
+// non-null, we check that pointer is the actual_ptr == expected_ptr,
+// and if not, report in error_msg what the conflict mapping was if
+// found, or a generic error in other cases.
 static bool CheckMapRequest(byte* expected_ptr, void* actual_ptr, size_t byte_count,
-                            std::ostringstream* error_msg) {
+                            std::string* error_msg) {
   // Handled first by caller for more specific error messages.
   CHECK(actual_ptr != MAP_FAILED);
 
@@ -139,6 +198,10 @@ static bool CheckMapRequest(byte* expected_ptr, void* actual_ptr, size_t byte_co
     return true;
   }
 
+  uintptr_t actual = reinterpret_cast<uintptr_t>(actual_ptr);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(expected_ptr);
+  uintptr_t limit = expected + byte_count;
+
   if (expected_ptr == actual_ptr) {
     return true;
   }
@@ -149,40 +212,19 @@ static bool CheckMapRequest(byte* expected_ptr, void* actual_ptr, size_t byte_co
     PLOG(WARNING) << StringPrintf("munmap(%p, %zd) failed", actual_ptr, byte_count);
   }
 
-  uintptr_t actual = reinterpret_cast<uintptr_t>(actual_ptr);
-  uintptr_t expected = reinterpret_cast<uintptr_t>(expected_ptr);
-  uintptr_t limit = expected + byte_count;
-
-  std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid()));
-  if (!map->Build()) {
-    *error_msg << StringPrintf("Failed to build process map to determine why mmap returned "
-                               "0x%08" PRIxPTR " instead of 0x%08" PRIxPTR, actual, expected);
-
+  if (!CheckNonOverlapping(expected, limit, error_msg)) {
     return false;
   }
-  for (BacktraceMap::const_iterator it = map->begin(); it != map->end(); ++it) {
-    if ((expected >= it->start && expected < it->end)  // start of new within old
-        || (limit > it->start && limit < it->end)      // end of new within old
-        || (expected <= it->start && limit > it->end)) {  // start/end of new includes all of old
-      *error_msg
-          << StringPrintf("Requested region 0x%08" PRIxPTR "-0x%08" PRIxPTR " overlaps with "
-                          "existing map 0x%08" PRIxPTR "-0x%08" PRIxPTR " (%s)\n",
-                          expected, limit,
-                          static_cast<uintptr_t>(it->start), static_cast<uintptr_t>(it->end),
-                          it->name.c_str())
-          << std::make_pair(it, map->end());
-      return false;
-    }
-  }
-  *error_msg << StringPrintf("Failed to mmap at expected address, mapped at "
-                             "0x%08" PRIxPTR " instead of 0x%08" PRIxPTR, actual, expected);
+
+  *error_msg = StringPrintf("Failed to mmap at expected address, mapped at "
+                            "0x%08" PRIxPTR " instead of 0x%08" PRIxPTR, actual, expected);
   return false;
 }
 
-MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count, int prot,
+MemMap* MemMap::MapAnonymous(const char* name, byte* expected_ptr, size_t byte_count, int prot,
                              bool low_4gb, std::string* error_msg) {
   if (byte_count == 0) {
-    return new MemMap(name, nullptr, 0, nullptr, 0, prot);
+    return new MemMap(name, nullptr, 0, nullptr, 0, prot, false);
   }
   size_t page_aligned_byte_count = RoundUp(byte_count, kPageSize);
 
@@ -222,11 +264,11 @@ MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count
   // 4GB.
   if (low_4gb && (
       // Start out of bounds.
-      (reinterpret_cast<uintptr_t>(expected) >> 32) != 0 ||
+      (reinterpret_cast<uintptr_t>(expected_ptr) >> 32) != 0 ||
       // End out of bounds. For simplicity, this will fail for the last page of memory.
-      (reinterpret_cast<uintptr_t>(expected + page_aligned_byte_count) >> 32) != 0)) {
+      (reinterpret_cast<uintptr_t>(expected_ptr + page_aligned_byte_count) >> 32) != 0)) {
     *error_msg = StringPrintf("The requested address space (%p, %p) cannot fit in low_4gb",
-                              expected, expected + page_aligned_byte_count);
+                              expected_ptr, expected_ptr + page_aligned_byte_count);
     return nullptr;
   }
 #endif
@@ -238,7 +280,7 @@ MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count
 #if USE_ART_LOW_4G_ALLOCATOR
   // MAP_32BIT only available on x86_64.
   void* actual = MAP_FAILED;
-  if (low_4gb && expected == nullptr) {
+  if (low_4gb && expected_ptr == nullptr) {
     bool first_run = true;
 
     for (uintptr_t ptr = next_mem_pos_; ptr < 4 * GB; ptr += kPageSize) {
@@ -294,18 +336,18 @@ MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count
       saved_errno = ENOMEM;
     }
   } else {
-    actual = mmap(expected, page_aligned_byte_count, prot, flags, fd.get(), 0);
+    actual = mmap(expected_ptr, page_aligned_byte_count, prot, flags, fd.get(), 0);
     saved_errno = errno;
   }
 
 #else
 #if defined(__LP64__)
-  if (low_4gb && expected == nullptr) {
+  if (low_4gb && expected_ptr == nullptr) {
     flags |= MAP_32BIT;
   }
 #endif
 
-  void* actual = mmap(expected, page_aligned_byte_count, prot, flags, fd.get(), 0);
+  void* actual = mmap(expected_ptr, page_aligned_byte_count, prot, flags, fd.get(), 0);
   saved_errno = errno;
 #endif
 
@@ -314,44 +356,51 @@ MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count
     ReadFileToString("/proc/self/maps", &maps);
 
     *error_msg = StringPrintf("Failed anonymous mmap(%p, %zd, 0x%x, 0x%x, %d, 0): %s\n%s",
-                              expected, page_aligned_byte_count, prot, flags, fd.get(),
+                              expected_ptr, page_aligned_byte_count, prot, flags, fd.get(),
                               strerror(saved_errno), maps.c_str());
     return nullptr;
   }
   std::ostringstream check_map_request_error_msg;
-  if (!CheckMapRequest(expected, actual, page_aligned_byte_count, &check_map_request_error_msg)) {
-    *error_msg = check_map_request_error_msg.str();
+  if (!CheckMapRequest(expected_ptr, actual, page_aligned_byte_count, error_msg)) {
     return nullptr;
   }
   return new MemMap(name, reinterpret_cast<byte*>(actual), byte_count, actual,
-                    page_aligned_byte_count, prot);
+                    page_aligned_byte_count, prot, false);
 }
 
-MemMap* MemMap::MapFileAtAddress(byte* expected, size_t byte_count, int prot, int flags, int fd,
+MemMap* MemMap::MapFileAtAddress(byte* expected_ptr, size_t byte_count, int prot, int flags, int fd,
                                  off_t start, bool reuse, const char* filename,
                                  std::string* error_msg) {
   CHECK_NE(0, prot);
   CHECK_NE(0, flags & (MAP_SHARED | MAP_PRIVATE));
+  uintptr_t expected = reinterpret_cast<uintptr_t>(expected_ptr);
+  uintptr_t limit = expected + byte_count;
   if (reuse) {
     // reuse means it is okay that it overlaps an existing page mapping.
     // Only use this if you actually made the page reservation yourself.
-    CHECK(expected != nullptr);
+    CHECK(expected_ptr != nullptr);
+    if (!CheckOverlapping(expected, limit, error_msg)) {
+      return nullptr;
+    }
     flags |= MAP_FIXED;
   } else {
     CHECK_EQ(0, flags & MAP_FIXED);
+    if (expected_ptr != nullptr && !CheckNonOverlapping(expected, limit, error_msg)) {
+      return nullptr;
+    }
   }
 
   if (byte_count == 0) {
-    return new MemMap(filename, nullptr, 0, nullptr, 0, prot);
+    return new MemMap(filename, nullptr, 0, nullptr, 0, prot, false);
   }
   // Adjust 'offset' to be page-aligned as required by mmap.
   int page_offset = start % kPageSize;
   off_t page_aligned_offset = start - page_offset;
   // Adjust 'byte_count' to be page-aligned as we will map this anyway.
   size_t page_aligned_byte_count = RoundUp(byte_count + page_offset, kPageSize);
-  // The 'expected' is modified (if specified, ie non-null) to be page aligned to the file but not
-  // necessarily to virtual memory. mmap will page align 'expected' for us.
-  byte* page_aligned_expected = (expected == nullptr) ? nullptr : (expected - page_offset);
+  // The 'expected_ptr' is modified (if specified, ie non-null) to be page aligned to the file but
+  // not necessarily to virtual memory. mmap will page align 'expected' for us.
+  byte* page_aligned_expected = (expected_ptr == nullptr) ? nullptr : (expected_ptr - page_offset);
 
   byte* actual = reinterpret_cast<byte*>(mmap(page_aligned_expected,
                                               page_aligned_byte_count,
@@ -373,21 +422,22 @@ MemMap* MemMap::MapFileAtAddress(byte* expected, size_t byte_count, int prot, in
     return nullptr;
   }
   std::ostringstream check_map_request_error_msg;
-  if (!CheckMapRequest(expected, actual, page_aligned_byte_count, &check_map_request_error_msg)) {
-    *error_msg = check_map_request_error_msg.str();
+  if (!CheckMapRequest(expected_ptr, actual, page_aligned_byte_count, error_msg)) {
     return nullptr;
   }
   return new MemMap(filename, actual + page_offset, byte_count, actual, page_aligned_byte_count,
-                    prot);
+                    prot, reuse);
 }
 
 MemMap::~MemMap() {
   if (base_begin_ == nullptr && base_size_ == 0) {
     return;
   }
-  int result = munmap(base_begin_, base_size_);
-  if (result == -1) {
-    PLOG(FATAL) << "munmap failed";
+  if (!reuse_) {
+    int result = munmap(base_begin_, base_size_);
+    if (result == -1) {
+      PLOG(FATAL) << "munmap failed";
+    }
   }
 
   // Remove it from maps_.
@@ -405,9 +455,9 @@ MemMap::~MemMap() {
 }
 
 MemMap::MemMap(const std::string& name, byte* begin, size_t size, void* base_begin,
-               size_t base_size, int prot)
+               size_t base_size, int prot, bool reuse)
     : name_(name), begin_(begin), size_(size), base_begin_(base_begin), base_size_(base_size),
-      prot_(prot) {
+      prot_(prot), reuse_(reuse) {
   if (size_ == 0) {
     CHECK(begin_ == nullptr);
     CHECK(base_begin_ == nullptr);
@@ -437,7 +487,7 @@ MemMap* MemMap::RemapAtEnd(byte* new_end, const char* tail_name, int tail_prot,
   byte* new_base_end = new_end;
   DCHECK_LE(new_base_end, old_base_end);
   if (new_base_end == old_base_end) {
-    return new MemMap(tail_name, nullptr, 0, nullptr, 0, tail_prot);
+    return new MemMap(tail_name, nullptr, 0, nullptr, 0, tail_prot, false);
   }
   size_ = new_end - reinterpret_cast<byte*>(begin_);
   base_size_ = new_base_end - reinterpret_cast<byte*>(base_begin_);
@@ -489,7 +539,7 @@ MemMap* MemMap::RemapAtEnd(byte* new_end, const char* tail_name, int tail_prot,
                               maps.c_str());
     return nullptr;
   }
-  return new MemMap(tail_name, actual, tail_size, actual, tail_base_size, tail_prot);
+  return new MemMap(tail_name, actual, tail_size, actual, tail_base_size, tail_prot, false);
 }
 
 void MemMap::MadviseDontNeedAndZero() {
diff --git a/runtime/mem_map.h b/runtime/mem_map.h
index defa6a5..872c63b 100644
--- a/runtime/mem_map.h
+++ b/runtime/mem_map.h
@@ -73,7 +73,9 @@ class MemMap {
 
   // Map part of a file, taking care of non-page aligned offsets.  The
   // "start" offset is absolute, not relative. This version allows
-  // requesting a specific address for the base of the mapping.
+  // requesting a specific address for the base of the
+  // mapping. "reuse" allows us to create a view into an existing
+  // mapping where we do not take ownership of the memory.
   //
   // On success, returns returns a MemMap instance.  On failure, returns a NULL;
   static MemMap* MapFileAtAddress(byte* addr, size_t byte_count, int prot, int flags, int fd,
@@ -134,7 +136,7 @@ class MemMap {
 
  private:
   MemMap(const std::string& name, byte* begin, size_t size, void* base_begin, size_t base_size,
-         int prot) LOCKS_EXCLUDED(Locks::mem_maps_lock_);
+         int prot, bool reuse) LOCKS_EXCLUDED(Locks::mem_maps_lock_);
 
   static void DumpMaps(std::ostream& os, const std::multimap<void*, MemMap*>& mem_maps)
       LOCKS_EXCLUDED(Locks::mem_maps_lock_);
@@ -145,7 +147,7 @@ class MemMap {
   static MemMap* GetLargestMemMapAt(void* address)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mem_maps_lock_);
 
-  std::string name_;
+  const std::string name_;
   byte* const begin_;  // Start of data.
   size_t size_;  // Length of data.
 
@@ -153,6 +155,11 @@ class MemMap {
   size_t base_size_;  // Length of mapping. May be changed by RemapAtEnd (ie Zygote).
   int prot_;  // Protection of the map.
 
+  // When reuse_ is true, this is just a view of an existing mapping
+  // and we do not take ownership and are not responsible for
+  // unmapping.
+  const bool reuse_;
+
 #if USE_ART_LOW_4G_ALLOCATOR
   static uintptr_t next_mem_pos_;   // Next memory location to check for low_4g extent.
 #endif
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 4882728..8eacb1c 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -157,12 +157,12 @@ ArtMethod* ArtMethod::FindOverriddenMethod() {
       }
     }
   }
-#ifndef NDEBUG
-  StackHandleScope<2> hs(Thread::Current());
-  MethodHelper result_mh(hs.NewHandle(result));
-  MethodHelper this_mh(hs.NewHandle(this));
-  DCHECK(result == NULL || this_mh.HasSameNameAndSignature(&result_mh));
-#endif
+  if (kIsDebugBuild) {
+    StackHandleScope<2> hs(Thread::Current());
+    MethodHelper result_mh(hs.NewHandle(result));
+    MethodHelper this_mh(hs.NewHandle(this));
+    DCHECK(result == nullptr || this_mh.HasSameNameAndSignature(&result_mh));
+  }
   return result;
 }
 
diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc
index 440a6be..4964aa0 100644
--- a/runtime/monitor_pool.cc
+++ b/runtime/monitor_pool.cc
@@ -52,7 +52,7 @@ void MonitorPool::AllocateChunk() {
       monitor_chunks_.StoreRelaxed(new_backing);
       capacity_ = new_capacity;
       old_chunk_arrays_.push_back(old_backing);
-      LOG(INFO) << "Resizing to capacity " << capacity_;
+      VLOG(monitor) << "Resizing to capacity " << capacity_;
     }
   }
 
@@ -64,7 +64,7 @@ void MonitorPool::AllocateChunk() {
   CHECK_EQ(0U, reinterpret_cast<uintptr_t>(chunk) % kMonitorAlignment);
 
   // Add the chunk.
-  *(monitor_chunks_.LoadRelaxed()+num_chunks_) = reinterpret_cast<uintptr_t>(chunk);
+  *(monitor_chunks_.LoadRelaxed() + num_chunks_) = reinterpret_cast<uintptr_t>(chunk);
   num_chunks_++;
 
   // Set up the free list
@@ -96,7 +96,7 @@ Monitor* MonitorPool::CreateMonitorInPool(Thread* self, Thread* owner, mirror::O
 
   // Enough space, or need to resize?
   if (first_free_ == nullptr) {
-    LOG(INFO) << "Allocating a new chunk.";
+    VLOG(monitor) << "Allocating a new chunk.";
     AllocateChunk();
   }
 
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index ac1a310..0af2c22 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -275,8 +275,96 @@ static void CopyProfileFile(const char* oldfile, const char* newfile) {
   }
 }
 
-static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
+// Java: dalvik.system.DexFile.UP_TO_DATE
+static const jbyte kUpToDate = 0;
+// Java: dalvik.system.DexFile.DEXOPT_NEEDED
+static const jbyte kPatchoatNeeded = 1;
+// Java: dalvik.system.DexFile.PATCHOAT_NEEDED
+static const jbyte kDexoptNeeded = 2;
+
+template <const bool kVerboseLogging, const bool kReasonLogging>
+static jbyte IsDexOptNeededForFile(const std::string& oat_filename, const char* filename,
+                                   InstructionSet target_instruction_set) {
+  std::string error_msg;
+  std::unique_ptr<const OatFile> oat_file(OatFile::Open(oat_filename, oat_filename, nullptr,
+                                                        false, &error_msg));
+  if (oat_file.get() == nullptr) {
+    if (kVerboseLogging) {
+      LOG(INFO) << "DexFile_isDexOptNeeded failed to open oat file '" << oat_filename
+          << "' for file location '" << filename << "': " << error_msg;
+    }
+    error_msg.clear();
+    return kDexoptNeeded;
+  }
+  bool should_relocate_if_possible = Runtime::Current()->ShouldRelocate();
+  uint32_t location_checksum = 0;
+  const art::OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(filename, nullptr,
+                                                                          kReasonLogging);
+  if (oat_dex_file != nullptr) {
+    // If its not possible to read the classes.dex assume up-to-date as we won't be able to
+    // compile it anyway.
+    if (!DexFile::GetChecksum(filename, &location_checksum, &error_msg)) {
+      if (kVerboseLogging) {
+        LOG(INFO) << "DexFile_isDexOptNeeded found precompiled stripped file: "
+            << filename << " for " << oat_filename << ": " << error_msg;
+      }
+      if (ClassLinker::VerifyOatChecksums(oat_file.get(), target_instruction_set, &error_msg)) {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " is up-to-date for " << filename;
+        }
+        return kUpToDate;
+      } else if (should_relocate_if_possible &&
+                  ClassLinker::VerifyOatImageChecksum(oat_file.get(), target_instruction_set)) {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " needs to be relocated for " << filename;
+        }
+        return kPatchoatNeeded;
+      } else {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " is out of date for " << filename;
+        }
+        return kDexoptNeeded;
+      }
+      // If we get here the file is out of date and we should use the system one to relocate.
+    } else {
+      if (ClassLinker::VerifyOatAndDexFileChecksums(oat_file.get(), filename, location_checksum,
+                                                    target_instruction_set, &error_msg)) {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " is up-to-date for " << filename;
+        }
+        return kUpToDate;
+      } else if (location_checksum == oat_dex_file->GetDexFileLocationChecksum()
+                  && should_relocate_if_possible
+                  && ClassLinker::VerifyOatImageChecksum(oat_file.get(), target_instruction_set)) {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " needs to be relocated for " << filename;
+        }
+        return kPatchoatNeeded;
+      } else {
+        if (kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                    << " is out of date for " << filename;
+        }
+        return kDexoptNeeded;
+      }
+    }
+  } else {
+    if (kVerboseLogging) {
+      LOG(INFO) << "DexFile_isDexOptNeeded file " << oat_filename
+                << " does not contain " << filename;
+    }
+    return kDexoptNeeded;
+  }
+}
+
+static jbyte IsDexOptNeededInternal(JNIEnv* env, const char* filename,
     const char* pkgname, const char* instruction_set, const jboolean defer) {
+  // TODO disable this logging.
   const bool kVerboseLogging = false;  // Spammy logging.
   const bool kReasonLogging = true;  // Logging of reason for returning JNI_TRUE.
 
@@ -285,7 +373,7 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
     ScopedLocalRef<jclass> fnfe(env, env->FindClass("java/io/FileNotFoundException"));
     const char* message = (filename == nullptr) ? "<empty file name>" : filename;
     env->ThrowNew(fnfe.get(), message);
-    return JNI_FALSE;
+    return kUpToDate;
   }
 
   // Always treat elements of the bootclasspath as up-to-date.  The
@@ -301,78 +389,45 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
       if (kVerboseLogging) {
         LOG(INFO) << "DexFile_isDexOptNeeded ignoring boot class path file: " << filename;
       }
-      return JNI_FALSE;
+      return kUpToDate;
     }
   }
 
-  const InstructionSet target_instruction_set = GetInstructionSetFromString(instruction_set);
-
-  // Check if we have an odex file next to the dex file.
-  std::string odex_filename(DexFilenameToOdexFilename(filename, kRuntimeISA));
-  std::string error_msg;
-  std::unique_ptr<const OatFile> oat_file(OatFile::Open(odex_filename, odex_filename, NULL, false,
-                                                        &error_msg));
-  if (oat_file.get() == nullptr) {
-    if (kVerboseLogging) {
-      LOG(INFO) << "DexFile_isDexOptNeeded failed to open oat file '" << filename
-          << "': " << error_msg;
-    }
-    error_msg.clear();
-  } else {
-    const art::OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(filename, NULL,
-                                                                           kReasonLogging);
-    if (oat_dex_file != nullptr) {
-      uint32_t location_checksum;
-      // If its not possible to read the classes.dex assume up-to-date as we won't be able to
-      // compile it anyway.
-      if (!DexFile::GetChecksum(filename, &location_checksum, &error_msg)) {
-        if (kVerboseLogging) {
-          LOG(INFO) << "DexFile_isDexOptNeeded ignoring precompiled stripped file: "
-              << filename << ": " << error_msg;
-        }
-        return JNI_FALSE;
-      }
-      if (ClassLinker::VerifyOatFileChecksums(oat_file.get(), filename, location_checksum,
-                                              target_instruction_set,
-                                              &error_msg)) {
-        if (kVerboseLogging) {
-          LOG(INFO) << "DexFile_isDexOptNeeded precompiled file " << odex_filename
-              << " has an up-to-date checksum compared to " << filename;
-        }
-        return JNI_FALSE;
-      } else {
-        if (kVerboseLogging) {
-          LOG(INFO) << "DexFile_isDexOptNeeded found precompiled file " << odex_filename
-              << " with an out-of-date checksum compared to " << filename
-              << ": " << error_msg;
-        }
-        error_msg.clear();
-      }
-    }
-  }
+  bool force_system_only = false;
+  bool require_system_version = false;
 
   // Check the profile file.  We need to rerun dex2oat if the profile has changed significantly
   // since the last time, or it's new.
   // If the 'defer' argument is true then this will be retried later.  In this case we
   // need to make sure that the profile file copy is not made so that we will get the
   // same result second time.
+  std::string profile_file;
+  std::string prev_profile_file;
+  bool should_copy_profile = false;
   if (Runtime::Current()->GetProfilerOptions().IsEnabled() && (pkgname != nullptr)) {
-    const std::string profile_file = GetDalvikCacheOrDie("profiles", false /* create_if_absent */)
+    profile_file = GetDalvikCacheOrDie("profiles", false /* create_if_absent */)
         + std::string("/") + pkgname;
-    const std::string prev_profile_file = profile_file + std::string("@old");
+    prev_profile_file = profile_file + std::string("@old");
 
     struct stat profstat, prevstat;
     int e1 = stat(profile_file.c_str(), &profstat);
+    int e1_errno = errno;
     int e2 = stat(prev_profile_file.c_str(), &prevstat);
+    int e2_errno = errno;
     if (e1 < 0) {
-      // No profile file, need to run dex2oat
-      if (kReasonLogging) {
-        LOG(INFO) << "DexFile_isDexOptNeeded profile file " << profile_file << " doesn't exist";
+      if (e1_errno != EACCES) {
+        // No profile file, need to run dex2oat, unless we find a file in system
+        if (kReasonLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeededInternal profile file " << profile_file << " doesn't exist. "
+                    << "Will check odex to see if we can find a working version.";
+        }
+        // Force it to only accept system files/files with versions in system.
+        require_system_version = true;
+      } else {
+        LOG(INFO) << "DexFile_isDexOptNeededInternal recieved EACCES trying to stat profile file "
+                  << profile_file;
       }
-      return JNI_TRUE;
-    }
-
-    if (e2 == 0) {
+    } else if (e2 == 0) {
       // There is a previous profile file.  Check if the profile has changed significantly.
       // A change in profile is considered significant if X% (change_thr property) of the top K%
       // (compile_thr property) samples has changed.
@@ -384,7 +439,7 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
       bool old_ok = old_profile.LoadFile(prev_profile_file);
       if (!new_ok || !old_ok) {
         if (kVerboseLogging) {
-          LOG(INFO) << "DexFile_isDexOptNeeded Ignoring invalid profiles: "
+          LOG(INFO) << "DexFile_isDexOptNeededInternal Ignoring invalid profiles: "
                     << (new_ok ?  "" : profile_file) << " " << (old_ok ? "" : prev_profile_file);
         }
       } else {
@@ -393,7 +448,7 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
         old_profile.GetTopKSamples(old_top_k, top_k_threshold);
         if (new_top_k.empty()) {
           if (kVerboseLogging) {
-            LOG(INFO) << "DexFile_isDexOptNeeded empty profile: " << profile_file;
+            LOG(INFO) << "DexFile_isDexOptNeededInternal empty profile: " << profile_file;
           }
           // If the new topK is empty we shouldn't optimize so we leave the change_percent at 0.0.
         } else {
@@ -405,7 +460,7 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
           if (kVerboseLogging) {
             std::set<std::string>::iterator end = diff.end();
             for (std::set<std::string>::iterator it = diff.begin(); it != end; it++) {
-              LOG(INFO) << "DexFile_isDexOptNeeded new in topK: " << *it;
+              LOG(INFO) << "DexFile_isDexOptNeededInternal new in topK: " << *it;
             }
           }
         }
@@ -413,67 +468,84 @@ static jboolean IsDexOptNeededInternal(JNIEnv* env, const char* filename,
 
       if (change_percent > change_threshold) {
         if (kReasonLogging) {
-          LOG(INFO) << "DexFile_isDexOptNeeded size of new profile file " << profile_file <<
+          LOG(INFO) << "DexFile_isDexOptNeededInternal size of new profile file " << profile_file <<
           " is significantly different from old profile file " << prev_profile_file << " (top "
           << top_k_threshold << "% samples changed in proportion of " << change_percent << "%)";
         }
-        if (!defer) {
-          CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
-        }
-        return JNI_TRUE;
+        should_copy_profile = !defer;
+        // Force us to only accept system files.
+        force_system_only = true;
       }
-    } else {
+    } else if (e2_errno == ENOENT) {
       // Previous profile does not exist.  Make a copy of the current one.
       if (kVerboseLogging) {
-        LOG(INFO) << "DexFile_isDexOptNeeded previous profile doesn't exist: " << prev_profile_file;
-      }
-      if (!defer) {
-        CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
+        LOG(INFO) << "DexFile_isDexOptNeededInternal previous profile doesn't exist: " << prev_profile_file;
       }
+      should_copy_profile = !defer;
+    } else {
+      PLOG(INFO) << "Unable to stat previous profile file " << prev_profile_file;
     }
   }
 
-  // Check if we have an oat file in the cache
-  const std::string cache_dir(GetDalvikCacheOrDie(instruction_set));
-  const std::string cache_location(
-      GetDalvikCacheFilenameOrDie(filename, cache_dir.c_str()));
-  oat_file.reset(OatFile::Open(cache_location, filename, NULL, false, &error_msg));
-  if (oat_file.get() == nullptr) {
-    if (kReasonLogging) {
-      LOG(INFO) << "DexFile_isDexOptNeeded cache file " << cache_location
-          << " does not exist for " << filename << ": " << error_msg;
+  const InstructionSet target_instruction_set = GetInstructionSetFromString(instruction_set);
+
+  // Get the filename for odex file next to the dex file.
+  std::string odex_filename(DexFilenameToOdexFilename(filename, target_instruction_set));
+  // Get the filename for the dalvik-cache file
+  std::string cache_dir;
+  bool have_android_data = false;
+  bool dalvik_cache_exists = false;
+  GetDalvikCache(instruction_set, false, &cache_dir, &have_android_data, &dalvik_cache_exists);
+  std::string cache_filename;  // was cache_location
+  bool have_cache_filename = false;
+  if (dalvik_cache_exists) {
+    std::string error_msg;
+    have_cache_filename = GetDalvikCacheFilename(filename, cache_dir.c_str(), &cache_filename,
+                                                 &error_msg);
+    if (!have_cache_filename && kVerboseLogging) {
+      LOG(INFO) << "DexFile_isDexOptNeededInternal failed to find cache file for dex file " << filename
+                << ": " << error_msg;
     }
-    return JNI_TRUE;
   }
 
-  uint32_t location_checksum;
-  if (!DexFile::GetChecksum(filename, &location_checksum, &error_msg)) {
-    if (kReasonLogging) {
-      LOG(ERROR) << "DexFile_isDexOptNeeded failed to compute checksum of " << filename
-            << " (error " << error_msg << ")";
+  bool should_relocate_if_possible = Runtime::Current()->ShouldRelocate();
+
+  InstructionSet isa = Runtime::Current()->GetInstructionSet();
+  jbyte dalvik_cache_decision = -1;
+  // Lets try the cache first (since we want to load from there since thats where the relocated
+  // versions will be).
+  if (have_cache_filename && !force_system_only) {
+    // We can use the dalvik-cache if we find a good file.
+    dalvik_cache_decision =
+        IsDexOptNeededForFile<kVerboseLogging, kReasonLogging>(cache_filename, filename, isa);
+    // We will only return DexOptNeeded if both the cache and system return it.
+    if (dalvik_cache_decision != kDexoptNeeded && !require_system_version) {
+      CHECK(!(dalvik_cache_decision == kPatchoatNeeded && !should_relocate_if_possible))
+          << "May not return PatchoatNeeded when patching is disabled.";
+      return dalvik_cache_decision;
     }
-    return JNI_TRUE;
+    // We couldn't find one thats easy. We should now try the system.
   }
 
-  if (!ClassLinker::VerifyOatFileChecksums(oat_file.get(), filename, location_checksum,
-                                           target_instruction_set, &error_msg)) {
-    if (kReasonLogging) {
-      LOG(INFO) << "DexFile_isDexOptNeeded cache file " << cache_location
-          << " has out-of-date checksum compared to " << filename
-          << " (error " << error_msg << ")";
-    }
-    return JNI_TRUE;
+  jbyte system_decision =
+      IsDexOptNeededForFile<kVerboseLogging, kReasonLogging>(odex_filename, filename, isa);
+  CHECK(!(system_decision == kPatchoatNeeded && !should_relocate_if_possible))
+      << "May not return PatchoatNeeded when patching is disabled.";
+
+  if (require_system_version && system_decision == kPatchoatNeeded
+                             && dalvik_cache_decision == kUpToDate) {
+    // We have a version from system relocated to the cache. Return it.
+    return dalvik_cache_decision;
   }
 
-  if (kVerboseLogging) {
-    LOG(INFO) << "DexFile_isDexOptNeeded cache file " << cache_location
-              << " is up-to-date for " << filename;
+  if (should_copy_profile && system_decision == kDexoptNeeded) {
+    CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
   }
-  CHECK(error_msg.empty()) << error_msg;
-  return JNI_FALSE;
+
+  return system_decision;
 }
 
-static jboolean DexFile_isDexOptNeededInternal(JNIEnv* env, jclass, jstring javaFilename,
+static jbyte DexFile_isDexOptNeededInternal(JNIEnv* env, jclass, jstring javaFilename,
     jstring javaPkgname, jstring javaInstructionSet, jboolean defer) {
   ScopedUtfChars filename(env, javaFilename);
   NullableScopedUtfChars pkgname(env, javaPkgname);
@@ -487,8 +559,8 @@ static jboolean DexFile_isDexOptNeededInternal(JNIEnv* env, jclass, jstring java
 static jboolean DexFile_isDexOptNeeded(JNIEnv* env, jclass, jstring javaFilename) {
   const char* instruction_set = GetInstructionSetString(kRuntimeISA);
   ScopedUtfChars filename(env, javaFilename);
-  return IsDexOptNeededInternal(env, filename.c_str(), nullptr /* pkgname */,
-                                instruction_set, false /* defer */);
+  return kUpToDate != IsDexOptNeededInternal(env, filename.c_str(), nullptr /* pkgname */,
+                                             instruction_set, false /* defer */);
 }
 
 
@@ -497,7 +569,7 @@ static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(DexFile, defineClassNative, "(Ljava/lang/String;Ljava/lang/ClassLoader;J)Ljava/lang/Class;"),
   NATIVE_METHOD(DexFile, getClassNameList, "(J)[Ljava/lang/String;"),
   NATIVE_METHOD(DexFile, isDexOptNeeded, "(Ljava/lang/String;)Z"),
-  NATIVE_METHOD(DexFile, isDexOptNeededInternal, "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Z)Z"),
+  NATIVE_METHOD(DexFile, isDexOptNeededInternal, "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Z)B"),
   NATIVE_METHOD(DexFile, openDexFile, "(Ljava/lang/String;Ljava/lang/String;I)J"),
 };
 
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index f9cc36a..c4c6b10 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -18,6 +18,7 @@
 
 #include <dlfcn.h>
 #include <sstream>
+#include <string.h>
 
 #include "base/bit_vector.h"
 #include "base/stl_util.h"
@@ -125,6 +126,9 @@ OatFile::OatFile(const std::string& location)
 }
 
 OatFile::~OatFile() {
+  for (auto it : oat_dex_files_) {
+     delete it.first.data();
+  }
   STLDeleteValues(&oat_dex_files_);
   if (dlopen_handle_ != NULL) {
     dlclose(dlopen_handle_);
@@ -305,8 +309,14 @@ bool OatFile::Setup(std::string* error_msg) {
                                               dex_file_checksum,
                                               dex_file_pointer,
                                               methods_offsets_pointer);
-    // Use a StringPiece backed by the oat_dex_file's internal std::string as the key.
-    StringPiece key(oat_dex_file->GetDexFileLocation());
+
+    std::string dex_canonical_location_str = DexFile::GetDexCanonicalLocation(dex_file_location.c_str());
+    // make a copy since we need to persist it as a key in the object's field.
+    int location_size = dex_canonical_location_str.size() + 1;
+    char* dex_canonical_location = new char[location_size ];
+    strncpy(dex_canonical_location, dex_canonical_location_str.c_str(), location_size);
+
+    StringPiece key(dex_canonical_location);
     oat_dex_files_.Put(key, oat_dex_file);
   }
   return true;
@@ -329,7 +339,9 @@ const byte* OatFile::End() const {
 const OatFile::OatDexFile* OatFile::GetOatDexFile(const char* dex_location,
                                                   const uint32_t* dex_location_checksum,
                                                   bool warn_if_not_found) const {
-  Table::const_iterator it = oat_dex_files_.find(dex_location);
+  std::string dex_canonical_location = DexFile::GetDexCanonicalLocation(dex_location);
+
+  Table::const_iterator it = oat_dex_files_.find(dex_canonical_location);
   if (it != oat_dex_files_.end()) {
     const OatFile::OatDexFile* oat_dex_file = it->second;
     if (dex_location_checksum == NULL ||
@@ -344,15 +356,18 @@ const OatFile::OatDexFile* OatFile::GetOatDexFile(const char* dex_location,
       checksum = StringPrintf("0x%08x", *dex_location_checksum);
     }
     LOG(WARNING) << "Failed to find OatDexFile for DexFile " << dex_location
+                 << " ( canonical path " << dex_canonical_location << ")"
                  << " with checksum " << checksum << " in OatFile " << GetLocation();
     if (kIsDebugBuild) {
       for (Table::const_iterator it = oat_dex_files_.begin(); it != oat_dex_files_.end(); ++it) {
         LOG(WARNING) << "OatFile " << GetLocation()
                      << " contains OatDexFile " << it->second->GetDexFileLocation()
+                     << " (canonical path " << it->first << ")"
                      << " with checksum 0x" << std::hex << it->second->GetDexFileLocationChecksum();
       }
     }
   }
+
   return NULL;
 }
 
diff --git a/runtime/parsed_options.h b/runtime/parsed_options.h
index 668ed9e..3dbe26f 100644
--- a/runtime/parsed_options.h
+++ b/runtime/parsed_options.h
@@ -48,8 +48,6 @@ class ParsedOptions {
   std::string native_bridge_library_string_;
   CompilerCallbacks* compiler_callbacks_;
   bool is_zygote_;
-  // TODO Change this to true when we want it on by default.
-  static constexpr bool kDefaultMustRelocate = false;
   bool must_relocate_;
   std::string patchoat_executable_;
   bool interpreter_only_;
diff --git a/runtime/proxy_test.cc b/runtime/proxy_test.cc
index bd6656d..3081421 100644
--- a/runtime/proxy_test.cc
+++ b/runtime/proxy_test.cc
@@ -17,14 +17,14 @@
 #include <jni.h>
 #include <vector>
 
-#include "common_compiler_test.h"
+#include "common_runtime_test.h"
 #include "field_helper.h"
 #include "mirror/art_field-inl.h"
 #include "scoped_thread_state_change.h"
 
 namespace art {
 
-class ProxyTest : public CommonCompilerTest {
+class ProxyTest : public CommonRuntimeTest {
  public:
   // Generate a proxy class with the given name and interfaces. This is a simplification from what
   // libcore does to fit to our test needs. We do not check for duplicated interfaces or methods and
@@ -103,6 +103,12 @@ class ProxyTest : public CommonCompilerTest {
     soa.Self()->AssertNoPendingException();
     return proxyClass;
   }
+
+ protected:
+  void SetUpRuntimeOptions(RuntimeOptions *options) OVERRIDE {
+    options->push_back(std::make_pair(StringPrintf("-Ximage:%s", GetLibCoreOatFileName().c_str()),
+                                      nullptr));
+  }
 };
 
 // Creates a proxy class and check ClassHelper works correctly.
diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h
index 982553d..c4d51cb 100644
--- a/runtime/quick/inline_method_analyser.h
+++ b/runtime/quick/inline_method_analyser.h
@@ -48,7 +48,12 @@ enum InlineMethodOpcode : uint16_t {
   kIntrinsicMinMaxFloat,
   kIntrinsicMinMaxDouble,
   kIntrinsicSqrt,
-  kIntrinsicGet,
+  kIntrinsicCeil,
+  kIntrinsicFloor,
+  kIntrinsicRint,
+  kIntrinsicRoundFloat,
+  kIntrinsicRoundDouble,
+  kIntrinsicReferenceGet,
   kIntrinsicCharAt,
   kIntrinsicCompareTo,
   kIntrinsicIsEmptyOrLength,
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 52cdcc1..4d49809 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -1236,7 +1236,7 @@ bool GetDalvikCacheFilename(const char* location, const char* cache_location,
     return false;
   }
   std::string cache_file(&location[1]);  // skip leading slash
-  if (!EndsWith(location, ".dex") && !EndsWith(location, ".art")) {
+  if (!EndsWith(location, ".dex") && !EndsWith(location, ".art") && !EndsWith(location, ".oat")) {
     cache_file += "/";
     cache_file += DexFile::kClassesDex;
   }
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index 7cd5980..d6c90e1 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -350,6 +350,8 @@ TEST_F(UtilsTest, GetDalvikCacheFilenameOrDie) {
                GetDalvikCacheFilenameOrDie("/system/framework/core.jar", "/foo").c_str());
   EXPECT_STREQ("/foo/system@framework@boot.art",
                GetDalvikCacheFilenameOrDie("/system/framework/boot.art", "/foo").c_str());
+  EXPECT_STREQ("/foo/system@framework@boot.oat",
+               GetDalvikCacheFilenameOrDie("/system/framework/boot.oat", "/foo").c_str());
 }
 
 TEST_F(UtilsTest, GetSystemImageFilename) {
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 18f7626..329b4dc 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -3111,7 +3111,7 @@ mirror::ArtMethod* MethodVerifier::VerifyInvocationArgsFromIterator(T* it, const
       } else {
         // Check whether the name of the called method is "<init>"
         const uint32_t method_idx = (is_range) ? inst->VRegB_3rc() : inst->VRegB_35c();
-        if (strcmp(dex_file_->GetMethodName(dex_file_->GetMethodId(method_idx)), "init") != 0) {
+        if (strcmp(dex_file_->GetMethodName(dex_file_->GetMethodId(method_idx)), "<init>") != 0) {
           Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "'this' arg must be initialized";
           return nullptr;
         }
diff --git a/test/015-switch/expected.txt b/test/015-switch/expected.txt
index ca3b518..91b4714 100644
--- a/test/015-switch/expected.txt
+++ b/test/015-switch/expected.txt
@@ -8,3 +8,9 @@ CORRECT (not found)
 CORRECT (default only)
 CORRECT big sparse / first
 CORRECT big sparse / last
+default
+254
+255
+256
+257
+default
diff --git a/test/015-switch/src/Main.java b/test/015-switch/src/Main.java
index 7198e2b..dd97a8c 100644
--- a/test/015-switch/src/Main.java
+++ b/test/015-switch/src/Main.java
@@ -101,5 +101,15 @@ public class Main {
             case 100: System.out.print("CORRECT big sparse / last\n"); break;
             default: System.out.print("blah!\n"); break;
         }
+
+        for (a = 253; a <= 258; a++) {
+          switch (a) {
+            case 254: System.out.println("254"); break;
+            case 255: System.out.println("255"); break;
+            case 256: System.out.println("256"); break;
+            case 257: System.out.println("257"); break;
+            default: System.out.println("default"); break;
+          }
+        }
     }
 }
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 9ecc0a0..56972ff 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -34,6 +34,11 @@ public class Main {
     test_Math_max_F();
     test_Math_min_D();
     test_Math_max_D();
+    test_Math_ceil();
+    test_Math_floor();
+    test_Math_rint();
+    test_Math_round_D();
+    test_Math_round_F();
     test_Short_reverseBytes();
     test_Integer_reverseBytes();
     test_Long_reverseBytes();
@@ -49,6 +54,11 @@ public class Main {
     test_StrictMath_max_F();
     test_StrictMath_min_D();
     test_StrictMath_max_D();
+    test_StrictMath_ceil();
+    test_StrictMath_floor();
+    test_StrictMath_rint();
+    test_StrictMath_round_D();
+    test_StrictMath_round_F();
     test_String_charAt();
     test_String_compareTo();
     test_String_indexOf();
@@ -376,6 +386,104 @@ public class Main {
     Assert.assertEquals(Math.max(Double.MIN_VALUE, Double.MAX_VALUE), Double.MAX_VALUE);
   }
 
+  public static void test_Math_ceil() {
+    Assert.assertEquals(Math.ceil(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-0.9), -0.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-0.5), -0.0d, 0.0);
+    Assert.assertEquals(Math.ceil(0.0), -0.0d, 0.0);
+    Assert.assertEquals(Math.ceil(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(Math.ceil(+2.1), +3.0d, 0.0);
+    Assert.assertEquals(Math.ceil(+2.5), +3.0d, 0.0);
+    Assert.assertEquals(Math.ceil(+2.9), +3.0d, 0.0);
+    Assert.assertEquals(Math.ceil(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-2.1), -2.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-2.5), -2.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-2.9), -2.0d, 0.0);
+    Assert.assertEquals(Math.ceil(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(Math.ceil(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(Math.ceil(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(Math.ceil(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_Math_floor() {
+    Assert.assertEquals(Math.floor(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(Math.floor(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(Math.floor(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(Math.floor(+2.1), +2.0d, 0.0);
+    Assert.assertEquals(Math.floor(+2.5), +2.0d, 0.0);
+    Assert.assertEquals(Math.floor(+2.9), +2.0d, 0.0);
+    Assert.assertEquals(Math.floor(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(Math.floor(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(Math.floor(-2.1), -3.0d, 0.0);
+    Assert.assertEquals(Math.floor(-2.5), -3.0d, 0.0);
+    Assert.assertEquals(Math.floor(-2.9), -3.0d, 0.0);
+    Assert.assertEquals(Math.floor(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(Math.floor(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(Math.floor(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(Math.floor(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_Math_rint() {
+    Assert.assertEquals(Math.rint(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(Math.rint(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(Math.rint(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(Math.rint(+2.1), +2.0d, 0.0);
+    Assert.assertEquals(Math.rint(+2.5), +2.0d, 0.0);
+    Assert.assertEquals(Math.rint(+2.9), +3.0d, 0.0);
+    Assert.assertEquals(Math.rint(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(Math.rint(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(Math.rint(-2.1), -2.0d, 0.0);
+    Assert.assertEquals(Math.rint(-2.5), -2.0d, 0.0);
+    Assert.assertEquals(Math.rint(-2.9), -3.0d, 0.0);
+    Assert.assertEquals(Math.rint(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(Math.rint(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(Math.rint(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(Math.rint(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_Math_round_D() {
+    Assert.assertEquals(Math.round(+0.0d), (long)+0.0);
+    Assert.assertEquals(Math.round(-0.0d), (long)+0.0);
+    Assert.assertEquals(Math.round(2.0d), 2l);
+    Assert.assertEquals(Math.round(2.1d), 2l);
+    Assert.assertEquals(Math.round(2.5d), 3l);
+    Assert.assertEquals(Math.round(2.9d), 3l);
+    Assert.assertEquals(Math.round(3.0d), 3l);
+    Assert.assertEquals(Math.round(-2.0d), -2l);
+    Assert.assertEquals(Math.round(-2.1d), -2l);
+    Assert.assertEquals(Math.round(-2.5d), -2l);
+    Assert.assertEquals(Math.round(-2.9d), -3l);
+    Assert.assertEquals(Math.round(-3.0d), -3l);
+    Assert.assertEquals(Math.round(0.49999999999999994d), 1l);
+    Assert.assertEquals(Math.round(Double.NaN), (long)+0.0d);
+    Assert.assertEquals(Math.round(Long.MAX_VALUE + 1.0d), Long.MAX_VALUE);
+    Assert.assertEquals(Math.round(Long.MIN_VALUE - 1.0d), Long.MIN_VALUE);
+    Assert.assertEquals(Math.round(Double.POSITIVE_INFINITY), Long.MAX_VALUE);
+    Assert.assertEquals(Math.round(Double.NEGATIVE_INFINITY), Long.MIN_VALUE);
+  }
+
+  public static void test_Math_round_F() {
+    Assert.assertEquals(Math.round(+0.0f), (int)+0.0);
+    Assert.assertEquals(Math.round(-0.0f), (int)+0.0);
+    Assert.assertEquals(Math.round(2.0f), 2);
+    Assert.assertEquals(Math.round(2.1f), 2);
+    Assert.assertEquals(Math.round(2.5f), 3);
+    Assert.assertEquals(Math.round(2.9f), 3);
+    Assert.assertEquals(Math.round(3.0f), 3);
+    Assert.assertEquals(Math.round(-2.0f), -2);
+    Assert.assertEquals(Math.round(-2.1f), -2);
+    Assert.assertEquals(Math.round(-2.5f), -2);
+    Assert.assertEquals(Math.round(-2.9f), -3);
+    Assert.assertEquals(Math.round(-3.0f), -3);
+    Assert.assertEquals(Math.round(Float.NaN), (int)+0.0f);
+    Assert.assertEquals(Math.round(Integer.MAX_VALUE + 1.0f), Integer.MAX_VALUE);
+    Assert.assertEquals(Math.round(Integer.MIN_VALUE - 1.0f), Integer.MIN_VALUE);
+    Assert.assertEquals(Math.round(Float.POSITIVE_INFINITY), Integer.MAX_VALUE);
+    Assert.assertEquals(Math.round(Float.NEGATIVE_INFINITY), Integer.MIN_VALUE);
+  }
+
   public static void test_StrictMath_abs_I() {
     Assert.assertEquals(StrictMath.abs(0), 0);
     Assert.assertEquals(StrictMath.abs(123), 123);
@@ -487,6 +595,104 @@ public class Main {
     Assert.assertEquals(StrictMath.max(Double.MIN_VALUE, Double.MAX_VALUE), Double.MAX_VALUE);
   }
 
+  public static void test_StrictMath_ceil() {
+    Assert.assertEquals(StrictMath.ceil(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-0.9), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-0.5), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(0.0), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(+2.1), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(+2.5), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(+2.9), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-2.1), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-2.5), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-2.9), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.ceil(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(StrictMath.ceil(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(StrictMath.ceil(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_StrictMath_floor() {
+    Assert.assertEquals(StrictMath.floor(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(+2.1), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(+2.5), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(+2.9), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-2.1), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-2.5), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-2.9), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.floor(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(StrictMath.floor(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(StrictMath.floor(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_StrictMath_rint() {
+    Assert.assertEquals(StrictMath.rint(+0.0), +0.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-0.0), -0.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(+2.0), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(+2.1), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(+2.5), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(+2.9), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(+3.0), +3.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-2.0), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-2.1), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-2.5), -2.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-2.9), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(-3.0), -3.0d, 0.0);
+    Assert.assertEquals(StrictMath.rint(Double.NaN), Double.NaN, 0.0);
+    Assert.assertEquals(StrictMath.rint(Double.POSITIVE_INFINITY), Double.POSITIVE_INFINITY, 0.0);
+    Assert.assertEquals(StrictMath.rint(Double.NEGATIVE_INFINITY), Double.NEGATIVE_INFINITY, 0.0);
+  }
+
+  public static void test_StrictMath_round_D() {
+    Assert.assertEquals(StrictMath.round(+0.0d), (long)+0.0);
+    Assert.assertEquals(StrictMath.round(-0.0d), (long)+0.0);
+    Assert.assertEquals(StrictMath.round(2.0d), 2l);
+    Assert.assertEquals(StrictMath.round(2.1d), 2l);
+    Assert.assertEquals(StrictMath.round(2.5d), 3l);
+    Assert.assertEquals(StrictMath.round(2.9d), 3l);
+    Assert.assertEquals(StrictMath.round(3.0d), 3l);
+    Assert.assertEquals(StrictMath.round(-2.0d), -2l);
+    Assert.assertEquals(StrictMath.round(-2.1d), -2l);
+    Assert.assertEquals(StrictMath.round(-2.5d), -2l);
+    Assert.assertEquals(StrictMath.round(-2.9d), -3l);
+    Assert.assertEquals(StrictMath.round(-3.0d), -3l);
+    Assert.assertEquals(StrictMath.round(0.49999999999999994d), 1l);
+    Assert.assertEquals(StrictMath.round(Double.NaN), (long)+0.0d);
+    Assert.assertEquals(StrictMath.round(Long.MAX_VALUE + 1.0d), Long.MAX_VALUE);
+    Assert.assertEquals(StrictMath.round(Long.MIN_VALUE - 1.0d), Long.MIN_VALUE);
+    Assert.assertEquals(StrictMath.round(Double.POSITIVE_INFINITY), Long.MAX_VALUE);
+    Assert.assertEquals(StrictMath.round(Double.NEGATIVE_INFINITY), Long.MIN_VALUE);
+  }
+
+  public static void test_StrictMath_round_F() {
+    Assert.assertEquals(StrictMath.round(+0.0f), (int)+0.0);
+    Assert.assertEquals(StrictMath.round(-0.0f), (int)+0.0);
+    Assert.assertEquals(StrictMath.round(2.0f), 2);
+    Assert.assertEquals(StrictMath.round(2.1f), 2);
+    Assert.assertEquals(StrictMath.round(2.5f), 3);
+    Assert.assertEquals(StrictMath.round(2.9f), 3);
+    Assert.assertEquals(StrictMath.round(3.0f), 3);
+    Assert.assertEquals(StrictMath.round(-2.0f), -2);
+    Assert.assertEquals(StrictMath.round(-2.1f), -2);
+    Assert.assertEquals(StrictMath.round(-2.5f), -2);
+    Assert.assertEquals(StrictMath.round(-2.9f), -3);
+    Assert.assertEquals(StrictMath.round(-3.0f), -3);
+    Assert.assertEquals(StrictMath.round(Float.NaN), (int)+0.0f);
+    Assert.assertEquals(StrictMath.round(Integer.MAX_VALUE + 1.0f), Integer.MAX_VALUE);
+    Assert.assertEquals(StrictMath.round(Integer.MIN_VALUE - 1.0f), Integer.MIN_VALUE);
+    Assert.assertEquals(StrictMath.round(Float.POSITIVE_INFINITY), Integer.MAX_VALUE);
+    Assert.assertEquals(StrictMath.round(Float.NEGATIVE_INFINITY), Integer.MIN_VALUE);
+  }
+
   public static void test_Float_floatToRawIntBits() {
     Assert.assertEquals(Float.floatToRawIntBits(-1.0f), 0xbf800000);
     Assert.assertEquals(Float.floatToRawIntBits(0.0f), 0);
diff --git a/test/115-native-bridge/expected.txt b/test/115-native-bridge/expected.txt
index f852620..5b41606 100644
--- a/test/115-native-bridge/expected.txt
+++ b/test/115-native-bridge/expected.txt
@@ -1,13 +1,55 @@
 Ready for native bridge tests.
 Native bridge initialized.
 Checking for support.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
-Getting trampoline.
+Getting trampoline for JNI_OnLoad with shorty (null).
+Test ART callbacks: all JNI function number is 9.
+    name:booleanMethod, signature:(ZZZZZZZZZZ)Z, shorty:ZZZZZZZZZZZ.
+    name:byteMethod, signature:(BBBBBBBBBB)B, shorty:BBBBBBBBBBB.
+    name:charMethod, signature:(CCCCCCCCCC)C, shorty:CCCCCCCCCCC.
+    name:shortMethod, signature:(SSSSSSSSSS)S, shorty:SSSSSSSSSSS.
+    name:testCallStaticVoidMethodOnSubClassNative, signature:()V, shorty:V.
+    name:testFindClassOnAttachedNativeThread, signature:()V, shorty:V.
+    name:testFindFieldOnAttachedNativeThreadNative, signature:()V, shorty:V.
+    name:testGetMirandaMethodNative, signature:()Ljava/lang/reflect/Method;, shorty:L.
+    name:testZeroLengthByteBuffers, signature:()V, shorty:V.
+trampoline_JNI_OnLoad called!
+Getting trampoline for Java_Main_testFindClassOnAttachedNativeThread with shorty V.
+trampoline_Java_Main_testFindClassOnAttachedNativeThread called!
+Getting trampoline for Java_Main_testFindFieldOnAttachedNativeThreadNative with shorty V.
+trampoline_Java_Main_testFindFieldOnAttachedNativeThreadNative called!
+Getting trampoline for Java_Main_testCallStaticVoidMethodOnSubClassNative with shorty V.
+trampoline_Java_Main_testCallStaticVoidMethodOnSubClassNative called!
+Getting trampoline for Java_Main_testGetMirandaMethodNative with shorty L.
+trampoline_Java_Main_testGetMirandaMethodNative called!
+Getting trampoline for Java_Main_testZeroLengthByteBuffers with shorty V.
+trampoline_Java_Main_testZeroLengthByteBuffers called!
+Getting trampoline for Java_Main_byteMethod with shorty BBBBBBBBBBB.
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+trampoline_Java_Main_byteMethod called!
+Getting trampoline for Java_Main_shortMethod with shorty SSSSSSSSSSS.
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+trampoline_Java_Main_shortMethod called!
+Getting trampoline for Java_Main_booleanMethod with shorty ZZZZZZZZZZZ.
+trampoline_Java_Main_booleanMethod called!
+trampoline_Java_Main_booleanMethod called!
+Getting trampoline for Java_Main_charMethod with shorty CCCCCCCCCCC.
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
+trampoline_Java_Main_charMethod called!
diff --git a/test/115-native-bridge/nativebridge.cc b/test/115-native-bridge/nativebridge.cc
index bd3ae13..82211a5 100644
--- a/test/115-native-bridge/nativebridge.cc
+++ b/test/115-native-bridge/nativebridge.cc
@@ -44,13 +44,192 @@ struct NativeBridgeCallbacks {
   bool (*isSupported)(const char* libpath);
 };
 
+struct NativeBridgeMethod {
+  const char* name;
+  const char* signature;
+  bool static_method;
+  void* fnPtr;
+  void* trampoline;
+};
+
+static NativeBridgeMethod* find_native_bridge_method(const char *name);
+static NativeBridgeArtCallbacks* gNativeBridgeArtCallbacks;
+
+static jint trampoline_JNI_OnLoad(JavaVM* vm, void* reserved) {
+  JNIEnv* env = nullptr;
+  typedef jint (*FnPtr_t)(JavaVM*, void*);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>(find_native_bridge_method("JNI_OnLoad")->fnPtr);
+
+  vm->GetEnv(reinterpret_cast<void **>(&env), JNI_VERSION_1_6);
+  if (env == nullptr) {
+    return 0;
+  }
+
+  jclass klass = env->FindClass("Main");
+  if (klass != nullptr) {
+    int i, count1, count2;
+    count1 = gNativeBridgeArtCallbacks->getNativeMethodCount(env, klass);
+    std::unique_ptr<JNINativeMethod[]> methods(new JNINativeMethod[count1]);
+    if (methods == nullptr) {
+      return 0;
+    }
+    count2 = gNativeBridgeArtCallbacks->getNativeMethods(env, klass, methods.get(), count1);
+    if (count1 == count2) {
+      printf("Test ART callbacks: all JNI function number is %d.\n", count1);
+    }
+
+    for (i = 0; i < count1; i++) {
+      NativeBridgeMethod* nb_method = find_native_bridge_method(methods[i].name);
+      if (nb_method != nullptr) {
+        jmethodID mid = nullptr;
+        if (nb_method->static_method) {
+          mid = env->GetStaticMethodID(klass, methods[i].name, nb_method->signature);
+        } else {
+          mid = env->GetMethodID(klass, methods[i].name, nb_method->signature);
+        }
+        if (mid != nullptr) {
+          const char* shorty = gNativeBridgeArtCallbacks->getMethodShorty(env, mid);
+          if (strcmp(shorty, methods[i].signature) == 0) {
+            printf("    name:%s, signature:%s, shorty:%s.\n",
+                   methods[i].name, nb_method->signature, shorty);
+          }
+        }
+      }
+    }
+    methods.release();
+  }
+
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(vm, reserved);
+}
+
+static void trampoline_Java_Main_testFindClassOnAttachedNativeThread(JNIEnv* env,
+                                                                     jclass klass) {
+  typedef void (*FnPtr_t)(JNIEnv*, jclass);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>
+    (find_native_bridge_method("testFindClassOnAttachedNativeThread")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass);
+}
+
+static void trampoline_Java_Main_testFindFieldOnAttachedNativeThreadNative(JNIEnv* env,
+                                                                           jclass klass) {
+  typedef void (*FnPtr_t)(JNIEnv*, jclass);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>
+    (find_native_bridge_method("testFindFieldOnAttachedNativeThreadNative")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass);
+}
+
+static void trampoline_Java_Main_testCallStaticVoidMethodOnSubClassNative(JNIEnv* env,
+                                                                          jclass klass) {
+  typedef void (*FnPtr_t)(JNIEnv*, jclass);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>
+    (find_native_bridge_method("testCallStaticVoidMethodOnSubClassNative")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass);
+}
+
+static jobject trampoline_Java_Main_testGetMirandaMethodNative(JNIEnv* env, jclass klass) {
+  typedef jobject (*FnPtr_t)(JNIEnv*, jclass);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>
+    (find_native_bridge_method("testGetMirandaMethodNative")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass);
+}
 
+static void trampoline_Java_Main_testZeroLengthByteBuffers(JNIEnv* env, jclass klass) {
+  typedef void (*FnPtr_t)(JNIEnv*, jclass);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>
+    (find_native_bridge_method("testZeroLengthByteBuffers")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass);
+}
+
+static jbyte trampoline_Java_Main_byteMethod(JNIEnv* env, jclass klass, jbyte b1, jbyte b2,
+                                             jbyte b3, jbyte b4, jbyte b5, jbyte b6,
+                                             jbyte b7, jbyte b8, jbyte b9, jbyte b10) {
+  typedef jbyte (*FnPtr_t)(JNIEnv*, jclass, jbyte, jbyte, jbyte, jbyte, jbyte,
+                           jbyte, jbyte, jbyte, jbyte, jbyte);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>(find_native_bridge_method("byteMethod")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10);
+}
 
-static std::vector<void*> symbols;
+static jshort trampoline_Java_Main_shortMethod(JNIEnv* env, jclass klass, jshort s1, jshort s2,
+                                               jshort s3, jshort s4, jshort s5, jshort s6,
+                                               jshort s7, jshort s8, jshort s9, jshort s10) {
+  typedef jshort (*FnPtr_t)(JNIEnv*, jclass, jshort, jshort, jshort, jshort, jshort,
+                            jshort, jshort, jshort, jshort, jshort);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>(find_native_bridge_method("shortMethod")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10);
+}
+
+static jboolean trampoline_Java_Main_booleanMethod(JNIEnv* env, jclass klass, jboolean b1,
+                                                   jboolean b2, jboolean b3, jboolean b4,
+                                                   jboolean b5, jboolean b6, jboolean b7,
+                                                   jboolean b8, jboolean b9, jboolean b10) {
+  typedef jboolean (*FnPtr_t)(JNIEnv*, jclass, jboolean, jboolean, jboolean, jboolean, jboolean,
+                              jboolean, jboolean, jboolean, jboolean, jboolean);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>(find_native_bridge_method("booleanMethod")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10);
+}
+
+static jchar trampoline_Java_Main_charMethod(JNIEnv* env, jclass klass, jchar c1, jchar c2,
+                                             jchar c3, jchar c4, jchar c5, jchar c6,
+                                             jchar c7, jchar c8, jchar c9, jchar c10) {
+  typedef jchar (*FnPtr_t)(JNIEnv*, jclass, jchar, jchar, jchar, jchar, jchar,
+                           jchar, jchar, jchar, jchar, jchar);
+  FnPtr_t fnPtr = reinterpret_cast<FnPtr_t>(find_native_bridge_method("charMethod")->fnPtr);
+  printf("%s called!\n", __FUNCTION__);
+  return fnPtr(env, klass, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10);
+}
+
+NativeBridgeMethod gNativeBridgeMethods[] = {
+  { "JNI_OnLoad", "", true, nullptr,
+    reinterpret_cast<void*>(trampoline_JNI_OnLoad) },
+  { "booleanMethod", "(ZZZZZZZZZZ)Z", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_booleanMethod) },
+  { "byteMethod", "(BBBBBBBBBB)B", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_byteMethod) },
+  { "charMethod", "(CCCCCCCCCC)C", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_charMethod) },
+  { "shortMethod", "(SSSSSSSSSS)S", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_shortMethod) },
+  { "testCallStaticVoidMethodOnSubClassNative", "()V", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_testCallStaticVoidMethodOnSubClassNative) },
+  { "testFindClassOnAttachedNativeThread", "()V", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_testFindClassOnAttachedNativeThread) },
+  { "testFindFieldOnAttachedNativeThreadNative", "()V", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_testFindFieldOnAttachedNativeThreadNative) },
+  { "testGetMirandaMethodNative", "()Ljava/lang/reflect/Method;", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_testGetMirandaMethodNative) },
+  { "testZeroLengthByteBuffers", "()V", true, nullptr,
+    reinterpret_cast<void*>(trampoline_Java_Main_testZeroLengthByteBuffers) },
+};
+
+static NativeBridgeMethod* find_native_bridge_method(const char *name) {
+  const char* pname = name;
+  if (strncmp(name, "Java_Main_", 10) == 0) {
+    pname += 10;
+  }
+
+  for (size_t i = 0; i < sizeof(gNativeBridgeMethods) / sizeof(gNativeBridgeMethods[0]); i++) {
+    if (strcmp(pname, gNativeBridgeMethods[i].name) == 0) {
+      return &gNativeBridgeMethods[i];
+    }
+  }
+  return nullptr;
+}
 
 // NativeBridgeCallbacks implementations
 extern "C" bool native_bridge_initialize(NativeBridgeArtCallbacks* art_cbs) {
-  printf("Native bridge initialized.\n");
+  if (art_cbs != nullptr) {
+    gNativeBridgeArtCallbacks = art_cbs;
+    printf("Native bridge initialized.\n");
+  }
   return true;
 }
 
@@ -80,17 +259,16 @@ extern "C" void* native_bridge_loadLibrary(const char* libpath, int flag) {
 
 extern "C" void* native_bridge_getTrampoline(void* handle, const char* name, const char* shorty,
                                              uint32_t len) {
-  printf("Getting trampoline.\n");
+  printf("Getting trampoline for %s with shorty %s.\n", name, shorty);
 
   // The name here is actually the JNI name, so we can directly do the lookup.
   void* sym = dlsym(handle, name);
-  if (sym != nullptr) {
-    symbols.push_back(sym);
-  }
+  NativeBridgeMethod* method = find_native_bridge_method(name);
+  if (method == nullptr)
+    return nullptr;
+  method->fnPtr = sym;
 
-  // As libarttest is the same arch as the host, we can actually directly use the code and do not
-  // need to create a trampoline. :-)
-  return sym;
+  return method->trampoline;
 }
 
 extern "C" bool native_bridge_isSupported(const char* libpath) {
@@ -109,6 +287,3 @@ NativeBridgeCallbacks NativeBridgeItf {
   .getTrampoline = &native_bridge_getTrampoline,
   .isSupported = &native_bridge_isSupported
 };
-
-
-
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 5c1bc03..d7ee383 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -81,38 +81,10 @@ endif
 
 # Tests that are broken in --trace mode.
 TEST_ART_BROKEN_TRACE_RUN_TESTS := \
-  003-omnibus-opcodes \
-  004-InterfaceTest \
   004-SignalTest \
-  004-ThreadStress \
-  005-annotations \
-  012-math \
   018-stack-overflow \
-  023-many-interfaces \
-  027-arithmetic \
-  031-class-attributes \
-  037-inherit \
-  044-proxy \
-  046-reflect \
-  051-thread \
-  055-enum-performance \
-  062-character-encodings \
-  064-field-access \
-  074-gc-thrash \
-  078-polymorphic-virtual \
-  080-oom-throw \
-  082-inline-execute \
-  083-compiler-regressions \
-  093-serialization \
   097-duplicate-method \
-  100-reflect2 \
-  102-concurrent-gc \
-  103-string-append \
-  107-int-math2 \
-  112-double-math \
-  114-ParallelGC \
-  700-LoadArgRegs \
-  701-easy-div-rem
+  107-int-math2
 
 ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-relocate))
 ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-no-prebuild))
diff --git a/test/run-test b/test/run-test
index aef7c52..ca7e68c 100755
--- a/test/run-test
+++ b/test/run-test
@@ -33,7 +33,11 @@ cd "${progdir}"
 progdir=`pwd`
 prog="${progdir}"/`basename "${prog}"`
 test_dir="test-$$"
-tmp_dir="/tmp/$USER/${test_dir}"
+if [ -z "$TMPDIR" ]; then
+  tmp_dir="/tmp/$USER/${test_dir}"
+else
+  tmp_dir="${TMPDIR}/$USER/${test_dir}"
+fi
 
 export JAVA="java"
 export JAVAC="javac -g"