23 files changed, 1121 insertions, 273 deletions
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index c6c5ca7..beeb3ad 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -60,6 +60,14 @@ enum SpecialTargetRegister {
   kFArg5,
   kFArg6,
   kFArg7,
+  kFArg8,
+  kFArg9,
+  kFArg10,
+  kFArg11,
+  kFArg12,
+  kFArg13,
+  kFArg14,
+  kFArg15,
   kRet0,
   kRet1,
   kInvokeTgt,
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index d935bc3..36cb7a4 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -297,19 +297,20 @@ constexpr RegStorage rs_dr30(RegStorage::kValid | dr30);
 constexpr RegStorage rs_dr31(RegStorage::kValid | dr31);
 #endif
 
-// RegisterLocation templates return values (r0, or r0/r1).
-const RegLocation arm_loc_c_return
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k32BitSolo, r0), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_wide
+// RegisterLocation templates return values (r0, r0/r1, s0, or d0).
+// Note: The return locations are shared between quick code and quick helper. This follows quick
+// ABI. Quick helper assembly routine needs to handle the ABI differences.
+const RegLocation arm_loc_c_return =
+    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, rs_r0, INVALID_SREG, INVALID_SREG};
+const RegLocation arm_loc_c_return_wide =
     {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k64BitPair, r0, r1), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_float
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k32BitSolo, r0), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_double
-    {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k64BitPair, r0, r1), INVALID_SREG, INVALID_SREG};
+     RegStorage::MakeRegPair(rs_r0, rs_r1), INVALID_SREG, INVALID_SREG};
+const RegLocation arm_loc_c_return_float = kArm32QuickCodeUseSoftFloat
+    ? arm_loc_c_return
+    : RegLocation({kLocPhysReg, 0, 0, 0, 1, 0, 0, 0, 1, rs_fr0, INVALID_SREG, INVALID_SREG});
+const RegLocation arm_loc_c_return_double = kArm32QuickCodeUseSoftFloat
+    ? arm_loc_c_return_wide
+    : RegLocation({kLocPhysReg, 1, 0, 0, 1, 0, 0, 0, 1, rs_dr0, INVALID_SREG, INVALID_SREG});
 
 enum ArmShiftEncodings {
   kArmLsl = 0x0,
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 6fd29f2..442c4fc 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -25,6 +25,64 @@
 namespace art {
 
 class ArmMir2Lir FINAL : public Mir2Lir {
+ protected:
+  // TODO: Consolidate hard float target support.
+  // InToRegStorageMapper and InToRegStorageMapping can be shared with all backends.
+  // Base class used to get RegStorage for next argument.
+  class InToRegStorageMapper {
+   public:
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide) = 0;
+    virtual ~InToRegStorageMapper() {
+    }
+  };
+
+  // Inherited class for ARM backend.
+  class InToRegStorageArmMapper FINAL : public InToRegStorageMapper {
+   public:
+    InToRegStorageArmMapper()
+        : cur_core_reg_(0), cur_fp_reg_(0), cur_fp_double_reg_(0) {
+    }
+
+    virtual ~InToRegStorageArmMapper() {
+    }
+
+    RegStorage GetNextReg(bool is_double_or_float, bool is_wide) OVERRIDE;
+
+   private:
+    uint32_t cur_core_reg_;
+    uint32_t cur_fp_reg_;
+    uint32_t cur_fp_double_reg_;
+  };
+
+  // Class to map argument to RegStorage. The mapping object is initialized by a mapper.
+  class InToRegStorageMapping FINAL {
+   public:
+    InToRegStorageMapping()
+        : max_mapped_in_(0), is_there_stack_mapped_(false), initialized_(false) {
+    }
+
+    int GetMaxMappedIn() const {
+      return max_mapped_in_;
+    }
+
+    bool IsThereStackMapped() const {
+      return is_there_stack_mapped_;
+    }
+
+    bool IsInitialized() const {
+      return initialized_;
+    }
+
+    void Initialize(RegLocation* arg_locs, int count, InToRegStorageMapper* mapper);
+    RegStorage Get(int in_position) const;
+
+   private:
+    std::map<int, RegStorage> mapping_;
+    int max_mapped_in_;
+    bool is_there_stack_mapped_;
+    bool initialized_;
+  };
+
   public:
     ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
 
@@ -47,15 +105,30 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage TargetReg(SpecialTargetRegister reg);
-    RegStorage GetArgMappingToPhysicalReg(int arg_num);
-    RegLocation GetReturnAlt();
-    RegLocation GetReturnWideAlt();
-    RegLocation LocCReturn();
-    RegLocation LocCReturnRef();
-    RegLocation LocCReturnDouble();
-    RegLocation LocCReturnFloat();
-    RegLocation LocCReturnWide();
+    RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
+    RegStorage TargetReg(SpecialTargetRegister reg, WideKind wide_kind) OVERRIDE {
+      if (wide_kind == kWide) {
+        DCHECK((kArg0 <= reg && reg < kArg3) || (kFArg0 <= reg && reg < kFArg15) || (kRet0 == reg));
+        RegStorage ret_reg = RegStorage::MakeRegPair(TargetReg(reg),
+            TargetReg(static_cast<SpecialTargetRegister>(reg + 1)));
+        if (ret_reg.IsFloat()) {
+          // Regard double as double, be consistent with register allocation.
+          ret_reg = As64BitFloatReg(ret_reg);
+        }
+        return ret_reg;
+      } else {
+        return TargetReg(reg);
+      }
+    }
+
+    RegStorage GetArgMappingToPhysicalReg(int arg_num) OVERRIDE;
+    RegLocation GetReturnAlt() OVERRIDE;
+    RegLocation GetReturnWideAlt() OVERRIDE;
+    RegLocation LocCReturn() OVERRIDE;
+    RegLocation LocCReturnRef() OVERRIDE;
+    RegLocation LocCReturnDouble() OVERRIDE;
+    RegLocation LocCReturnFloat() OVERRIDE;
+    RegLocation LocCReturnWide() OVERRIDE;
     ResourceMask GetRegMaskCommon(const RegStorage& reg) const OVERRIDE;
     void AdjustSpillMask();
     void ClobberCallerSave();
@@ -210,6 +283,19 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
     size_t GetInstructionOffset(LIR* lir);
 
+    int GenDalvikArgsNoRange(CallInfo* info, int call_state, LIR** pcrLabel,
+                             NextCallInsn next_call_insn,
+                             const MethodReference& target_method,
+                             uint32_t vtable_idx,
+                             uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
+                             bool skip_this) OVERRIDE;
+    int GenDalvikArgsRange(CallInfo* info, int call_state, LIR** pcrLabel,
+                           NextCallInsn next_call_insn,
+                           const MethodReference& target_method,
+                           uint32_t vtable_idx,
+                           uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
+                           bool skip_this) OVERRIDE;
+
   private:
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -226,10 +312,10 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
                           bool is_div, int flags) OVERRIDE;
     RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div) OVERRIDE;
-    typedef struct {
+    struct EasyMultiplyOp {
       OpKind op;
       uint32_t shift;
-    } EasyMultiplyOp;
+    };
     bool GetEasyMultiplyOp(int lit, EasyMultiplyOp* op);
     bool GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops);
     void GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops);
@@ -239,6 +325,36 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     static constexpr ResourceMask EncodeArmRegFpcsList(int reg_list);
 
     ArenaVector<LIR*> call_method_insns_;
+
+    /**
+     * @brief Given float register pair, returns Solo64 float register.
+     * @param reg #RegStorage containing a float register pair (e.g. @c s2 and @c s3).
+     * @return A Solo64 float mapping to the register pair (e.g. @c d1).
+     */
+    static RegStorage As64BitFloatReg(RegStorage reg) {
+      DCHECK(reg.IsFloat());
+
+      RegStorage low = reg.GetLow();
+      RegStorage high = reg.GetHigh();
+      DCHECK((low.GetRegNum() % 2 == 0) && (low.GetRegNum() + 1 == high.GetRegNum()));
+
+      return RegStorage::FloatSolo64(low.GetRegNum() / 2);
+    }
+
+    /**
+     * @brief Given Solo64 float register, returns float register pair.
+     * @param reg #RegStorage containing a Solo64 float register (e.g. @c d1).
+     * @return A float register pair mapping to the Solo64 float pair (e.g. @c s2 and s3).
+     */
+    static RegStorage As64BitFloatRegPair(RegStorage reg) {
+      DCHECK(reg.IsDouble() && reg.Is64BitSolo());
+
+      int reg_num = reg.GetRegNum();
+      return RegStorage::MakeRegPair(RegStorage::FloatSolo32(reg_num * 2),
+                                     RegStorage::FloatSolo32(reg_num * 2 + 1));
+    }
+
+    InToRegStorageMapping in_to_reg_storage_mapping_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 9742243..8e08f5f 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -442,6 +442,15 @@ void ArmMir2Lir::OpRegCopyWide(RegStorage r_dest, RegStorage r_src) {
     bool src_fp = r_src.IsFloat();
     DCHECK(r_dest.Is64Bit());
     DCHECK(r_src.Is64Bit());
+    // Note: If the register is get by register allocator, it should never be a pair.
+    // But some functions in mir_2_lir assume 64-bit registers are 32-bit register pairs.
+    // TODO: Rework Mir2Lir::LoadArg() and Mir2Lir::LoadArgDirect().
+    if (dest_fp && r_dest.IsPair()) {
+      r_dest = As64BitFloatReg(r_dest);
+    }
+    if (src_fp && r_src.IsPair()) {
+      r_src = As64BitFloatReg(r_src);
+    }
     if (dest_fp) {
       if (src_fp) {
         OpRegCopy(r_dest, r_src);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index dd8f7fe..7100a28 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -89,7 +89,7 @@ RegLocation ArmMir2Lir::LocCReturnDouble() {
 
 // Return a target-dependent special register.
 RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
-  RegStorage res_reg = RegStorage::InvalidReg();
+  RegStorage res_reg;
   switch (reg) {
     case kSelf: res_reg = rs_rARM_SELF; break;
 #ifdef ARM_R4_SUSPEND_FLAG
@@ -104,10 +104,22 @@ RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
     case kArg1: res_reg = rs_r1; break;
     case kArg2: res_reg = rs_r2; break;
     case kArg3: res_reg = rs_r3; break;
-    case kFArg0: res_reg = rs_r0; break;
-    case kFArg1: res_reg = rs_r1; break;
-    case kFArg2: res_reg = rs_r2; break;
-    case kFArg3: res_reg = rs_r3; break;
+    case kFArg0: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r0 : rs_fr0; break;
+    case kFArg1: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r1 : rs_fr1; break;
+    case kFArg2: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r2 : rs_fr2; break;
+    case kFArg3: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r3 : rs_fr3; break;
+    case kFArg4: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr4; break;
+    case kFArg5: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr5; break;
+    case kFArg6: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr6; break;
+    case kFArg7: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr7; break;
+    case kFArg8: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr8; break;
+    case kFArg9: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr9; break;
+    case kFArg10: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr10; break;
+    case kFArg11: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr11; break;
+    case kFArg12: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr12; break;
+    case kFArg13: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr13; break;
+    case kFArg14: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr14; break;
+    case kFArg15: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr15; break;
     case kRet0: res_reg = rs_r0; break;
     case kRet1: res_reg = rs_r1; break;
     case kInvokeTgt: res_reg = rs_rARM_LR; break;
@@ -119,20 +131,6 @@ RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
   return res_reg;
 }
 
-RegStorage ArmMir2Lir::GetArgMappingToPhysicalReg(int arg_num) {
-  // For the 32-bit internal ABI, the first 3 arguments are passed in registers.
-  switch (arg_num) {
-    case 0:
-      return rs_r1;
-    case 1:
-      return rs_r2;
-    case 2:
-      return rs_r3;
-    default:
-      return RegStorage::InvalidReg();
-  }
-}
-
 /*
  * Decode the register id.
  */
@@ -718,6 +716,32 @@ void ArmMir2Lir::LockCallTemps() {
   LockTemp(rs_r1);
   LockTemp(rs_r2);
   LockTemp(rs_r3);
+  if (!kArm32QuickCodeUseSoftFloat) {
+    LockTemp(rs_fr0);
+    LockTemp(rs_fr1);
+    LockTemp(rs_fr2);
+    LockTemp(rs_fr3);
+    LockTemp(rs_fr4);
+    LockTemp(rs_fr5);
+    LockTemp(rs_fr6);
+    LockTemp(rs_fr7);
+    LockTemp(rs_fr8);
+    LockTemp(rs_fr9);
+    LockTemp(rs_fr10);
+    LockTemp(rs_fr11);
+    LockTemp(rs_fr12);
+    LockTemp(rs_fr13);
+    LockTemp(rs_fr14);
+    LockTemp(rs_fr15);
+    LockTemp(rs_dr0);
+    LockTemp(rs_dr1);
+    LockTemp(rs_dr2);
+    LockTemp(rs_dr3);
+    LockTemp(rs_dr4);
+    LockTemp(rs_dr5);
+    LockTemp(rs_dr6);
+    LockTemp(rs_dr7);
+  }
 }
 
 /* To be used when explicitly managing register use */
@@ -726,6 +750,32 @@ void ArmMir2Lir::FreeCallTemps() {
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
   FreeTemp(rs_r3);
+  if (!kArm32QuickCodeUseSoftFloat) {
+    FreeTemp(rs_fr0);
+    FreeTemp(rs_fr1);
+    FreeTemp(rs_fr2);
+    FreeTemp(rs_fr3);
+    FreeTemp(rs_fr4);
+    FreeTemp(rs_fr5);
+    FreeTemp(rs_fr6);
+    FreeTemp(rs_fr7);
+    FreeTemp(rs_fr8);
+    FreeTemp(rs_fr9);
+    FreeTemp(rs_fr10);
+    FreeTemp(rs_fr11);
+    FreeTemp(rs_fr12);
+    FreeTemp(rs_fr13);
+    FreeTemp(rs_fr14);
+    FreeTemp(rs_fr15);
+    FreeTemp(rs_dr0);
+    FreeTemp(rs_dr1);
+    FreeTemp(rs_dr2);
+    FreeTemp(rs_dr3);
+    FreeTemp(rs_dr4);
+    FreeTemp(rs_dr5);
+    FreeTemp(rs_dr6);
+    FreeTemp(rs_dr7);
+  }
 }
 
 RegStorage ArmMir2Lir::LoadHelper(QuickEntrypointEnum trampoline) {
@@ -847,4 +897,313 @@ void ArmMir2Lir::InstallLiteralPools() {
   Mir2Lir::InstallLiteralPools();
 }
 
+RegStorage ArmMir2Lir::InToRegStorageArmMapper::GetNextReg(bool is_double_or_float, bool is_wide) {
+  const RegStorage coreArgMappingToPhysicalReg[] =
+      {rs_r1, rs_r2, rs_r3};
+  const int coreArgMappingToPhysicalRegSize = arraysize(coreArgMappingToPhysicalReg);
+  const RegStorage fpArgMappingToPhysicalReg[] =
+      {rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
+       rs_fr8, rs_fr9, rs_fr10, rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15};
+  const uint32_t fpArgMappingToPhysicalRegSize = arraysize(fpArgMappingToPhysicalReg);
+  COMPILE_ASSERT(fpArgMappingToPhysicalRegSize % 2 == 0, knum_of_fp_arg_regs_not_even);
+
+  if (kArm32QuickCodeUseSoftFloat) {
+    is_double_or_float = false;  // Regard double as long, float as int.
+    is_wide = false;  // Map long separately.
+  }
+
+  RegStorage result = RegStorage::InvalidReg();
+  if (is_double_or_float) {
+    // TODO: Remove "cur_fp_double_reg_ % 2 != 0" when we return double as double.
+    if (is_wide || cur_fp_double_reg_ % 2 != 0) {
+      cur_fp_double_reg_ = std::max(cur_fp_double_reg_, RoundUp(cur_fp_reg_, 2));
+      if (cur_fp_double_reg_ < fpArgMappingToPhysicalRegSize) {
+        // TODO: Replace by following code in the branch when FlushIns() support 64-bit registers.
+        // result = RegStorage::MakeRegPair(fpArgMappingToPhysicalReg[cur_fp_double_reg_],
+        //                                  fpArgMappingToPhysicalReg[cur_fp_double_reg_ + 1]);
+        // result = As64BitFloatReg(result);
+        // cur_fp_double_reg_ += 2;
+        result = fpArgMappingToPhysicalReg[cur_fp_double_reg_];
+        cur_fp_double_reg_++;
+      }
+    } else {
+      // TODO: Remove the check when we return double as double.
+      DCHECK_EQ(cur_fp_double_reg_ % 2, 0U);
+      if (cur_fp_reg_ % 2 == 0) {
+        cur_fp_reg_ = std::max(cur_fp_double_reg_, cur_fp_reg_);
+      }
+      if (cur_fp_reg_ < fpArgMappingToPhysicalRegSize) {
+        result = fpArgMappingToPhysicalReg[cur_fp_reg_];
+        cur_fp_reg_++;
+      }
+    }
+  } else {
+    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      result = coreArgMappingToPhysicalReg[cur_core_reg_++];
+      // TODO: Enable following code when FlushIns() support 64-bit registers.
+      // if (is_wide && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      //   result = RegStorage::MakeRegPair(result, coreArgMappingToPhysicalReg[cur_core_reg_++]);
+      // }
+    }
+  }
+  return result;
+}
+
+RegStorage ArmMir2Lir::InToRegStorageMapping::Get(int in_position) const {
+  DCHECK(IsInitialized());
+  auto res = mapping_.find(in_position);
+  return res != mapping_.end() ? res->second : RegStorage::InvalidReg();
+}
+
+void ArmMir2Lir::InToRegStorageMapping::Initialize(RegLocation* arg_locs, int count,
+                                                   InToRegStorageMapper* mapper) {
+  DCHECK(mapper != nullptr);
+  max_mapped_in_ = -1;
+  is_there_stack_mapped_ = false;
+  for (int in_position = 0; in_position < count; in_position++) {
+     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp,
+                                         arg_locs[in_position].wide);
+     if (reg.Valid()) {
+       mapping_[in_position] = reg;
+       // TODO: Enable the following code when FlushIns() support 64-bit argument registers.
+       // if (arg_locs[in_position].wide) {
+       //  if (reg.Is32Bit()) {
+       //    // As it is a split long, the hi-part is on stack.
+       //    is_there_stack_mapped_ = true;
+       //  }
+       //  // We covered 2 v-registers, so skip the next one
+       //  in_position++;
+       // }
+       max_mapped_in_ = std::max(max_mapped_in_, in_position);
+     } else {
+       is_there_stack_mapped_ = true;
+     }
+  }
+  initialized_ = true;
+}
+
+// TODO: Should be able to return long, double registers.
+// Need check some common code as it will break some assumption.
+RegStorage ArmMir2Lir::GetArgMappingToPhysicalReg(int arg_num) {
+  if (!in_to_reg_storage_mapping_.IsInitialized()) {
+    int start_vreg = mir_graph_->GetFirstInVR();
+    RegLocation* arg_locs = &mir_graph_->reg_location_[start_vreg];
+
+    InToRegStorageArmMapper mapper;
+    in_to_reg_storage_mapping_.Initialize(arg_locs, mir_graph_->GetNumOfInVRs(), &mapper);
+  }
+  return in_to_reg_storage_mapping_.Get(arg_num);
+}
+
+int ArmMir2Lir::GenDalvikArgsNoRange(CallInfo* info,
+                                     int call_state, LIR** pcrLabel, NextCallInsn next_call_insn,
+                                     const MethodReference& target_method,
+                                     uint32_t vtable_idx, uintptr_t direct_code,
+                                     uintptr_t direct_method, InvokeType type, bool skip_this) {
+  if (kArm32QuickCodeUseSoftFloat) {
+    return Mir2Lir::GenDalvikArgsNoRange(info, call_state, pcrLabel, next_call_insn, target_method,
+                                         vtable_idx, direct_code, direct_method, type, skip_this);
+  } else {
+    return GenDalvikArgsRange(info, call_state, pcrLabel, next_call_insn, target_method, vtable_idx,
+                              direct_code, direct_method, type, skip_this);
+  }
+}
+
+int ArmMir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
+                                   LIR** pcrLabel, NextCallInsn next_call_insn,
+                                   const MethodReference& target_method,
+                                   uint32_t vtable_idx, uintptr_t direct_code,
+                                   uintptr_t direct_method, InvokeType type, bool skip_this) {
+  if (kArm32QuickCodeUseSoftFloat) {
+    return Mir2Lir::GenDalvikArgsRange(info, call_state, pcrLabel, next_call_insn, target_method,
+                                       vtable_idx, direct_code, direct_method, type, skip_this);
+  }
+
+  // TODO: Rework the implementation when argument register can be long or double.
+
+  /* If no arguments, just return */
+  if (info->num_arg_words == 0) {
+    return call_state;
+  }
+
+  const int start_index = skip_this ? 1 : 0;
+
+  InToRegStorageArmMapper mapper;
+  InToRegStorageMapping in_to_reg_storage_mapping;
+  in_to_reg_storage_mapping.Initialize(info->args, info->num_arg_words, &mapper);
+  const int last_mapped_in = in_to_reg_storage_mapping.GetMaxMappedIn();
+  int regs_left_to_pass_via_stack = info->num_arg_words - (last_mapped_in + 1);
+
+  // First of all, check whether it makes sense to use bulk copying.
+  // Bulk copying is done only for the range case.
+  // TODO: make a constant instead of 2
+  if (info->is_range && regs_left_to_pass_via_stack >= 2) {
+    // Scan the rest of the args - if in phys_reg flush to memory
+    for (int next_arg = last_mapped_in + 1; next_arg < info->num_arg_words;) {
+      RegLocation loc = info->args[next_arg];
+      if (loc.wide) {
+        // TODO: Only flush hi-part.
+        if (loc.high_word) {
+          loc = info->args[--next_arg];
+        }
+        loc = UpdateLocWide(loc);
+        if (loc.location == kLocPhysReg) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          StoreBaseDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64, kNotVolatile);
+        }
+        next_arg += 2;
+      } else {
+        loc = UpdateLoc(loc);
+        if (loc.location == kLocPhysReg) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          if (loc.ref) {
+            StoreRefDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, kNotVolatile);
+          } else {
+            StoreBaseDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32,
+                          kNotVolatile);
+          }
+        }
+        next_arg++;
+      }
+    }
+
+    // The rest can be copied together
+    int start_offset = SRegOffset(info->args[last_mapped_in + 1].s_reg_low);
+    int outs_offset = StackVisitor::GetOutVROffset(last_mapped_in + 1,
+                                                   cu_->instruction_set);
+
+    int current_src_offset = start_offset;
+    int current_dest_offset = outs_offset;
+
+    // Only davik regs are accessed in this loop; no next_call_insn() calls.
+    ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+    while (regs_left_to_pass_via_stack > 0) {
+      /*
+       * TODO: Improve by adding block copy for large number of arguments.  This
+       * should be done, if possible, as a target-depending helper.  For now, just
+       * copy a Dalvik vreg at a time.
+       */
+      // Moving 32-bits via general purpose register.
+      size_t bytes_to_move = sizeof(uint32_t);
+
+      // Instead of allocating a new temp, simply reuse one of the registers being used
+      // for argument passing.
+      RegStorage temp = TargetReg(kArg3, kNotWide);
+
+      // Now load the argument VR and store to the outs.
+      Load32Disp(TargetPtrReg(kSp), current_src_offset, temp);
+      Store32Disp(TargetPtrReg(kSp), current_dest_offset, temp);
+
+      current_src_offset += bytes_to_move;
+      current_dest_offset += bytes_to_move;
+      regs_left_to_pass_via_stack -= (bytes_to_move >> 2);
+    }
+    DCHECK_EQ(regs_left_to_pass_via_stack, 0);
+  }
+
+  // Now handle rest not registers if they are
+  if (in_to_reg_storage_mapping.IsThereStackMapped()) {
+    RegStorage regWide = TargetReg(kArg2, kWide);
+    for (int i = start_index; i <= last_mapped_in + regs_left_to_pass_via_stack; i++) {
+      RegLocation rl_arg = info->args[i];
+      rl_arg = UpdateRawLoc(rl_arg);
+      RegStorage reg = in_to_reg_storage_mapping.Get(i);
+      // TODO: Only pass split wide hi-part via stack.
+      if (!reg.Valid() || rl_arg.wide) {
+        int out_offset = StackVisitor::GetOutVROffset(i, cu_->instruction_set);
+
+        {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          if (rl_arg.wide) {
+            if (rl_arg.location == kLocPhysReg) {
+              StoreBaseDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, k64, kNotVolatile);
+            } else {
+              LoadValueDirectWideFixed(rl_arg, regWide);
+              StoreBaseDisp(TargetPtrReg(kSp), out_offset, regWide, k64, kNotVolatile);
+            }
+          } else {
+            if (rl_arg.location == kLocPhysReg) {
+              if (rl_arg.ref) {
+                StoreRefDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, kNotVolatile);
+              } else {
+                StoreBaseDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, k32, kNotVolatile);
+              }
+            } else {
+              if (rl_arg.ref) {
+                RegStorage regSingle = TargetReg(kArg2, kRef);
+                LoadValueDirectFixed(rl_arg, regSingle);
+                StoreRefDisp(TargetPtrReg(kSp), out_offset, regSingle, kNotVolatile);
+              } else {
+                RegStorage regSingle = TargetReg(kArg2, kNotWide);
+                LoadValueDirectFixed(rl_arg, regSingle);
+                StoreBaseDisp(TargetPtrReg(kSp), out_offset, regSingle, k32, kNotVolatile);
+              }
+            }
+          }
+        }
+
+        call_state = next_call_insn(cu_, info, call_state, target_method,
+                                    vtable_idx, direct_code, direct_method, type);
+      }
+      if (rl_arg.wide) {
+        i++;
+      }
+    }
+  }
+
+  // Finish with mapped registers
+  for (int i = start_index; i <= last_mapped_in; i++) {
+    RegLocation rl_arg = info->args[i];
+    rl_arg = UpdateRawLoc(rl_arg);
+    RegStorage reg = in_to_reg_storage_mapping.Get(i);
+    if (reg.Valid()) {
+      if (reg.Is64Bit()) {
+        LoadValueDirectWideFixed(rl_arg, reg);
+      } else {
+        // TODO: Only split long should be the case we need to care about.
+        if (rl_arg.wide) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          int high_word = rl_arg.high_word ? 1 : 0;
+          rl_arg = high_word ? info->args[i - 1] : rl_arg;
+          if (rl_arg.location == kLocPhysReg) {
+            RegStorage rs_arg = rl_arg.reg;
+            if (rs_arg.IsDouble() && rs_arg.Is64BitSolo()) {
+              rs_arg = As64BitFloatRegPair(rs_arg);
+            }
+            RegStorage rs_arg_low = rs_arg.GetLow();
+            RegStorage rs_arg_high = rs_arg.GetHigh();
+            OpRegCopy(reg, high_word ? rs_arg_high : rs_arg_low);
+          } else {
+            Load32Disp(TargetPtrReg(kSp), SRegOffset(rl_arg.s_reg_low + high_word), reg);
+          }
+        } else {
+          LoadValueDirectFixed(rl_arg, reg);
+        }
+      }
+      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                                  direct_code, direct_method, type);
+    }
+    if (reg.Is64Bit()) {
+      i++;
+    }
+  }
+
+  call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                           direct_code, direct_method, type);
+  if (pcrLabel) {
+    if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitNullChecks()) {
+      *pcrLabel = GenExplicitNullCheck(TargetReg(kArg1, kRef), info->opt_flags);
+    } else {
+      *pcrLabel = nullptr;
+      // In lieu of generating a check for kArg1 being null, we need to
+      // perform a load when doing implicit checks.
+      RegStorage tmp = AllocTemp();
+      Load32Disp(TargetReg(kArg1, kRef), 0, tmp);
+      MarkPossibleNullPointerException(info->opt_flags);
+      FreeTemp(tmp);
+    }
+  }
+  return call_state;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 09acf4c..ce2de65 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -1007,6 +1007,12 @@ LIR* ArmMir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegStora
     // Intentional fall-though.
     case k64:
       if (r_src.IsFloat()) {
+        // Note: If the register is retrieved by register allocator, it should never be a pair.
+        // But some functions in mir2lir assume 64-bit registers are 32-bit register pairs.
+        // TODO: Rework Mir2Lir::LoadArg() and Mir2Lir::LoadArgDirect().
+        if (r_src.IsPair()) {
+          r_src = As64BitFloatReg(r_src);
+        }
         DCHECK(!r_src.IsPair());
         store = LoadStoreUsingInsnWithOffsetImm8Shl2(kThumb2Vstrd, r_base, displacement, r_src);
       } else {
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 2bef7c5..bc4d00b 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -248,13 +248,13 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(QuickEntrypointEnum trampo
         if (cu_->instruction_set == kMips) {
           LoadValueDirectFixed(arg1, TargetReg(arg1.fp ? kFArg2 : kArg1, kNotWide));
         } else {
-          LoadValueDirectFixed(arg1, TargetReg(kArg1, kNotWide));
+          LoadValueDirectFixed(arg1, TargetReg(arg1.fp ? kFArg1 : kArg1, kNotWide));
         }
       } else {
         if (cu_->instruction_set == kMips) {
           LoadValueDirectWideFixed(arg1, TargetReg(arg1.fp ? kFArg2 : kArg2, kWide));
         } else {
-          LoadValueDirectWideFixed(arg1, TargetReg(kArg1, kWide));
+          LoadValueDirectWideFixed(arg1, TargetReg(arg1.fp ? kFArg1 : kArg1, kWide));
         }
       }
     } else {
@@ -365,6 +365,7 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(
  * ArgLocs is an array of location records describing the incoming arguments
  * with one location record per word of argument.
  */
+// TODO: Support 64-bit argument registers.
 void Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
   /*
    * Dummy up a RegLocation for the incoming StackReference<mirror::ArtMethod>
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 3e0844b..f4e6dfe 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1191,13 +1191,17 @@ class Mir2Lir : public Backend {
      */
     virtual RegStorage TargetReg(SpecialTargetRegister reg, WideKind wide_kind) {
       if (wide_kind == kWide) {
-        DCHECK((kArg0 <= reg && reg < kArg7) || (kFArg0 <= reg && reg < kFArg7) || (kRet0 == reg));
+        DCHECK((kArg0 <= reg && reg < kArg7) || (kFArg0 <= reg && reg < kFArg15) || (kRet0 == reg));
         COMPILE_ASSERT((kArg1 == kArg0 + 1) && (kArg2 == kArg1 + 1) && (kArg3 == kArg2 + 1) &&
                        (kArg4 == kArg3 + 1) && (kArg5 == kArg4 + 1) && (kArg6 == kArg5 + 1) &&
                        (kArg7 == kArg6 + 1), kargs_range_unexpected);
         COMPILE_ASSERT((kFArg1 == kFArg0 + 1) && (kFArg2 == kFArg1 + 1) && (kFArg3 == kFArg2 + 1) &&
                        (kFArg4 == kFArg3 + 1) && (kFArg5 == kFArg4 + 1) && (kFArg6 == kFArg5 + 1) &&
-                       (kFArg7 == kFArg6 + 1), kfargs_range_unexpected);
+                       (kFArg7 == kFArg6 + 1) && (kFArg8 == kFArg7 + 1) && (kFArg9 == kFArg8 + 1) &&
+                       (kFArg10 == kFArg9 + 1) && (kFArg11 == kFArg10 + 1) &&
+                       (kFArg12 == kFArg11 + 1) && (kFArg13 == kFArg12 + 1) &&
+                       (kFArg14 == kFArg13 + 1) && (kFArg15 == kFArg14 + 1),
+                       kfargs_range_unexpected);
         COMPILE_ASSERT(kRet1 == kRet0 + 1, kret_range_unexpected);
         return RegStorage::MakeRegPair(TargetReg(reg),
                                        TargetReg(static_cast<SpecialTargetRegister>(reg + 1)));
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 6f2a647..8f7bd30 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -425,6 +425,21 @@ static int kAllOpcodes[] = {
     kMirOpSelect,
 };
 
+static int kInvokeOpcodes[] = {
+    Instruction::INVOKE_VIRTUAL,
+    Instruction::INVOKE_SUPER,
+    Instruction::INVOKE_DIRECT,
+    Instruction::INVOKE_STATIC,
+    Instruction::INVOKE_INTERFACE,
+    Instruction::INVOKE_VIRTUAL_RANGE,
+    Instruction::INVOKE_SUPER_RANGE,
+    Instruction::INVOKE_DIRECT_RANGE,
+    Instruction::INVOKE_STATIC_RANGE,
+    Instruction::INVOKE_INTERFACE_RANGE,
+    Instruction::INVOKE_VIRTUAL_QUICK,
+    Instruction::INVOKE_VIRTUAL_RANGE_QUICK,
+};
+
 // Unsupported opcodes. nullptr can be used when everything is supported. Size of the lists is
 // recorded below.
 static const int* kUnsupportedOpcodes[] = {
@@ -523,8 +538,8 @@ bool QuickCompiler::CanCompileMethod(uint32_t method_idx, const DexFile& dex_fil
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
       int opcode = mir->dalvikInsn.opcode;
       // Check if we support the byte code.
-      if (std::find(unsupport_list, unsupport_list + unsupport_list_size,
-                    opcode) != unsupport_list + unsupport_list_size) {
+      if (std::find(unsupport_list, unsupport_list + unsupport_list_size, opcode)
+          != unsupport_list + unsupport_list_size) {
         if (!MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
           VLOG(compiler) << "Unsupported dalvik byte code : "
               << mir->dalvikInsn.opcode;
@@ -535,11 +550,8 @@ bool QuickCompiler::CanCompileMethod(uint32_t method_idx, const DexFile& dex_fil
         return false;
       }
       // Check if it invokes a prototype that we cannot support.
-      if (Instruction::INVOKE_VIRTUAL == opcode ||
-          Instruction::INVOKE_SUPER == opcode ||
-          Instruction::INVOKE_DIRECT == opcode ||
-          Instruction::INVOKE_STATIC == opcode ||
-          Instruction::INVOKE_INTERFACE == opcode) {
+      if (std::find(kInvokeOpcodes, kInvokeOpcodes + arraysize(kInvokeOpcodes), opcode)
+          != kInvokeOpcodes + arraysize(kInvokeOpcodes)) {
         uint32_t invoke_method_idx = mir->dalvikInsn.vB;
         const char* invoke_method_shorty = dex_file.GetMethodShorty(
             dex_file.GetMethodId(invoke_method_idx));
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index f0c0ed7..9545896 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -21,6 +21,22 @@
 namespace art {
 namespace arm {
 
+// Used by hard float.
+static const Register kHFCoreArgumentRegisters[] = {
+  R0, R1, R2, R3
+};
+
+static const SRegister kHFSArgumentRegisters[] = {
+  S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
+};
+
+static const DRegister kHFDArgumentRegisters[] = {
+  D0, D1, D2, D3, D4, D5, D6, D7
+};
+
+COMPILE_ASSERT(arraysize(kHFDArgumentRegisters) * 2 == arraysize(kHFSArgumentRegisters),
+    ks_d_argument_registers_mismatch);
+
 // Calling convention
 
 ManagedRegister ArmManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
@@ -31,26 +47,43 @@ ManagedRegister ArmJniCallingConvention::InterproceduralScratchRegister() {
   return ArmManagedRegister::FromCoreRegister(IP);  // R12
 }
 
-static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
-  if (shorty[0] == 'F') {
-    return ArmManagedRegister::FromCoreRegister(R0);
-  } else if (shorty[0] == 'D') {
-    return ArmManagedRegister::FromRegisterPair(R0_R1);
-  } else if (shorty[0] == 'J') {
-    return ArmManagedRegister::FromRegisterPair(R0_R1);
-  } else if (shorty[0] == 'V') {
-    return ArmManagedRegister::NoRegister();
+ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
+  if (kArm32QuickCodeUseSoftFloat) {
+    switch (GetShorty()[0]) {
+    case 'V':
+      return ArmManagedRegister::NoRegister();
+    case 'D':
+    case 'J':
+      return ArmManagedRegister::FromRegisterPair(R0_R1);
+    default:
+      return ArmManagedRegister::FromCoreRegister(R0);
+    }
   } else {
-    return ArmManagedRegister::FromCoreRegister(R0);
+    switch (GetShorty()[0]) {
+    case 'V':
+      return ArmManagedRegister::NoRegister();
+    case 'D':
+      return ArmManagedRegister::FromDRegister(D0);
+    case 'F':
+      return ArmManagedRegister::FromSRegister(S0);
+    case 'J':
+      return ArmManagedRegister::FromRegisterPair(R0_R1);
+    default:
+      return ArmManagedRegister::FromCoreRegister(R0);
+    }
   }
 }
 
-ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
-  return ReturnRegisterForShorty(GetShorty());
-}
-
 ManagedRegister ArmJniCallingConvention::ReturnRegister() {
-  return ReturnRegisterForShorty(GetShorty());
+  switch (GetShorty()[0]) {
+  case 'V':
+    return ArmManagedRegister::NoRegister();
+  case 'D':
+  case 'J':
+    return ArmManagedRegister::FromRegisterPair(R0_R1);
+  default:
+    return ArmManagedRegister::FromCoreRegister(R0);
+  }
 }
 
 ManagedRegister ArmJniCallingConvention::IntReturnRegister() {
@@ -88,15 +121,68 @@ FrameOffset ArmManagedRuntimeCallingConvention::CurrentParamStackOffset() {
 const ManagedRegisterEntrySpills& ArmManagedRuntimeCallingConvention::EntrySpills() {
   // We spill the argument registers on ARM to free them up for scratch use, we then assume
   // all arguments are on the stack.
-  if (entry_spills_.size() == 0) {
-    size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
-    if (num_spills > 0) {
-      entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R1));
-      if (num_spills > 1) {
-        entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R2));
-        if (num_spills > 2) {
-          entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R3));
+  if (kArm32QuickCodeUseSoftFloat) {
+    if (entry_spills_.size() == 0) {
+      size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
+      if (num_spills > 0) {
+        entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R1));
+        if (num_spills > 1) {
+          entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R2));
+          if (num_spills > 2) {
+            entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R3));
+          }
+        }
+      }
+    }
+  } else {
+    if ((entry_spills_.size() == 0) && (NumArgs() > 0)) {
+      uint32_t gpr_index = 1;  // R0 ~ R3. Reserve r0 for ArtMethod*.
+      uint32_t fpr_index = 0;  // S0 ~ S15.
+      uint32_t fpr_double_index = 0;  // D0 ~ D7.
+
+      ResetIterator(FrameOffset(0));
+      while (HasNext()) {
+        if (IsCurrentParamAFloatOrDouble()) {
+          if (IsCurrentParamADouble()) {  // Double.
+            // Double should not overlap with float.
+            fpr_double_index = (std::max(fpr_double_index * 2, RoundUp(fpr_index, 2))) / 2;
+            if (fpr_double_index < arraysize(kHFDArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromDRegister(kHFDArgumentRegisters[fpr_double_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+            }
+          } else {  // Float.
+            // Float should not overlap with double.
+            if (fpr_index % 2 == 0) {
+              fpr_index = std::max(fpr_double_index * 2, fpr_index);
+            }
+            if (fpr_index < arraysize(kHFSArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromSRegister(kHFSArgumentRegisters[fpr_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            }
+          }
+        } else {
+          // FIXME: Pointer this returns as both reference and long.
+          if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
+            if (gpr_index < arraysize(kHFCoreArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            }
+          }
+          // High part of long or 32-bit argument.
+          if (gpr_index < arraysize(kHFCoreArgumentRegisters)) {
+            entry_spills_.push_back(
+                ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index++]));
+          } else {
+            entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+          }
         }
+        Next();
       }
     }
   }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 80e9cdb..0555c00 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -233,23 +233,30 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
   bool shouldOptimize =
       dex_compilation_unit.GetSymbol().find("00024reg_00024") != std::string::npos;
 
+  if (instruction_set == kThumb2 && !kArm32QuickCodeUseSoftFloat) {
+    uint32_t shorty_len;
+    const char* shorty = dex_compilation_unit.GetShorty(&shorty_len);
+    for (uint32_t i = 0; i < shorty_len; ++i) {
+      if (shorty[i] == 'D' || shorty[i] == 'F') {
+        CHECK(!shouldCompile) << "Hard float ARM32 parameters are not yet supported";
+        return nullptr;
+      }
+    }
+  }
+
   ArenaPool pool;
   ArenaAllocator arena(&pool);
   HGraphBuilder builder(&arena, &dex_compilation_unit, &dex_file, GetCompilerDriver());
 
   HGraph* graph = builder.BuildGraph(*code_item);
   if (graph == nullptr) {
-    if (shouldCompile) {
-      LOG(FATAL) << "Could not build graph in optimizing compiler";
-    }
+    CHECK(!shouldCompile) << "Could not build graph in optimizing compiler";
     return nullptr;
   }
 
   CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, instruction_set);
   if (codegen == nullptr) {
-    if (shouldCompile) {
-      LOG(FATAL) << "Could not find code generator for optimizing compiler";
-    }
+    CHECK(!shouldCompile) << "Could not find code generator for optimizing compiler";
     return nullptr;
   }
 
@@ -305,7 +312,7 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
                               stack_map);
   } else if (shouldOptimize && RegisterAllocator::Supports(instruction_set)) {
     LOG(FATAL) << "Could not allocate registers in optimizing compiler";
-    return nullptr;
+    UNREACHABLE();
   } else {
     unoptimized_compiled_methods_++;
     codegen->CompileBaseline(&allocator);
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index b430c7e..75bab82 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -417,9 +417,23 @@ void ArmAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
   StoreToOffset(kStoreWord, R0, SP, 0);
 
   // Write out entry spills.
+  int32_t offset = frame_size + sizeof(StackReference<mirror::ArtMethod>);
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Register reg = entry_spills.at(i).AsArm().AsCoreRegister();
-    StoreToOffset(kStoreWord, reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
+    ArmManagedRegister reg = entry_spills.at(i).AsArm();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsSRegister()) {
+      StoreSToOffset(reg.AsSRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      StoreDToOffset(reg.AsDRegister(), SP, offset);
+      offset += 8;
+    }
   }
 }
 
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 0ef0fef..6f6dcbc 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -222,6 +222,7 @@ LIBART_TARGET_SRC_FILES_arm := \
   arch/arm/memcmp16_arm.S \
   arch/arm/portable_entrypoints_arm.S \
   arch/arm/quick_entrypoints_arm.S \
+  arch/arm/quick_entrypoints_cc_arm.cc \
   arch/arm/thread_arm.cc \
   arch/arm/fault_handler_arm.cc
 
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index 5388cc0..8cd2a27 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -19,9 +19,9 @@
 
 #include "asm_support.h"
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 176
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 112
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 48
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 112
 
 // Flag for enabling R4 optimization in arm runtime
 #define ARM_R4_SUSPEND_FLAG
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 96ffc93..fd9c626 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -97,6 +97,23 @@ void ArmContext::SmashCallerSaves() {
   gprs_[R1] = const_cast<uint32_t*>(&gZero);
   gprs_[R2] = nullptr;
   gprs_[R3] = nullptr;
+
+  fprs_[S0] = nullptr;
+  fprs_[S1] = nullptr;
+  fprs_[S2] = nullptr;
+  fprs_[S3] = nullptr;
+  fprs_[S4] = nullptr;
+  fprs_[S5] = nullptr;
+  fprs_[S6] = nullptr;
+  fprs_[S7] = nullptr;
+  fprs_[S8] = nullptr;
+  fprs_[S9] = nullptr;
+  fprs_[S10] = nullptr;
+  fprs_[S11] = nullptr;
+  fprs_[S12] = nullptr;
+  fprs_[S13] = nullptr;
+  fprs_[S14] = nullptr;
+  fprs_[S15] = nullptr;
 }
 
 extern "C" void art_quick_do_long_jump(uint32_t*, uint32_t*);
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index ff0eb4a..24e9b1d 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -77,23 +77,17 @@ extern "C" void art_quick_handle_fill_data(void*, void*);
 extern "C" void art_quick_lock_object(void*);
 extern "C" void art_quick_unlock_object(void*);
 
-// Math entrypoints.
-extern int32_t CmpgDouble(double a, double b);
-extern int32_t CmplDouble(double a, double b);
-extern int32_t CmpgFloat(float a, float b);
-extern int32_t CmplFloat(float a, float b);
-
-// Math conversions.
-extern "C" int32_t __aeabi_f2iz(float op1);        // FLOAT_TO_INT
-extern "C" int32_t __aeabi_d2iz(double op1);       // DOUBLE_TO_INT
-extern "C" float __aeabi_l2f(int64_t op1);         // LONG_TO_FLOAT
-extern "C" double __aeabi_l2d(int64_t op1);        // LONG_TO_DOUBLE
-
+// Used by soft float.
 // Single-precision FP arithmetics.
-extern "C" float fmodf(float a, float b);          // REM_FLOAT[_2ADDR]
-
+extern "C" float fmodf(float a, float b);              // REM_FLOAT[_2ADDR]
 // Double-precision FP arithmetics.
-extern "C" double fmod(double a, double b);         // REM_DOUBLE[_2ADDR]
+extern "C" double fmod(double a, double b);            // REM_DOUBLE[_2ADDR]
+
+// Used by hard float.
+extern "C" int64_t art_quick_f2l(float f);             // FLOAT_TO_LONG
+extern "C" int64_t art_quick_d2l(double d);            // DOUBLE_TO_LONG
+extern "C" float art_quick_fmodf(float a, float b);    // REM_FLOAT[_2ADDR]
+extern "C" double art_quick_fmod(double a, double b);  // REM_DOUBLE[_2ADDR]
 
 // Integer arithmetics.
 extern "C" int __aeabi_idivmod(int32_t, int32_t);  // [DIV|REM]_INT[_2ADDR|_LIT8|_LIT16]
@@ -205,25 +199,24 @@ void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
   qpoints->pUnlockObject = art_quick_unlock_object;
 
   // Math
-  qpoints->pCmpgDouble = CmpgDouble;
-  qpoints->pCmpgFloat = CmpgFloat;
-  qpoints->pCmplDouble = CmplDouble;
-  qpoints->pCmplFloat = CmplFloat;
-  qpoints->pFmod = fmod;
-  qpoints->pL2d = __aeabi_l2d;
-  qpoints->pFmodf = fmodf;
-  qpoints->pL2f = __aeabi_l2f;
-  qpoints->pD2iz = __aeabi_d2iz;
-  qpoints->pF2iz = __aeabi_f2iz;
   qpoints->pIdivmod = __aeabi_idivmod;
-  qpoints->pD2l = art_d2l;
-  qpoints->pF2l = art_f2l;
   qpoints->pLdiv = __aeabi_ldivmod;
   qpoints->pLmod = __aeabi_ldivmod;  // result returned in r2:r3
   qpoints->pLmul = art_quick_mul_long;
   qpoints->pShlLong = art_quick_shl_long;
   qpoints->pShrLong = art_quick_shr_long;
   qpoints->pUshrLong = art_quick_ushr_long;
+  if (kArm32QuickCodeUseSoftFloat) {
+    qpoints->pFmod = fmod;
+    qpoints->pFmodf = fmodf;
+    qpoints->pD2l = art_d2l;
+    qpoints->pF2l = art_f2l;
+  } else {
+    qpoints->pFmod = art_quick_fmod;
+    qpoints->pFmodf = art_quick_fmodf;
+    qpoints->pD2l = art_quick_d2l;
+    qpoints->pF2l = art_quick_f2l;
+  }
 
   // Intrinsics
   qpoints->pIndexOf = art_quick_indexof;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index aae0c94..632b414 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -40,10 +40,10 @@
     .cfi_rel_offset r10, 24
     .cfi_rel_offset r11, 28
     .cfi_rel_offset lr, 32
-    vpush {s0-s31}                                @ 32 words (128 bytes) of floats.
-    .pad #128
-    .cfi_adjust_cfa_offset 128
-    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*.
+    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
+    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
     .pad #12
     .cfi_adjust_cfa_offset 12
     RUNTIME_CURRENT1 \rTemp1, \rTemp2             @ Load Runtime::Current into rTemp1.
@@ -53,7 +53,7 @@
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
      // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 128 + 12)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 64 + 12)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(ARM) size not as expected."
 #endif
 .endm
@@ -101,15 +101,7 @@
 .endm
 
 .macro RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    add sp, #4               @ bottom word holds Method*
-    pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     bx  lr                   @ return
 .endm
 
@@ -117,9 +109,10 @@
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs).
      */
-.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
+    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
     .save {r1-r3, r5-r8, r10-r11, lr}
+    .cfi_adjust_cfa_offset 40
     .cfi_rel_offset r1, 0
     .cfi_rel_offset r2, 4
     .cfi_rel_offset r3, 8
@@ -130,47 +123,39 @@
     .cfi_rel_offset r10, 28
     .cfi_rel_offset r11, 32
     .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
+    vpush {s0-s15}                     @ 16 words of float args.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
     sub sp, #8                         @ 2 words of space, bottom word will hold Method*
     .pad #8
     .cfi_adjust_cfa_offset 8
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 64 + 8)
+#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
+#endif
+.endm
+
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp1, \rTemp2  @ Load Runtime::Current into rTemp1.
     THIS_LOAD_REQUIRES_READ_BARRIER
      @ rTemp1 is kRefsAndArgs Method*.
     ldr \rTemp1, [\rTemp1, #RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET]
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
-
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 8)
-#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
-#endif
 .endm
 
 .macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_R0
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
-    .save {r1-r3, r5-r8, r10-r11, lr}
-    .cfi_rel_offset r1, 0
-    .cfi_rel_offset r2, 4
-    .cfi_rel_offset r3, 8
-    .cfi_rel_offset r5, 12
-    .cfi_rel_offset r6, 16
-    .cfi_rel_offset r7, 20
-    .cfi_rel_offset r8, 24
-    .cfi_rel_offset r10, 28
-    .cfi_rel_offset r11, 32
-    .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
-    sub sp, #8                         @ 2 words of space, bottom word will hold Method*
-    .pad #8
-    .cfi_adjust_cfa_offset 8
-
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     str r0, [sp, #0]                   @ Store ArtMethod* to bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
 .macro RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     add  sp, #8                      @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {s0-s15}
+    .cfi_adjust_cfa_offset -64
     pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
     .cfi_restore r1
     .cfi_restore r2
@@ -181,7 +166,7 @@
     .cfi_restore r8
     .cfi_restore r10
     .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    .cfi_adjust_cfa_offset -40
 .endm
 
 
@@ -373,60 +358,91 @@ INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvoke
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
-     * Quick invocation stub.
+     * Quick invocation stub internal.
      * On entry:
      *   r0 = method pointer
      *   r1 = argument array or NULL for no argument methods
      *   r2 = size of argument array in bytes
      *   r3 = (managed) thread pointer
      *   [sp] = JValue* result
-     *   [sp + 4] = shorty
+     *   [sp + 4] = result_in_float
+     *   [sp + 8] = core register argument array
+     *   [sp + 12] = fp register argument array
+     *  +-------------------------+
+     *  | uint32_t* fp_reg_args   |
+     *  | uint32_t* core_reg_args |
+     *  |   result_in_float       | <- Caller frame
+     *  |   Jvalue* result        |
+     *  +-------------------------+
+     *  |          lr             |
+     *  |          r11            |
+     *  |          r9             |
+     *  |          r4             | <- r11
+     *  +-------------------------+
+     *  | uint32_t out[n-1]       |
+     *  |    :      :             |        Outs
+     *  | uint32_t out[0]         |
+     *  | StackRef<ArtMethod>     | <- SP  value=null
+     *  +-------------------------+
      */
-ENTRY art_quick_invoke_stub
-    push   {r0, r4, r5, r9, r11, lr}       @ spill regs
-    .save  {r0, r4, r5, r9, r11, lr}
-    .pad #24
-    .cfi_adjust_cfa_offset 24
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r4, 4
-    .cfi_rel_offset r5, 8
-    .cfi_rel_offset r9, 12
-    .cfi_rel_offset r11, 16
-    .cfi_rel_offset lr, 20
+ENTRY art_quick_invoke_stub_internal
+    push   {r4, r9, r11, lr}               @ spill regs
+    .save  {r4, r9, r11, lr}
+    .pad #16
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r9, 4
+    .cfi_rel_offset r11, 8
+    .cfi_rel_offset lr, 12
     mov    r11, sp                         @ save the stack pointer
     .cfi_def_cfa_register r11
+
     mov    r9, r3                          @ move managed thread pointer into r9
-#ifdef ARM_R4_SUSPEND_FLAG
-    mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
-#endif
-    add    r5, r2, #4                      @ create space for method pointer in frame
 
-    sub    r5, sp, r5                      @ reserve & align *stack* to 16 bytes: native calling
-    and    r5, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
-    mov    sp, r5                          @ 16B alignment ourselves.
+    add    r4, r2, #4                      @ create space for method pointer in frame
+    sub    r4, sp, r4                      @ reserve & align *stack* to 16 bytes: native calling
+    and    r4, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
+    mov    sp, r4                          @ 16B alignment ourselves.
 
+    mov    r4, r0                          @ save method*
     add    r0, sp, #4                      @ pass stack pointer + method ptr as dest for memcpy
     bl     memcpy                          @ memcpy (dest, src, bytes)
-    ldr    r0, [r11]                       @ restore method*
-    ldr    r1, [sp, #4]                    @ copy arg value for r1
-    ldr    r2, [sp, #8]                    @ copy arg value for r2
-    ldr    r3, [sp, #12]                   @ copy arg value for r3
     mov    ip, #0                          @ set ip to 0
     str    ip, [sp]                        @ store NULL for method* at bottom of frame
+
+    ldr    ip, [r11, #28]                  @ load fp register argument array pointer
+    vldm   ip, {s0-s15}                    @ copy s0 - s15
+
+    ldr    ip, [r11, #24]                  @ load core register argument array pointer
+    mov    r0, r4                          @ restore method*
+    add    ip, ip, #4                      @ skip r0
+    ldm    ip, {r1-r3}                     @ copy r1 - r3
+
+#ifdef ARM_R4_SUSPEND_FLAG
+    mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
+#endif
+
     ldr    ip, [r0, #MIRROR_ART_METHOD_QUICK_CODE_OFFSET]  @ get pointer to the code
     blx    ip                              @ call the method
+
     mov    sp, r11                         @ restore the stack pointer
-    ldr    ip, [sp, #24]                   @ load the result pointer
-    strd   r0, [ip]                        @ store r0/r1 into result pointer
-    pop    {r0, r4, r5, r9, r11, lr}       @ restore spill regs
-    .cfi_restore r0
+    .cfi_def_cfa_register sp
+
+    ldr    r4, [sp, #20]                   @ load result_is_float
+    ldr    r9, [sp, #16]                   @ load the result pointer
+    cmp    r4, #0
+    ite    eq
+    strdeq r0, [r9]                        @ store r0/r1 into result pointer
+    vstrne d0, [r9]                        @ store s0-s1/d0 into result pointer
+
+    pop    {r4, r9, r11, lr}               @ restore spill regs
     .cfi_restore r4
-    .cfi_restore r5
     .cfi_restore r9
+    .cfi_restore r11
     .cfi_restore lr
-    .cfi_adjust_cfa_offset -24
+    .cfi_adjust_cfa_offset -16
     bx     lr
-END art_quick_invoke_stub
+END art_quick_invoke_stub_internal
 
     /*
      * On entry r0 is uint32_t* gprs_ and r1 is uint32_t* fprs_
@@ -869,13 +885,14 @@ ENTRY art_quick_proxy_invoke_handler
     mov     r3, sp                 @ pass SP
     blx     artQuickProxyInvokeHandler  @ (Method* proxy method, receiver, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
-    cbnz    r2, 1f                 @ success if no exception is pending
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    cbnz    r2, 1f                 @ success if no exception is pending
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
     bx      lr                     @ return on success
 1:
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     DELIVER_PENDING_EXCEPTION
 END art_quick_proxy_invoke_handler
 
@@ -977,20 +994,13 @@ ENTRY art_quick_generic_jni_trampoline
     ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     cbnz r2, .Lexception_in_native
 
-    // Tear down the callee-save frame.
-    add  sp, #12                      @ rewind sp
-    // Do not pop r0 and r1, they contain the return value.
-    pop {r2-r3, r5-r8, r10-r11, lr}  @ 9 words of callee saves
-    .cfi_restore r2
-    .cfi_restore r3
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
 
+    // store into fpr, for when it's a fpr return...
+    vmov d0, r0, r1
     bx lr      // ret
 
 .Lentry_error:
@@ -1010,11 +1020,13 @@ ENTRY art_quick_to_interpreter_bridge
     mov     r2, sp                 @ pass SP
     blx     artQuickToInterpreterBridge    @ (Method* method, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     cbnz    r2, 1f                 @ success if no exception is pending
-    bx    lr                       @ return on success
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
+    bx      lr                     @ return on success
 1:
     DELIVER_PENDING_EXCEPTION
 END art_quick_to_interpreter_bridge
@@ -1435,3 +1447,54 @@ ENTRY art_quick_string_compareto
 .Ldone:
     pop   {r4, r7-r12, pc}
 END art_quick_string_compareto
+
+    /* Assembly routines used to handle ABI differences. */
+
+    /* double fmod(double a, double b) */
+    .extern fmod
+ENTRY art_quick_fmod
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    vmov  r2, r3, d1
+    bl    fmod
+    vmov  d0, r0, r1
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* float fmodf(float a, float b) */
+     .extern fmodf
+ENTRY art_quick_fmodf
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    bl    fmodf
+    vmov  s0, r0
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* int64_t art_d2l(double d) */
+    .extern art_d2l
+ENTRY art_quick_d2l
+    vmov  r0, r1, d0
+    b     art_d2l
+END art_quick_d2l
+
+    /* int64_t art_f2l(float f) */
+    .extern art_f2l
+ENTRY art_quick_f2l
+    vmov  r0, s0
+    b     art_f2l
+END art_quick_f2l
diff --git a/runtime/arch/arm/quick_entrypoints_cc_arm.cc b/runtime/arch/arm/quick_entrypoints_cc_arm.cc
new file mode 100644
index 0000000..e21e6c1
--- /dev/null
+++ b/runtime/arch/arm/quick_entrypoints_cc_arm.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mirror/art_method.h"
+#include "utils.h"  // For RoundUp().
+
+namespace art {
+
+// Assembly stub that does the final part of the up-call into Java.
+extern "C" void art_quick_invoke_stub_internal(mirror::ArtMethod*, uint32_t*, uint32_t,
+                                               Thread* self, JValue* result, uint32_t, uint32_t*,
+                                               uint32_t*);
+
+template <bool kIsStatic>
+static void quick_invoke_reg_setup(mirror::ArtMethod* method, uint32_t* args, uint32_t args_size,
+                                   Thread* self, JValue* result, const char* shorty) {
+  // Note: We do not follow aapcs ABI in quick code for both softfp and hardfp.
+  uint32_t core_reg_args[4];  // r0 ~ r3
+  uint32_t fp_reg_args[16];  // s0 ~ s15 (d0 ~ d7)
+  uint32_t gpr_index = 1;  // Index into core registers. Reserve r0 for mirror::ArtMethod*.
+  uint32_t fpr_index = 0;  // Index into float registers.
+  uint32_t fpr_double_index = 0;  // Index into float registers for doubles.
+  uint32_t arg_index = 0;  // Index into argument array.
+  const uint32_t result_in_float = kArm32QuickCodeUseSoftFloat ? 0 :
+      (shorty[0] == 'F' || shorty[0] == 'D') ? 1 : 0;
+
+  if (!kIsStatic) {
+    // Copy receiver for non-static methods.
+    core_reg_args[gpr_index++] = args[arg_index++];
+  }
+
+  for (uint32_t shorty_index = 1; shorty[shorty_index] != '\0'; ++shorty_index, ++arg_index) {
+    char arg_type = shorty[shorty_index];
+    if (kArm32QuickCodeUseSoftFloat) {
+      arg_type = (arg_type == 'D') ? 'J' : arg_type;  // Regard double as long.
+      arg_type = (arg_type == 'F') ? 'I' : arg_type;  // Regard float as int.
+    }
+    switch (arg_type) {
+      case 'D': {
+        // Copy double argument into fp_reg_args if there are still floating point reg arguments.
+        // Double should not overlap with float.
+        fpr_double_index = std::max(fpr_double_index, RoundUp(fpr_index, 2));
+        if (fpr_double_index < arraysize(fp_reg_args)) {
+          fp_reg_args[fpr_double_index++] = args[arg_index];
+          fp_reg_args[fpr_double_index++] = args[arg_index + 1];
+        }
+        ++arg_index;
+        break;
+      }
+      case 'F':
+        // Copy float argument into fp_reg_args if there are still floating point reg arguments.
+        // If fpr_index is odd then its pointing at a hole next to an existing float argument. If we
+        // encounter a float argument then pick it up from that hole. In the case fpr_index is even,
+        // ensure that we don't pick up an argument that overlaps with with a double from
+        // fpr_double_index. In either case, take care not to go beyond the maximum number of
+        // floating point arguments.
+        if (fpr_index % 2 == 0) {
+          fpr_index = std::max(fpr_double_index, fpr_index);
+        }
+        if (fpr_index < arraysize(fp_reg_args)) {
+          fp_reg_args[fpr_index++] = args[arg_index];
+        }
+        break;
+      case 'J':
+        if (gpr_index < arraysize(core_reg_args)) {
+          core_reg_args[gpr_index++] = args[arg_index];
+        }
+        ++arg_index;
+        FALLTHROUGH_INTENDED;  // Fall-through to take of the high part.
+      default:
+        if (gpr_index < arraysize(core_reg_args)) {
+          core_reg_args[gpr_index++] = args[arg_index];
+        }
+        break;
+    }
+  }
+
+  art_quick_invoke_stub_internal(method, args, args_size, self, result, result_in_float,
+      core_reg_args, fp_reg_args);
+}
+
+// Called by art::mirror::ArtMethod::Invoke to do entry into a non-static method.
+// TODO: migrate into an assembly implementation as with ARM64.
+extern "C" void art_quick_invoke_stub(mirror::ArtMethod* method, uint32_t* args, uint32_t args_size,
+                                      Thread* self, JValue* result, const char* shorty) {
+  quick_invoke_reg_setup<false>(method, args, args_size, self, result, shorty);
+}
+
+// Called by art::mirror::ArtMethod::Invoke to do entry into a static method.
+// TODO: migrate into an assembly implementation as with ARM64.
+extern "C" void art_quick_invoke_static_stub(mirror::ArtMethod* method, uint32_t* args,
+                                             uint32_t args_size, Thread* self, JValue* result,
+                                             const char* shorty) {
+  quick_invoke_reg_setup<true>(method, args, args_size, self, result, shorty);
+}
+
+}  // namespace art
diff --git a/runtime/arch/arm/quick_method_frame_info_arm.h b/runtime/arch/arm/quick_method_frame_info_arm.h
index 7595e94..c1f3fc2 100644
--- a/runtime/arch/arm/quick_method_frame_info_arm.h
+++ b/runtime/arch/arm/quick_method_frame_info_arm.h
@@ -25,6 +25,8 @@
 namespace art {
 namespace arm {
 
+static constexpr uint32_t kArmCalleeSaveAlwaysSpills =
+    (1 << art::arm::LR);
 static constexpr uint32_t kArmCalleeSaveRefSpills =
     (1 << art::arm::R5) | (1 << art::arm::R6)  | (1 << art::arm::R7) | (1 << art::arm::R8) |
     (1 << art::arm::R10) | (1 << art::arm::R11);
@@ -32,23 +34,30 @@ static constexpr uint32_t kArmCalleeSaveArgSpills =
     (1 << art::arm::R1) | (1 << art::arm::R2) | (1 << art::arm::R3);
 static constexpr uint32_t kArmCalleeSaveAllSpills =
     (1 << art::arm::R4) | (1 << art::arm::R9);
-static constexpr uint32_t kArmCalleeSaveFpAllSpills =
+
+static constexpr uint32_t kArmCalleeSaveFpAlwaysSpills = 0;
+static constexpr uint32_t kArmCalleeSaveFpRefSpills = 0;
+static constexpr uint32_t kArmCalleeSaveFpArgSpills =
     (1 << art::arm::S0)  | (1 << art::arm::S1)  | (1 << art::arm::S2)  | (1 << art::arm::S3)  |
     (1 << art::arm::S4)  | (1 << art::arm::S5)  | (1 << art::arm::S6)  | (1 << art::arm::S7)  |
     (1 << art::arm::S8)  | (1 << art::arm::S9)  | (1 << art::arm::S10) | (1 << art::arm::S11) |
-    (1 << art::arm::S12) | (1 << art::arm::S13) | (1 << art::arm::S14) | (1 << art::arm::S15) |
+    (1 << art::arm::S12) | (1 << art::arm::S13) | (1 << art::arm::S14) | (1 << art::arm::S15);
+static constexpr uint32_t kArmCalleeSaveFpAllSpills =
     (1 << art::arm::S16) | (1 << art::arm::S17) | (1 << art::arm::S18) | (1 << art::arm::S19) |
     (1 << art::arm::S20) | (1 << art::arm::S21) | (1 << art::arm::S22) | (1 << art::arm::S23) |
     (1 << art::arm::S24) | (1 << art::arm::S25) | (1 << art::arm::S26) | (1 << art::arm::S27) |
     (1 << art::arm::S28) | (1 << art::arm::S29) | (1 << art::arm::S30) | (1 << art::arm::S31);
 
 constexpr uint32_t ArmCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
-  return kArmCalleeSaveRefSpills | (type == Runtime::kRefsAndArgs ? kArmCalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0) | (1 << art::arm::LR);
+  return kArmCalleeSaveAlwaysSpills | kArmCalleeSaveRefSpills |
+      (type == Runtime::kRefsAndArgs ? kArmCalleeSaveArgSpills : 0) |
+      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFpSpills(Runtime::CalleeSaveType type) {
-  return type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0;
+  return kArmCalleeSaveFpAlwaysSpills | kArmCalleeSaveFpRefSpills |
+      (type == Runtime::kRefsAndArgs ? kArmCalleeSaveFpArgSpills: 0) |
+      (type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/arm64/quick_method_frame_info_arm64.h b/runtime/arch/arm64/quick_method_frame_info_arm64.h
index 15c6c07..0e1e32b 100644
--- a/runtime/arch/arm64/quick_method_frame_info_arm64.h
+++ b/runtime/arch/arm64/quick_method_frame_info_arm64.h
@@ -54,7 +54,7 @@ static constexpr uint32_t kArm64CalleeSaveFpArgSpills =
     (1 << art::arm64::D0) | (1 << art::arm64::D1) | (1 << art::arm64::D2) |
     (1 << art::arm64::D3) | (1 << art::arm64::D4) | (1 << art::arm64::D5) |
     (1 << art::arm64::D6) | (1 << art::arm64::D7);
-static constexpr uint32_t kArm64FpAllSpills =
+static constexpr uint32_t kArm64CalleeSaveFpAllSpills =
     (1 << art::arm64::D8)  | (1 << art::arm64::D9)  | (1 << art::arm64::D10) |
     (1 << art::arm64::D11)  | (1 << art::arm64::D12)  | (1 << art::arm64::D13) |
     (1 << art::arm64::D14)  | (1 << art::arm64::D15);
@@ -68,7 +68,7 @@ constexpr uint32_t Arm64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
 constexpr uint32_t Arm64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kArm64CalleeSaveFpAlwaysSpills | kArm64CalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kArm64CalleeSaveFpArgSpills: 0) |
-      (type == Runtime::kSaveAll ? kArm64FpAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArm64CalleeSaveFpAllSpills : 0);
 }
 
 constexpr uint32_t Arm64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index af341bb..93c47dc 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -50,15 +50,19 @@ class QuickArgumentVisitor {
   // | arg1 spill |  |
   // | Method*    | ---
   // | LR         |
-  // | ...        |    callee saves
-  // | R3         |    arg3
-  // | R2         |    arg2
-  // | R1         |    arg1
-  // | R0         |    padding
+  // | ...        |    4x6 bytes callee saves
+  // | R3         |
+  // | R2         |
+  // | R1         |
+  // | S15        |
+  // | :          |
+  // | S0         |
+  // |            |    4x2 bytes padding
   // | Method*    |  <- sp
-  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = kArm32QuickCodeUseSoftFloat;
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = !kArm32QuickCodeUseSoftFloat;
+  static constexpr size_t kNumQuickGprArgs = 3;
+  static constexpr size_t kNumQuickFprArgs = kArm32QuickCodeUseSoftFloat ? 0 : 16;
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
       arm::ArmCalleeSaveFpr1Offset(Runtime::kRefsAndArgs);  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset =
@@ -90,6 +94,7 @@ class QuickArgumentVisitor {
   // |            |    padding
   // | Method*    |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 7;  // 7 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
@@ -117,6 +122,7 @@ class QuickArgumentVisitor {
   // | A1         |    arg1
   // | A0/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -141,6 +147,7 @@ class QuickArgumentVisitor {
   // | ECX         |    arg1
   // | EAX/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -178,6 +185,7 @@ class QuickArgumentVisitor {
   // | Padding         |
   // | RDI/Method*     |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 5;  // 5 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
@@ -222,8 +230,16 @@ class QuickArgumentVisitor {
           fpr_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
           stack_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
                       + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
-          gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
-          is_split_long_or_double_(false) {}
+          gpr_index_(0), fpr_index_(0), fpr_double_index_(0), stack_index_(0),
+          cur_type_(Primitive::kPrimVoid), is_split_long_or_double_(false) {
+    COMPILE_ASSERT(kQuickSoftFloatAbi == (kNumQuickFprArgs == 0), knum_of_quick_fpr_arg_unexpected);
+    COMPILE_ASSERT(!(kQuickSoftFloatAbi && kQuickDoubleRegAlignedFloatBackFilled),
+        kdouble_align_unexpected);
+    // For register alignment, we want to assume that counters(fpr_double_index_) are even if the
+    // next register is even.
+    COMPILE_ASSERT(!kQuickDoubleRegAlignedFloatBackFilled || kNumQuickFprArgs % 2 == 0,
+        knum_quick_fpr_args_not_even);
+  }
 
   virtual ~QuickArgumentVisitor() {}
 
@@ -237,7 +253,11 @@ class QuickArgumentVisitor {
     if (!kQuickSoftFloatAbi) {
       Primitive::Type type = GetParamPrimitiveType();
       if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
-        if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+        if (type == Primitive::kPrimDouble && kQuickDoubleRegAlignedFloatBackFilled) {
+          if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+            return fpr_args_ + (fpr_double_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
+          }
+        } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
           return fpr_args_ + (fpr_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
         }
         return stack_args_ + (stack_index_ * kBytesStackArgLocation);
@@ -268,28 +288,30 @@ class QuickArgumentVisitor {
 
   uint64_t ReadSplitLongParam() const {
     DCHECK(IsSplitLongOrDouble());
+    // Read low half from register.
     uint64_t low_half = *reinterpret_cast<uint32_t*>(GetParamAddress());
-    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_);
+    // Read high half from the stack. As current stack_index_ indexes the argument, the high part
+    // index should be (stack_index_ + 1).
+    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_
+        + (stack_index_ + 1) * kBytesStackArgLocation);
     return (low_half & 0xffffffffULL) | (high_half << 32);
   }
 
   void VisitArguments() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    // This implementation doesn't support reg-spill area for hard float
-    // ABI targets such as x86_64 and aarch64. So, for those targets whose
-    // 'kQuickSoftFloatAbi' is 'false':
-    //     (a) 'stack_args_' should point to the first method's argument
-    //     (b) whatever the argument type it is, the 'stack_index_' should
-    //         be moved forward along with every visiting.
+    // (a) 'stack_args_' should point to the first method's argument
+    // (b) whatever the argument type it is, the 'stack_index_' should
+    //     be moved forward along with every visiting.
     gpr_index_ = 0;
     fpr_index_ = 0;
+    if (kQuickDoubleRegAlignedFloatBackFilled) {
+      fpr_double_index_ = 0;
+    }
     stack_index_ = 0;
     if (!is_static_) {  // Handle this.
       cur_type_ = Primitive::kPrimNot;
       is_split_long_or_double_ = false;
       Visit();
-      if (!kQuickSoftFloatAbi || kNumQuickGprArgs == 0) {
-        stack_index_++;
-      }
+      stack_index_++;
       if (kNumQuickGprArgs > 0) {
         gpr_index_++;
       }
@@ -305,9 +327,7 @@ class QuickArgumentVisitor {
         case Primitive::kPrimInt:
           is_split_long_or_double_ = false;
           Visit();
-          if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-            stack_index_++;
-          }
+          stack_index_++;
           if (gpr_index_ < kNumQuickGprArgs) {
             gpr_index_++;
           }
@@ -315,17 +335,24 @@ class QuickArgumentVisitor {
         case Primitive::kPrimFloat:
           is_split_long_or_double_ = false;
           Visit();
+          stack_index_++;
           if (kQuickSoftFloatAbi) {
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
-            } else {
-              stack_index_++;
             }
           } else {
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+            if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
               fpr_index_++;
+              if (kQuickDoubleRegAlignedFloatBackFilled) {
+                // Double should not overlap with float.
+                // For example, if fpr_index_ = 3, fpr_double_index_ should be at least 4.
+                fpr_double_index_ = std::max(fpr_double_index_, RoundUp(fpr_index_, 2));
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
             }
-            stack_index_++;
           }
           break;
         case Primitive::kPrimDouble:
@@ -334,42 +361,46 @@ class QuickArgumentVisitor {
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
-            if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-              if (kBytesStackArgLocation == 4) {
-                stack_index_+= 2;
-              } else {
-                CHECK_EQ(kBytesStackArgLocation, 8U);
-                stack_index_++;
-              }
+            if (kBytesStackArgLocation == 4) {
+              stack_index_+= 2;
+            } else {
+              CHECK_EQ(kBytesStackArgLocation, 8U);
+              stack_index_++;
             }
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
               if (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) {
                 if (gpr_index_ < kNumQuickGprArgs) {
                   gpr_index_++;
-                } else if (kQuickSoftFloatAbi) {
-                  stack_index_++;
                 }
               }
             }
           } else {
             is_split_long_or_double_ = (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) &&
-                ((fpr_index_ + 1) == kNumQuickFprArgs);
+                ((fpr_index_ + 1) == kNumQuickFprArgs) && !kQuickDoubleRegAlignedFloatBackFilled;
             Visit();
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-              fpr_index_++;
-              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
-                if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-                  fpr_index_++;
-                }
-              }
-            }
             if (kBytesStackArgLocation == 4) {
               stack_index_+= 2;
             } else {
               CHECK_EQ(kBytesStackArgLocation, 8U);
               stack_index_++;
             }
+            if (kQuickDoubleRegAlignedFloatBackFilled) {
+              if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+                fpr_double_index_ += 2;
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
+            } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+              fpr_index_++;
+              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
+                if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+                  fpr_index_++;
+                }
+              }
+            }
           }
           break;
         default:
@@ -381,16 +412,8 @@ class QuickArgumentVisitor {
  private:
   static size_t StackArgumentStartFromShorty(bool is_static, const char* shorty,
                                              uint32_t shorty_len) {
-    if (kQuickSoftFloatAbi) {
-      CHECK_EQ(kNumQuickFprArgs, 0U);
-      return (kNumQuickGprArgs * GetBytesPerGprSpillLocation(kRuntimeISA))
-          + sizeof(StackReference<mirror::ArtMethod>) /* StackReference<ArtMethod> */;
-    } else {
-      // For now, there is no reg-spill area for the targets with
-      // hard float ABI. So, the offset pointing to the first method's
-      // parameter ('this' for non-static methods) should be returned.
-      return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
-    }
+    // 'stack_args_' points to the first method's argument
+    return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
   }
 
  protected:
@@ -403,7 +426,14 @@ class QuickArgumentVisitor {
   uint8_t* const fpr_args_;  // Address of FPR arguments in callee save frame.
   uint8_t* const stack_args_;  // Address of stack arguments in caller's frame.
   uint32_t gpr_index_;  // Index into spilled GPRs.
-  uint32_t fpr_index_;  // Index into spilled FPRs.
+  // Index into spilled FPRs.
+  // In case kQuickDoubleRegAlignedFloatBackFilled, it may index a hole while fpr_double_index_
+  // holds a higher register number.
+  uint32_t fpr_index_;
+  // Index into spilled FPRs for aligned double.
+  // Only used when kQuickDoubleRegAlignedFloatBackFilled. Next available double register indexed in
+  // terms of singles, may be behind fpr_index.
+  uint32_t fpr_double_index_;
   uint32_t stack_index_;  // Index into arguments on the stack.
   // The current type of argument during VisitArguments.
   Primitive::Type cur_type_;
@@ -943,8 +973,8 @@ template<class T> class BuildNativeCallFrameStateMachine {
         delegate_(delegate) {
     // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
     // the next register is even; counting down is just to make the compiler happy...
-    CHECK_EQ(kNumNativeGprArgs % 2, 0U);
-    CHECK_EQ(kNumNativeFprArgs % 2, 0U);
+    COMPILE_ASSERT(kNumNativeGprArgs % 2 == 0U, knum_native_gpr_args_not_even);
+    COMPILE_ASSERT(kNumNativeFprArgs % 2 == 0U, knum_native_fpr_args_not_even);
   }
 
   virtual ~BuildNativeCallFrameStateMachine() {}
diff --git a/runtime/globals.h b/runtime/globals.h
index b7bd44d..4d33196 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -112,6 +112,8 @@ static constexpr TraceClockSource kDefaultTraceClockSource = kTraceClockSourceWa
 
 static constexpr bool kDefaultMustRelocate = true;
 
+static constexpr bool kArm32QuickCodeUseSoftFloat = false;
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_GLOBALS_H_
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 9584d15..b219004 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -43,7 +43,7 @@ namespace mirror {
 extern "C" void art_portable_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*, char);
 extern "C" void art_quick_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                       const char*);
-#ifdef __LP64__
+#if defined(__LP64__) || defined(__arm__)
 extern "C" void art_quick_invoke_static_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                              const char*);
 #endif
@@ -396,7 +396,7 @@ void ArtMethod::Invoke(Thread* self, uint32_t* args, uint32_t args_size, JValue*
       }
 
       if (!IsPortableCompiled()) {
-#ifdef __LP64__
+#if defined(__LP64__) || defined(__arm__)
         if (!IsStatic()) {
           (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
         } else {