diff options
-rw-r--r-- | compiler/dex/backend.h | 7 | ||||
-rw-r--r-- | compiler/dex/compiler_enums.h | 33 | ||||
-rw-r--r-- | compiler/dex/mir_analysis.cc | 48 | ||||
-rw-r--r-- | compiler/dex/mir_dataflow.cc | 124 | ||||
-rw-r--r-- | compiler/dex/mir_graph.cc | 14 | ||||
-rw-r--r-- | compiler/dex/quick/x86/assemble_x86.cc | 32 | ||||
-rw-r--r-- | compiler/dex/quick/x86/codegen_x86.h | 48 | ||||
-rwxr-xr-x | compiler/dex/quick/x86/target_x86.cc | 658 | ||||
-rw-r--r-- | compiler/dex/quick/x86/utility_x86.cc | 11 | ||||
-rw-r--r-- | compiler/dex/quick/x86/x86_lir.h | 14 | ||||
-rw-r--r-- | disassembler/disassembler_x86.cc | 93 |
11 files changed, 710 insertions, 372 deletions
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h index 1f24849..cab3427 100644 --- a/compiler/dex/backend.h +++ b/compiler/dex/backend.h @@ -38,14 +38,15 @@ class Backend { /* * Return the number of reservable vector registers supported - * @param fp_used ‘true’ if floating point computations will be - * executed while vector registers are reserved. + * @param long_or_fp ‘true’ if floating point computations will be + * executed or the operations will be long type while vector + * registers are reserved. * @return the number of vector registers that are available * @note The backend should ensure that sufficient vector registers * are held back to generate scalar code without exhausting vector * registers, if scalar code also uses the vector registers. */ - virtual int NumReservableVectorRegisters(bool fp_used) { return 0; } + virtual int NumReservableVectorRegisters(bool long_or_fp) { return 0; } protected: explicit Backend(ArenaAllocator* arena) : arena_(arena) {} diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h index 9c2a8ba..e4003bf 100644 --- a/compiler/dex/compiler_enums.h +++ b/compiler/dex/compiler_enums.h @@ -256,13 +256,16 @@ enum ExtendedMIROpcode { // vC: TypeSize kMirOpPackedSet, - // @brief Reserve N vector registers (named 0..N-1) - // vA: Number of registers + // @brief Reserve a range of vector registers. + // vA: Start vector register to reserve. + // vB: Inclusive end vector register to reserve. // @note: The backend may choose to map vector numbers used in vector opcodes. // Reserved registers are removed from the list of backend temporary pool. kMirOpReserveVectorRegisters, - // @brief Free Reserved vector registers + // @brief Free a range of reserved vector registers + // vA: Start vector register to unreserve. + // vB: Inclusive end vector register to unreserve. // @note: All currently reserved vector registers are returned to the temporary pool. kMirOpReturnVectorRegisters, @@ -270,6 +273,30 @@ enum ExtendedMIROpcode { // vA: a constant defined by enum MemBarrierKind. kMirOpMemBarrier, + // @brief Used to fill a vector register with array values. + // @details Just as with normal arrays, access on null object register must ensure NullPointerException + // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same + // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that + // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with + // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK. + // vA: destination vector register + // vB: array register + // vC: index register + // arg[0]: TypeSize (most other vector opcodes have this in vC) + kMirOpPackedArrayGet, + + // @brief Used to store a vector register into array. + // @details Just as with normal arrays, access on null object register must ensure NullPointerException + // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same + // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that + // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with + // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK. + // vA: source vector register + // vB: array register + // vC: index register + // arg[0]: TypeSize (most other vector opcodes have this in vC) + kMirOpPackedArrayPut, + kMirOpLast, }; diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc index a8af92c..b265ee7 100644 --- a/compiler/dex/mir_analysis.cc +++ b/compiler/dex/mir_analysis.cc @@ -830,68 +830,74 @@ const uint32_t MIRGraph::analysis_attributes_[kMirOpLast] = { // 109 MIR_RANGE_CHECK AN_NONE, - // 110 MIR_DIV_ZERO_CHECK + // 10A MIR_DIV_ZERO_CHECK AN_NONE, - // 111 MIR_CHECK + // 10B MIR_CHECK AN_NONE, - // 112 MIR_CHECKPART2 + // 10C MIR_CHECKPART2 AN_NONE, - // 113 MIR_SELECT + // 10D MIR_SELECT AN_NONE, - // 114 MirOpConstVector + // 10E MirOpConstVector AN_NONE, - // 115 MirOpMoveVector + // 10F MirOpMoveVector AN_NONE, - // 116 MirOpPackedMultiply + // 110 MirOpPackedMultiply AN_NONE, - // 117 MirOpPackedAddition + // 111 MirOpPackedAddition AN_NONE, - // 118 MirOpPackedSubtract + // 112 MirOpPackedSubtract AN_NONE, - // 119 MirOpPackedShiftLeft + // 113 MirOpPackedShiftLeft AN_NONE, - // 120 MirOpPackedSignedShiftRight + // 114 MirOpPackedSignedShiftRight AN_NONE, - // 121 MirOpPackedUnsignedShiftRight + // 115 MirOpPackedUnsignedShiftRight AN_NONE, - // 122 MirOpPackedAnd + // 116 MirOpPackedAnd AN_NONE, - // 123 MirOpPackedOr + // 117 MirOpPackedOr AN_NONE, - // 124 MirOpPackedXor + // 118 MirOpPackedXor AN_NONE, - // 125 MirOpPackedAddReduce + // 119 MirOpPackedAddReduce AN_NONE, - // 126 MirOpPackedReduce + // 11A MirOpPackedReduce AN_NONE, - // 127 MirOpPackedSet + // 11B MirOpPackedSet AN_NONE, - // 128 MirOpReserveVectorRegisters + // 11C MirOpReserveVectorRegisters AN_NONE, - // 129 MirOpReturnVectorRegisters + // 11D MirOpReturnVectorRegisters AN_NONE, - // 130 MirOpMemBarrier + // 11E MirOpMemBarrier AN_NONE, + + // 11F MirOpPackedArrayGet + AN_ARRAYOP, + + // 120 MirOpPackedArrayPut + AN_ARRAYOP, }; struct MethodStats { diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc index 4c906b0..d9531fb 100644 --- a/compiler/dex/mir_dataflow.cc +++ b/compiler/dex/mir_dataflow.cc @@ -829,68 +829,74 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = { // 109 MIR_RANGE_CHECK 0, - // 110 MIR_DIV_ZERO_CHECK + // 10A MIR_DIV_ZERO_CHECK 0, - // 111 MIR_CHECK + // 10B MIR_CHECK 0, - // 112 MIR_CHECKPART2 + // 10C MIR_CHECKPART2 0, - // 113 MIR_SELECT + // 10D MIR_SELECT DF_DA | DF_UB, - // 114 MirOpConstVector - DF_DA, + // 10E MirOpConstVector + 0, - // 115 MirOpMoveVector + // 10F MirOpMoveVector 0, - // 116 MirOpPackedMultiply + // 110 MirOpPackedMultiply 0, - // 117 MirOpPackedAddition + // 111 MirOpPackedAddition 0, - // 118 MirOpPackedSubtract + // 112 MirOpPackedSubtract 0, - // 119 MirOpPackedShiftLeft + // 113 MirOpPackedShiftLeft 0, - // 120 MirOpPackedSignedShiftRight + // 114 MirOpPackedSignedShiftRight 0, - // 121 MirOpPackedUnsignedShiftRight + // 115 MirOpPackedUnsignedShiftRight 0, - // 122 MirOpPackedAnd + // 116 MirOpPackedAnd 0, - // 123 MirOpPackedOr + // 117 MirOpPackedOr 0, - // 124 MirOpPackedXor + // 118 MirOpPackedXor 0, - // 125 MirOpPackedAddReduce - DF_DA | DF_UA, + // 119 MirOpPackedAddReduce + DF_FORMAT_EXTENDED, - // 126 MirOpPackedReduce - DF_DA, + // 11A MirOpPackedReduce + DF_FORMAT_EXTENDED, - // 127 MirOpPackedSet - DF_UB, + // 11B MirOpPackedSet + DF_FORMAT_EXTENDED, - // 128 MirOpReserveVectorRegisters + // 11C MirOpReserveVectorRegisters 0, - // 129 MirOpReturnVectorRegisters + // 11D MirOpReturnVectorRegisters 0, - // 130 MirOpMemBarrier + // 11E MirOpMemBarrier 0, + + // 11F MirOpPackedArrayGet + DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN, + + // 120 MirOpPackedArrayPut + DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN, }; /* Return the base virtual register for a SSA name */ @@ -915,7 +921,36 @@ void MIRGraph::HandleDef(ArenaBitVector* def_v, int dalvik_reg_id) { void MIRGraph::HandleExtended(ArenaBitVector* use_v, ArenaBitVector* def_v, ArenaBitVector* live_in_v, const MIR::DecodedInstruction& d_insn) { + // For vector MIRs, vC contains type information + bool is_vector_type_wide = false; + int type_size = d_insn.vC >> 16; + if (type_size == k64 || type_size == kDouble) { + is_vector_type_wide = true; + } + switch (static_cast<int>(d_insn.opcode)) { + case kMirOpPackedAddReduce: + HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA); + if (is_vector_type_wide == true) { + HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA + 1); + } + HandleDef(def_v, d_insn.vA); + if (is_vector_type_wide == true) { + HandleDef(def_v, d_insn.vA + 1); + } + break; + case kMirOpPackedReduce: + HandleDef(def_v, d_insn.vA); + if (is_vector_type_wide == true) { + HandleDef(def_v, d_insn.vA + 1); + } + break; + case kMirOpPackedSet: + HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB); + if (is_vector_type_wide == true) { + HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB + 1); + } + break; default: LOG(ERROR) << "Unexpected Extended Opcode " << d_insn.opcode; break; @@ -1064,7 +1099,46 @@ void MIRGraph::DataFlowSSAFormat3RC(MIR* mir) { } void MIRGraph::DataFlowSSAFormatExtended(MIR* mir) { + const MIR::DecodedInstruction& d_insn = mir->dalvikInsn; + // For vector MIRs, vC contains type information + bool is_vector_type_wide = false; + int type_size = d_insn.vC >> 16; + if (type_size == k64 || type_size == kDouble) { + is_vector_type_wide = true; + } + switch (static_cast<int>(mir->dalvikInsn.opcode)) { + case kMirOpPackedAddReduce: + // We have one use, plus one more for wide + AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1); + HandleSSAUse(mir->ssa_rep->uses, d_insn.vA, 0); + if (is_vector_type_wide == true) { + HandleSSAUse(mir->ssa_rep->uses, d_insn.vA + 1, 1); + } + + // We have a def, plus one more for wide + AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1); + HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0); + if (is_vector_type_wide == true) { + HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1); + } + break; + case kMirOpPackedReduce: + // We have a def, plus one more for wide + AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1); + HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0); + if (is_vector_type_wide == true) { + HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1); + } + break; + case kMirOpPackedSet: + // We have one use, plus one more for wide + AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1); + HandleSSAUse(mir->ssa_rep->uses, d_insn.vB, 0); + if (is_vector_type_wide == true) { + HandleSSAUse(mir->ssa_rep->uses, d_insn.vB + 1, 1); + } + break; default: LOG(ERROR) << "Missing case for extended MIR: " << mir->dalvikInsn.opcode; break; diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc index e77be5d..62a8f26 100644 --- a/compiler/dex/mir_graph.cc +++ b/compiler/dex/mir_graph.cc @@ -68,6 +68,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = { "ReserveVectorRegisters", "ReturnVectorRegisters", "MemBarrier", + "PackedArrayGet", + "PackedArrayPut", }; MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena) @@ -1386,6 +1388,18 @@ void MIRGraph::DisassembleExtendedInstr(const MIR* mir, std::string* decoded_mir decoded_mir->append(ss.str()); break; } + case kMirOpPackedArrayGet: + case kMirOpPackedArrayPut: + decoded_mir->append(StringPrintf(" vect%d", mir->dalvikInsn.vA)); + if (ssa_rep != nullptr) { + decoded_mir->append(StringPrintf(", %s[%s]", + GetSSANameWithConst(ssa_rep->uses[0], false).c_str(), + GetSSANameWithConst(ssa_rep->uses[1], false).c_str())); + } else { + decoded_mir->append(StringPrintf(", v%d[v%d]", mir->dalvikInsn.vB, mir->dalvikInsn.vC)); + } + FillTypeSizeString(mir->dalvikInsn.arg[0], decoded_mir); + break; default: break; } diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc index 46f5dd3..9935a22 100644 --- a/compiler/dex/quick/x86/assemble_x86.cc +++ b/compiler/dex/quick/x86/assemble_x86.cc @@ -16,6 +16,7 @@ #include "codegen_x86.h" #include "dex/quick/mir_to_lir-inl.h" +#include "oat.h" #include "x86_lir.h" namespace art { @@ -389,20 +390,27 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0, EXT_0F_ENCODING_MAP(Subss, 0xF3, 0x5C, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Punpcklbw, 0x66, 0x60, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Punpcklwd, 0x66, 0x61, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Punpcklqdq, 0x66, 0x6C, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Sqrtsd, 0xF2, 0x51, REG_DEF0_USE0), EXT_0F_ENCODING2_MAP(Pmulld, 0x66, 0x38, 0x40, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Pmullw, 0x66, 0xD5, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Pmuludq, 0x66, 0xF4, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Mulps, 0x00, 0x59, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Mulpd, 0x66, 0x59, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Paddb, 0x66, 0xFC, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Paddw, 0x66, 0xFD, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Paddd, 0x66, 0xFE, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Paddq, 0x66, 0xD4, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Psadbw, 0x66, 0xF6, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Addps, 0x00, 0x58, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Addpd, 0xF2, 0x58, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Psubb, 0x66, 0xF8, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Psubw, 0x66, 0xF9, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Psubd, 0x66, 0xFA, REG_DEF0_USE0), + EXT_0F_ENCODING_MAP(Psubq, 0x66, 0xFB, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Subps, 0x00, 0x5C, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Subpd, 0x66, 0x5C, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Pand, 0x66, 0xDB, REG_DEF0_USE0), @@ -431,6 +439,7 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0, { kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1, false }, "PsrlwRI", "!0r,!1d" }, { kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1, false }, "PsrldRI", "!0r,!1d" }, { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1, false }, "PsrlqRI", "!0r,!1d" }, + { kX86PsrldqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 3, 0, 1, false }, "PsrldqRI", "!0r,!1d" }, { kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1, false }, "PsllwRI", "!0r,!1d" }, { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" }, { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" }, @@ -447,9 +456,9 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0, { kX86Fucompp, kNullary, NO_OPERAND | USE_FP_STACK, { 0xDA, 0, 0xE9, 0, 0, 0, 0, 0, false }, "Fucompp", "" }, { kX86Fstsw16R, kNullary, NO_OPERAND | REG_DEFA | USE_FP_STACK, { 0x9B, 0xDF, 0xE0, 0, 0, 0, 0, 0, false }, "Fstsw16R", "ax" }, - EXT_0F_ENCODING_MAP(Mova128, 0x66, 0x6F, REG_DEF0), - { kX86Mova128MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" }, - { kX86Mova128AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128AR", "[!0r+!1r<<!2d+!3d],!4r" }, + EXT_0F_ENCODING_MAP(Movdqa, 0x66, 0x6F, REG_DEF0), + { kX86MovdqaMR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaMR", "[!0r+!1d],!2r" }, + { kX86MovdqaAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaAR", "[!0r+!1r<<!2d+!3d],!4r" }, EXT_0F_ENCODING_MAP(Movups, 0x0, 0x10, REG_DEF0), @@ -1956,17 +1965,12 @@ void X86Mir2Lir::AssignOffsets() { int offset = AssignInsnOffsets(); if (const_vectors_ != nullptr) { - /* assign offsets to vector literals */ - - // First, get offset to 12 mod 16 to align to 16 byte boundary. - // This will ensure that the vector is 16 byte aligned, as the procedure is - // always aligned at at 4 mod 16. - int align_size = (16-4) - (offset & 0xF); - if (align_size < 0) { - align_size += 16; - } - - offset += align_size; + // Vector literals must be 16-byte aligned. The header that is placed + // in the code section causes misalignment so we take it into account. + // Otherwise, we are sure that for x86 method is aligned to 16. + DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u); + uint32_t bytes_to_fill = (0x10 - ((offset + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF; + offset += bytes_to_fill; // Now assign each literal the right offset. for (LIR *p = const_vectors_; p != nullptr; p = p->next) { diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h index 7ad917d..a85e02f 100644 --- a/compiler/dex/quick/x86/codegen_x86.h +++ b/compiler/dex/quick/x86/codegen_x86.h @@ -125,7 +125,7 @@ class X86Mir2Lir : public Mir2Lir { void CompilerInitializeRegAlloc() OVERRIDE; int VectorRegisterSize() OVERRIDE; - int NumReservableVectorRegisters(bool fp_used) OVERRIDE; + int NumReservableVectorRegisters(bool long_or_fp) OVERRIDE; // Required for target - miscellaneous. void AssembleLIR() OVERRIDE; @@ -479,7 +479,8 @@ class X86Mir2Lir : public Mir2Lir { void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1, int64_t val, ConditionCode ccode); void GenConstWide(RegLocation rl_dest, int64_t value); - void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir); + void GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2); + void GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2); void GenShiftByteVector(BasicBlock *bb, MIR *mir); void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4); @@ -521,20 +522,18 @@ class X86Mir2Lir : public Mir2Lir { bool GenInlinedIndexOf(CallInfo* info, bool zero_based); /** - * @brief Reserve a fixed number of vector registers from the register pool - * @details The mir->dalvikInsn.vA specifies an N such that vector registers - * [0..N-1] are removed from the temporary pool. The caller must call - * ReturnVectorRegisters before calling ReserveVectorRegisters again. - * Also sets the num_reserved_vector_regs_ to the specified value - * @param mir whose vA specifies the number of registers to reserve + * @brief Used to reserve a range of vector registers. + * @see kMirOpReserveVectorRegisters + * @param mir The extended MIR for reservation. */ void ReserveVectorRegisters(MIR* mir); /** - * @brief Return all the reserved vector registers to the temp pool - * @details Returns [0..num_reserved_vector_regs_] + * @brief Used to return a range of vector registers. + * @see kMirOpReturnVectorRegisters + * @param mir The extended MIR for returning vector regs. */ - void ReturnVectorRegisters(); + void ReturnVectorRegisters(MIR* mir); /* * @brief Load 128 bit constant into vector register. @@ -684,6 +683,20 @@ class X86Mir2Lir : public Mir2Lir { */ void GenSetVector(BasicBlock *bb, MIR *mir); + /** + * @brief Used to generate code for kMirOpPackedArrayGet. + * @param bb The basic block of MIR. + * @param mir The mir whose opcode is kMirOpPackedArrayGet. + */ + void GenPackedArrayGet(BasicBlock *bb, MIR *mir); + + /** + * @brief Used to generate code for kMirOpPackedArrayPut. + * @param bb The basic block of MIR. + * @param mir The mir whose opcode is kMirOpPackedArrayPut. + */ + void GenPackedArrayPut(BasicBlock *bb, MIR *mir); + /* * @brief Generate code for a vector opcode. * @param bb The basic block in which the MIR is from. @@ -937,20 +950,20 @@ class X86Mir2Lir : public Mir2Lir { LIR* stack_increment_; // The list of const vector literals. - LIR *const_vectors_; + LIR* const_vectors_; /* * @brief Search for a matching vector literal - * @param mir A kMirOpConst128b MIR instruction to match. + * @param constants An array of size 4 which contains all of 32-bit constants. * @returns pointer to matching LIR constant, or nullptr if not found. */ - LIR *ScanVectorLiteral(MIR *mir); + LIR* ScanVectorLiteral(int32_t* constants); /* * @brief Add a constant vector literal - * @param mir A kMirOpConst128b MIR instruction to match. + * @param constants An array of size 4 which contains all of 32-bit constants. */ - LIR *AddVectorLiteral(MIR *mir); + LIR* AddVectorLiteral(int32_t* constants); InToRegStorageMapping in_to_reg_storage_mapping_; @@ -970,9 +983,6 @@ class X86Mir2Lir : public Mir2Lir { static const X86EncodingMap EncodingMap[kX86Last]; private: - // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters - int num_reserved_vector_regs_; - void SwapBits(RegStorage result_reg, int shift, int32_t value); void SwapBits64(RegStorage result_reg, int shift, int64_t value); }; diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc index 68c1633..ffe6702 100755 --- a/compiler/dex/quick/x86/target_x86.cc +++ b/compiler/dex/quick/x86/target_x86.cc @@ -24,6 +24,7 @@ #include "dex/reg_storage_eq.h" #include "mirror/array.h" #include "mirror/string.h" +#include "oat.h" #include "x86_lir.h" #include "utils/dwarf_cfi.h" @@ -454,7 +455,7 @@ RegStorage X86Mir2Lir::AllocateByteRegister() { } RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) { - return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg(); + return GetRegInfo(reg)->Master()->GetReg(); } bool X86Mir2Lir::IsByteRegister(RegStorage reg) { @@ -689,8 +690,11 @@ int X86Mir2Lir::VectorRegisterSize() { return 128; } -int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) { - return fp_used ? 5 : 7; +int X86Mir2Lir::NumReservableVectorRegisters(bool long_or_fp) { + int num_vector_temps = cu_->target64 ? xp_temps_64.size() : xp_temps_32.size(); + + // Leave a few temps for use by backend as scratch. + return long_or_fp ? num_vector_temps - 2 : num_vector_temps - 1; } void X86Mir2Lir::SpillCoreRegs() { @@ -864,9 +868,6 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* rX86_RET1 = rDX; rX86_INVOKE_TGT = rAX; rX86_COUNT = rCX; - - // Initialize the number of reserved vector registers - num_reserved_vector_regs_ = -1; } Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph, @@ -1022,19 +1023,18 @@ void X86Mir2Lir::InstallLiteralPools() { DCHECK(method_literal_list_ == nullptr); DCHECK(class_literal_list_ == nullptr); - // Align to 16 byte boundary. We have implicit knowledge that the start of the method is - // on a 4 byte boundary. How can I check this if it changes (other than aligned loads - // will fail at runtime)? - if (const_vectors_ != nullptr) { - int align_size = (16-4) - (code_buffer_.size() & 0xF); - if (align_size < 0) { - align_size += 16; - } - while (align_size > 0) { + if (const_vectors_ != nullptr) { + // Vector literals must be 16-byte aligned. The header that is placed + // in the code section causes misalignment so we take it into account. + // Otherwise, we are sure that for x86 method is aligned to 16. + DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u); + uint32_t bytes_to_fill = (0x10 - ((code_buffer_.size() + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF; + while (bytes_to_fill > 0) { code_buffer_.push_back(0); - align_size--; + bytes_to_fill--; } + for (LIR *p = const_vectors_; p != nullptr; p = p->next) { PushWord(&code_buffer_, p->operands[0]); PushWord(&code_buffer_, p->operands[1]); @@ -1489,7 +1489,7 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { ReserveVectorRegisters(mir); break; case kMirOpReturnVectorRegisters: - ReturnVectorRegisters(); + ReturnVectorRegisters(mir); break; case kMirOpConstVector: GenConst128(bb, mir); @@ -1536,17 +1536,19 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { case kMirOpMemBarrier: GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA)); break; + case kMirOpPackedArrayGet: + GenPackedArrayGet(bb, mir); + break; + case kMirOpPackedArrayPut: + GenPackedArrayPut(bb, mir); + break; default: break; } } void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) { - // We should not try to reserve twice without returning the registers - DCHECK_NE(num_reserved_vector_regs_, -1); - - int num_vector_reg = mir->dalvikInsn.vA; - for (int i = 0; i < num_vector_reg; i++) { + for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) { RegStorage xp_reg = RegStorage::Solo128(i); RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); Clobber(xp_reg); @@ -1561,13 +1563,10 @@ void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) { } } } - - num_reserved_vector_regs_ = num_vector_reg; } -void X86Mir2Lir::ReturnVectorRegisters() { - // Return all the reserved registers - for (int i = 0; i < num_reserved_vector_regs_; i++) { +void X86Mir2Lir::ReturnVectorRegisters(MIR* mir) { + for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) { RegStorage xp_reg = RegStorage::Solo128(i); RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); @@ -1581,17 +1580,12 @@ void X86Mir2Lir::ReturnVectorRegisters() { } } } - - // We don't have anymore reserved vector registers - num_reserved_vector_regs_ = -1; } void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { - store_method_addr_used_ = true; - int type_size = mir->dalvikInsn.vB; - // We support 128 bit vectors. - DCHECK_EQ(type_size & 0xFFFF, 128); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest); + uint32_t *args = mir->dalvikInsn.arg; int reg = rs_dest.GetReg(); // Check for all 0 case. @@ -1601,14 +1595,24 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { } // Append the mov const vector to reg opcode. - AppendOpcodeWithConst(kX86MovupsRM, reg, mir); + AppendOpcodeWithConst(kX86MovdqaRM, reg, mir); } void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) { - // Okay, load it from the constant vector area. - LIR *data_target = ScanVectorLiteral(mir); + // The literal pool needs position independent logic. + store_method_addr_used_ = true; + + // To deal with correct memory ordering, reverse order of constants. + int32_t constants[4]; + constants[3] = mir->dalvikInsn.arg[0]; + constants[2] = mir->dalvikInsn.arg[1]; + constants[1] = mir->dalvikInsn.arg[2]; + constants[0] = mir->dalvikInsn.arg[3]; + + // Search if there is already a constant in pool with this value. + LIR *data_target = ScanVectorLiteral(constants); if (data_target == nullptr) { - data_target = AddVectorLiteral(mir); + data_target = AddVectorLiteral(constants); } // Address the start of the method. @@ -1624,7 +1628,7 @@ void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) { // 4 byte offset. We will fix this up in the assembler later to have the right // value. ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral); - LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg()); + LIR *load = NewLIR3(opcode, reg, rl_method.reg.GetReg(), 256 /* bogus */); load->flags.fixup = kFixupLoad; load->target = data_target; } @@ -1633,16 +1637,12 @@ void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest); RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB); - NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg()); + NewLIR2(kX86MovdqaRR, rs_dest.GetReg(), rs_src.GetReg()); } -void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) { - const int BYTE_SIZE = 8; - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide()); - +void X86Mir2Lir::GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2) { /* * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM * and multiplying 8 at a time before recombining back into one XMM register. @@ -1660,29 +1660,100 @@ void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) { */ // Copy xmm1. - NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg()); + RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempDouble()); + RegStorage rs_dest_high_tmp = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_src1_high_tmp.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86MovdqaRR, rs_dest_high_tmp.GetReg(), rs_dest_src1.GetReg()); // Multiply low bits. + // x7 *= x3 NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); // xmm1 now has low bits. AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); // Prepare high bits for multiplication. - NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE); - AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); + NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), 0x8); + AndMaskVectorRegister(rs_dest_high_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); // Multiply high bits and xmm2 now has high bits. - NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg()); + NewLIR2(kX86PmullwRR, rs_src1_high_tmp.GetReg(), rs_dest_high_tmp.GetReg()); // Combine back into dest XMM register. - NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src1_high_tmp.GetReg()); +} + +void X86Mir2Lir::GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2) { + /* + * We need to emulate the packed long multiply. + * For kMirOpPackedMultiply xmm1, xmm0: + * - xmm1 is src/dest + * - xmm0 is src + * - Get xmm2 and xmm3 as temp + * - Idea is to multiply the lower 32 of each operand with the higher 32 of the other. + * - Then add the two results. + * - Move it to the upper 32 of the destination + * - Then multiply the lower 32-bits of the operands and add the result to the destination. + * + * (op dest src ) + * movdqa %xmm2, %xmm1 + * movdqa %xmm3, %xmm0 + * psrlq %xmm3, $0x20 + * pmuludq %xmm3, %xmm2 + * psrlq %xmm1, $0x20 + * pmuludq %xmm1, %xmm0 + * paddq %xmm1, %xmm3 + * psllq %xmm1, $0x20 + * pmuludq %xmm2, %xmm0 + * paddq %xmm1, %xmm2 + * + * When both the operands are the same, then we need to calculate the lower-32 * higher-32 + * calculation only once. Thus we don't need the xmm3 temp above. That sequence becomes: + * + * (op dest src ) + * movdqa %xmm2, %xmm1 + * psrlq %xmm1, $0x20 + * pmuludq %xmm1, %xmm0 + * paddq %xmm1, %xmm1 + * psllq %xmm1, $0x20 + * pmuludq %xmm2, %xmm0 + * paddq %xmm1, %xmm2 + * + */ + + bool both_operands_same = (rs_dest_src1.GetReg() == rs_src2.GetReg()); + + RegStorage rs_tmp_vector_1; + RegStorage rs_tmp_vector_2; + rs_tmp_vector_1 = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector_1.GetReg(), rs_dest_src1.GetReg()); + + if (both_operands_same == false) { + rs_tmp_vector_2 = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector_2.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PsrlqRI, rs_tmp_vector_2.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_tmp_vector_2.GetReg(), rs_tmp_vector_1.GetReg()); + } + + NewLIR2(kX86PsrlqRI, rs_dest_src1.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); + + if (both_operands_same == false) { + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_2.GetReg()); + } else { + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg()); + } + + NewLIR2(kX86PsllqRI, rs_dest_src1.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_tmp_vector_1.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_1.GetReg()); } void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { @@ -1700,7 +1771,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { break; case kSignedByte: // HW doesn't support 16x16 byte multiplication so emulate it. - GenMultiplyVectorSignedByte(bb, mir); + GenMultiplyVectorSignedByte(rs_dest_src1, rs_src2); + return; + case k64: + GenMultiplyVectorLong(rs_dest_src1, rs_src2); return; default: LOG(FATAL) << "Unsupported vector multiply " << opsize; @@ -1713,12 +1787,16 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: opcode = kX86PadddRR; break; + case k64: + opcode = kX86PaddqRR; + break; case kSignedHalf: case kUnsignedHalf: opcode = kX86PaddwRR; @@ -1744,12 +1822,16 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: opcode = kX86PsubdRR; break; + case k64: + opcode = kX86PsubqRR; + break; case kSignedHalf: case kUnsignedHalf: opcode = kX86PsubwRR; @@ -1772,58 +1854,54 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) { + // Destination does not need clobbered because it has already been as part + // of the general packed shift handler (caller of this method). RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); - RegStorage rs_tmp = Get128BitRegister(AllocTempWide()); int opcode = 0; - int imm = mir->dalvikInsn.vB; - switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) { case kMirOpPackedShiftLeft: opcode = kX86PsllwRI; break; case kMirOpPackedSignedShiftRight: - opcode = kX86PsrawRI; - break; case kMirOpPackedUnsignedShiftRight: - opcode = kX86PsrlwRI; - break; + // TODO Add support for emulated byte shifts. default: LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode; break; } - /* - * xmm1 will have low bits - * xmm2 will have high bits - * - * xmm2 = xmm1 - * xmm1 = xmm1 .<< N - * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00 - * xmm2 = xmm2 .<< N - * xmm1 = xmm1 | xmm2 - */ - - // Copy xmm1. - NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg()); + // Clear xmm register and return if shift more than byte length. + int imm = mir->dalvikInsn.vB; + if (imm >= 8) { + NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg()); + return; + } // Shift lower values. NewLIR2(opcode, rs_dest_src1.GetReg(), imm); - // Mask bottom bits. - AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); - - // Shift higher values. - NewLIR2(opcode, rs_tmp.GetReg(), imm); + /* + * The above shift will shift the whole word, but that means + * both the bytes will shift as well. To emulate a byte level + * shift, we can just throw away the lower (8 - N) bits of the + * upper byte, and we are done. + */ + uint8_t byte_mask = 0xFF << imm; + uint32_t int_mask = byte_mask; + int_mask = int_mask << 8 | byte_mask; + int_mask = int_mask << 8 | byte_mask; + int_mask = int_mask << 8 | byte_mask; - // Combine back into dest XMM register. - NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg()); + // And the destination with the mask + AndMaskVectorRegister(rs_dest_src1, int_mask, int_mask, int_mask, int_mask); } void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1852,6 +1930,7 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1866,6 +1945,8 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { case kUnsignedByte: GenShiftByteVector(bb, mir); return; + case k64: + // TODO Implement emulated shift algorithm. default: LOG(FATAL) << "Unsupported vector signed shift right " << opsize; break; @@ -1877,6 +1958,7 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1905,6 +1987,7 @@ void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1913,6 +1996,7 @@ void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1921,6 +2005,7 @@ void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1945,134 +2030,240 @@ void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) { OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); - RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegLocation rl_dest = mir_graph_->GetDest(mir); - RegStorage rs_tmp; - - int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8; - int vec_unit_size = 0; - int opcode = 0; - int extr_opcode = 0; - RegLocation rl_result; + RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB); + bool is_wide = opsize == k64 || opsize == kDouble; + + // Get the location of the virtual register. Since this bytecode is overloaded + // for different types (and sizes), we need different logic for each path. + // The design of bytecode uses same VR for source and destination. + RegLocation rl_src, rl_dest, rl_result; + if (is_wide) { + rl_src = mir_graph_->GetSrcWide(mir, 0); + rl_dest = mir_graph_->GetDestWide(mir); + } else { + rl_src = mir_graph_->GetSrc(mir, 0); + rl_dest = mir_graph_->GetDest(mir); + } - switch (opsize) { - case k32: - extr_opcode = kX86PextrdRRI; - opcode = kX86PhadddRR; - vec_unit_size = 4; - break; - case kSignedByte: - case kUnsignedByte: - extr_opcode = kX86PextrbRRI; - opcode = kX86PhaddwRR; - vec_unit_size = 2; - break; - case kSignedHalf: - case kUnsignedHalf: - extr_opcode = kX86PextrwRRI; - opcode = kX86PhaddwRR; - vec_unit_size = 2; - break; - case kSingle: - rl_result = EvalLoc(rl_dest, kFPReg, true); - vec_unit_size = 4; - for (int i = 0; i < 3; i++) { - NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); - NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39); - } - NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); - StoreValue(rl_dest, rl_result); + // We need a temp for byte and short values + RegStorage temp; - // For single-precision floats, we are done here - return; - default: - LOG(FATAL) << "Unsupported vector add reduce " << opsize; - break; - } + // There is a different path depending on type and size. + if (opsize == kSingle) { + // Handle float case. + // TODO Add support for fast math (not value safe) and do horizontal add in that case. - int elems = vec_bytes / vec_unit_size; + rl_src = LoadValue(rl_src, kFPReg); + rl_result = EvalLoc(rl_dest, kFPReg, true); - // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again - // TODO is overflow handled correctly? - if (opsize == kSignedByte || opsize == kUnsignedByte) { - rs_tmp = Get128BitRegister(AllocTempWide()); + // Since we are doing an add-reduce, we move the reg holding the VR + // into the result so we include it in result. + OpRegCopy(rl_result.reg, rl_src.reg); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); - // tmp = xmm1 .>> 8. - NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg()); - NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8); + // Since FP must keep order of operation for value safety, we shift to low + // 32-bits and add to result. + for (int i = 0; i < 3; i++) { + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + } - // Zero extend low bits in xmm1. - AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); - } + StoreValue(rl_dest, rl_result); + } else if (opsize == kDouble) { + // Handle double case. + rl_src = LoadValueWide(rl_src, kFPReg); + rl_result = EvalLocWide(rl_dest, kFPReg, true); + LOG(FATAL) << "Unsupported vector add reduce for double."; + } else if (opsize == k64) { + /* + * Handle long case: + * 1) Reduce the vector register to lower half (with addition). + * 1-1) Get an xmm temp and fill it with vector register. + * 1-2) Shift the xmm temp by 8-bytes. + * 1-3) Add the xmm temp to vector register that is being reduced. + * 2) Allocate temp GP / GP pair. + * 2-1) In 64-bit case, use movq to move result to a 64-bit GP. + * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair. + * 3) Finish the add reduction by doing what add-long/2addr does, + * but instead of having a VR as one of the sources, we have our temp GP. + */ + RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg()); + NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8); + NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg()); + FreeTemp(rs_tmp_vector); + + // We would like to be able to reuse the add-long implementation, so set up a fake + // register location to pass it. + RegLocation temp_loc = mir_graph_->GetBadLoc(); + temp_loc.core = 1; + temp_loc.wide = 1; + temp_loc.location = kLocPhysReg; + temp_loc.reg = AllocTempWide(); + + if (cu_->target64) { + DCHECK(!temp_loc.reg.IsPair()); + NewLIR2(kX86MovqrxRR, temp_loc.reg.GetReg(), vector_src.GetReg()); + } else { + NewLIR2(kX86MovdrxRR, temp_loc.reg.GetLowReg(), vector_src.GetReg()); + NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20); + NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg()); + } - while (elems > 1) { - if (opsize == kSignedByte || opsize == kUnsignedByte) { - NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg()); + GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc); + } else if (opsize == kSignedByte || opsize == kUnsignedByte) { + RegStorage rs_tmp = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg()); + NewLIR2(kX86PsadbwRR, vector_src.GetReg(), rs_tmp.GetReg()); + NewLIR3(kX86PshufdRRI, rs_tmp.GetReg(), vector_src.GetReg(), 0x4e); + NewLIR2(kX86PaddbRR, vector_src.GetReg(), rs_tmp.GetReg()); + // Move to a GPR + temp = AllocTemp(); + NewLIR2(kX86MovdrxRR, temp.GetReg(), vector_src.GetReg()); + } else { + // Handle and the int and short cases together + + // Initialize as if we were handling int case. Below we update + // the opcode if handling byte or short. + int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8; + int vec_unit_size; + int horizontal_add_opcode; + int extract_opcode; + + if (opsize == kSignedHalf || opsize == kUnsignedHalf) { + extract_opcode = kX86PextrwRRI; + horizontal_add_opcode = kX86PhaddwRR; + vec_unit_size = 2; + } else if (opsize == k32) { + vec_unit_size = 4; + horizontal_add_opcode = kX86PhadddRR; + extract_opcode = kX86PextrdRRI; + } else { + LOG(FATAL) << "Unsupported vector add reduce " << opsize; + return; } - NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg()); - elems >>= 1; - } - // Combine the results if we separated them. - if (opsize == kSignedByte || opsize == kUnsignedByte) { - NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg()); - } + int elems = vec_bytes / vec_unit_size; - // We need to extract to a GPR. - RegStorage temp = AllocTemp(); - NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0); + while (elems > 1) { + NewLIR2(horizontal_add_opcode, vector_src.GetReg(), vector_src.GetReg()); + elems >>= 1; + } - // Can we do this directly into memory? - rl_result = UpdateLocTyped(rl_dest, kCoreReg); - if (rl_result.location == kLocPhysReg) { - // Ensure res is in a core reg - rl_result = EvalLoc(rl_dest, kCoreReg, true); - OpRegReg(kOpAdd, rl_result.reg, temp); - StoreFinalValue(rl_dest, rl_result); - } else { - OpMemReg(kOpAdd, rl_result, temp.GetReg()); - } + // Handle this as arithmetic unary case. + ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg); - FreeTemp(temp); + // Extract to a GP register because this is integral typed. + temp = AllocTemp(); + NewLIR3(extract_opcode, temp.GetReg(), vector_src.GetReg(), 0); + } + + if (opsize != k64 && opsize != kSingle && opsize != kDouble) { + // The logic below looks very similar to the handling of ADD_INT_2ADDR + // except the rhs is not a VR but a physical register allocated above. + // No load of source VR is done because it assumes that rl_result will + // share physical register / memory location. + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + if (rl_result.location == kLocPhysReg) { + // Ensure res is in a core reg. + rl_result = EvalLoc(rl_dest, kCoreReg, true); + OpRegReg(kOpAdd, rl_result.reg, temp); + StoreFinalValue(rl_dest, rl_result); + } else { + // Do the addition directly to memory. + OpMemReg(kOpAdd, rl_result, temp.GetReg()); + } + } } void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) { OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegLocation rl_dest = mir_graph_->GetDest(mir); - RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); + RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB); int extract_index = mir->dalvikInsn.arg[0]; int extr_opcode = 0; RegLocation rl_result; bool is_wide = false; - switch (opsize) { - case k32: - rl_result = UpdateLocTyped(rl_dest, kCoreReg); - extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI; - break; - case kSignedHalf: - case kUnsignedHalf: - rl_result= UpdateLocTyped(rl_dest, kCoreReg); - extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI; - break; - default: - LOG(FATAL) << "Unsupported vector add reduce " << opsize; - return; - break; - } + // There is a different path depending on type and size. + if (opsize == kSingle) { + // Handle float case. + // TODO Add support for fast math (not value safe) and do horizontal add in that case. - if (rl_result.location == kLocPhysReg) { - NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index); - if (is_wide == true) { - StoreFinalValue(rl_dest, rl_result); + rl_result = EvalLoc(rl_dest, kFPReg, true); + NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg()); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + + // Since FP must keep order of operation for value safety, we shift to low + // 32-bits and add to result. + for (int i = 0; i < 3; i++) { + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + } + + StoreValue(rl_dest, rl_result); + } else if (opsize == kDouble) { + // TODO Handle double case. + LOG(FATAL) << "Unsupported add reduce for double."; + } else if (opsize == k64) { + /* + * Handle long case: + * 1) Reduce the vector register to lower half (with addition). + * 1-1) Get an xmm temp and fill it with vector register. + * 1-2) Shift the xmm temp by 8-bytes. + * 1-3) Add the xmm temp to vector register that is being reduced. + * 2) Evaluate destination to a GP / GP pair. + * 2-1) In 64-bit case, use movq to move result to a 64-bit GP. + * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair. + * 3) Store the result to the final destination. + */ + RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg()); + NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8); + NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg()); + FreeTemp(rs_tmp_vector); + + rl_result = EvalLocWide(rl_dest, kCoreReg, true); + if (cu_->target64) { + DCHECK(!rl_result.reg.IsPair()); + NewLIR2(kX86MovqrxRR, rl_result.reg.GetReg(), vector_src.GetReg()); } else { - StoreFinalValueWide(rl_dest, rl_result); + NewLIR2(kX86MovdrxRR, rl_result.reg.GetLowReg(), vector_src.GetReg()); + NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20); + NewLIR2(kX86MovdrxRR, rl_result.reg.GetHighReg(), vector_src.GetReg()); } + + StoreValueWide(rl_dest, rl_result); } else { - int displacement = SRegOffset(rl_result.s_reg_low); - LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg()); - AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */); - AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */); + // Handle the rest of integral types now. + switch (opsize) { + case k32: + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI; + break; + case kSignedHalf: + case kUnsignedHalf: + rl_result= UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI; + break; + default: + LOG(FATAL) << "Unsupported vector reduce " << opsize; + return; + } + + if (rl_result.location == kLocPhysReg) { + NewLIR3(extr_opcode, rl_result.reg.GetReg(), vector_src.GetReg(), extract_index); + if (is_wide == true) { + StoreFinalValue(rl_dest, rl_result); + } else { + StoreFinalValueWide(rl_dest, rl_result); + } + } else { + int displacement = SRegOffset(rl_result.s_reg_low); + LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, vector_src.GetReg()); + AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */); + AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */); + } } } @@ -2080,96 +2271,113 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); - int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR; + Clobber(rs_dest); + int op_shuffle = 0, op_shuffle_high = 0, op_mov = kX86MovdxrRR; RegisterClass reg_type = kCoreReg; + bool is_wide = false; switch (opsize) { case k32: - op_low = kX86PshufdRRI; + op_shuffle = kX86PshufdRRI; break; case kSingle: - op_low = kX86PshufdRRI; - op_mov = kX86Mova128RR; + op_shuffle = kX86PshufdRRI; + op_mov = kX86MovdqaRR; reg_type = kFPReg; break; case k64: - op_low = kX86PshufdRRI; - imm = 0x44; - break; - case kDouble: - op_low = kX86PshufdRRI; - op_mov = kX86Mova128RR; - reg_type = kFPReg; - imm = 0x44; + op_shuffle = kX86PunpcklqdqRR; + op_mov = kX86MovqrxRR; + is_wide = true; break; case kSignedByte: case kUnsignedByte: - // Shuffle 8 bit value into 16 bit word. - // We set val = val + (val << 8) below and use 16 bit shuffle. + // We will have the source loaded up in a + // double-word before we use this shuffle + op_shuffle = kX86PshufdRRI; + break; case kSignedHalf: case kUnsignedHalf: // Handles low quadword. - op_low = kX86PshuflwRRI; + op_shuffle = kX86PshuflwRRI; // Handles upper quadword. - op_high = kX86PshufdRRI; + op_shuffle_high = kX86PshufdRRI; break; default: LOG(FATAL) << "Unsupported vector set " << opsize; break; } - RegLocation rl_src = mir_graph_->GetSrc(mir, 0); - - // Load the value from the VR into the reg. - if (rl_src.wide == 0) { + // Load the value from the VR into a physical register. + RegLocation rl_src; + if (!is_wide) { + rl_src = mir_graph_->GetSrc(mir, 0); rl_src = LoadValue(rl_src, reg_type); } else { + rl_src = mir_graph_->GetSrcWide(mir, 0); rl_src = LoadValueWide(rl_src, reg_type); } + RegStorage reg_to_shuffle = rl_src.reg; - // If opsize is 8 bits wide then double value and use 16 bit shuffle instead. - if (opsize == kSignedByte || opsize == kUnsignedByte) { - RegStorage temp = AllocTemp(); - // val = val + (val << 8). - NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg()); - NewLIR2(kX86Sal32RI, temp.GetReg(), 8); - NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg()); - FreeTemp(temp); + // Load the value into the XMM register. + if (!cu_->target64 && opsize == k64) { + // Logic assumes that longs are loaded in GP register pairs. + NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), reg_to_shuffle.GetLowReg()); + RegStorage r_tmp = AllocTempDouble(); + NewLIR2(kX86MovdxrRR, r_tmp.GetReg(), reg_to_shuffle.GetHighReg()); + NewLIR2(kX86PunpckldqRR, rs_dest.GetReg(), r_tmp.GetReg()); + FreeTemp(r_tmp); + } else { + NewLIR2(op_mov, rs_dest.GetReg(), reg_to_shuffle.GetReg()); } - // Load the value into the XMM register. - NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg()); + if (opsize == kSignedByte || opsize == kUnsignedByte) { + // In the byte case, first duplicate it to be a word + // Then duplicate it to be a double-word + NewLIR2(kX86PunpcklbwRR, rs_dest.GetReg(), rs_dest.GetReg()); + NewLIR2(kX86PunpcklwdRR, rs_dest.GetReg(), rs_dest.GetReg()); + } // Now shuffle the value across the destination. - NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm); + if (op_shuffle == kX86PunpcklqdqRR) { + NewLIR2(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg()); + } else { + NewLIR3(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg(), 0); + } // And then repeat as needed. - if (op_high != 0) { - NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm); + if (op_shuffle_high != 0) { + NewLIR3(op_shuffle_high, rs_dest.GetReg(), rs_dest.GetReg(), 0); } } -LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) { - int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg); +void X86Mir2Lir::GenPackedArrayGet(BasicBlock *bb, MIR *mir) { + UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayGet not supported."; +} + +void X86Mir2Lir::GenPackedArrayPut(BasicBlock *bb, MIR *mir) { + UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayPut not supported."; +} + +LIR* X86Mir2Lir::ScanVectorLiteral(int32_t* constants) { for (LIR *p = const_vectors_; p != nullptr; p = p->next) { - if (args[0] == p->operands[0] && args[1] == p->operands[1] && - args[2] == p->operands[2] && args[3] == p->operands[3]) { + if (constants[0] == p->operands[0] && constants[1] == p->operands[1] && + constants[2] == p->operands[2] && constants[3] == p->operands[3]) { return p; } } return nullptr; } -LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) { +LIR* X86Mir2Lir::AddVectorLiteral(int32_t* constants) { LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData)); - int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg); - new_value->operands[0] = args[0]; - new_value->operands[1] = args[1]; - new_value->operands[2] = args[2]; - new_value->operands[3] = args[3]; + new_value->operands[0] = constants[0]; + new_value->operands[1] = constants[1]; + new_value->operands[2] = constants[2]; + new_value->operands[3] = constants[3]; new_value->next = const_vectors_; if (const_vectors_ == nullptr) { - estimated_native_code_size_ += 12; // Amount needed to align to 16 byte boundary. + estimated_native_code_size_ += 12; // Maximum needed to align to 16 byte boundary. } estimated_native_code_size_ += 16; // Space for one vector. const_vectors_ = new_value; diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc index 4f65a0f..30384ec 100644 --- a/compiler/dex/quick/x86/utility_x86.cc +++ b/compiler/dex/quick/x86/utility_x86.cc @@ -990,6 +990,17 @@ void X86Mir2Lir::AnalyzeExtendedMIR(int opcode, BasicBlock * bb, MIR *mir) { case kMirOpConstVector: store_method_addr_ = true; break; + case kMirOpPackedMultiply: + case kMirOpPackedShiftLeft: + case kMirOpPackedSignedShiftRight: + case kMirOpPackedUnsignedShiftRight: { + // Byte emulation requires constants from the literal pool. + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + if (opsize == kSignedByte || opsize == kUnsignedByte) { + store_method_addr_ = true; + } + break; + } default: // Ignore the rest. break; diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h index e3ef8c1..22a2f30 100644 --- a/compiler/dex/quick/x86/x86_lir.h +++ b/compiler/dex/quick/x86/x86_lir.h @@ -555,20 +555,27 @@ enum X86OpCode { Binary0fOpCode(kX86Subss), // float subtract Binary0fOpCode(kX86Divsd), // double divide Binary0fOpCode(kX86Divss), // float divide - Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words + Binary0fOpCode(kX86Punpcklbw), // Interleave low-order bytes + Binary0fOpCode(kX86Punpcklwd), // Interleave low-order single words (16-bits) + Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words (32-bit) + Binary0fOpCode(kX86Punpcklqdq), // Interleave low-order quad word Binary0fOpCode(kX86Sqrtsd), // square root Binary0fOpCode(kX86Pmulld), // parallel integer multiply 32 bits x 4 Binary0fOpCode(kX86Pmullw), // parallel integer multiply 16 bits x 8 + Binary0fOpCode(kX86Pmuludq), // parallel unsigned 32 integer and stores result as 64 Binary0fOpCode(kX86Mulps), // parallel FP multiply 32 bits x 4 Binary0fOpCode(kX86Mulpd), // parallel FP multiply 64 bits x 2 Binary0fOpCode(kX86Paddb), // parallel integer addition 8 bits x 16 Binary0fOpCode(kX86Paddw), // parallel integer addition 16 bits x 8 Binary0fOpCode(kX86Paddd), // parallel integer addition 32 bits x 4 + Binary0fOpCode(kX86Paddq), // parallel integer addition 64 bits x 2 + Binary0fOpCode(kX86Psadbw), // computes sum of absolute differences for unsigned byte integers Binary0fOpCode(kX86Addps), // parallel FP addition 32 bits x 4 Binary0fOpCode(kX86Addpd), // parallel FP addition 64 bits x 2 Binary0fOpCode(kX86Psubb), // parallel integer subtraction 8 bits x 16 Binary0fOpCode(kX86Psubw), // parallel integer subtraction 16 bits x 8 Binary0fOpCode(kX86Psubd), // parallel integer subtraction 32 bits x 4 + Binary0fOpCode(kX86Psubq), // parallel integer subtraction 32 bits x 4 Binary0fOpCode(kX86Subps), // parallel FP subtraction 32 bits x 4 Binary0fOpCode(kX86Subpd), // parallel FP subtraction 64 bits x 2 Binary0fOpCode(kX86Pand), // parallel AND 128 bits x 1 @@ -593,6 +600,7 @@ enum X86OpCode { kX86PsrlwRI, // logical right shift of floating point registers 16 bits x 8 kX86PsrldRI, // logical right shift of floating point registers 32 bits x 4 kX86PsrlqRI, // logical right shift of floating point registers 64 bits x 2 + kX86PsrldqRI, // logical shift of 128-bit vector register, immediate in bytes kX86PsllwRI, // left shift of floating point registers 16 bits x 8 kX86PslldRI, // left shift of floating point registers 32 bits x 4 kX86PsllqRI, // left shift of floating point registers 64 bits x 2 @@ -607,8 +615,8 @@ enum X86OpCode { kX86Fprem, // remainder from dividing of two floating point values kX86Fucompp, // compare floating point values and pop x87 fp stack twice kX86Fstsw16R, // store FPU status word - Binary0fOpCode(kX86Mova128), // move 128 bits aligned - kX86Mova128MR, kX86Mova128AR, // store 128 bit aligned from xmm1 to m128 + Binary0fOpCode(kX86Movdqa), // move 128 bits aligned + kX86MovdqaMR, kX86MovdqaAR, // store 128 bit aligned from xmm1 to m128 Binary0fOpCode(kX86Movups), // load unaligned packed single FP values from xmm2/m128 to xmm1 kX86MovupsMR, kX86MovupsAR, // store unaligned packed single FP values from xmm1 to m128 Binary0fOpCode(kX86Movaps), // load aligned packed single FP values from xmm2/m128 to xmm1 diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc index 7551add..1848abe 100644 --- a/disassembler/disassembler_x86.cc +++ b/disassembler/disassembler_x86.cc @@ -558,14 +558,19 @@ DISASSEMBLER_ENTRY(cmp, has_modrm = true; src_reg_file = dst_reg_file = SSE; break; - case 0x62: + case 0x60: case 0x61: case 0x62: case 0x6C: if (prefix[2] == 0x66) { src_reg_file = dst_reg_file = SSE; prefix[2] = 0; // Clear prefix now. It has served its purpose as part of the opcode. } else { src_reg_file = dst_reg_file = MMX; } - opcode << "punpckldq"; + switch (*instr) { + case 0x60: opcode << "punpcklbw"; break; + case 0x61: opcode << "punpcklwd"; break; + case 0x62: opcode << "punpckldq"; break; + case 0x6c: opcode << "punpcklqdq"; break; + } load = true; has_modrm = true; break; @@ -650,7 +655,7 @@ DISASSEMBLER_ENTRY(cmp, } else { dst_reg_file = MMX; } - static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "unknown-73", "unknown-73", "unknown-73", "psllq", "unknown-73"}; + static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "psrldq", "unknown-73", "unknown-73", "psllq", "unknown-73"}; modrm_opcodes = x73_opcodes; reg_is_opcode = true; has_modrm = true; @@ -800,6 +805,18 @@ DISASSEMBLER_ENTRY(cmp, opcode << "bswap"; reg_in_opcode = true; break; + case 0xD4: + if (prefix[2] == 0x66) { + src_reg_file = dst_reg_file = SSE; + prefix[2] = 0; + } else { + src_reg_file = dst_reg_file = MMX; + } + opcode << "paddq"; + prefix[2] = 0; + has_modrm = true; + load = true; + break; case 0xDB: if (prefix[2] == 0x66) { src_reg_file = dst_reg_file = SSE; @@ -847,66 +864,14 @@ DISASSEMBLER_ENTRY(cmp, has_modrm = true; load = true; break; + case 0xF4: + case 0xF6: case 0xF8: - if (prefix[2] == 0x66) { - src_reg_file = dst_reg_file = SSE; - prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode - } else { - src_reg_file = dst_reg_file = MMX; - } - opcode << "psubb"; - prefix[2] = 0; - has_modrm = true; - load = true; - break; case 0xF9: - if (prefix[2] == 0x66) { - src_reg_file = dst_reg_file = SSE; - prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode - } else { - src_reg_file = dst_reg_file = MMX; - } - opcode << "psubw"; - prefix[2] = 0; - has_modrm = true; - load = true; - break; case 0xFA: - if (prefix[2] == 0x66) { - src_reg_file = dst_reg_file = SSE; - prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode - } else { - src_reg_file = dst_reg_file = MMX; - } - opcode << "psubd"; - prefix[2] = 0; - has_modrm = true; - load = true; - break; + case 0xFB: case 0xFC: - if (prefix[2] == 0x66) { - src_reg_file = dst_reg_file = SSE; - prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode - } else { - src_reg_file = dst_reg_file = MMX; - } - opcode << "paddb"; - prefix[2] = 0; - has_modrm = true; - load = true; - break; case 0xFD: - if (prefix[2] == 0x66) { - src_reg_file = dst_reg_file = SSE; - prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode - } else { - src_reg_file = dst_reg_file = MMX; - } - opcode << "paddw"; - prefix[2] = 0; - has_modrm = true; - load = true; - break; case 0xFE: if (prefix[2] == 0x66) { src_reg_file = dst_reg_file = SSE; @@ -914,7 +879,17 @@ DISASSEMBLER_ENTRY(cmp, } else { src_reg_file = dst_reg_file = MMX; } - opcode << "paddd"; + switch (*instr) { + case 0xF4: opcode << "pmuludq"; break; + case 0xF6: opcode << "psadbw"; break; + case 0xF8: opcode << "psubb"; break; + case 0xF9: opcode << "psubw"; break; + case 0xFA: opcode << "psubd"; break; + case 0xFB: opcode << "psubq"; break; + case 0xFC: opcode << "paddb"; break; + case 0xFD: opcode << "paddw"; break; + case 0xFE: opcode << "paddd"; break; + } prefix[2] = 0; has_modrm = true; load = true; |