summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--compiler/dex/backend.h7
-rw-r--r--compiler/dex/compiler_enums.h33
-rw-r--r--compiler/dex/mir_analysis.cc48
-rw-r--r--compiler/dex/mir_dataflow.cc124
-rw-r--r--compiler/dex/mir_graph.cc14
-rw-r--r--compiler/dex/quick/x86/assemble_x86.cc32
-rw-r--r--compiler/dex/quick/x86/codegen_x86.h48
-rwxr-xr-xcompiler/dex/quick/x86/target_x86.cc658
-rw-r--r--compiler/dex/quick/x86/utility_x86.cc11
-rw-r--r--compiler/dex/quick/x86/x86_lir.h14
-rw-r--r--disassembler/disassembler_x86.cc93
11 files changed, 710 insertions, 372 deletions
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h
index 1f24849..cab3427 100644
--- a/compiler/dex/backend.h
+++ b/compiler/dex/backend.h
@@ -38,14 +38,15 @@ class Backend {
/*
* Return the number of reservable vector registers supported
- * @param fp_used ‘true’ if floating point computations will be
- * executed while vector registers are reserved.
+ * @param long_or_fp ‘true’ if floating point computations will be
+ * executed or the operations will be long type while vector
+ * registers are reserved.
* @return the number of vector registers that are available
* @note The backend should ensure that sufficient vector registers
* are held back to generate scalar code without exhausting vector
* registers, if scalar code also uses the vector registers.
*/
- virtual int NumReservableVectorRegisters(bool fp_used) { return 0; }
+ virtual int NumReservableVectorRegisters(bool long_or_fp) { return 0; }
protected:
explicit Backend(ArenaAllocator* arena) : arena_(arena) {}
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 9c2a8ba..e4003bf 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -256,13 +256,16 @@ enum ExtendedMIROpcode {
// vC: TypeSize
kMirOpPackedSet,
- // @brief Reserve N vector registers (named 0..N-1)
- // vA: Number of registers
+ // @brief Reserve a range of vector registers.
+ // vA: Start vector register to reserve.
+ // vB: Inclusive end vector register to reserve.
// @note: The backend may choose to map vector numbers used in vector opcodes.
// Reserved registers are removed from the list of backend temporary pool.
kMirOpReserveVectorRegisters,
- // @brief Free Reserved vector registers
+ // @brief Free a range of reserved vector registers
+ // vA: Start vector register to unreserve.
+ // vB: Inclusive end vector register to unreserve.
// @note: All currently reserved vector registers are returned to the temporary pool.
kMirOpReturnVectorRegisters,
@@ -270,6 +273,30 @@ enum ExtendedMIROpcode {
// vA: a constant defined by enum MemBarrierKind.
kMirOpMemBarrier,
+ // @brief Used to fill a vector register with array values.
+ // @details Just as with normal arrays, access on null object register must ensure NullPointerException
+ // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same
+ // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that
+ // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with
+ // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK.
+ // vA: destination vector register
+ // vB: array register
+ // vC: index register
+ // arg[0]: TypeSize (most other vector opcodes have this in vC)
+ kMirOpPackedArrayGet,
+
+ // @brief Used to store a vector register into array.
+ // @details Just as with normal arrays, access on null object register must ensure NullPointerException
+ // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same
+ // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that
+ // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with
+ // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK.
+ // vA: source vector register
+ // vB: array register
+ // vC: index register
+ // arg[0]: TypeSize (most other vector opcodes have this in vC)
+ kMirOpPackedArrayPut,
+
kMirOpLast,
};
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index a8af92c..b265ee7 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -830,68 +830,74 @@ const uint32_t MIRGraph::analysis_attributes_[kMirOpLast] = {
// 109 MIR_RANGE_CHECK
AN_NONE,
- // 110 MIR_DIV_ZERO_CHECK
+ // 10A MIR_DIV_ZERO_CHECK
AN_NONE,
- // 111 MIR_CHECK
+ // 10B MIR_CHECK
AN_NONE,
- // 112 MIR_CHECKPART2
+ // 10C MIR_CHECKPART2
AN_NONE,
- // 113 MIR_SELECT
+ // 10D MIR_SELECT
AN_NONE,
- // 114 MirOpConstVector
+ // 10E MirOpConstVector
AN_NONE,
- // 115 MirOpMoveVector
+ // 10F MirOpMoveVector
AN_NONE,
- // 116 MirOpPackedMultiply
+ // 110 MirOpPackedMultiply
AN_NONE,
- // 117 MirOpPackedAddition
+ // 111 MirOpPackedAddition
AN_NONE,
- // 118 MirOpPackedSubtract
+ // 112 MirOpPackedSubtract
AN_NONE,
- // 119 MirOpPackedShiftLeft
+ // 113 MirOpPackedShiftLeft
AN_NONE,
- // 120 MirOpPackedSignedShiftRight
+ // 114 MirOpPackedSignedShiftRight
AN_NONE,
- // 121 MirOpPackedUnsignedShiftRight
+ // 115 MirOpPackedUnsignedShiftRight
AN_NONE,
- // 122 MirOpPackedAnd
+ // 116 MirOpPackedAnd
AN_NONE,
- // 123 MirOpPackedOr
+ // 117 MirOpPackedOr
AN_NONE,
- // 124 MirOpPackedXor
+ // 118 MirOpPackedXor
AN_NONE,
- // 125 MirOpPackedAddReduce
+ // 119 MirOpPackedAddReduce
AN_NONE,
- // 126 MirOpPackedReduce
+ // 11A MirOpPackedReduce
AN_NONE,
- // 127 MirOpPackedSet
+ // 11B MirOpPackedSet
AN_NONE,
- // 128 MirOpReserveVectorRegisters
+ // 11C MirOpReserveVectorRegisters
AN_NONE,
- // 129 MirOpReturnVectorRegisters
+ // 11D MirOpReturnVectorRegisters
AN_NONE,
- // 130 MirOpMemBarrier
+ // 11E MirOpMemBarrier
AN_NONE,
+
+ // 11F MirOpPackedArrayGet
+ AN_ARRAYOP,
+
+ // 120 MirOpPackedArrayPut
+ AN_ARRAYOP,
};
struct MethodStats {
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 4c906b0..d9531fb 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -829,68 +829,74 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = {
// 109 MIR_RANGE_CHECK
0,
- // 110 MIR_DIV_ZERO_CHECK
+ // 10A MIR_DIV_ZERO_CHECK
0,
- // 111 MIR_CHECK
+ // 10B MIR_CHECK
0,
- // 112 MIR_CHECKPART2
+ // 10C MIR_CHECKPART2
0,
- // 113 MIR_SELECT
+ // 10D MIR_SELECT
DF_DA | DF_UB,
- // 114 MirOpConstVector
- DF_DA,
+ // 10E MirOpConstVector
+ 0,
- // 115 MirOpMoveVector
+ // 10F MirOpMoveVector
0,
- // 116 MirOpPackedMultiply
+ // 110 MirOpPackedMultiply
0,
- // 117 MirOpPackedAddition
+ // 111 MirOpPackedAddition
0,
- // 118 MirOpPackedSubtract
+ // 112 MirOpPackedSubtract
0,
- // 119 MirOpPackedShiftLeft
+ // 113 MirOpPackedShiftLeft
0,
- // 120 MirOpPackedSignedShiftRight
+ // 114 MirOpPackedSignedShiftRight
0,
- // 121 MirOpPackedUnsignedShiftRight
+ // 115 MirOpPackedUnsignedShiftRight
0,
- // 122 MirOpPackedAnd
+ // 116 MirOpPackedAnd
0,
- // 123 MirOpPackedOr
+ // 117 MirOpPackedOr
0,
- // 124 MirOpPackedXor
+ // 118 MirOpPackedXor
0,
- // 125 MirOpPackedAddReduce
- DF_DA | DF_UA,
+ // 119 MirOpPackedAddReduce
+ DF_FORMAT_EXTENDED,
- // 126 MirOpPackedReduce
- DF_DA,
+ // 11A MirOpPackedReduce
+ DF_FORMAT_EXTENDED,
- // 127 MirOpPackedSet
- DF_UB,
+ // 11B MirOpPackedSet
+ DF_FORMAT_EXTENDED,
- // 128 MirOpReserveVectorRegisters
+ // 11C MirOpReserveVectorRegisters
0,
- // 129 MirOpReturnVectorRegisters
+ // 11D MirOpReturnVectorRegisters
0,
- // 130 MirOpMemBarrier
+ // 11E MirOpMemBarrier
0,
+
+ // 11F MirOpPackedArrayGet
+ DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN,
+
+ // 120 MirOpPackedArrayPut
+ DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN,
};
/* Return the base virtual register for a SSA name */
@@ -915,7 +921,36 @@ void MIRGraph::HandleDef(ArenaBitVector* def_v, int dalvik_reg_id) {
void MIRGraph::HandleExtended(ArenaBitVector* use_v, ArenaBitVector* def_v,
ArenaBitVector* live_in_v,
const MIR::DecodedInstruction& d_insn) {
+ // For vector MIRs, vC contains type information
+ bool is_vector_type_wide = false;
+ int type_size = d_insn.vC >> 16;
+ if (type_size == k64 || type_size == kDouble) {
+ is_vector_type_wide = true;
+ }
+
switch (static_cast<int>(d_insn.opcode)) {
+ case kMirOpPackedAddReduce:
+ HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA);
+ if (is_vector_type_wide == true) {
+ HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA + 1);
+ }
+ HandleDef(def_v, d_insn.vA);
+ if (is_vector_type_wide == true) {
+ HandleDef(def_v, d_insn.vA + 1);
+ }
+ break;
+ case kMirOpPackedReduce:
+ HandleDef(def_v, d_insn.vA);
+ if (is_vector_type_wide == true) {
+ HandleDef(def_v, d_insn.vA + 1);
+ }
+ break;
+ case kMirOpPackedSet:
+ HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB);
+ if (is_vector_type_wide == true) {
+ HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB + 1);
+ }
+ break;
default:
LOG(ERROR) << "Unexpected Extended Opcode " << d_insn.opcode;
break;
@@ -1064,7 +1099,46 @@ void MIRGraph::DataFlowSSAFormat3RC(MIR* mir) {
}
void MIRGraph::DataFlowSSAFormatExtended(MIR* mir) {
+ const MIR::DecodedInstruction& d_insn = mir->dalvikInsn;
+ // For vector MIRs, vC contains type information
+ bool is_vector_type_wide = false;
+ int type_size = d_insn.vC >> 16;
+ if (type_size == k64 || type_size == kDouble) {
+ is_vector_type_wide = true;
+ }
+
switch (static_cast<int>(mir->dalvikInsn.opcode)) {
+ case kMirOpPackedAddReduce:
+ // We have one use, plus one more for wide
+ AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1);
+ HandleSSAUse(mir->ssa_rep->uses, d_insn.vA, 0);
+ if (is_vector_type_wide == true) {
+ HandleSSAUse(mir->ssa_rep->uses, d_insn.vA + 1, 1);
+ }
+
+ // We have a def, plus one more for wide
+ AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1);
+ HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0);
+ if (is_vector_type_wide == true) {
+ HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1);
+ }
+ break;
+ case kMirOpPackedReduce:
+ // We have a def, plus one more for wide
+ AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1);
+ HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0);
+ if (is_vector_type_wide == true) {
+ HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1);
+ }
+ break;
+ case kMirOpPackedSet:
+ // We have one use, plus one more for wide
+ AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1);
+ HandleSSAUse(mir->ssa_rep->uses, d_insn.vB, 0);
+ if (is_vector_type_wide == true) {
+ HandleSSAUse(mir->ssa_rep->uses, d_insn.vB + 1, 1);
+ }
+ break;
default:
LOG(ERROR) << "Missing case for extended MIR: " << mir->dalvikInsn.opcode;
break;
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index e77be5d..62a8f26 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -68,6 +68,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = {
"ReserveVectorRegisters",
"ReturnVectorRegisters",
"MemBarrier",
+ "PackedArrayGet",
+ "PackedArrayPut",
};
MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -1386,6 +1388,18 @@ void MIRGraph::DisassembleExtendedInstr(const MIR* mir, std::string* decoded_mir
decoded_mir->append(ss.str());
break;
}
+ case kMirOpPackedArrayGet:
+ case kMirOpPackedArrayPut:
+ decoded_mir->append(StringPrintf(" vect%d", mir->dalvikInsn.vA));
+ if (ssa_rep != nullptr) {
+ decoded_mir->append(StringPrintf(", %s[%s]",
+ GetSSANameWithConst(ssa_rep->uses[0], false).c_str(),
+ GetSSANameWithConst(ssa_rep->uses[1], false).c_str()));
+ } else {
+ decoded_mir->append(StringPrintf(", v%d[v%d]", mir->dalvikInsn.vB, mir->dalvikInsn.vC));
+ }
+ FillTypeSizeString(mir->dalvikInsn.arg[0], decoded_mir);
+ break;
default:
break;
}
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 46f5dd3..9935a22 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -16,6 +16,7 @@
#include "codegen_x86.h"
#include "dex/quick/mir_to_lir-inl.h"
+#include "oat.h"
#include "x86_lir.h"
namespace art {
@@ -389,20 +390,27 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
EXT_0F_ENCODING_MAP(Subss, 0xF3, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklbw, 0x66, 0x60, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklwd, 0x66, 0x61, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklqdq, 0x66, 0x6C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Sqrtsd, 0xF2, 0x51, REG_DEF0_USE0),
EXT_0F_ENCODING2_MAP(Pmulld, 0x66, 0x38, 0x40, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Pmullw, 0x66, 0xD5, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Pmuludq, 0x66, 0xF4, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Mulps, 0x00, 0x59, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Mulpd, 0x66, 0x59, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddb, 0x66, 0xFC, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddw, 0x66, 0xFD, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddd, 0x66, 0xFE, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Paddq, 0x66, 0xD4, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psadbw, 0x66, 0xF6, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Addps, 0x00, 0x58, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Addpd, 0xF2, 0x58, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubb, 0x66, 0xF8, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubw, 0x66, 0xF9, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubd, 0x66, 0xFA, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psubq, 0x66, 0xFB, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Subps, 0x00, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Subpd, 0x66, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Pand, 0x66, 0xDB, REG_DEF0_USE0),
@@ -431,6 +439,7 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
{ kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1, false }, "PsrlwRI", "!0r,!1d" },
{ kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1, false }, "PsrldRI", "!0r,!1d" },
{ kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1, false }, "PsrlqRI", "!0r,!1d" },
+ { kX86PsrldqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 3, 0, 1, false }, "PsrldqRI", "!0r,!1d" },
{ kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1, false }, "PsllwRI", "!0r,!1d" },
{ kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" },
@@ -447,9 +456,9 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
{ kX86Fucompp, kNullary, NO_OPERAND | USE_FP_STACK, { 0xDA, 0, 0xE9, 0, 0, 0, 0, 0, false }, "Fucompp", "" },
{ kX86Fstsw16R, kNullary, NO_OPERAND | REG_DEFA | USE_FP_STACK, { 0x9B, 0xDF, 0xE0, 0, 0, 0, 0, 0, false }, "Fstsw16R", "ax" },
- EXT_0F_ENCODING_MAP(Mova128, 0x66, 0x6F, REG_DEF0),
- { kX86Mova128MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" },
- { kX86Mova128AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128AR", "[!0r+!1r<<!2d+!3d],!4r" },
+ EXT_0F_ENCODING_MAP(Movdqa, 0x66, 0x6F, REG_DEF0),
+ { kX86MovdqaMR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaMR", "[!0r+!1d],!2r" },
+ { kX86MovdqaAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaAR", "[!0r+!1r<<!2d+!3d],!4r" },
EXT_0F_ENCODING_MAP(Movups, 0x0, 0x10, REG_DEF0),
@@ -1956,17 +1965,12 @@ void X86Mir2Lir::AssignOffsets() {
int offset = AssignInsnOffsets();
if (const_vectors_ != nullptr) {
- /* assign offsets to vector literals */
-
- // First, get offset to 12 mod 16 to align to 16 byte boundary.
- // This will ensure that the vector is 16 byte aligned, as the procedure is
- // always aligned at at 4 mod 16.
- int align_size = (16-4) - (offset & 0xF);
- if (align_size < 0) {
- align_size += 16;
- }
-
- offset += align_size;
+ // Vector literals must be 16-byte aligned. The header that is placed
+ // in the code section causes misalignment so we take it into account.
+ // Otherwise, we are sure that for x86 method is aligned to 16.
+ DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+ uint32_t bytes_to_fill = (0x10 - ((offset + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+ offset += bytes_to_fill;
// Now assign each literal the right offset.
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 7ad917d..a85e02f 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -125,7 +125,7 @@ class X86Mir2Lir : public Mir2Lir {
void CompilerInitializeRegAlloc() OVERRIDE;
int VectorRegisterSize() OVERRIDE;
- int NumReservableVectorRegisters(bool fp_used) OVERRIDE;
+ int NumReservableVectorRegisters(bool long_or_fp) OVERRIDE;
// Required for target - miscellaneous.
void AssembleLIR() OVERRIDE;
@@ -479,7 +479,8 @@ class X86Mir2Lir : public Mir2Lir {
void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
int64_t val, ConditionCode ccode);
void GenConstWide(RegLocation rl_dest, int64_t value);
- void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
+ void GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2);
+ void GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2);
void GenShiftByteVector(BasicBlock *bb, MIR *mir);
void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3,
uint32_t m4);
@@ -521,20 +522,18 @@ class X86Mir2Lir : public Mir2Lir {
bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
/**
- * @brief Reserve a fixed number of vector registers from the register pool
- * @details The mir->dalvikInsn.vA specifies an N such that vector registers
- * [0..N-1] are removed from the temporary pool. The caller must call
- * ReturnVectorRegisters before calling ReserveVectorRegisters again.
- * Also sets the num_reserved_vector_regs_ to the specified value
- * @param mir whose vA specifies the number of registers to reserve
+ * @brief Used to reserve a range of vector registers.
+ * @see kMirOpReserveVectorRegisters
+ * @param mir The extended MIR for reservation.
*/
void ReserveVectorRegisters(MIR* mir);
/**
- * @brief Return all the reserved vector registers to the temp pool
- * @details Returns [0..num_reserved_vector_regs_]
+ * @brief Used to return a range of vector registers.
+ * @see kMirOpReturnVectorRegisters
+ * @param mir The extended MIR for returning vector regs.
*/
- void ReturnVectorRegisters();
+ void ReturnVectorRegisters(MIR* mir);
/*
* @brief Load 128 bit constant into vector register.
@@ -684,6 +683,20 @@ class X86Mir2Lir : public Mir2Lir {
*/
void GenSetVector(BasicBlock *bb, MIR *mir);
+ /**
+ * @brief Used to generate code for kMirOpPackedArrayGet.
+ * @param bb The basic block of MIR.
+ * @param mir The mir whose opcode is kMirOpPackedArrayGet.
+ */
+ void GenPackedArrayGet(BasicBlock *bb, MIR *mir);
+
+ /**
+ * @brief Used to generate code for kMirOpPackedArrayPut.
+ * @param bb The basic block of MIR.
+ * @param mir The mir whose opcode is kMirOpPackedArrayPut.
+ */
+ void GenPackedArrayPut(BasicBlock *bb, MIR *mir);
+
/*
* @brief Generate code for a vector opcode.
* @param bb The basic block in which the MIR is from.
@@ -937,20 +950,20 @@ class X86Mir2Lir : public Mir2Lir {
LIR* stack_increment_;
// The list of const vector literals.
- LIR *const_vectors_;
+ LIR* const_vectors_;
/*
* @brief Search for a matching vector literal
- * @param mir A kMirOpConst128b MIR instruction to match.
+ * @param constants An array of size 4 which contains all of 32-bit constants.
* @returns pointer to matching LIR constant, or nullptr if not found.
*/
- LIR *ScanVectorLiteral(MIR *mir);
+ LIR* ScanVectorLiteral(int32_t* constants);
/*
* @brief Add a constant vector literal
- * @param mir A kMirOpConst128b MIR instruction to match.
+ * @param constants An array of size 4 which contains all of 32-bit constants.
*/
- LIR *AddVectorLiteral(MIR *mir);
+ LIR* AddVectorLiteral(int32_t* constants);
InToRegStorageMapping in_to_reg_storage_mapping_;
@@ -970,9 +983,6 @@ class X86Mir2Lir : public Mir2Lir {
static const X86EncodingMap EncodingMap[kX86Last];
private:
- // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
- int num_reserved_vector_regs_;
-
void SwapBits(RegStorage result_reg, int shift, int32_t value);
void SwapBits64(RegStorage result_reg, int shift, int64_t value);
};
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 68c1633..ffe6702 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -24,6 +24,7 @@
#include "dex/reg_storage_eq.h"
#include "mirror/array.h"
#include "mirror/string.h"
+#include "oat.h"
#include "x86_lir.h"
#include "utils/dwarf_cfi.h"
@@ -454,7 +455,7 @@ RegStorage X86Mir2Lir::AllocateByteRegister() {
}
RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
- return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+ return GetRegInfo(reg)->Master()->GetReg();
}
bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
@@ -689,8 +690,11 @@ int X86Mir2Lir::VectorRegisterSize() {
return 128;
}
-int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
- return fp_used ? 5 : 7;
+int X86Mir2Lir::NumReservableVectorRegisters(bool long_or_fp) {
+ int num_vector_temps = cu_->target64 ? xp_temps_64.size() : xp_temps_32.size();
+
+ // Leave a few temps for use by backend as scratch.
+ return long_or_fp ? num_vector_temps - 2 : num_vector_temps - 1;
}
void X86Mir2Lir::SpillCoreRegs() {
@@ -864,9 +868,6 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator*
rX86_RET1 = rDX;
rX86_INVOKE_TGT = rAX;
rX86_COUNT = rCX;
-
- // Initialize the number of reserved vector registers
- num_reserved_vector_regs_ = -1;
}
Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
@@ -1022,19 +1023,18 @@ void X86Mir2Lir::InstallLiteralPools() {
DCHECK(method_literal_list_ == nullptr);
DCHECK(class_literal_list_ == nullptr);
- // Align to 16 byte boundary. We have implicit knowledge that the start of the method is
- // on a 4 byte boundary. How can I check this if it changes (other than aligned loads
- // will fail at runtime)?
- if (const_vectors_ != nullptr) {
- int align_size = (16-4) - (code_buffer_.size() & 0xF);
- if (align_size < 0) {
- align_size += 16;
- }
- while (align_size > 0) {
+ if (const_vectors_ != nullptr) {
+ // Vector literals must be 16-byte aligned. The header that is placed
+ // in the code section causes misalignment so we take it into account.
+ // Otherwise, we are sure that for x86 method is aligned to 16.
+ DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+ uint32_t bytes_to_fill = (0x10 - ((code_buffer_.size() + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+ while (bytes_to_fill > 0) {
code_buffer_.push_back(0);
- align_size--;
+ bytes_to_fill--;
}
+
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
PushWord(&code_buffer_, p->operands[0]);
PushWord(&code_buffer_, p->operands[1]);
@@ -1489,7 +1489,7 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
ReserveVectorRegisters(mir);
break;
case kMirOpReturnVectorRegisters:
- ReturnVectorRegisters();
+ ReturnVectorRegisters(mir);
break;
case kMirOpConstVector:
GenConst128(bb, mir);
@@ -1536,17 +1536,19 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
case kMirOpMemBarrier:
GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA));
break;
+ case kMirOpPackedArrayGet:
+ GenPackedArrayGet(bb, mir);
+ break;
+ case kMirOpPackedArrayPut:
+ GenPackedArrayPut(bb, mir);
+ break;
default:
break;
}
}
void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
- // We should not try to reserve twice without returning the registers
- DCHECK_NE(num_reserved_vector_regs_, -1);
-
- int num_vector_reg = mir->dalvikInsn.vA;
- for (int i = 0; i < num_vector_reg; i++) {
+ for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
RegStorage xp_reg = RegStorage::Solo128(i);
RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
Clobber(xp_reg);
@@ -1561,13 +1563,10 @@ void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
}
}
}
-
- num_reserved_vector_regs_ = num_vector_reg;
}
-void X86Mir2Lir::ReturnVectorRegisters() {
- // Return all the reserved registers
- for (int i = 0; i < num_reserved_vector_regs_; i++) {
+void X86Mir2Lir::ReturnVectorRegisters(MIR* mir) {
+ for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
RegStorage xp_reg = RegStorage::Solo128(i);
RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
@@ -1581,17 +1580,12 @@ void X86Mir2Lir::ReturnVectorRegisters() {
}
}
}
-
- // We don't have anymore reserved vector registers
- num_reserved_vector_regs_ = -1;
}
void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
- store_method_addr_used_ = true;
- int type_size = mir->dalvikInsn.vB;
- // We support 128 bit vectors.
- DCHECK_EQ(type_size & 0xFFFF, 128);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest);
+
uint32_t *args = mir->dalvikInsn.arg;
int reg = rs_dest.GetReg();
// Check for all 0 case.
@@ -1601,14 +1595,24 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
}
// Append the mov const vector to reg opcode.
- AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+ AppendOpcodeWithConst(kX86MovdqaRM, reg, mir);
}
void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
- // Okay, load it from the constant vector area.
- LIR *data_target = ScanVectorLiteral(mir);
+ // The literal pool needs position independent logic.
+ store_method_addr_used_ = true;
+
+ // To deal with correct memory ordering, reverse order of constants.
+ int32_t constants[4];
+ constants[3] = mir->dalvikInsn.arg[0];
+ constants[2] = mir->dalvikInsn.arg[1];
+ constants[1] = mir->dalvikInsn.arg[2];
+ constants[0] = mir->dalvikInsn.arg[3];
+
+ // Search if there is already a constant in pool with this value.
+ LIR *data_target = ScanVectorLiteral(constants);
if (data_target == nullptr) {
- data_target = AddVectorLiteral(mir);
+ data_target = AddVectorLiteral(constants);
}
// Address the start of the method.
@@ -1624,7 +1628,7 @@ void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
// 4 byte offset. We will fix this up in the assembler later to have the right
// value.
ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
- LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
+ LIR *load = NewLIR3(opcode, reg, rl_method.reg.GetReg(), 256 /* bogus */);
load->flags.fixup = kFixupLoad;
load->target = data_target;
}
@@ -1633,16 +1637,12 @@ void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest);
RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
- NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+ NewLIR2(kX86MovdqaRR, rs_dest.GetReg(), rs_src.GetReg());
}
-void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
- const int BYTE_SIZE = 8;
- RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
- RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
- RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
-
+void X86Mir2Lir::GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2) {
/*
* Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
* and multiplying 8 at a time before recombining back into one XMM register.
@@ -1660,29 +1660,100 @@ void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
*/
// Copy xmm1.
- NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+ RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempDouble());
+ RegStorage rs_dest_high_tmp = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_src1_high_tmp.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86MovdqaRR, rs_dest_high_tmp.GetReg(), rs_dest_src1.GetReg());
// Multiply low bits.
+ // x7 *= x3
NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
// xmm1 now has low bits.
AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
// Prepare high bits for multiplication.
- NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
- AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+ NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), 0x8);
+ AndMaskVectorRegister(rs_dest_high_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
// Multiply high bits and xmm2 now has high bits.
- NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+ NewLIR2(kX86PmullwRR, rs_src1_high_tmp.GetReg(), rs_dest_high_tmp.GetReg());
// Combine back into dest XMM register.
- NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src1_high_tmp.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2) {
+ /*
+ * We need to emulate the packed long multiply.
+ * For kMirOpPackedMultiply xmm1, xmm0:
+ * - xmm1 is src/dest
+ * - xmm0 is src
+ * - Get xmm2 and xmm3 as temp
+ * - Idea is to multiply the lower 32 of each operand with the higher 32 of the other.
+ * - Then add the two results.
+ * - Move it to the upper 32 of the destination
+ * - Then multiply the lower 32-bits of the operands and add the result to the destination.
+ *
+ * (op dest src )
+ * movdqa %xmm2, %xmm1
+ * movdqa %xmm3, %xmm0
+ * psrlq %xmm3, $0x20
+ * pmuludq %xmm3, %xmm2
+ * psrlq %xmm1, $0x20
+ * pmuludq %xmm1, %xmm0
+ * paddq %xmm1, %xmm3
+ * psllq %xmm1, $0x20
+ * pmuludq %xmm2, %xmm0
+ * paddq %xmm1, %xmm2
+ *
+ * When both the operands are the same, then we need to calculate the lower-32 * higher-32
+ * calculation only once. Thus we don't need the xmm3 temp above. That sequence becomes:
+ *
+ * (op dest src )
+ * movdqa %xmm2, %xmm1
+ * psrlq %xmm1, $0x20
+ * pmuludq %xmm1, %xmm0
+ * paddq %xmm1, %xmm1
+ * psllq %xmm1, $0x20
+ * pmuludq %xmm2, %xmm0
+ * paddq %xmm1, %xmm2
+ *
+ */
+
+ bool both_operands_same = (rs_dest_src1.GetReg() == rs_src2.GetReg());
+
+ RegStorage rs_tmp_vector_1;
+ RegStorage rs_tmp_vector_2;
+ rs_tmp_vector_1 = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector_1.GetReg(), rs_dest_src1.GetReg());
+
+ if (both_operands_same == false) {
+ rs_tmp_vector_2 = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector_2.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PsrlqRI, rs_tmp_vector_2.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_tmp_vector_2.GetReg(), rs_tmp_vector_1.GetReg());
+ }
+
+ NewLIR2(kX86PsrlqRI, rs_dest_src1.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+ if (both_operands_same == false) {
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_2.GetReg());
+ } else {
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+ }
+
+ NewLIR2(kX86PsllqRI, rs_dest_src1.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_tmp_vector_1.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_1.GetReg());
}
void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
@@ -1700,7 +1771,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
break;
case kSignedByte:
// HW doesn't support 16x16 byte multiplication so emulate it.
- GenMultiplyVectorSignedByte(bb, mir);
+ GenMultiplyVectorSignedByte(rs_dest_src1, rs_src2);
+ return;
+ case k64:
+ GenMultiplyVectorLong(rs_dest_src1, rs_src2);
return;
default:
LOG(FATAL) << "Unsupported vector multiply " << opsize;
@@ -1713,12 +1787,16 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
case k32:
opcode = kX86PadddRR;
break;
+ case k64:
+ opcode = kX86PaddqRR;
+ break;
case kSignedHalf:
case kUnsignedHalf:
opcode = kX86PaddwRR;
@@ -1744,12 +1822,16 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
case k32:
opcode = kX86PsubdRR;
break;
+ case k64:
+ opcode = kX86PsubqRR;
+ break;
case kSignedHalf:
case kUnsignedHalf:
opcode = kX86PsubwRR;
@@ -1772,58 +1854,54 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
}
void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+ // Destination does not need clobbered because it has already been as part
+ // of the general packed shift handler (caller of this method).
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
- RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
int opcode = 0;
- int imm = mir->dalvikInsn.vB;
-
switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
case kMirOpPackedShiftLeft:
opcode = kX86PsllwRI;
break;
case kMirOpPackedSignedShiftRight:
- opcode = kX86PsrawRI;
- break;
case kMirOpPackedUnsignedShiftRight:
- opcode = kX86PsrlwRI;
- break;
+ // TODO Add support for emulated byte shifts.
default:
LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
break;
}
- /*
- * xmm1 will have low bits
- * xmm2 will have high bits
- *
- * xmm2 = xmm1
- * xmm1 = xmm1 .<< N
- * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
- * xmm2 = xmm2 .<< N
- * xmm1 = xmm1 | xmm2
- */
-
- // Copy xmm1.
- NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+ // Clear xmm register and return if shift more than byte length.
+ int imm = mir->dalvikInsn.vB;
+ if (imm >= 8) {
+ NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+ return;
+ }
// Shift lower values.
NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
- // Mask bottom bits.
- AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
-
- // Shift higher values.
- NewLIR2(opcode, rs_tmp.GetReg(), imm);
+ /*
+ * The above shift will shift the whole word, but that means
+ * both the bytes will shift as well. To emulate a byte level
+ * shift, we can just throw away the lower (8 - N) bits of the
+ * upper byte, and we are done.
+ */
+ uint8_t byte_mask = 0xFF << imm;
+ uint32_t int_mask = byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
- // Combine back into dest XMM register.
- NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+ // And the destination with the mask
+ AndMaskVectorRegister(rs_dest_src1, int_mask, int_mask, int_mask, int_mask);
}
void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
@@ -1852,6 +1930,7 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
@@ -1866,6 +1945,8 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
case kUnsignedByte:
GenShiftByteVector(bb, mir);
return;
+ case k64:
+ // TODO Implement emulated shift algorithm.
default:
LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
break;
@@ -1877,6 +1958,7 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
@@ -1905,6 +1987,7 @@ void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
@@ -1913,6 +1996,7 @@ void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
@@ -1921,6 +2005,7 @@ void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
@@ -1945,134 +2030,240 @@ void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32
void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
- RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
- RegLocation rl_dest = mir_graph_->GetDest(mir);
- RegStorage rs_tmp;
-
- int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
- int vec_unit_size = 0;
- int opcode = 0;
- int extr_opcode = 0;
- RegLocation rl_result;
+ RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+ bool is_wide = opsize == k64 || opsize == kDouble;
+
+ // Get the location of the virtual register. Since this bytecode is overloaded
+ // for different types (and sizes), we need different logic for each path.
+ // The design of bytecode uses same VR for source and destination.
+ RegLocation rl_src, rl_dest, rl_result;
+ if (is_wide) {
+ rl_src = mir_graph_->GetSrcWide(mir, 0);
+ rl_dest = mir_graph_->GetDestWide(mir);
+ } else {
+ rl_src = mir_graph_->GetSrc(mir, 0);
+ rl_dest = mir_graph_->GetDest(mir);
+ }
- switch (opsize) {
- case k32:
- extr_opcode = kX86PextrdRRI;
- opcode = kX86PhadddRR;
- vec_unit_size = 4;
- break;
- case kSignedByte:
- case kUnsignedByte:
- extr_opcode = kX86PextrbRRI;
- opcode = kX86PhaddwRR;
- vec_unit_size = 2;
- break;
- case kSignedHalf:
- case kUnsignedHalf:
- extr_opcode = kX86PextrwRRI;
- opcode = kX86PhaddwRR;
- vec_unit_size = 2;
- break;
- case kSingle:
- rl_result = EvalLoc(rl_dest, kFPReg, true);
- vec_unit_size = 4;
- for (int i = 0; i < 3; i++) {
- NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
- NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
- }
- NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
- StoreValue(rl_dest, rl_result);
+ // We need a temp for byte and short values
+ RegStorage temp;
- // For single-precision floats, we are done here
- return;
- default:
- LOG(FATAL) << "Unsupported vector add reduce " << opsize;
- break;
- }
+ // There is a different path depending on type and size.
+ if (opsize == kSingle) {
+ // Handle float case.
+ // TODO Add support for fast math (not value safe) and do horizontal add in that case.
- int elems = vec_bytes / vec_unit_size;
+ rl_src = LoadValue(rl_src, kFPReg);
+ rl_result = EvalLoc(rl_dest, kFPReg, true);
- // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
- // TODO is overflow handled correctly?
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- rs_tmp = Get128BitRegister(AllocTempWide());
+ // Since we are doing an add-reduce, we move the reg holding the VR
+ // into the result so we include it in result.
+ OpRegCopy(rl_result.reg, rl_src.reg);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
- // tmp = xmm1 .>> 8.
- NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
- NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
+ // Since FP must keep order of operation for value safety, we shift to low
+ // 32-bits and add to result.
+ for (int i = 0; i < 3; i++) {
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+ }
- // Zero extend low bits in xmm1.
- AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
- }
+ StoreValue(rl_dest, rl_result);
+ } else if (opsize == kDouble) {
+ // Handle double case.
+ rl_src = LoadValueWide(rl_src, kFPReg);
+ rl_result = EvalLocWide(rl_dest, kFPReg, true);
+ LOG(FATAL) << "Unsupported vector add reduce for double.";
+ } else if (opsize == k64) {
+ /*
+ * Handle long case:
+ * 1) Reduce the vector register to lower half (with addition).
+ * 1-1) Get an xmm temp and fill it with vector register.
+ * 1-2) Shift the xmm temp by 8-bytes.
+ * 1-3) Add the xmm temp to vector register that is being reduced.
+ * 2) Allocate temp GP / GP pair.
+ * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+ * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+ * 3) Finish the add reduction by doing what add-long/2addr does,
+ * but instead of having a VR as one of the sources, we have our temp GP.
+ */
+ RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+ NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+ FreeTemp(rs_tmp_vector);
+
+ // We would like to be able to reuse the add-long implementation, so set up a fake
+ // register location to pass it.
+ RegLocation temp_loc = mir_graph_->GetBadLoc();
+ temp_loc.core = 1;
+ temp_loc.wide = 1;
+ temp_loc.location = kLocPhysReg;
+ temp_loc.reg = AllocTempWide();
+
+ if (cu_->target64) {
+ DCHECK(!temp_loc.reg.IsPair());
+ NewLIR2(kX86MovqrxRR, temp_loc.reg.GetReg(), vector_src.GetReg());
+ } else {
+ NewLIR2(kX86MovdrxRR, temp_loc.reg.GetLowReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+ NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg());
+ }
- while (elems > 1) {
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
+ GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc);
+ } else if (opsize == kSignedByte || opsize == kUnsignedByte) {
+ RegStorage rs_tmp = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg());
+ NewLIR2(kX86PsadbwRR, vector_src.GetReg(), rs_tmp.GetReg());
+ NewLIR3(kX86PshufdRRI, rs_tmp.GetReg(), vector_src.GetReg(), 0x4e);
+ NewLIR2(kX86PaddbRR, vector_src.GetReg(), rs_tmp.GetReg());
+ // Move to a GPR
+ temp = AllocTemp();
+ NewLIR2(kX86MovdrxRR, temp.GetReg(), vector_src.GetReg());
+ } else {
+ // Handle and the int and short cases together
+
+ // Initialize as if we were handling int case. Below we update
+ // the opcode if handling byte or short.
+ int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+ int vec_unit_size;
+ int horizontal_add_opcode;
+ int extract_opcode;
+
+ if (opsize == kSignedHalf || opsize == kUnsignedHalf) {
+ extract_opcode = kX86PextrwRRI;
+ horizontal_add_opcode = kX86PhaddwRR;
+ vec_unit_size = 2;
+ } else if (opsize == k32) {
+ vec_unit_size = 4;
+ horizontal_add_opcode = kX86PhadddRR;
+ extract_opcode = kX86PextrdRRI;
+ } else {
+ LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+ return;
}
- NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
- elems >>= 1;
- }
- // Combine the results if we separated them.
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
- }
+ int elems = vec_bytes / vec_unit_size;
- // We need to extract to a GPR.
- RegStorage temp = AllocTemp();
- NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
+ while (elems > 1) {
+ NewLIR2(horizontal_add_opcode, vector_src.GetReg(), vector_src.GetReg());
+ elems >>= 1;
+ }
- // Can we do this directly into memory?
- rl_result = UpdateLocTyped(rl_dest, kCoreReg);
- if (rl_result.location == kLocPhysReg) {
- // Ensure res is in a core reg
- rl_result = EvalLoc(rl_dest, kCoreReg, true);
- OpRegReg(kOpAdd, rl_result.reg, temp);
- StoreFinalValue(rl_dest, rl_result);
- } else {
- OpMemReg(kOpAdd, rl_result, temp.GetReg());
- }
+ // Handle this as arithmetic unary case.
+ ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
- FreeTemp(temp);
+ // Extract to a GP register because this is integral typed.
+ temp = AllocTemp();
+ NewLIR3(extract_opcode, temp.GetReg(), vector_src.GetReg(), 0);
+ }
+
+ if (opsize != k64 && opsize != kSingle && opsize != kDouble) {
+ // The logic below looks very similar to the handling of ADD_INT_2ADDR
+ // except the rhs is not a VR but a physical register allocated above.
+ // No load of source VR is done because it assumes that rl_result will
+ // share physical register / memory location.
+ rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+ if (rl_result.location == kLocPhysReg) {
+ // Ensure res is in a core reg.
+ rl_result = EvalLoc(rl_dest, kCoreReg, true);
+ OpRegReg(kOpAdd, rl_result.reg, temp);
+ StoreFinalValue(rl_dest, rl_result);
+ } else {
+ // Do the addition directly to memory.
+ OpMemReg(kOpAdd, rl_result, temp.GetReg());
+ }
+ }
}
void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegLocation rl_dest = mir_graph_->GetDest(mir);
- RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
int extract_index = mir->dalvikInsn.arg[0];
int extr_opcode = 0;
RegLocation rl_result;
bool is_wide = false;
- switch (opsize) {
- case k32:
- rl_result = UpdateLocTyped(rl_dest, kCoreReg);
- extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
- break;
- case kSignedHalf:
- case kUnsignedHalf:
- rl_result= UpdateLocTyped(rl_dest, kCoreReg);
- extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
- break;
- default:
- LOG(FATAL) << "Unsupported vector add reduce " << opsize;
- return;
- break;
- }
+ // There is a different path depending on type and size.
+ if (opsize == kSingle) {
+ // Handle float case.
+ // TODO Add support for fast math (not value safe) and do horizontal add in that case.
- if (rl_result.location == kLocPhysReg) {
- NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
- if (is_wide == true) {
- StoreFinalValue(rl_dest, rl_result);
+ rl_result = EvalLoc(rl_dest, kFPReg, true);
+ NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+
+ // Since FP must keep order of operation for value safety, we shift to low
+ // 32-bits and add to result.
+ for (int i = 0; i < 3; i++) {
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+ }
+
+ StoreValue(rl_dest, rl_result);
+ } else if (opsize == kDouble) {
+ // TODO Handle double case.
+ LOG(FATAL) << "Unsupported add reduce for double.";
+ } else if (opsize == k64) {
+ /*
+ * Handle long case:
+ * 1) Reduce the vector register to lower half (with addition).
+ * 1-1) Get an xmm temp and fill it with vector register.
+ * 1-2) Shift the xmm temp by 8-bytes.
+ * 1-3) Add the xmm temp to vector register that is being reduced.
+ * 2) Evaluate destination to a GP / GP pair.
+ * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+ * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+ * 3) Store the result to the final destination.
+ */
+ RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+ NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+ FreeTemp(rs_tmp_vector);
+
+ rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+ if (cu_->target64) {
+ DCHECK(!rl_result.reg.IsPair());
+ NewLIR2(kX86MovqrxRR, rl_result.reg.GetReg(), vector_src.GetReg());
} else {
- StoreFinalValueWide(rl_dest, rl_result);
+ NewLIR2(kX86MovdrxRR, rl_result.reg.GetLowReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+ NewLIR2(kX86MovdrxRR, rl_result.reg.GetHighReg(), vector_src.GetReg());
}
+
+ StoreValueWide(rl_dest, rl_result);
} else {
- int displacement = SRegOffset(rl_result.s_reg_low);
- LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
- AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
- AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+ // Handle the rest of integral types now.
+ switch (opsize) {
+ case k32:
+ rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+ extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+ extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector reduce " << opsize;
+ return;
+ }
+
+ if (rl_result.location == kLocPhysReg) {
+ NewLIR3(extr_opcode, rl_result.reg.GetReg(), vector_src.GetReg(), extract_index);
+ if (is_wide == true) {
+ StoreFinalValue(rl_dest, rl_result);
+ } else {
+ StoreFinalValueWide(rl_dest, rl_result);
+ }
+ } else {
+ int displacement = SRegOffset(rl_result.s_reg_low);
+ LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, vector_src.GetReg());
+ AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+ AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+ }
}
}
@@ -2080,96 +2271,113 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
- int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+ Clobber(rs_dest);
+ int op_shuffle = 0, op_shuffle_high = 0, op_mov = kX86MovdxrRR;
RegisterClass reg_type = kCoreReg;
+ bool is_wide = false;
switch (opsize) {
case k32:
- op_low = kX86PshufdRRI;
+ op_shuffle = kX86PshufdRRI;
break;
case kSingle:
- op_low = kX86PshufdRRI;
- op_mov = kX86Mova128RR;
+ op_shuffle = kX86PshufdRRI;
+ op_mov = kX86MovdqaRR;
reg_type = kFPReg;
break;
case k64:
- op_low = kX86PshufdRRI;
- imm = 0x44;
- break;
- case kDouble:
- op_low = kX86PshufdRRI;
- op_mov = kX86Mova128RR;
- reg_type = kFPReg;
- imm = 0x44;
+ op_shuffle = kX86PunpcklqdqRR;
+ op_mov = kX86MovqrxRR;
+ is_wide = true;
break;
case kSignedByte:
case kUnsignedByte:
- // Shuffle 8 bit value into 16 bit word.
- // We set val = val + (val << 8) below and use 16 bit shuffle.
+ // We will have the source loaded up in a
+ // double-word before we use this shuffle
+ op_shuffle = kX86PshufdRRI;
+ break;
case kSignedHalf:
case kUnsignedHalf:
// Handles low quadword.
- op_low = kX86PshuflwRRI;
+ op_shuffle = kX86PshuflwRRI;
// Handles upper quadword.
- op_high = kX86PshufdRRI;
+ op_shuffle_high = kX86PshufdRRI;
break;
default:
LOG(FATAL) << "Unsupported vector set " << opsize;
break;
}
- RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-
- // Load the value from the VR into the reg.
- if (rl_src.wide == 0) {
+ // Load the value from the VR into a physical register.
+ RegLocation rl_src;
+ if (!is_wide) {
+ rl_src = mir_graph_->GetSrc(mir, 0);
rl_src = LoadValue(rl_src, reg_type);
} else {
+ rl_src = mir_graph_->GetSrcWide(mir, 0);
rl_src = LoadValueWide(rl_src, reg_type);
}
+ RegStorage reg_to_shuffle = rl_src.reg;
- // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- RegStorage temp = AllocTemp();
- // val = val + (val << 8).
- NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
- NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
- NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
- FreeTemp(temp);
+ // Load the value into the XMM register.
+ if (!cu_->target64 && opsize == k64) {
+ // Logic assumes that longs are loaded in GP register pairs.
+ NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), reg_to_shuffle.GetLowReg());
+ RegStorage r_tmp = AllocTempDouble();
+ NewLIR2(kX86MovdxrRR, r_tmp.GetReg(), reg_to_shuffle.GetHighReg());
+ NewLIR2(kX86PunpckldqRR, rs_dest.GetReg(), r_tmp.GetReg());
+ FreeTemp(r_tmp);
+ } else {
+ NewLIR2(op_mov, rs_dest.GetReg(), reg_to_shuffle.GetReg());
}
- // Load the value into the XMM register.
- NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
+ if (opsize == kSignedByte || opsize == kUnsignedByte) {
+ // In the byte case, first duplicate it to be a word
+ // Then duplicate it to be a double-word
+ NewLIR2(kX86PunpcklbwRR, rs_dest.GetReg(), rs_dest.GetReg());
+ NewLIR2(kX86PunpcklwdRR, rs_dest.GetReg(), rs_dest.GetReg());
+ }
// Now shuffle the value across the destination.
- NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+ if (op_shuffle == kX86PunpcklqdqRR) {
+ NewLIR2(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg());
+ } else {
+ NewLIR3(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+ }
// And then repeat as needed.
- if (op_high != 0) {
- NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+ if (op_shuffle_high != 0) {
+ NewLIR3(op_shuffle_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
}
}
-LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
- int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
+void X86Mir2Lir::GenPackedArrayGet(BasicBlock *bb, MIR *mir) {
+ UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayGet not supported.";
+}
+
+void X86Mir2Lir::GenPackedArrayPut(BasicBlock *bb, MIR *mir) {
+ UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayPut not supported.";
+}
+
+LIR* X86Mir2Lir::ScanVectorLiteral(int32_t* constants) {
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
- if (args[0] == p->operands[0] && args[1] == p->operands[1] &&
- args[2] == p->operands[2] && args[3] == p->operands[3]) {
+ if (constants[0] == p->operands[0] && constants[1] == p->operands[1] &&
+ constants[2] == p->operands[2] && constants[3] == p->operands[3]) {
return p;
}
}
return nullptr;
}
-LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
+LIR* X86Mir2Lir::AddVectorLiteral(int32_t* constants) {
LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData));
- int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
- new_value->operands[0] = args[0];
- new_value->operands[1] = args[1];
- new_value->operands[2] = args[2];
- new_value->operands[3] = args[3];
+ new_value->operands[0] = constants[0];
+ new_value->operands[1] = constants[1];
+ new_value->operands[2] = constants[2];
+ new_value->operands[3] = constants[3];
new_value->next = const_vectors_;
if (const_vectors_ == nullptr) {
- estimated_native_code_size_ += 12; // Amount needed to align to 16 byte boundary.
+ estimated_native_code_size_ += 12; // Maximum needed to align to 16 byte boundary.
}
estimated_native_code_size_ += 16; // Space for one vector.
const_vectors_ = new_value;
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 4f65a0f..30384ec 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -990,6 +990,17 @@ void X86Mir2Lir::AnalyzeExtendedMIR(int opcode, BasicBlock * bb, MIR *mir) {
case kMirOpConstVector:
store_method_addr_ = true;
break;
+ case kMirOpPackedMultiply:
+ case kMirOpPackedShiftLeft:
+ case kMirOpPackedSignedShiftRight:
+ case kMirOpPackedUnsignedShiftRight: {
+ // Byte emulation requires constants from the literal pool.
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+ if (opsize == kSignedByte || opsize == kUnsignedByte) {
+ store_method_addr_ = true;
+ }
+ break;
+ }
default:
// Ignore the rest.
break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index e3ef8c1..22a2f30 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -555,20 +555,27 @@ enum X86OpCode {
Binary0fOpCode(kX86Subss), // float subtract
Binary0fOpCode(kX86Divsd), // double divide
Binary0fOpCode(kX86Divss), // float divide
- Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words
+ Binary0fOpCode(kX86Punpcklbw), // Interleave low-order bytes
+ Binary0fOpCode(kX86Punpcklwd), // Interleave low-order single words (16-bits)
+ Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words (32-bit)
+ Binary0fOpCode(kX86Punpcklqdq), // Interleave low-order quad word
Binary0fOpCode(kX86Sqrtsd), // square root
Binary0fOpCode(kX86Pmulld), // parallel integer multiply 32 bits x 4
Binary0fOpCode(kX86Pmullw), // parallel integer multiply 16 bits x 8
+ Binary0fOpCode(kX86Pmuludq), // parallel unsigned 32 integer and stores result as 64
Binary0fOpCode(kX86Mulps), // parallel FP multiply 32 bits x 4
Binary0fOpCode(kX86Mulpd), // parallel FP multiply 64 bits x 2
Binary0fOpCode(kX86Paddb), // parallel integer addition 8 bits x 16
Binary0fOpCode(kX86Paddw), // parallel integer addition 16 bits x 8
Binary0fOpCode(kX86Paddd), // parallel integer addition 32 bits x 4
+ Binary0fOpCode(kX86Paddq), // parallel integer addition 64 bits x 2
+ Binary0fOpCode(kX86Psadbw), // computes sum of absolute differences for unsigned byte integers
Binary0fOpCode(kX86Addps), // parallel FP addition 32 bits x 4
Binary0fOpCode(kX86Addpd), // parallel FP addition 64 bits x 2
Binary0fOpCode(kX86Psubb), // parallel integer subtraction 8 bits x 16
Binary0fOpCode(kX86Psubw), // parallel integer subtraction 16 bits x 8
Binary0fOpCode(kX86Psubd), // parallel integer subtraction 32 bits x 4
+ Binary0fOpCode(kX86Psubq), // parallel integer subtraction 32 bits x 4
Binary0fOpCode(kX86Subps), // parallel FP subtraction 32 bits x 4
Binary0fOpCode(kX86Subpd), // parallel FP subtraction 64 bits x 2
Binary0fOpCode(kX86Pand), // parallel AND 128 bits x 1
@@ -593,6 +600,7 @@ enum X86OpCode {
kX86PsrlwRI, // logical right shift of floating point registers 16 bits x 8
kX86PsrldRI, // logical right shift of floating point registers 32 bits x 4
kX86PsrlqRI, // logical right shift of floating point registers 64 bits x 2
+ kX86PsrldqRI, // logical shift of 128-bit vector register, immediate in bytes
kX86PsllwRI, // left shift of floating point registers 16 bits x 8
kX86PslldRI, // left shift of floating point registers 32 bits x 4
kX86PsllqRI, // left shift of floating point registers 64 bits x 2
@@ -607,8 +615,8 @@ enum X86OpCode {
kX86Fprem, // remainder from dividing of two floating point values
kX86Fucompp, // compare floating point values and pop x87 fp stack twice
kX86Fstsw16R, // store FPU status word
- Binary0fOpCode(kX86Mova128), // move 128 bits aligned
- kX86Mova128MR, kX86Mova128AR, // store 128 bit aligned from xmm1 to m128
+ Binary0fOpCode(kX86Movdqa), // move 128 bits aligned
+ kX86MovdqaMR, kX86MovdqaAR, // store 128 bit aligned from xmm1 to m128
Binary0fOpCode(kX86Movups), // load unaligned packed single FP values from xmm2/m128 to xmm1
kX86MovupsMR, kX86MovupsAR, // store unaligned packed single FP values from xmm1 to m128
Binary0fOpCode(kX86Movaps), // load aligned packed single FP values from xmm2/m128 to xmm1
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 7551add..1848abe 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -558,14 +558,19 @@ DISASSEMBLER_ENTRY(cmp,
has_modrm = true;
src_reg_file = dst_reg_file = SSE;
break;
- case 0x62:
+ case 0x60: case 0x61: case 0x62: case 0x6C:
if (prefix[2] == 0x66) {
src_reg_file = dst_reg_file = SSE;
prefix[2] = 0; // Clear prefix now. It has served its purpose as part of the opcode.
} else {
src_reg_file = dst_reg_file = MMX;
}
- opcode << "punpckldq";
+ switch (*instr) {
+ case 0x60: opcode << "punpcklbw"; break;
+ case 0x61: opcode << "punpcklwd"; break;
+ case 0x62: opcode << "punpckldq"; break;
+ case 0x6c: opcode << "punpcklqdq"; break;
+ }
load = true;
has_modrm = true;
break;
@@ -650,7 +655,7 @@ DISASSEMBLER_ENTRY(cmp,
} else {
dst_reg_file = MMX;
}
- static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "unknown-73", "unknown-73", "unknown-73", "psllq", "unknown-73"};
+ static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "psrldq", "unknown-73", "unknown-73", "psllq", "unknown-73"};
modrm_opcodes = x73_opcodes;
reg_is_opcode = true;
has_modrm = true;
@@ -800,6 +805,18 @@ DISASSEMBLER_ENTRY(cmp,
opcode << "bswap";
reg_in_opcode = true;
break;
+ case 0xD4:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0;
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "paddq";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
case 0xDB:
if (prefix[2] == 0x66) {
src_reg_file = dst_reg_file = SSE;
@@ -847,66 +864,14 @@ DISASSEMBLER_ENTRY(cmp,
has_modrm = true;
load = true;
break;
+ case 0xF4:
+ case 0xF6:
case 0xF8:
- if (prefix[2] == 0x66) {
- src_reg_file = dst_reg_file = SSE;
- prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
- } else {
- src_reg_file = dst_reg_file = MMX;
- }
- opcode << "psubb";
- prefix[2] = 0;
- has_modrm = true;
- load = true;
- break;
case 0xF9:
- if (prefix[2] == 0x66) {
- src_reg_file = dst_reg_file = SSE;
- prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
- } else {
- src_reg_file = dst_reg_file = MMX;
- }
- opcode << "psubw";
- prefix[2] = 0;
- has_modrm = true;
- load = true;
- break;
case 0xFA:
- if (prefix[2] == 0x66) {
- src_reg_file = dst_reg_file = SSE;
- prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
- } else {
- src_reg_file = dst_reg_file = MMX;
- }
- opcode << "psubd";
- prefix[2] = 0;
- has_modrm = true;
- load = true;
- break;
+ case 0xFB:
case 0xFC:
- if (prefix[2] == 0x66) {
- src_reg_file = dst_reg_file = SSE;
- prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
- } else {
- src_reg_file = dst_reg_file = MMX;
- }
- opcode << "paddb";
- prefix[2] = 0;
- has_modrm = true;
- load = true;
- break;
case 0xFD:
- if (prefix[2] == 0x66) {
- src_reg_file = dst_reg_file = SSE;
- prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
- } else {
- src_reg_file = dst_reg_file = MMX;
- }
- opcode << "paddw";
- prefix[2] = 0;
- has_modrm = true;
- load = true;
- break;
case 0xFE:
if (prefix[2] == 0x66) {
src_reg_file = dst_reg_file = SSE;
@@ -914,7 +879,17 @@ DISASSEMBLER_ENTRY(cmp,
} else {
src_reg_file = dst_reg_file = MMX;
}
- opcode << "paddd";
+ switch (*instr) {
+ case 0xF4: opcode << "pmuludq"; break;
+ case 0xF6: opcode << "psadbw"; break;
+ case 0xF8: opcode << "psubb"; break;
+ case 0xF9: opcode << "psubw"; break;
+ case 0xFA: opcode << "psubd"; break;
+ case 0xFB: opcode << "psubq"; break;
+ case 0xFC: opcode << "paddb"; break;
+ case 0xFD: opcode << "paddw"; break;
+ case 0xFE: opcode << "paddd"; break;
+ }
prefix[2] = 0;
has_modrm = true;
load = true;