diff options
24 files changed, 629 insertions, 52 deletions
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index bd6e943..9b1ef17 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -378,10 +378,14 @@ CodeGenerator* CodeGenerator::Create(HGraph* graph, case kMips: return nullptr; case kX86: { - return new x86::CodeGeneratorX86(graph, compiler_options); + return new x86::CodeGeneratorX86(graph, + *isa_features.AsX86InstructionSetFeatures(), + compiler_options); } case kX86_64: { - return new x86_64::CodeGeneratorX86_64(graph, compiler_options); + return new x86_64::CodeGeneratorX86_64(graph, + *isa_features.AsX86_64InstructionSetFeatures(), + compiler_options); } default: return nullptr; diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 9b7e01c..f79dbc3 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -360,7 +360,9 @@ size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32 return GetFloatingPointSpillSlotSize(); } -CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options) +CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, + const X86InstructionSetFeatures& isa_features, + const CompilerOptions& compiler_options) : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters, @@ -373,7 +375,8 @@ CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compile block_labels_(graph->GetArena(), 0), location_builder_(graph, this), instruction_visitor_(graph, this), - move_resolver_(graph->GetArena(), this) { + move_resolver_(graph->GetArena(), this), + isa_features_(isa_features) { // Use a fake return address register to mimic Quick. AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister)); } @@ -1163,7 +1166,7 @@ void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) { } void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { - IntrinsicLocationsBuilderX86 intrinsic(GetGraph()->GetArena()); + IntrinsicLocationsBuilderX86 intrinsic(codegen_); if (intrinsic.TryDispatch(invoke)) { return; } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 2a26c86..0cc3c65 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -189,7 +189,9 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor { class CodeGeneratorX86 : public CodeGenerator { public: - CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options); + CodeGeneratorX86(HGraph* graph, + const X86InstructionSetFeatures& isa_features, + const CompilerOptions& compiler_options); virtual ~CodeGeneratorX86() {} void GenerateFrameEntry() OVERRIDE; @@ -275,6 +277,10 @@ class CodeGeneratorX86 : public CodeGenerator { Label* GetFrameEntryLabel() { return &frame_entry_label_; } + const X86InstructionSetFeatures& GetInstructionSetFeatures() const { + return isa_features_; + } + private: // Labels for each block that will be compiled. GrowableArray<Label> block_labels_; @@ -283,6 +289,7 @@ class CodeGeneratorX86 : public CodeGenerator { InstructionCodeGeneratorX86 instruction_visitor_; ParallelMoveResolverX86 move_resolver_; X86Assembler assembler_; + const X86InstructionSetFeatures& isa_features_; DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86); }; diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index b8940e3..9958451 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -411,7 +411,9 @@ size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uin static constexpr int kNumberOfCpuRegisterPairs = 0; // Use a fake return address register to mimic Quick. static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1); -CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options) +CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, + const X86_64InstructionSetFeatures& isa_features, + const CompilerOptions& compiler_options) : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfFloatRegisters, @@ -425,7 +427,8 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& c block_labels_(graph->GetArena(), 0), location_builder_(graph, this), instruction_visitor_(graph, this), - move_resolver_(graph->GetArena(), this) { + move_resolver_(graph->GetArena(), this), + isa_features_(isa_features) { AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister)); } @@ -1233,7 +1236,7 @@ Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type } void LocationsBuilderX86_64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { - IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena()); + IntrinsicLocationsBuilderX86_64 intrinsic(codegen_); if (intrinsic.TryDispatch(invoke)) { return; } @@ -1294,7 +1297,7 @@ void LocationsBuilderX86_64::HandleInvoke(HInvoke* invoke) { } void LocationsBuilderX86_64::VisitInvokeVirtual(HInvokeVirtual* invoke) { - IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena()); + IntrinsicLocationsBuilderX86_64 intrinsic(codegen_); if (intrinsic.TryDispatch(invoke)) { return; } diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 4b8f087..375c0b0 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -195,7 +195,9 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor { class CodeGeneratorX86_64 : public CodeGenerator { public: - CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options); + CodeGeneratorX86_64(HGraph* graph, + const X86_64InstructionSetFeatures& isa_features, + const CompilerOptions& compiler_options); virtual ~CodeGeneratorX86_64() {} void GenerateFrameEntry() OVERRIDE; @@ -268,6 +270,10 @@ class CodeGeneratorX86_64 : public CodeGenerator { void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, CpuRegister temp); + const X86_64InstructionSetFeatures& GetInstructionSetFeatures() const { + return isa_features_; + } + private: // Labels for each block that will be compiled. GrowableArray<Label> block_labels_; @@ -276,6 +282,7 @@ class CodeGeneratorX86_64 : public CodeGenerator { InstructionCodeGeneratorX86_64 instruction_visitor_; ParallelMoveResolverX86_64 move_resolver_; X86_64Assembler assembler_; + const X86_64InstructionSetFeatures& isa_features_; DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64); }; diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 6053ad5..2be117b 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -19,6 +19,8 @@ #include "arch/instruction_set.h" #include "arch/arm/instruction_set_features_arm.h" #include "arch/arm64/instruction_set_features_arm64.h" +#include "arch/x86/instruction_set_features_x86.h" +#include "arch/x86_64/instruction_set_features_x86_64.h" #include "base/macros.h" #include "builder.h" #include "code_generator_arm.h" @@ -108,7 +110,9 @@ static void RunCodeBaseline(HGraph* graph, bool has_result, Expected expected) { InternalCodeAllocator allocator; CompilerOptions compiler_options; - x86::CodeGeneratorX86 codegenX86(graph, compiler_options); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options); // We avoid doing a stack overflow check that requires the runtime being setup, // by making sure the compiler knows the methods we are running are leaf methods. codegenX86.CompileBaseline(&allocator, true); @@ -124,7 +128,9 @@ static void RunCodeBaseline(HGraph* graph, bool has_result, Expected expected) { Run(allocator, codegenARM, has_result, expected); } - x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options); + std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64( + X86_64InstructionSetFeatures::FromCppDefines()); + x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options); codegenX86_64.CompileBaseline(&allocator, true); if (kRuntimeISA == kX86_64) { Run(allocator, codegenX86_64, has_result, expected); @@ -175,10 +181,14 @@ static void RunCodeOptimized(HGraph* graph, compiler_options); RunCodeOptimized(&codegenARM64, graph, hook_before_codegen, has_result, expected); } else if (kRuntimeISA == kX86) { - x86::CodeGeneratorX86 codegenX86(graph, compiler_options); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options); RunCodeOptimized(&codegenX86, graph, hook_before_codegen, has_result, expected); } else if (kRuntimeISA == kX86_64) { - x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options); + std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64( + X86_64InstructionSetFeatures::FromCppDefines()); + x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options); RunCodeOptimized(&codegenX86_64, graph, hook_before_codegen, has_result, expected); } } diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc index 6853d54..02ad675 100644 --- a/compiler/optimizing/constant_folding_test.cc +++ b/compiler/optimizing/constant_folding_test.cc @@ -16,6 +16,7 @@ #include <functional> +#include "arch/x86/instruction_set_features_x86.h" #include "code_generator_x86.h" #include "constant_folding.h" #include "dead_code_elimination.h" @@ -46,7 +47,9 @@ static void TestCode(const uint16_t* data, std::string actual_before = printer_before.str(); ASSERT_EQ(expected_before, actual_before); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions()); HConstantFolding(graph).Run(); SSAChecker ssa_checker_cf(&allocator, graph); ssa_checker_cf.Run(); diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc index a644719..98ae1ec 100644 --- a/compiler/optimizing/dead_code_elimination_test.cc +++ b/compiler/optimizing/dead_code_elimination_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "arch/x86/instruction_set_features_x86.h" #include "code_generator_x86.h" #include "dead_code_elimination.h" #include "driver/compiler_options.h" @@ -40,7 +41,9 @@ static void TestCode(const uint16_t* data, std::string actual_before = printer_before.str(); ASSERT_EQ(actual_before, expected_before); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions()); HDeadCodeElimination(graph).Run(); SSAChecker ssa_checker(&allocator, graph); ssa_checker.Run(); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 384737f..0740471 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -16,6 +16,7 @@ #include "intrinsics_x86.h" +#include "arch/x86/instruction_set_features_x86.h" #include "code_generator_x86.h" #include "entrypoints/quick/quick_entrypoints.h" #include "intrinsics.h" @@ -34,6 +35,11 @@ static constexpr int kDoubleNaNHigh = 0x7FF80000; static constexpr int kDoubleNaNLow = 0x00000000; static constexpr int kFloatNaN = 0x7FC00000; +IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen) + : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) { +} + + X86Assembler* IntrinsicCodeGeneratorX86::GetAssembler() { return reinterpret_cast<X86Assembler*>(codegen_->GetAssembler()); } @@ -719,6 +725,148 @@ void IntrinsicCodeGeneratorX86::VisitMathSqrt(HInvoke* invoke) { GetAssembler()->sqrtsd(out, in); } +static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) { + MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen); + + DCHECK(invoke->IsInvokeStaticOrDirect()); + codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), EAX); + + // Copy the result back to the expected output. + Location out = invoke->GetLocations()->Out(); + if (out.IsValid()) { + DCHECK(out.IsRegister()); + MoveFromReturnRegister(out, invoke->GetType(), codegen); + } +} + +static void CreateSSE41FPToFPLocations(ArenaAllocator* arena, + HInvoke* invoke, + CodeGeneratorX86* codegen) { + // Do we have instruction support? + if (codegen->GetInstructionSetFeatures().HasSSE4_1()) { + CreateFPToFPLocations(arena, invoke); + return; + } + + // We have to fall back to a call to the intrinsic. + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCall); + InvokeRuntimeCallingConvention calling_convention; + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(Location::FpuRegisterLocation(XMM0)); + // Needs to be EAX for the invoke. + locations->AddTemp(Location::RegisterLocation(EAX)); +} + +static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86* codegen, + HInvoke* invoke, + X86Assembler* assembler, + int round_mode) { + LocationSummary* locations = invoke->GetLocations(); + if (locations->WillCall()) { + InvokeOutOfLineIntrinsic(codegen, invoke); + } else { + XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>(); + __ roundsd(out, in, Immediate(round_mode)); + } +} + +void IntrinsicLocationsBuilderX86::VisitMathCeil(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86::VisitMathCeil(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2); +} + +void IntrinsicLocationsBuilderX86::VisitMathFloor(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86::VisitMathFloor(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1); +} + +void IntrinsicLocationsBuilderX86::VisitMathRint(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0); +} + +// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble, +// as it needs 64 bit instructions. +void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) { + // Do we have instruction support? + if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + return; + } + + // We have to fall back to a call to the intrinsic. + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kCall); + InvokeRuntimeCallingConvention calling_convention; + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(Location::RegisterLocation(EAX)); + // Needs to be EAX for the invoke. + locations->AddTemp(Location::RegisterLocation(EAX)); +} + +void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) { + LocationSummary* locations = invoke->GetLocations(); + if (locations->WillCall()) { + InvokeOutOfLineIntrinsic(codegen_, invoke); + return; + } + + // Implement RoundFloat as t1 = floor(input + 0.5f); convert to int. + XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + Register out = locations->Out().AsRegister<Register>(); + XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + Label done, nan; + X86Assembler* assembler = GetAssembler(); + + // Generate 0.5 into inPlusPointFive. + __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f))); + __ movd(inPlusPointFive, out); + + // Add in the input. + __ addss(inPlusPointFive, in); + + // And truncate to an integer. + __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1)); + + __ movl(out, Immediate(kPrimIntMax)); + // maxInt = int-to-float(out) + __ cvtsi2ss(maxInt, out); + + // if inPlusPointFive >= maxInt goto done + __ comiss(inPlusPointFive, maxInt); + __ j(kAboveEqual, &done); + + // if input == NaN goto nan + __ j(kUnordered, &nan); + + // output = float-to-int-truncate(input) + __ cvttss2si(out, inPlusPointFive); + __ jmp(&done); + __ Bind(&nan); + + // output = 0 + __ xorl(out, out); + __ Bind(&done); +} + void IntrinsicLocationsBuilderX86::VisitStringCharAt(HInvoke* invoke) { // The inputs plus one temp. LocationSummary* locations = new (arena_) LocationSummary(invoke, @@ -1191,11 +1339,7 @@ void IntrinsicCodeGeneratorX86::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) UNIMPLEMENTED_INTRINSIC(IntegerReverse) UNIMPLEMENTED_INTRINSIC(LongReverse) UNIMPLEMENTED_INTRINSIC(LongReverseBytes) -UNIMPLEMENTED_INTRINSIC(MathFloor) -UNIMPLEMENTED_INTRINSIC(MathCeil) -UNIMPLEMENTED_INTRINSIC(MathRint) UNIMPLEMENTED_INTRINSIC(MathRoundDouble) -UNIMPLEMENTED_INTRINSIC(MathRoundFloat) UNIMPLEMENTED_INTRINSIC(StringIndexOf) UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter) UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar) diff --git a/compiler/optimizing/intrinsics_x86.h b/compiler/optimizing/intrinsics_x86.h index e1e8260..4292ec7 100644 --- a/compiler/optimizing/intrinsics_x86.h +++ b/compiler/optimizing/intrinsics_x86.h @@ -32,7 +32,7 @@ class X86Assembler; class IntrinsicLocationsBuilderX86 FINAL : public IntrinsicVisitor { public: - explicit IntrinsicLocationsBuilderX86(ArenaAllocator* arena) : arena_(arena) {} + explicit IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen); // Define visitor methods. @@ -50,6 +50,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) private: ArenaAllocator* arena_; + CodeGeneratorX86* codegen_; DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86); }; diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 736cea8..f6fa013 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -16,6 +16,7 @@ #include "intrinsics_x86_64.h" +#include "arch/x86_64/instruction_set_features_x86_64.h" #include "code_generator_x86_64.h" #include "entrypoints/quick/quick_entrypoints.h" #include "intrinsics.h" @@ -30,6 +31,11 @@ namespace art { namespace x86_64 { +IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen) + : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) { +} + + X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() { return reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler()); } @@ -614,6 +620,203 @@ void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) { GetAssembler()->sqrtsd(out, in); } +static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) { + MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen); + + DCHECK(invoke->IsInvokeStaticOrDirect()); + codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI)); + codegen->RecordPcInfo(invoke, invoke->GetDexPc()); + + // Copy the result back to the expected output. + Location out = invoke->GetLocations()->Out(); + if (out.IsValid()) { + DCHECK(out.IsRegister()); + MoveFromReturnRegister(out, invoke->GetType(), codegen); + } +} + +static void CreateSSE41FPToFPLocations(ArenaAllocator* arena, + HInvoke* invoke, + CodeGeneratorX86_64* codegen) { + // Do we have instruction support? + if (codegen->GetInstructionSetFeatures().HasSSE4_1()) { + CreateFPToFPLocations(arena, invoke); + return; + } + + // We have to fall back to a call to the intrinsic. + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCall); + InvokeRuntimeCallingConvention calling_convention; + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(Location::FpuRegisterLocation(XMM0)); + // Needs to be RDI for the invoke. + locations->AddTemp(Location::RegisterLocation(RDI)); +} + +static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen, + HInvoke* invoke, + X86_64Assembler* assembler, + int round_mode) { + LocationSummary* locations = invoke->GetLocations(); + if (locations->WillCall()) { + InvokeOutOfLineIntrinsic(codegen, invoke); + } else { + XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>(); + __ roundsd(out, in, Immediate(round_mode)); + } +} + +void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2); +} + +void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1); +} + +void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) { + CreateSSE41FPToFPLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) { + GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0); +} + +static void CreateSSE41FPToIntLocations(ArenaAllocator* arena, + HInvoke* invoke, + CodeGeneratorX86_64* codegen) { + // Do we have instruction support? + if (codegen->GetInstructionSetFeatures().HasSSE4_1()) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + return; + } + + // We have to fall back to a call to the intrinsic. + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCall); + InvokeRuntimeCallingConvention calling_convention; + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(Location::RegisterLocation(RAX)); + // Needs to be RDI for the invoke. + locations->AddTemp(Location::RegisterLocation(RDI)); +} + +void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) { + CreateSSE41FPToIntLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) { + LocationSummary* locations = invoke->GetLocations(); + if (locations->WillCall()) { + InvokeOutOfLineIntrinsic(codegen_, invoke); + return; + } + + // Implement RoundFloat as t1 = floor(input + 0.5f); convert to int. + XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + Label done, nan; + X86_64Assembler* assembler = GetAssembler(); + + // Generate 0.5 into inPlusPointFive. + __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f))); + __ movd(inPlusPointFive, out, false); + + // Add in the input. + __ addss(inPlusPointFive, in); + + // And truncate to an integer. + __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1)); + + __ movl(out, Immediate(kPrimIntMax)); + // maxInt = int-to-float(out) + __ cvtsi2ss(maxInt, out); + + // if inPlusPointFive >= maxInt goto done + __ comiss(inPlusPointFive, maxInt); + __ j(kAboveEqual, &done); + + // if input == NaN goto nan + __ j(kUnordered, &nan); + + // output = float-to-int-truncate(input) + __ cvttss2si(out, inPlusPointFive); + __ jmp(&done); + __ Bind(&nan); + + // output = 0 + __ xorl(out, out); + __ Bind(&done); +} + +void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) { + CreateSSE41FPToIntLocations(arena_, invoke, codegen_); +} + +void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) { + LocationSummary* locations = invoke->GetLocations(); + if (locations->WillCall()) { + InvokeOutOfLineIntrinsic(codegen_, invoke); + return; + } + + // Implement RoundDouble as t1 = floor(input + 0.5); convert to long. + XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + XmmRegister maxLong = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + Label done, nan; + X86_64Assembler* assembler = GetAssembler(); + + // Generate 0.5 into inPlusPointFive. + __ movq(out, Immediate(bit_cast<int64_t, double>(0.5))); + __ movd(inPlusPointFive, out, true); + + // Add in the input. + __ addsd(inPlusPointFive, in); + + // And truncate to an integer. + __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1)); + + __ movq(out, Immediate(kPrimLongMax)); + // maxLong = long-to-double(out) + __ cvtsi2sd(maxLong, out, true); + + // if inPlusPointFive >= maxLong goto done + __ comisd(inPlusPointFive, maxLong); + __ j(kAboveEqual, &done); + + // if input == NaN goto nan + __ j(kUnordered, &nan); + + // output = double-to-long-truncate(input) + __ cvttsd2si(out, inPlusPointFive, true); + __ jmp(&done); + __ Bind(&nan); + + // output = 0 + __ xorq(out, out); + __ Bind(&done); +} + void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) { // The inputs plus one temp. LocationSummary* locations = new (arena_) LocationSummary(invoke, @@ -1009,11 +1212,6 @@ void IntrinsicCodeGeneratorX86_64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSE UNIMPLEMENTED_INTRINSIC(IntegerReverse) UNIMPLEMENTED_INTRINSIC(LongReverse) -UNIMPLEMENTED_INTRINSIC(MathFloor) -UNIMPLEMENTED_INTRINSIC(MathCeil) -UNIMPLEMENTED_INTRINSIC(MathRint) -UNIMPLEMENTED_INTRINSIC(MathRoundDouble) -UNIMPLEMENTED_INTRINSIC(MathRoundFloat) UNIMPLEMENTED_INTRINSIC(StringIndexOf) UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter) UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar) diff --git a/compiler/optimizing/intrinsics_x86_64.h b/compiler/optimizing/intrinsics_x86_64.h index dfae7fa..0e0e72c 100644 --- a/compiler/optimizing/intrinsics_x86_64.h +++ b/compiler/optimizing/intrinsics_x86_64.h @@ -32,7 +32,7 @@ class X86_64Assembler; class IntrinsicLocationsBuilderX86_64 FINAL : public IntrinsicVisitor { public: - explicit IntrinsicLocationsBuilderX86_64(ArenaAllocator* arena) : arena_(arena) {} + explicit IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen); // Define visitor methods. @@ -50,6 +50,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) private: ArenaAllocator* arena_; + CodeGeneratorX86_64* codegen_; DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86_64); }; diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc index f22b7a7..28c5555 100644 --- a/compiler/optimizing/linearize_test.cc +++ b/compiler/optimizing/linearize_test.cc @@ -16,6 +16,7 @@ #include <fstream> +#include "arch/x86/instruction_set_features_x86.h" #include "base/arena_allocator.h" #include "base/stringprintf.h" #include "builder.h" @@ -46,7 +47,9 @@ static void TestCode(const uint16_t* data, const int* expected_order, size_t num graph->TryBuildingSsa(); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc index c102c4f..61d6593 100644 --- a/compiler/optimizing/live_ranges_test.cc +++ b/compiler/optimizing/live_ranges_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "arch/x86/instruction_set_features_x86.h" #include "base/arena_allocator.h" #include "builder.h" #include "code_generator.h" @@ -65,7 +66,9 @@ TEST(LiveRangesTest, CFG1) { ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -111,7 +114,9 @@ TEST(LiveRangesTest, CFG2) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -160,7 +165,9 @@ TEST(LiveRangesTest, CFG3) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -237,7 +244,9 @@ TEST(LiveRangesTest, Loop1) { ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); RemoveSuspendChecks(graph); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -315,7 +324,9 @@ TEST(LiveRangesTest, Loop2) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -391,7 +402,9 @@ TEST(LiveRangesTest, CFG4) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc index 0b0cfde..81250ca 100644 --- a/compiler/optimizing/liveness_test.cc +++ b/compiler/optimizing/liveness_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "arch/x86/instruction_set_features_x86.h" #include "base/arena_allocator.h" #include "builder.h" #include "code_generator.h" @@ -53,7 +54,9 @@ static void TestCode(const uint16_t* data, const char* expected) { graph->TryBuildingSsa(); // `Inline` conditions into ifs. PrepareForRegisterAllocation(graph).Run(); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc index 7c3a035..3951439 100644 --- a/compiler/optimizing/register_allocator_test.cc +++ b/compiler/optimizing/register_allocator_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "arch/x86/instruction_set_features_x86.h" #include "base/arena_allocator.h" #include "builder.h" #include "code_generator.h" @@ -42,7 +43,9 @@ static bool Check(const uint16_t* data) { const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data); builder.BuildGraph(*item); graph->TryBuildingSsa(); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); RegisterAllocator register_allocator(&allocator, &codegen, liveness); @@ -58,7 +61,9 @@ TEST(RegisterAllocatorTest, ValidateIntervals) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = new (&allocator) HGraph(&allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); GrowableArray<LiveInterval*> intervals(&allocator, 0); // Test with two intervals of the same range. @@ -298,7 +303,9 @@ TEST(RegisterAllocatorTest, Loop3) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildSSAGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); RegisterAllocator register_allocator(&allocator, &codegen, liveness); @@ -330,7 +337,9 @@ TEST(RegisterAllocatorTest, FirstRegisterUse) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = BuildSSAGraph(data, &allocator); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -383,7 +392,9 @@ TEST(RegisterAllocatorTest, DeadPhi) { ArenaAllocator allocator(&pool); HGraph* graph = BuildSSAGraph(data, &allocator); SsaDeadPhiElimination(graph).Run(); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); RegisterAllocator register_allocator(&allocator, &codegen, liveness); @@ -405,7 +416,9 @@ TEST(RegisterAllocatorTest, FreeUntil) { ArenaAllocator allocator(&pool); HGraph* graph = BuildSSAGraph(data, &allocator); SsaDeadPhiElimination(graph).Run(); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); RegisterAllocator register_allocator(&allocator, &codegen, liveness); @@ -507,7 +520,9 @@ TEST(RegisterAllocatorTest, PhiHint) { { HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -522,7 +537,9 @@ TEST(RegisterAllocatorTest, PhiHint) { { HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -539,7 +556,9 @@ TEST(RegisterAllocatorTest, PhiHint) { { HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -556,7 +575,9 @@ TEST(RegisterAllocatorTest, PhiHint) { { HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -608,7 +629,9 @@ TEST(RegisterAllocatorTest, ExpectedInRegisterHint) { { HGraph* graph = BuildFieldReturn(&allocator, &field, &ret); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -621,7 +644,9 @@ TEST(RegisterAllocatorTest, ExpectedInRegisterHint) { { HGraph* graph = BuildFieldReturn(&allocator, &field, &ret); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -671,7 +696,9 @@ TEST(RegisterAllocatorTest, SameAsFirstInputHint) { { HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -685,7 +712,9 @@ TEST(RegisterAllocatorTest, SameAsFirstInputHint) { { HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -734,7 +763,9 @@ TEST(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint) { { HGraph* graph = BuildDiv(&allocator, &div); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); liveness.Analyze(); @@ -822,7 +853,9 @@ TEST(RegisterAllocatorTest, SpillInactive) { locations = new (&allocator) LocationSummary(fourth->GetDefinedBy(), LocationSummary::kNoCall); locations->SetOut(Location::RequiresRegister()); - x86::CodeGeneratorX86 codegen(graph, CompilerOptions()); + std::unique_ptr<const X86InstructionSetFeatures> features_x86( + X86InstructionSetFeatures::FromCppDefines()); + x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(*graph, &codegen); RegisterAllocator register_allocator(&allocator, &codegen, liveness); diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h index 6f8b301..b13edb6 100644 --- a/compiler/utils/assembler_test.h +++ b/compiler/utils/assembler_test.h @@ -123,6 +123,16 @@ class AssemblerTest : public testing::Test { fmt); } + std::string RepeatFFI(void (Ass::*f)(FPReg, FPReg, const Imm&), size_t imm_bytes, std::string fmt) { + return RepeatTemplatedRegistersImm<FPReg, FPReg>(f, + GetFPRegisters(), + GetFPRegisters(), + &AssemblerTest::GetFPRegName, + &AssemblerTest::GetFPRegName, + imm_bytes, + fmt); + } + std::string RepeatFR(void (Ass::*f)(FPReg, Reg), std::string fmt) { return RepeatTemplatedRegisters<FPReg, Reg>(f, GetFPRegisters(), @@ -448,6 +458,57 @@ class AssemblerTest : public testing::Test { return str; } + template <typename Reg1, typename Reg2> + std::string RepeatTemplatedRegistersImm(void (Ass::*f)(Reg1, Reg2, const Imm&), + const std::vector<Reg1*> reg1_registers, + const std::vector<Reg2*> reg2_registers, + std::string (AssemblerTest::*GetName1)(const Reg1&), + std::string (AssemblerTest::*GetName2)(const Reg2&), + size_t imm_bytes, + std::string fmt) { + std::vector<int64_t> imms = CreateImmediateValues(imm_bytes); + WarnOnCombinations(reg1_registers.size() * reg2_registers.size() * imms.size()); + + std::string str; + for (auto reg1 : reg1_registers) { + for (auto reg2 : reg2_registers) { + for (int64_t imm : imms) { + Imm new_imm = CreateImmediate(imm); + (assembler_.get()->*f)(*reg1, *reg2, new_imm); + std::string base = fmt; + + std::string reg1_string = (this->*GetName1)(*reg1); + size_t reg1_index; + while ((reg1_index = base.find(REG1_TOKEN)) != std::string::npos) { + base.replace(reg1_index, ConstexprStrLen(REG1_TOKEN), reg1_string); + } + + std::string reg2_string = (this->*GetName2)(*reg2); + size_t reg2_index; + while ((reg2_index = base.find(REG2_TOKEN)) != std::string::npos) { + base.replace(reg2_index, ConstexprStrLen(REG2_TOKEN), reg2_string); + } + + size_t imm_index = base.find(IMM_TOKEN); + if (imm_index != std::string::npos) { + std::ostringstream sreg; + sreg << imm; + std::string imm_string = sreg.str(); + base.replace(imm_index, ConstexprStrLen(IMM_TOKEN), imm_string); + } + + if (str.size() > 0) { + str += "\n"; + } + str += base; + } + } + } + // Add a newline at the end. + str += "\n"; + return str; + } + template <RegisterView kRegView> std::string GetRegName(const Reg& reg) { std::ostringstream sreg; diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index 5773459..b3a1376 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -695,6 +695,28 @@ void X86Assembler::ucomisd(XmmRegister a, XmmRegister b) { } +void X86Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0x3A); + EmitUint8(0x0B); + EmitXmmRegisterOperand(dst, src); + EmitUint8(imm.value()); +} + + +void X86Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0x3A); + EmitUint8(0x0A); + EmitXmmRegisterOperand(dst, src); + EmitUint8(imm.value()); +} + + void X86Assembler::sqrtsd(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0xF2); diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index 6ccf2e3..bdf8843 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -312,6 +312,9 @@ class X86Assembler FINAL : public Assembler { void ucomiss(XmmRegister a, XmmRegister b); void ucomisd(XmmRegister a, XmmRegister b); + void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm); + void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm); + void sqrtsd(XmmRegister dst, XmmRegister src); void sqrtss(XmmRegister dst, XmmRegister src); diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index bd155ed..e82d90c 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -796,6 +796,30 @@ void X86_64Assembler::ucomisd(XmmRegister a, XmmRegister b) { } +void X86_64Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0x3A); + EmitUint8(0x0B); + EmitXmmRegisterOperand(dst.LowBits(), src); + EmitUint8(imm.value()); +} + + +void X86_64Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0x3A); + EmitUint8(0x0A); + EmitXmmRegisterOperand(dst.LowBits(), src); + EmitUint8(imm.value()); +} + + void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0xF2); diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index 495f74f..39f781c 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -353,6 +353,9 @@ class X86_64Assembler FINAL : public Assembler { void ucomiss(XmmRegister a, XmmRegister b); void ucomisd(XmmRegister a, XmmRegister b); + void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm); + void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm); + void sqrtsd(XmmRegister dst, XmmRegister src); void sqrtss(XmmRegister dst, XmmRegister src); diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc index 00f508b..4402dfc 100644 --- a/compiler/utils/x86_64/assembler_x86_64_test.cc +++ b/compiler/utils/x86_64/assembler_x86_64_test.cc @@ -692,6 +692,14 @@ TEST_F(AssemblerX86_64Test, Sqrtsd) { DriverStr(RepeatFF(&x86_64::X86_64Assembler::sqrtsd, "sqrtsd %{reg2}, %{reg1}"), "sqrtsd"); } +TEST_F(AssemblerX86_64Test, Roundss) { + DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundss, 1, "roundss ${imm}, %{reg2}, %{reg1}"), "roundss"); +} + +TEST_F(AssemblerX86_64Test, Roundsd) { + DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundsd, 1, "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd"); +} + TEST_F(AssemblerX86_64Test, Xorps) { DriverStr(RepeatFF(&x86_64::X86_64Assembler::xorps, "xorps %{reg2}, %{reg1}"), "xorps"); } diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc index 203488d..a1834e1 100644 --- a/disassembler/disassembler_x86.cc +++ b/disassembler/disassembler_x86.cc @@ -561,6 +561,24 @@ DISASSEMBLER_ENTRY(cmp, instr++; if (prefix[2] == 0x66) { switch (*instr) { + case 0x0A: + opcode1 = "roundss"; + prefix[2] = 0; + has_modrm = true; + store = true; + src_reg_file = SSE; + dst_reg_file = SSE; + immediate_bytes = 1; + break; + case 0x0B: + opcode1 = "roundsd"; + prefix[2] = 0; + has_modrm = true; + store = true; + src_reg_file = SSE; + dst_reg_file = SSE; + immediate_bytes = 1; + break; case 0x14: opcode1 = "pextrb"; prefix[2] = 0; diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h index 926fabb..7b61245 100644 --- a/runtime/arch/x86/instruction_set_features_x86.h +++ b/runtime/arch/x86/instruction_set_features_x86.h @@ -58,6 +58,8 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { virtual ~X86InstructionSetFeatures() {} + bool HasSSE4_1() const { return has_SSE4_1_; } + protected: // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures. virtual const InstructionSetFeatures* |