diff options
-rw-r--r-- | lib/Target/X86/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Target/X86/X86.h | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86.td | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86FixupLEAs.cpp | 251 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.cpp | 1 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.h | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetMachine.cpp | 5 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea1.ll | 38 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea2.ll | 84 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea3.ll | 51 | ||||
-rw-r--r-- | test/CodeGen/X86/lsr-static-addr.ll | 2 |
11 files changed, 444 insertions, 1 deletions
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index d14899d..7cb71f0 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -33,6 +33,7 @@ set(sources X86TargetObjectFile.cpp X86TargetTransformInfo.cpp X86VZeroUpper.cpp + X86FixupLEAs.cpp ) if( CMAKE_CL_64 ) diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 1f9919f..947002f 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -69,6 +69,11 @@ ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); +/// createX86FixupLEAs - Return a a pass that selectively replaces +/// certain instructions (like add, sub, inc, dec, some shifts, +/// and some multiplies) by equivalent LEA instructions, in order +/// to eliminate execution delays in some Atom processors. +FunctionPass *createX86FixupLEAs(); } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 306e3ac..87bb68d 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -139,6 +139,8 @@ def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "CallRegIndirect", "true", "Call register indirect">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -188,6 +190,7 @@ def : ProcessorModel<"atom", AtomModel, FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, FeatureSlowDivide, FeatureCallRegIndirect, + FeatureLEAUsesAG, FeaturePadShortFunctions]>; // "Arrandale" along with corei3 and corei5 diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp new file mode 100644 index 0000000..82e6de4 --- /dev/null +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -0,0 +1,251 @@ +//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will find instructions which +// can be re-written as LEA instructions in order to reduce pipeline +// delays for some models of the Intel Atom family. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-fixup-LEAs" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumLEAs, "Number of LEA instructions created"); + +namespace { + class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + + virtual const char *getPassName() const { return "X86 Atom LEA Fixup";} + void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + void processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + RegUsageState usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I); + MachineBasicBlock::iterator searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + public: + FixupLEAPass() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + MachineFunction *MF; + const TargetMachine *TM; + const TargetInstrInfo *TII; // Machine instruction info. + LiveVariables *LV; + + }; + char FixupLEAPass::ID = 0; +} + +/// postRAConvertToLEA - if an instruction can be converted to an +/// equivalent LEA, insert the new instruction into the basic block +/// and return a pointer to it. Otherwise, return zero. +MachineInstr * +FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr* MI = MBBI; + MachineInstr* NewMI; + switch (MI->getOpcode()) { + case X86::MOV32rr: + case X86::MOV64rr: { + const MachineOperand& Src = MI->getOperand(1); + const MachineOperand& Dest = MI->getOperand(0); + NewMI = BuildMI(*MF, MI->getDebugLoc(), + TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (!MI->getOperand(2).isImm()) { + // convertToThreeAddress will call getImm() + // which requires isImm() to be true + return 0; + } + } + return TII->convertToThreeAddress(MFI, MBBI, LV); +} + +FunctionPass *llvm::createX86FixupLEAs() { + return new FixupLEAPass(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, +/// replacing instructions by equivalent LEA instructions +/// if needed and when possible. +bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + TII = Func.getTarget().getInstrInfo(); + TM = &MF->getTarget(); + LV = getAnalysisIfAvailable<LiveVariables>(); + + DEBUG(dbgs() << "Start X86FixupLEAs\n";); + // Process all basic blocks. + for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) + processBasicBlock(Func, I); + DEBUG(dbgs() << "End X86FixupLEAs\n";); + + return true; +} + +/// usesRegister - Determine if an instruction references a machine register +/// and, if so, whether it reads or writes the register. +FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I) { + RegUsageState RegUsage = RU_NotUsed; + MachineInstr* MI = I; + + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand& opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()){ + if (opnd.isDef()) + return RU_Write; + RegUsage = RU_Read; + } + } + return RegUsage; +} + +/// getPreviousInstr - Given a reference to an instruction in a basic +/// block, return a reference to the previous instruction in the block, +/// wrapping around to the last instruction of the block if the block +/// branches to itself. +static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + if (I == MFI->begin()) { + if (MFI->isPredecessor(MFI)) { + I = --MFI->end(); + return true; + } + else + return false; + } + --I; + return true; +} + +/// searchBackwards - Step backwards through a basic block, looking +/// for an instruction which writes a register within +/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. +MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst; + static const int INSTR_DISTANCE_THRESHOLD = 5; + + CurInst = I; + bool Found; + Found = getPreviousInstr(CurInst, MFI); + while( Found && I != CurInst) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > INSTR_DISTANCE_THRESHOLD) + break; // too far back to make a difference + if (usesRegister(p, CurInst) == RU_Write){ + return CurInst; + } + InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + Found = getPreviousInstr(CurInst, MFI); + } + return 0; +} + +/// processInstruction - Given a memory access or LEA instruction +/// whose address mode uses a base and/or index register, look for +/// an opportunity to replace the instruction which sets the base or index +/// register with an equivalent LEA instruction. +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + // Process a load, store, or LEA instruction. + MachineInstr *MI = I; + int opcode = MI->getOpcode(); + const MCInstrDesc& Desc = MI->getDesc(); + int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); + if (AddrOffset >= 0) { + AddrOffset += X86II::getOperandBias(Desc); + MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + if (p.isReg() && p.getReg() != X86::ESP) { + seekLEAFixup(p, I, MFI); + } + MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + if (q.isReg() && q.getReg() != X86::ESP) { + seekLEAFixup(q, I, MFI); + } + } +} + +/// seekLEAFixup - Given a machine register, look for the instruction +/// which writes it in the current basic block. If found, +/// try to replace it with an equivalent LEA instruction. +/// If replacement succeeds, then also process the the newly created +/// instruction. +void FixupLEAPass::seekLEAFixup(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + if (MBI) { + MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI, LV); + if (NewMI) { + ++NumLEAs; + DEBUG(dbgs() << "Candidate to replace:"; MBI->dump();); + // now to replace with an equivalent LEA... + DEBUG(dbgs() << "Replaced by: "; NewMI->dump();); + MFI->erase(MBI); + MachineBasicBlock::iterator J = + static_cast<MachineBasicBlock::iterator> (NewMI); + processInstruction(J, MFI); + } + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// replacing adds and shifts with LEA instructions, where appropriate. +bool FixupLEAPass::processBasicBlock(MachineFunction &MF, + MachineFunction::iterator MFI) { + + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) + processInstruction(I, MFI); + return false; +} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 14619b6..448d2e6 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -467,6 +467,7 @@ void X86Subtarget::initializeEnvironment() { PostRAScheduler = false; PadShortFunctions = false; CallRegIndirect = false; + LEAUsesAG = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 6fbdb1d..66832b9 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -165,6 +165,9 @@ protected: /// CallRegIndirect - True if the Calls with memory reference should be converted /// to a register-based indirect call. bool CallRegIndirect; + /// LEAUsesAG - True if the LEA instruction inputs have to be ready at + /// address generation (AG) time. + bool LEAUsesAG; /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. @@ -278,6 +281,7 @@ public: bool hasSlowDivide() const { return HasSlowDivide; } bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } + bool LEAusesAG() const { return LEAUsesAG; } bool isAtom() const { return X86ProcFamily == IntelAtom; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8aa58a2..00fa47f 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -215,6 +215,11 @@ bool X86PassConfig::addPreEmitPass() { addPass(createX86PadShortFunctions()); ShouldPrint = true; } + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().LEAusesAG()){ + addPass(createX86FixupLEAs()); + ShouldPrint = true; + } return ShouldPrint; } diff --git a/test/CodeGen/X86/atom-fixup-lea1.ll b/test/CodeGen/X86/atom-fixup-lea1.ll new file mode 100644 index 0000000..4651bf2 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea1.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK: addl +; CHECK-NEXT:leal +; CHECK-NEXT:decl +; CHECK-NEXT:jne + +; Test for the FixupLEAs pre-emit pass. An LEA should be substituted for the ADD +; that increments the array pointer because it is within 5 instructions of the +; corresponding load. The ADD precedes the load by following the loop back edge. + +; Original C code +;int test(int n, int * array) +;{ +; int sum = 0; +; for(int i = 0; i < n; i++) +; sum += array[i]; +; return sum; +;} + +define i32 @test(i32 %n, i32* nocapture %array) { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32* %array, i32 %i.06 + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.05 + %inc = add nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} diff --git a/test/CodeGen/X86/atom-fixup-lea2.ll b/test/CodeGen/X86/atom-fixup-lea2.ll new file mode 100644 index 0000000..1855ea1 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea2.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK:BB#5 +; CHECK-NEXT:leal +; CHECK-NEXT:leal +; CHECK-NEXT:leal +; CHECK-NEXT:movl + + +; Test for fixup lea pre-emit pass. LEA instructions should be substituted for +; ADD instructions which compute the address and index of the load because they +; precede the load within 5 instructions. An LEA should also be substituted for +; an ADD which computes part of the index because it precedes the index LEA +; within 5 instructions, this substitution is referred to as backwards chaining. + +; Original C Code +;struct node_t +;{ +; int k, m, n, p; +; int * array; +;}; + +;extern struct node_t getnode(); + +;int test() +;{ +; int sum = 0; +; struct node_t n = getnode(); +; if(n.array != 0 && n.p > 0 && n.k > 0 && n.n > 0 && n.m > 0) { +; sum = ((int*)((int)n.array + n.p) )[ n.k + n.m + n.n ]; +; } +; return sum; +;} + +%struct.node_t = type { i32, i32, i32, i32, i32* } + +define i32 @test() { +entry: + %n = alloca %struct.node_t, align 4 + call void bitcast (void (%struct.node_t*, ...)* @getnode to void (%struct.node_t*)*)(%struct.node_t* sret %n) + %array = getelementptr inbounds %struct.node_t* %n, i32 0, i32 4 + %0 = load i32** %array, align 4 + %cmp = icmp eq i32* %0, null + br i1 %cmp, label %if.end, label %land.lhs.true + +land.lhs.true: + %p = getelementptr inbounds %struct.node_t* %n, i32 0, i32 3 + %1 = load i32* %p, align 4 + %cmp1 = icmp sgt i32 %1, 0 + br i1 %cmp1, label %land.lhs.true2, label %if.end + +land.lhs.true2: + %k = getelementptr inbounds %struct.node_t* %n, i32 0, i32 0 + %2 = load i32* %k, align 4 + %cmp3 = icmp sgt i32 %2, 0 + br i1 %cmp3, label %land.lhs.true4, label %if.end + +land.lhs.true4: + %n5 = getelementptr inbounds %struct.node_t* %n, i32 0, i32 2 + %3 = load i32* %n5, align 4 + %cmp6 = icmp sgt i32 %3, 0 + br i1 %cmp6, label %land.lhs.true7, label %if.end + +land.lhs.true7: + %m = getelementptr inbounds %struct.node_t* %n, i32 0, i32 1 + %4 = load i32* %m, align 4 + %cmp8 = icmp sgt i32 %4, 0 + br i1 %cmp8, label %if.then, label %if.end + +if.then: + %add = add i32 %3, %2 + %add12 = add i32 %add, %4 + %5 = ptrtoint i32* %0 to i32 + %add15 = add nsw i32 %1, %5 + %6 = inttoptr i32 %add15 to i32* + %arrayidx = getelementptr inbounds i32* %6, i32 %add12 + %7 = load i32* %arrayidx, align 4 + br label %if.end + +if.end: + %sum.0 = phi i32 [ %7, %if.then ], [ 0, %land.lhs.true7 ], [ 0, %land.lhs.true4 ], [ 0, %land.lhs.true2 ], [ 0, %land.lhs.true ], [ 0, %entry ] + ret i32 %sum.0 +} + +declare void @getnode(%struct.node_t* sret, ...) diff --git a/test/CodeGen/X86/atom-fixup-lea3.ll b/test/CodeGen/X86/atom-fixup-lea3.ll new file mode 100644 index 0000000..311b0b3 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea3.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK: addl ([[reg:%[a-z]+]]) +; CHECK-NEXT: addl $4, [[reg]] + +; Test for the FixupLEAs pre-emit pass. +; An LEA should NOT be substituted for the ADD instruction +; that increments the array pointer if it is greater than 5 instructions +; away from the memory reference that uses it. + +; Original C code: clang -m32 -S -O2 +;int test(int n, int * array, int * m, int * array2) +;{ +; int i, j = 0; +; int sum = 0; +; for (i = 0, j = 0; i < n;) { +; ++i; +; *m += array2[j++]; +; sum += array[i]; +; } +; return sum; +;} + +define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %.pre = load i32* %m, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add, %for.body ] + %sum.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ] + %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ] + %inc1 = add nsw i32 %j.09, 1 + %arrayidx = getelementptr inbounds i32* %array2, i32 %j.09 + %1 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %1 + store i32 %add, i32* %m, align 4 + %arrayidx2 = getelementptr inbounds i32* %array, i32 %inc1 + %2 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %sum.010 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %sum.0.lcssa +} + diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll index 6566f56..b2aea90 100644 --- a/test/CodeGen/X86/lsr-static-addr.ll +++ b/test/CodeGen/X86/lsr-static-addr.ll @@ -17,7 +17,7 @@ ; ATOM-NEXT: movsd A(,%rax,8) ; ATOM-NEXT: mulsd ; ATOM-NEXT: movsd -; ATOM-NEXT: incq %rax +; ATOM-NEXT: leaq 1(%rax), %rax @A = external global [0 x double] |