diff options
author | Andrew Trick <atrick@apple.com> | 2012-11-12 19:40:10 +0000 |
---|---|---|
committer | Andrew Trick <atrick@apple.com> | 2012-11-12 19:40:10 +0000 |
commit | 9b5caaa9c452f262a52dd5ac7ebbc722da5a63de (patch) | |
tree | b881fab9bfd2e4ed47848171401915857c06b06e | |
parent | 0a46bf13a3b6c412749b874b52c8234b027b7134 (diff) | |
download | external_llvm-9b5caaa9c452f262a52dd5ac7ebbc722da5a63de.zip external_llvm-9b5caaa9c452f262a52dd5ac7ebbc722da5a63de.tar.gz external_llvm-9b5caaa9c452f262a52dd5ac7ebbc722da5a63de.tar.bz2 |
misched: Target-independent support for load/store clustering.
This infrastructure is generally useful for any target that wants to
strongly prefer two instructions to be adjacent after scheduling.
A following checkin will add target-specific hooks with unit
tests. Then this feature will be enabled by default with misched.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167742 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/CodeGen/MachineScheduler.h | 27 | ||||
-rw-r--r-- | include/llvm/Target/TargetInstrInfo.h | 13 | ||||
-rw-r--r-- | lib/CodeGen/MachineScheduler.cpp | 188 | ||||
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.cpp | 6 |
4 files changed, 220 insertions, 14 deletions
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index 31bd606..08f9182 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -202,6 +202,10 @@ protected: RegisterClassInfo *RegClassInfo; MachineSchedStrategy *SchedImpl; + /// Topo - A topological ordering for SUnits which permits fast IsReachable + /// and similar queries. + ScheduleDAGTopologicalSort Topo; + /// Ordered list of DAG postprocessing steps. std::vector<ScheduleDAGMutation*> Mutations; @@ -226,6 +230,10 @@ protected: IntervalPressure BotPressure; RegPressureTracker BotRPTracker; + /// Record the next node in a scheduled cluster. + const SUnit *NextClusterPred; + const SUnit *NextClusterSucc; + #ifndef NDEBUG /// The number of instructions scheduled so far. Used to cut off the /// scheduler at the point determined by misched-cutoff. @@ -236,24 +244,35 @@ public: ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S): ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS), AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S), - RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure), - CurrentBottom(), BotRPTracker(BotPressure) { + Topo(SUnits, &ExitSU), RPTracker(RegPressure), CurrentTop(), + TopRPTracker(TopPressure), CurrentBottom(), BotRPTracker(BotPressure), + NextClusterPred(NULL), NextClusterSucc(NULL) { #ifndef NDEBUG NumInstrsScheduled = 0; #endif } virtual ~ScheduleDAGMI() { + DeleteContainerPointers(Mutations); delete SchedImpl; } /// Add a postprocessing step to the DAG builder. /// Mutations are applied in the order that they are added after normal DAG /// building and before MachineSchedStrategy initialization. + /// + /// ScheduleDAGMI takes ownership of the Mutation object. void addMutation(ScheduleDAGMutation *Mutation) { Mutations.push_back(Mutation); } + /// \brief Add a DAG edge to the given SU with the given predecessor + /// dependence data. + /// + /// \returns true if the edge may be added without creating a cycle OR if an + /// equivalent edge already existed (false indicates failure). + bool addEdge(SUnit *SuccSU, const SDep &PredDep); + MachineBasicBlock::iterator top() const { return CurrentTop; } MachineBasicBlock::iterator bottom() const { return CurrentBottom; } @@ -285,6 +304,10 @@ public: return RegionCriticalPSets; } + const SUnit *getNextClusterPred() const { return NextClusterPred; } + + const SUnit *getNextClusterSucc() const { return NextClusterSucc; } + protected: // Top-Level entry points for the schedule() driver... diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 4570813..4f8ae01 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -621,6 +621,19 @@ public: return false; } + /// \brief Get the base register and byte offset of a load/store instr. + virtual bool getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const { + return false; + } + + virtual bool shouldScheduleLoadsNear(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + return false; + } + /// ReverseBranchCondition - Reverses the branch condition of the specified /// condition list, returning false on success and true if it cannot be /// reversed. diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 71cc072..dbab6ba 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -58,6 +58,10 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden, "before attempting to balance ILP"), cl::init(10U)); +// Experimental heuristics +static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, + cl::desc("Enable load clustering.")); + //===----------------------------------------------------------------------===// // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// @@ -303,6 +307,17 @@ void ReadyQueue::dump() { // preservation. //===----------------------------------------------------------------------===// +bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) { + // Do not use WillCreateCycle, it assumes SD scheduling. + // If Pred is reachable from Succ, then the edge creates a cycle. + if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) + return false; + Topo.AddPred(SuccSU, PredDep.getSUnit()); + SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); + // Return true regardless of whether a new edge needed to be inserted. + return true; +} + /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When /// NumPredsLeft reaches zero, release the successor node. /// @@ -312,6 +327,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { if (SuccEdge->isWeak()) { --SuccSU->WeakPredsLeft; + if (SuccEdge->isCluster()) + NextClusterSucc = SuccSU; return; } #ifndef NDEBUG @@ -344,6 +361,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { if (PredEdge->isWeak()) { --PredSU->WeakSuccsLeft; + if (PredEdge->isCluster()) + NextClusterPred = PredSU; return; } #ifndef NDEBUG @@ -482,6 +501,8 @@ updateScheduledPressure(std::vector<unsigned> NewMaxPressure) { void ScheduleDAGMI::schedule() { buildDAGWithRegPressure(); + Topo.InitDAGTopologicalSorting(); + postprocessDAG(); DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) @@ -562,6 +583,8 @@ void ScheduleDAGMI::releaseRoots() { /// Identify DAG roots and setup scheduler queues. void ScheduleDAGMI::initQueues() { + NextClusterSucc = NULL; + NextClusterPred = NULL; // Initialize the strategy before modifying the DAG. SchedImpl->initialize(this); @@ -664,6 +687,119 @@ void ScheduleDAGMI::dumpSchedule() const { } #endif +namespace { +/// \brief Post-process the DAG to create cluster edges between neighboring +/// loads. +class LoadClusterMutation : public ScheduleDAGMutation { + struct LoadInfo { + SUnit *SU; + unsigned BaseReg; + unsigned Offset; + LoadInfo(SUnit *su, unsigned reg, unsigned ofs) + : SU(su), BaseReg(reg), Offset(ofs) {} + }; + static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS, + const LoadClusterMutation::LoadInfo &RHS); + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; +public: + LoadClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri) + : TII(tii), TRI(tri) {} + + virtual void apply(ScheduleDAGMI *DAG); +protected: + void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); +}; +} // anonymous + +bool LoadClusterMutation::LoadInfoLess( + const LoadClusterMutation::LoadInfo &LHS, + const LoadClusterMutation::LoadInfo &RHS) { + if (LHS.BaseReg != RHS.BaseReg) + return LHS.BaseReg < RHS.BaseReg; + return LHS.Offset < RHS.Offset; +} + +void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, + ScheduleDAGMI *DAG) { + SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; + for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { + SUnit *SU = Loads[Idx]; + unsigned BaseReg; + unsigned Offset; + if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) + LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); + } + if (LoadRecords.size() < 2) + return; + std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess); + unsigned ClusterLength = 1; + for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { + if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { + ClusterLength = 1; + continue; + } + + SUnit *SUa = LoadRecords[Idx].SU; + SUnit *SUb = LoadRecords[Idx+1].SU; + if (TII->shouldScheduleLoadsNear(SUa->getInstr(), SUb->getInstr(), + ClusterLength) + && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { + + DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" + << SUb->NodeNum << ")\n"); + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since nearby + // loads should have effectively the same inputs. + for (SUnit::const_succ_iterator + SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { + if (SI->getSUnit() == SUb) + continue; + DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); + DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); + } + ++ClusterLength; + } + else + ClusterLength = 1; + } +} + +/// \brief Callback from DAG postProcessing to create cluster edges for loads. +void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { + // Map DAG NodeNum to store chain ID. + DenseMap<unsigned, unsigned> StoreChainIDs; + // Map each store chain to a set of dependent loads. + SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; + for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { + SUnit *SU = &DAG->SUnits[Idx]; + if (!SU->getInstr()->mayLoad()) + continue; + unsigned ChainPredID = DAG->SUnits.size(); + for (SUnit::const_pred_iterator + PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { + if (PI->isCtrl()) { + ChainPredID = PI->getSUnit()->NodeNum; + break; + } + } + // Check if this chain-like pred has been seen + // before. ChainPredID==MaxNodeID for loads at the top of the schedule. + unsigned NumChains = StoreChainDependents.size(); + std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = + StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); + if (Result.second) + StoreChainDependents.resize(NumChains + 1); + StoreChainDependents[Result.first->second].push_back(SU); + } + // Iterate over the store chains. + for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) + clusterNeighboringLoads(StoreChainDependents[Idx], DAG); +} + //===----------------------------------------------------------------------===// // ConvergingScheduler - Implementation of the standard MachineSchedStrategy. //===----------------------------------------------------------------------===// @@ -676,9 +812,10 @@ public: /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason { - NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand, - BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, - SingleMax, MultiPressure, NextDefUse, NodeOrder}; + NoCand, SingleExcess, SingleCritical, Cluster, + ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, + TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse, + NodeOrder}; #ifndef NDEBUG static const char *getReasonStr(ConvergingScheduler::CandReason Reason); @@ -1029,6 +1166,8 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) { for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E; ++I) { + if (I->isWeak()) + continue; unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; unsigned MinLatency = I->getMinLatency(); #ifndef NDEBUG @@ -1424,6 +1563,7 @@ static bool tryLess(unsigned TryVal, unsigned CandVal, } return false; } + static bool tryGreater(unsigned TryVal, unsigned CandVal, ConvergingScheduler::SchedCandidate &TryCand, ConvergingScheduler::SchedCandidate &Cand, @@ -1440,6 +1580,10 @@ static bool tryGreater(unsigned TryVal, unsigned CandVal, return false; } +static unsigned getWeakLeft(const SUnit *SU, bool isTop) { + return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; +} + /// Apply a set of heursitics to a new candidate. Heuristics are currently /// hierarchical. This may be more efficient than a graduated cost model because /// we don't need to evaluate all aspects of the model for each node in the @@ -1482,6 +1626,26 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, if (Cand.Reason == SingleCritical) Cand.Reason = MultiPressure; + // Keep clustered nodes together to encourage downstream peephole + // optimizations which may reduce resource requirements. + // + // This is a best effort to set things up for a post-RA pass. Optimizations + // like generating loads of multiple registers should ideally be done within + // the scheduler pass by combining the loads during DAG postprocessing. + const SUnit *NextClusterSU = + Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, + TryCand, Cand, Cluster)) + return; + // Currently, weak edges are for clustering, so we hard-code that reason. + // However, deferring the current TryCand will not change Cand's reason. + CandReason OrigReason = Cand.Reason; + if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), + getWeakLeft(Cand.SU, Zone.isTop()), + TryCand, Cand, Cluster)) { + Cand.Reason = OrigReason; + return; + } // Avoid critical resource consumption and balance the schedule. TryCand.initResourceDelta(DAG, SchedModel); if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, @@ -1528,15 +1692,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, // Prefer immediate defs/users of the last scheduled instruction. This is a // nice pressure avoidance strategy that also conserves the processor's // register renaming resources and keeps the machine code readable. - if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) { - TryCand.Reason = NextDefUse; - return; - } - if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) { - if (Cand.Reason > NextDefUse) - Cand.Reason = NextDefUse; + if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU), + TryCand, Cand, NextDefUse)) return; - } + // Fall through to original instruction order. if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { @@ -1582,6 +1741,7 @@ const char *ConvergingScheduler::getReasonStr( case NoCand: return "NOCAND "; case SingleExcess: return "REG-EXCESS"; case SingleCritical: return "REG-CRIT "; + case Cluster: return "CLUSTER "; case SingleMax: return "REG-MAX "; case MultiPressure: return "REG-MULTI "; case ResourceReduce: return "RES-REDUCE"; @@ -1822,7 +1982,11 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { assert((!ForceTopDown || !ForceBottomUp) && "-misched-topdown incompatible with -misched-bottomup"); - return new ScheduleDAGMI(C, new ConvergingScheduler()); + ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler()); + // Register DAG post-processors. + if (EnableLoadCluster) + DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI)); + return DAG; } static MachineSchedRegistry ConvergingSchedRegistry("converge", "Standard converging scheduler.", diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 3c7bb24..3288a71 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1373,6 +1373,9 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, /// only return true if the base pointers are the same and the only differences /// between the two addresses is the offset. It also returns the offsets by /// reference. +/// +/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched +/// is permanently disabled. bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const { @@ -1447,6 +1450,9 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, /// from the common base address. It returns true if it decides it's desirable /// to schedule the two loads together. "NumLoads" is the number of loads that /// have already been scheduled after Load1. +/// +/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched +/// is permanently disabled. bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const { |