diff options
author | Preston Gurd <preston.gurd@intel.com> | 2013-04-25 20:29:37 +0000 |
---|---|---|
committer | Preston Gurd <preston.gurd@intel.com> | 2013-04-25 20:29:37 +0000 |
commit | d6ac8e9a03d8fa7115079d86192bc4529e8281aa (patch) | |
tree | 9553fbaac5e6badb3c220a49e83147e96b44c70f | |
parent | 975b1ddf60387139357c8cbbaeb613de5a39294f (diff) | |
download | external_llvm-d6ac8e9a03d8fa7115079d86192bc4529e8281aa.zip external_llvm-d6ac8e9a03d8fa7115079d86192bc4529e8281aa.tar.gz external_llvm-d6ac8e9a03d8fa7115079d86192bc4529e8281aa.tar.bz2 |
This patch adds the X86FixupLEAs pass, which will reduce instruction
latency for certain models of the Intel Atom family, by converting
instructions into their equivalent LEA instructions, when it is both
useful and possible to do so.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180573 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Target/X86/X86.h | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86.td | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86FixupLEAs.cpp | 251 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.cpp | 1 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.h | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetMachine.cpp | 5 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea1.ll | 38 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea2.ll | 84 | ||||
-rw-r--r-- | test/CodeGen/X86/atom-fixup-lea3.ll | 51 | ||||
-rw-r--r-- | test/CodeGen/X86/lsr-static-addr.ll | 2 |
11 files changed, 444 insertions, 1 deletions
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index d14899d..7cb71f0 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -33,6 +33,7 @@ set(sources X86TargetObjectFile.cpp X86TargetTransformInfo.cpp X86VZeroUpper.cpp + X86FixupLEAs.cpp ) if( CMAKE_CL_64 ) diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 1f9919f..947002f 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -69,6 +69,11 @@ ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); +/// createX86FixupLEAs - Return a a pass that selectively replaces +/// certain instructions (like add, sub, inc, dec, some shifts, +/// and some multiplies) by equivalent LEA instructions, in order +/// to eliminate execution delays in some Atom processors. +FunctionPass *createX86FixupLEAs(); } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 306e3ac..87bb68d 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -139,6 +139,8 @@ def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "CallRegIndirect", "true", "Call register indirect">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -188,6 +190,7 @@ def : ProcessorModel<"atom", AtomModel, FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, FeatureSlowDivide, FeatureCallRegIndirect, + FeatureLEAUsesAG, FeaturePadShortFunctions]>; // "Arrandale" along with corei3 and corei5 diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp new file mode 100644 index 0000000..82e6de4 --- /dev/null +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -0,0 +1,251 @@ +//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will find instructions which +// can be re-written as LEA instructions in order to reduce pipeline +// delays for some models of the Intel Atom family. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-fixup-LEAs" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumLEAs, "Number of LEA instructions created"); + +namespace { + class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + + virtual const char *getPassName() const { return "X86 Atom LEA Fixup";} + void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + void processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + RegUsageState usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I); + MachineBasicBlock::iterator searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + public: + FixupLEAPass() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + MachineFunction *MF; + const TargetMachine *TM; + const TargetInstrInfo *TII; // Machine instruction info. + LiveVariables *LV; + + }; + char FixupLEAPass::ID = 0; +} + +/// postRAConvertToLEA - if an instruction can be converted to an +/// equivalent LEA, insert the new instruction into the basic block +/// and return a pointer to it. Otherwise, return zero. +MachineInstr * +FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr* MI = MBBI; + MachineInstr* NewMI; + switch (MI->getOpcode()) { + case X86::MOV32rr: + case X86::MOV64rr: { + const MachineOperand& Src = MI->getOperand(1); + const MachineOperand& Dest = MI->getOperand(0); + NewMI = BuildMI(*MF, MI->getDebugLoc(), + TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (!MI->getOperand(2).isImm()) { + // convertToThreeAddress will call getImm() + // which requires isImm() to be true + return 0; + } + } + return TII->convertToThreeAddress(MFI, MBBI, LV); +} + +FunctionPass *llvm::createX86FixupLEAs() { + return new FixupLEAPass(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, +/// replacing instructions by equivalent LEA instructions +/// if needed and when possible. +bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + TII = Func.getTarget().getInstrInfo(); + TM = &MF->getTarget(); + LV = getAnalysisIfAvailable<LiveVariables>(); + + DEBUG(dbgs() << "Start X86FixupLEAs\n";); + // Process all basic blocks. + for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) + processBasicBlock(Func, I); + DEBUG(dbgs() << "End X86FixupLEAs\n";); + + return true; +} + +/// usesRegister - Determine if an instruction references a machine register +/// and, if so, whether it reads or writes the register. +FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I) { + RegUsageState RegUsage = RU_NotUsed; + MachineInstr* MI = I; + + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand& opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()){ + if (opnd.isDef()) + return RU_Write; + RegUsage = RU_Read; + } + } + return RegUsage; +} + +/// getPreviousInstr - Given a reference to an instruction in a basic +/// block, return a reference to the previous instruction in the block, +/// wrapping around to the last instruction of the block if the block +/// branches to itself. +static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + if (I == MFI->begin()) { + if (MFI->isPredecessor(MFI)) { + I = --MFI->end(); + return true; + } + else + return false; + } + --I; + return true; +} + +/// searchBackwards - Step backwards through a basic block, looking +/// for an instruction which writes a register within +/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. +MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst; + static const int INSTR_DISTANCE_THRESHOLD = 5; + + CurInst = I; + bool Found; + Found = getPreviousInstr(CurInst, MFI); + while( Found && I != CurInst) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > INSTR_DISTANCE_THRESHOLD) + break; // too far back to make a difference + if (usesRegister(p, CurInst) == RU_Write){ + return CurInst; + } + InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + Found = getPreviousInstr(CurInst, MFI); + } + return 0; +} + +/// processInstruction - Given a memory access or LEA instruction +/// whose address mode uses a base and/or index register, look for +/// an opportunity to replace the instruction which sets the base or index +/// register with an equivalent LEA instruction. +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + // Process a load, store, or LEA instruction. + MachineInstr *MI = I; + int opcode = MI->getOpcode(); + const MCInstrDesc& Desc = MI->getDesc(); + int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); + if (AddrOffset >= 0) { + AddrOffset += X86II::getOperandBias(Desc); + MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + if (p.isReg() && p.getReg() != X86::ESP) { + seekLEAFixup(p, I, MFI); + } + MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + if (q.isReg() && q.getReg() != X86::ESP) { + seekLEAFixup(q, I, MFI); + } + } +} + +/// seekLEAFixup - Given a machine register, look for the instruction +/// which writes it in the current basic block. If found, +/// try to replace it with an equivalent LEA instruction. +/// If replacement succeeds, then also process the the newly created +/// instruction. +void FixupLEAPass::seekLEAFixup(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + if (MBI) { + MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI, LV); + if (NewMI) { + ++NumLEAs; + DEBUG(dbgs() << "Candidate to replace:"; MBI->dump();); + // now to replace with an equivalent LEA... + DEBUG(dbgs() << "Replaced by: "; NewMI->dump();); + MFI->erase(MBI); + MachineBasicBlock::iterator J = + static_cast<MachineBasicBlock::iterator> (NewMI); + processInstruction(J, MFI); + } + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// replacing adds and shifts with LEA instructions, where appropriate. +bool FixupLEAPass::processBasicBlock(MachineFunction &MF, + MachineFunction::iterator MFI) { + + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) + processInstruction(I, MFI); + return false; +} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 14619b6..448d2e6 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -467,6 +467,7 @@ void X86Subtarget::initializeEnvironment() { PostRAScheduler = false; PadShortFunctions = false; CallRegIndirect = false; + LEAUsesAG = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 6fbdb1d..66832b9 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -165,6 +165,9 @@ protected: /// CallRegIndirect - True if the Calls with memory reference should be converted /// to a register-based indirect call. bool CallRegIndirect; + /// LEAUsesAG - True if the LEA instruction inputs have to be ready at + /// address generation (AG) time. + bool LEAUsesAG; /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. @@ -278,6 +281,7 @@ public: bool hasSlowDivide() const { return HasSlowDivide; } bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } + bool LEAusesAG() const { return LEAUsesAG; } bool isAtom() const { return X86ProcFamily == IntelAtom; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8aa58a2..00fa47f 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -215,6 +215,11 @@ bool X86PassConfig::addPreEmitPass() { addPass(createX86PadShortFunctions()); ShouldPrint = true; } + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().LEAusesAG()){ + addPass(createX86FixupLEAs()); + ShouldPrint = true; + } return ShouldPrint; } diff --git a/test/CodeGen/X86/atom-fixup-lea1.ll b/test/CodeGen/X86/atom-fixup-lea1.ll new file mode 100644 index 0000000..4651bf2 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea1.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK: addl +; CHECK-NEXT:leal +; CHECK-NEXT:decl +; CHECK-NEXT:jne + +; Test for the FixupLEAs pre-emit pass. An LEA should be substituted for the ADD +; that increments the array pointer because it is within 5 instructions of the +; corresponding load. The ADD precedes the load by following the loop back edge. + +; Original C code +;int test(int n, int * array) +;{ +; int sum = 0; +; for(int i = 0; i < n; i++) +; sum += array[i]; +; return sum; +;} + +define i32 @test(i32 %n, i32* nocapture %array) { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32* %array, i32 %i.06 + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.05 + %inc = add nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} diff --git a/test/CodeGen/X86/atom-fixup-lea2.ll b/test/CodeGen/X86/atom-fixup-lea2.ll new file mode 100644 index 0000000..1855ea1 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea2.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK:BB#5 +; CHECK-NEXT:leal +; CHECK-NEXT:leal +; CHECK-NEXT:leal +; CHECK-NEXT:movl + + +; Test for fixup lea pre-emit pass. LEA instructions should be substituted for +; ADD instructions which compute the address and index of the load because they +; precede the load within 5 instructions. An LEA should also be substituted for +; an ADD which computes part of the index because it precedes the index LEA +; within 5 instructions, this substitution is referred to as backwards chaining. + +; Original C Code +;struct node_t +;{ +; int k, m, n, p; +; int * array; +;}; + +;extern struct node_t getnode(); + +;int test() +;{ +; int sum = 0; +; struct node_t n = getnode(); +; if(n.array != 0 && n.p > 0 && n.k > 0 && n.n > 0 && n.m > 0) { +; sum = ((int*)((int)n.array + n.p) )[ n.k + n.m + n.n ]; +; } +; return sum; +;} + +%struct.node_t = type { i32, i32, i32, i32, i32* } + +define i32 @test() { +entry: + %n = alloca %struct.node_t, align 4 + call void bitcast (void (%struct.node_t*, ...)* @getnode to void (%struct.node_t*)*)(%struct.node_t* sret %n) + %array = getelementptr inbounds %struct.node_t* %n, i32 0, i32 4 + %0 = load i32** %array, align 4 + %cmp = icmp eq i32* %0, null + br i1 %cmp, label %if.end, label %land.lhs.true + +land.lhs.true: + %p = getelementptr inbounds %struct.node_t* %n, i32 0, i32 3 + %1 = load i32* %p, align 4 + %cmp1 = icmp sgt i32 %1, 0 + br i1 %cmp1, label %land.lhs.true2, label %if.end + +land.lhs.true2: + %k = getelementptr inbounds %struct.node_t* %n, i32 0, i32 0 + %2 = load i32* %k, align 4 + %cmp3 = icmp sgt i32 %2, 0 + br i1 %cmp3, label %land.lhs.true4, label %if.end + +land.lhs.true4: + %n5 = getelementptr inbounds %struct.node_t* %n, i32 0, i32 2 + %3 = load i32* %n5, align 4 + %cmp6 = icmp sgt i32 %3, 0 + br i1 %cmp6, label %land.lhs.true7, label %if.end + +land.lhs.true7: + %m = getelementptr inbounds %struct.node_t* %n, i32 0, i32 1 + %4 = load i32* %m, align 4 + %cmp8 = icmp sgt i32 %4, 0 + br i1 %cmp8, label %if.then, label %if.end + +if.then: + %add = add i32 %3, %2 + %add12 = add i32 %add, %4 + %5 = ptrtoint i32* %0 to i32 + %add15 = add nsw i32 %1, %5 + %6 = inttoptr i32 %add15 to i32* + %arrayidx = getelementptr inbounds i32* %6, i32 %add12 + %7 = load i32* %arrayidx, align 4 + br label %if.end + +if.end: + %sum.0 = phi i32 [ %7, %if.then ], [ 0, %land.lhs.true7 ], [ 0, %land.lhs.true4 ], [ 0, %land.lhs.true2 ], [ 0, %land.lhs.true ], [ 0, %entry ] + ret i32 %sum.0 +} + +declare void @getnode(%struct.node_t* sret, ...) diff --git a/test/CodeGen/X86/atom-fixup-lea3.ll b/test/CodeGen/X86/atom-fixup-lea3.ll new file mode 100644 index 0000000..311b0b3 --- /dev/null +++ b/test/CodeGen/X86/atom-fixup-lea3.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s +; CHECK: addl ([[reg:%[a-z]+]]) +; CHECK-NEXT: addl $4, [[reg]] + +; Test for the FixupLEAs pre-emit pass. +; An LEA should NOT be substituted for the ADD instruction +; that increments the array pointer if it is greater than 5 instructions +; away from the memory reference that uses it. + +; Original C code: clang -m32 -S -O2 +;int test(int n, int * array, int * m, int * array2) +;{ +; int i, j = 0; +; int sum = 0; +; for (i = 0, j = 0; i < n;) { +; ++i; +; *m += array2[j++]; +; sum += array[i]; +; } +; return sum; +;} + +define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %.pre = load i32* %m, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add, %for.body ] + %sum.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ] + %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ] + %inc1 = add nsw i32 %j.09, 1 + %arrayidx = getelementptr inbounds i32* %array2, i32 %j.09 + %1 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %1 + store i32 %add, i32* %m, align 4 + %arrayidx2 = getelementptr inbounds i32* %array, i32 %inc1 + %2 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %sum.010 + %exitcond = icmp eq i32 %inc1, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %sum.0.lcssa +} + diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll index 6566f56..b2aea90 100644 --- a/test/CodeGen/X86/lsr-static-addr.ll +++ b/test/CodeGen/X86/lsr-static-addr.ll @@ -17,7 +17,7 @@ ; ATOM-NEXT: movsd A(,%rax,8) ; ATOM-NEXT: mulsd ; ATOM-NEXT: movsd -; ATOM-NEXT: incq %rax +; ATOM-NEXT: leaq 1(%rax), %rax @A = external global [0 x double] |