lib/Target/X86/X86AtomicExpandPass.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass (at IR level) to replace atomic instructions which
// cannot be implemented as a single instruction with cmpxchg-based loops.
//
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86TargetMachine.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;

#define DEBUG_TYPE "x86-atomic-expand"

namespace {
  class X86AtomicExpandPass : public FunctionPass {
    const X86TargetMachine *TM;
  public:
    static char ID; // Pass identification, replacement for typeid
    explicit X86AtomicExpandPass(const X86TargetMachine *TM)
      : FunctionPass(ID), TM(TM) {}

    bool runOnFunction(Function &F) override;
    bool expandAtomicInsts(Function &F);

    bool needsCmpXchgNb(Type *MemType);

    /// There are four kinds of atomic operations. Two never need expanding:
    /// cmpxchg is what we expand the others *to*, and loads are easily handled
    /// by ISelLowering. Atomicrmw and store can need expanding in some
    /// circumstances.
    bool shouldExpand(Instruction *Inst);

    /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
    /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
    bool shouldExpandStore(StoreInst *SI);

    /// Only some atomicrmw instructions need expanding -- some operations
    /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
    /// limited support but can't return the previous value; some (e.g. add)
    /// have complete support in the instruction set.
    ///
    /// Also, naturally, 128-bit operations always need to be expanded.
    bool shouldExpandAtomicRMW(AtomicRMWInst *AI);

    bool expandAtomicRMW(AtomicRMWInst *AI);
    bool expandAtomicStore(StoreInst *SI);
  };
}

char X86AtomicExpandPass::ID = 0;

FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
  return new X86AtomicExpandPass(TM);
}

bool X86AtomicExpandPass::runOnFunction(Function &F) {
  SmallVector<Instruction *, 1> AtomicInsts;

  // Changing control-flow while iterating through it is a bad idea, so gather a
  // list of all atomic instructions before we start.
  for (BasicBlock &BB : F)
    for (Instruction &Inst : BB) {
      if (isa<AtomicRMWInst>(&Inst) ||
          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
        AtomicInsts.push_back(&Inst);
    }

  bool MadeChange = false;
  for (Instruction *Inst : AtomicInsts) {
    if (!shouldExpand(Inst))
      continue;

    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
      MadeChange |= expandAtomicRMW(AI);
    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
      MadeChange |= expandAtomicStore(SI);

    assert(MadeChange && "Atomic inst not expanded when it should be?");
    Inst->eraseFromParent();
  }

  return MadeChange;
}

/// Returns true if operations on the given type will need to use either
/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the
/// native width, and the instructions are available (otherwise we leave them
/// alone to become __sync_fetch_and_... calls).
bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
  if (!Subtarget.hasCmpxchg16b())
    return false;

  unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64;

  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
  if (OpWidth == CmpXchgNbWidth)
    return true;

  return false;
}


bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

  if (needsCmpXchgNb(AI->getType()))
    return true;

  if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
    return false;

  AtomicRMWInst::BinOp Op = AI->getOperation();
  switch (Op) {
  default:
    llvm_unreachable("Unknown atomic operation");
  case AtomicRMWInst::Xchg:
  case AtomicRMWInst::Add:
  case AtomicRMWInst::Sub:
    // It's better to use xadd, xsub or xchg for these in all cases.
    return false;
  case AtomicRMWInst::Or:
  case AtomicRMWInst::And:
  case AtomicRMWInst::Xor:
    // If the atomicrmw's result isn't actually used, we can just add a "lock"
    // prefix to a normal instruction for these operations.
    return !AI->use_empty();
  case AtomicRMWInst::Nand:
  case AtomicRMWInst::Max:
  case AtomicRMWInst::Min:
  case AtomicRMWInst::UMax:
  case AtomicRMWInst::UMin:
    // These always require a non-trivial set of data operations on x86. We must
    // use a cmpxchg loop.
    return true;
  }
}

bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
  if (needsCmpXchgNb(SI->getValueOperand()->getType()))
    return true;

  return false;
}

bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
    return shouldExpandAtomicRMW(AI);
  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
    return shouldExpandStore(SI);
  return false;
}

/// Emit IR to implement the given atomicrmw operation on values in registers,
/// returning the new value.
static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
                              Value *Loaded, Value *Inc) {
  Value *NewVal;
  switch (Op) {
  case AtomicRMWInst::Xchg:
    return Inc;
  case AtomicRMWInst::Add:
    return Builder.CreateAdd(Loaded, Inc, "new");
  case AtomicRMWInst::Sub:
    return Builder.CreateSub(Loaded, Inc, "new");
  case AtomicRMWInst::And:
    return Builder.CreateAnd(Loaded, Inc, "new");
  case AtomicRMWInst::Nand:
    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
  case AtomicRMWInst::Or:
    return Builder.CreateOr(Loaded, Inc, "new");
  case AtomicRMWInst::Xor:
    return Builder.CreateXor(Loaded, Inc, "new");
  case AtomicRMWInst::Max:
    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
  case AtomicRMWInst::Min:
    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
  case AtomicRMWInst::UMax:
    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
    return  Builder.CreateSelect(NewVal, Loaded, Inc, "new");
  case AtomicRMWInst::UMin:
    NewVal = Builder.CreateICmpULE(Loaded, Inc);
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
  default:
    break;
  }
  llvm_unreachable("Unknown atomic op");
}

bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
  AtomicOrdering Order =
      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
  Value *Addr = AI->getPointerOperand();
  BasicBlock *BB = AI->getParent();
  Function *F = BB->getParent();
  LLVMContext &Ctx = F->getContext();

  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
  //
  // The standard expansion we produce is:
  //     [...]
  //     %init_loaded = load atomic iN* %addr
  //     br label %loop
  // loop:
  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
  //     %new = some_op iN %loaded, %incr
  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
  //     %success = extractvalue { iN, i1 } %pair, 1
  //     br i1 %success, label %atomicrmw.end, label %loop
  // atomicrmw.end:
  //     [...]
  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);

  // This grabs the DebugLoc from AI.
  IRBuilder<> Builder(AI);

  // The split call above "helpfully" added a branch at the end of BB (to the
  // wrong place), but we want a load. It's easiest to just remove
  // the branch entirely.
  std::prev(BB->end())->eraseFromParent();
  Builder.SetInsertPoint(BB);
  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
  Builder.CreateBr(LoopBB);

  // Start the main loop block now that we've taken care of the preliminaries.
  Builder.SetInsertPoint(LoopBB);
  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
  Loaded->addIncoming(InitLoaded, BB);

  Value *NewVal =
      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());

  Value *Pair = Builder.CreateAtomicCmpXchg(
      Addr, Loaded, NewVal, Order,
      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
  Loaded->addIncoming(NewLoaded, LoopBB);

  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
  Builder.CreateCondBr(Success, ExitBB, LoopBB);

  AI->replaceAllUsesWith(NewLoaded);

  return true;
}

bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
  // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
  // this in terms of the usual expansion to "atomicrmw xchg".
  IRBuilder<> Builder(SI);
  AtomicOrdering Order =
      SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
  AtomicRMWInst *AI =
      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
                              SI->getValueOperand(), Order);

  // Now we have an appropriate swap instruction, lower it as usual.
  if (shouldExpandAtomicRMW(AI)) {
    expandAtomicRMW(AI);
    AI->eraseFromParent();
    return true;
  }

  return AI;
}