DFG2LLVM_OpenCL.cpp 96.2 KB
Newer Older
1
//=== DFG2LLVM_OpenCL.cpp ===//
kotsifa2's avatar
kotsifa2 committed
2
3
4
5
6
7
8
9
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

10
#define ENABLE_ASSERTS
11
#define TARGET_PTX 64
kotsifa2's avatar
kotsifa2 committed
12
#define GENERIC_ADDRSPACE 0
13
#define GLOBAL_ADDRSPACE 1
14
#define CONSTANT_ADDRSPACE 4
15
#define SHARED_ADDRSPACE 3
16

17
#define DEBUG_TYPE "DFG2LLVM_OpenCL"
Yifan Zhao's avatar
Yifan Zhao committed
18
19
20
21
22
#include "SupportHPVM/DFG2LLVM.h"
#include "SupportHPVM/HPVMTimer.h"
#include "SupportHPVM/HPVMUtils.h"
#include "llvm-c/Core.h"
#include "llvm/IR/Attributes.h"
23
#include "llvm/IR/DataLayout.h"
kotsifa2's avatar
kotsifa2 committed
24
#include "llvm/IR/IRBuilder.h"
25
#include "llvm/IR/InstIterator.h"
Yifan Zhao's avatar
Yifan Zhao committed
26
#include "llvm/IR/Module.h"
27
28
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
Yifan Zhao's avatar
Yifan Zhao committed
29
#include "llvm/Pass.h"
30
#include "llvm/Support/FileSystem.h"
Yifan Zhao's avatar
Yifan Zhao committed
31
32
33
34
#include "llvm/Support/SourceMgr.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
35
36
37

#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
38
#include "llvm/IR/UseListOrder.h"
Yifan Zhao's avatar
Yifan Zhao committed
39
#include "llvm/Support/ToolOutputFile.h"
kotsifa2's avatar
kotsifa2 committed
40

41
42
#include <sstream>

43
44
45
46
47
48
49
50
#ifndef LLVM_BUILD_DIR
#error LLVM_BUILD_DIR is not defined
#endif

#define STR_VALUE(X) #X
#define STRINGIFY(X) STR_VALUE(X)
#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR)

kotsifa2's avatar
kotsifa2 committed
51
52
using namespace llvm;
using namespace builddfg;
53
using namespace dfg2llvm;
Yifan Zhao's avatar
Yifan Zhao committed
54
using namespace hpvmUtils;
kotsifa2's avatar
kotsifa2 committed
55

Yifan Zhao's avatar
Yifan Zhao committed
56
// HPVM Command line option to use timer or not
57
58
static cl::opt<bool> HPVMTimer_OpenCL("hpvm-timers-ptx",
                                      cl::desc("Enable hpvm timers"));
59

kotsifa2's avatar
kotsifa2 committed
60
namespace {
61
62
63
64
65
66
// Helper class declarations

// Class to maintain the tuple of host pointer, device pointer and size
// in bytes. Would have preferred to use tuple but support not yet available
class OutputPtr {
public:
Yifan Zhao's avatar
Yifan Zhao committed
67
68
  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
69

Yifan Zhao's avatar
Yifan Zhao committed
70
71
72
  Value *h_ptr;
  Value *d_ptr;
  Value *bytes;
73
74
75
76
77
78
};

// Class to maintain important kernel info required for generating runtime
// calls
class Kernel {
public:
Yifan Zhao's avatar
Yifan Zhao committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  Kernel(
      Function *_KF, DFLeafNode *_KLeafNode,
      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
          std::map<unsigned, std::pair<Value *, unsigned>>(),
      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
      unsigned _gridDim = 0,
      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
      unsigned _blockDim = 0,
      std::vector<Value *> _localWGSize = std::vector<Value *>())
      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
        localWGSize(_localWGSize) {

    assert(gridDim == globalWGSize.size() &&
           "gridDim should be same as the size of vector globalWGSize");
    assert(blockDim == localWGSize.size() &&
           "blockDim should be same as the size of vector localWGSize");
98
99
  }

Yifan Zhao's avatar
Yifan Zhao committed
100
101
  Function *KernelFunction;
  DFLeafNode *KernelLeafNode;
102
103
  std::map<unsigned, unsigned> inArgMap;
  // Map for shared memory arguments
Yifan Zhao's avatar
Yifan Zhao committed
104
  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
105
  // Fields for (potential) allocation node
Yifan Zhao's avatar
Yifan Zhao committed
106
107
  DFLeafNode *AllocationNode;
  Function *AllocationFunction;
Prakalp Srivastava's avatar
Prakalp Srivastava committed
108
  std::map<unsigned, unsigned> allocInArgMap;
109

110
  std::vector<unsigned> outArgMap;
111
  unsigned gridDim;
Yifan Zhao's avatar
Yifan Zhao committed
112
  std::vector<Value *> globalWGSize;
113
  unsigned blockDim;
Yifan Zhao's avatar
Yifan Zhao committed
114
  std::vector<Value *> localWGSize;
115
  std::vector<int> localDimMap;
116

Yifan Zhao's avatar
Yifan Zhao committed
117
118
  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
119

Yifan Zhao's avatar
Yifan Zhao committed
120
  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
121
122
    return sharedInArgMap;
  }
Yifan Zhao's avatar
Yifan Zhao committed
123
  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
124
125
126
    sharedInArgMap = map;
  }

Yifan Zhao's avatar
Yifan Zhao committed
127
128
  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
129

Yifan Zhao's avatar
Yifan Zhao committed
130
  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
131

Yifan Zhao's avatar
Yifan Zhao committed
132
  bool hasLocalWG() const { return blockDim != 0; }
133
134
};

135
// Helper function declarations
Yifan Zhao's avatar
Yifan Zhao committed
136
137
138
139
140
141
142
143
static bool canBePromoted(Argument *arg, Function *F);
static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
                                 Kernel *, ValueToValueMapTy &, Instruction *);
static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
                              ValueToValueMapTy &, Instruction *,
                              const Twine &WGName = "WGSize");
static std::string getPTXFilename(const Module &);
static std::string getFilenameFromModule(const Module &M);
144
145
146
static void changeDataLayout(Module &);
static void changeTargetTriple(Module &);
static void findReturnInst(Function *, std::vector<ReturnInst *> &);
Yifan Zhao's avatar
Yifan Zhao committed
147
148
static void findIntrinsicInst(Function *, Intrinsic::ID,
                              std::vector<IntrinsicInst *> &);
149
150
static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
static std::string getAtomicOpName(Intrinsic::ID);
151

152
153
// DFG2LLVM_OpenCL - The first implementation.
struct DFG2LLVM_OpenCL : public DFG2LLVM {
154
  static char ID; // Pass identification, replacement for typeid
155
  DFG2LLVM_OpenCL() : DFG2LLVM(ID) {}
156
157
158
159
160
161
162

private:
public:
  bool runOnModule(Module &M);
};

// Visitor for Code generation traversal (tree traversal for now)
163
class CGT_OpenCL : public CodeGenTraversal {
164
165

private:
Yifan Zhao's avatar
Yifan Zhao committed
166
  // Member variables
167
  std::unique_ptr<Module> KernelM;
Yifan Zhao's avatar
Yifan Zhao committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  DFNode *KernelLaunchNode = NULL;
  Kernel *kernel;

  // HPVM Runtime API
  FunctionCallee llvm_hpvm_ocl_launch;
  FunctionCallee llvm_hpvm_ocl_wait;
  FunctionCallee llvm_hpvm_ocl_initContext;
  FunctionCallee llvm_hpvm_ocl_clearContext;
  FunctionCallee llvm_hpvm_ocl_argument_shared;
  FunctionCallee llvm_hpvm_ocl_argument_scalar;
  FunctionCallee llvm_hpvm_ocl_argument_ptr;
  FunctionCallee llvm_hpvm_ocl_output_ptr;
  FunctionCallee llvm_hpvm_ocl_free;
  FunctionCallee llvm_hpvm_ocl_getOutput;
  FunctionCallee llvm_hpvm_ocl_executeNode;

  // Functions
185
  std::string getKernelsModuleName(Module &M);
Yifan Zhao's avatar
Yifan Zhao committed
186
187
188
189
190
191
192
193
  void fixValueAddrspace(Value *V, unsigned addrspace);
  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
                                                  Function *);
  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
                               unsigned i);
  void addCLMetadata(Function *F);
  Function *transformFunctionToVoid(Function *F);
  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
194

195
196
  // Virtual Functions
  void init() {
197
198
    HPVMTimer = HPVMTimer_OpenCL;
    TargetName = "OpenCL";
199
200
  }
  void initRuntimeAPI();
Yifan Zhao's avatar
Yifan Zhao committed
201
202
  void codeGen(DFInternalNode *N);
  void codeGen(DFLeafNode *N);
203
204
205

public:
  // Constructor
206
  CGT_OpenCL(Module &_M, BuildDFG &_DFG)
Yifan Zhao's avatar
Yifan Zhao committed
207
      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
208
    init();
209
    initRuntimeAPI();
Yifan Zhao's avatar
Yifan Zhao committed
210
211
    DEBUG(errs() << "Old module pointer: " << &_M << "\n");
    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
212

Yifan Zhao's avatar
Yifan Zhao committed
213
214
215
    // Copying instead of creating new, in order to preserve required info
    // (metadata) Remove functions, global variables and aliases
    std::vector<GlobalVariable *> GVVect;
216
    for (Module::global_iterator mi = KernelM->global_begin(),
Yifan Zhao's avatar
Yifan Zhao committed
217
218
219
                                 me = KernelM->global_end();
         (mi != me); ++mi) {
      GlobalVariable *GV = &*mi;
220
      GVVect.push_back(GV);
221
    }
222
223
224
    for (auto *GV : GVVect) {
      GV->replaceAllUsesWith(UndefValue::get(GV->getType()));
      GV->eraseFromParent();
225
226
    }

Yifan Zhao's avatar
Yifan Zhao committed
227
228
229
230
    std::vector<Function *> FuncVect;
    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
         (mi != me); ++mi) {
      Function *F = &*mi;
231
      FuncVect.push_back(F);
232
    }
233
234
235
    for (auto *F : FuncVect) {
      F->replaceAllUsesWith(UndefValue::get(F->getType()));
      F->eraseFromParent();
236
    }
kotsifa2's avatar
kotsifa2 committed
237

Yifan Zhao's avatar
Yifan Zhao committed
238
    std::vector<GlobalAlias *> GAVect;
239
    for (Module::alias_iterator mi = KernelM->alias_begin(),
Yifan Zhao's avatar
Yifan Zhao committed
240
241
242
                                me = KernelM->alias_end();
         (mi != me); ++mi) {
      GlobalAlias *GA = &*mi;
243
      GAVect.push_back(GA);
244
    }
245
246
247
    for (auto *GA : GAVect) {
      GA->replaceAllUsesWith(UndefValue::get(GA->getType()));
      GA->eraseFromParent();
248
    }
kotsifa2's avatar
kotsifa2 committed
249

250
251
    changeDataLayout(*KernelM);
    changeTargetTriple(*KernelM);
kotsifa2's avatar
kotsifa2 committed
252

253
    DEBUG(errs() << *KernelM);
254
  }
255

256
  void writeKernelsModule();
257
258
};

Yifan Zhao's avatar
Yifan Zhao committed
259
// Initialize the HPVM runtime API. This makes it easier to insert these calls
260
void CGT_OpenCL::initRuntimeAPI() {
261
262
263

  // Load Runtime API Module
  SMDiagnostic Err;
264

265
266
  std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) +
                           "/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
267

268
  runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext());
Yifan Zhao's avatar
Yifan Zhao committed
269
  if (runtimeModule == nullptr) {
270
271
    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
    assert(false && "couldn't parse runtime");
Yifan Zhao's avatar
Yifan Zhao committed
272
273
  } else
    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
274
275

  // Get or insert the global declarations for launch/wait functions
Yifan Zhao's avatar
Yifan Zhao committed
276
277
278
279
280
281
282
283
284
285
286
  DECLARE(llvm_hpvm_ocl_launch);
  DECLARE(llvm_hpvm_ocl_wait);
  DECLARE(llvm_hpvm_ocl_initContext);
  DECLARE(llvm_hpvm_ocl_clearContext);
  DECLARE(llvm_hpvm_ocl_argument_shared);
  DECLARE(llvm_hpvm_ocl_argument_scalar);
  DECLARE(llvm_hpvm_ocl_argument_ptr);
  DECLARE(llvm_hpvm_ocl_output_ptr);
  DECLARE(llvm_hpvm_ocl_free);
  DECLARE(llvm_hpvm_ocl_getOutput);
  DECLARE(llvm_hpvm_ocl_executeNode);
287
288
289

  // Get or insert timerAPI functions as well if you plan to use timers
  initTimerAPI();
290
291

  // Insert init context in main
292
  DEBUG(errs() << "Gen Code to initialize OpenCL Timer\n");
Yifan Zhao's avatar
Yifan Zhao committed
293
294
  Function *VI = M.getFunction("llvm.hpvm.init");
  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
295

296
  InitCall = cast<Instruction>(*VI->user_begin());
297
  initializeTimerSet(InitCall);
Yifan Zhao's avatar
Yifan Zhao committed
298
299
300
301
302
  switchToTimer(hpvm_TimerID_INIT_CTX, InitCall);
  CallInst::Create(llvm_hpvm_ocl_initContext,
                   ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "",
                   InitCall);
  switchToTimer(hpvm_TimerID_NONE, InitCall);
303

Yifan Zhao's avatar
Yifan Zhao committed
304
  // Insert print instruction at hpvm exit
305
  DEBUG(errs() << "Gen Code to print OpenCL Timer\n");
Yifan Zhao's avatar
Yifan Zhao committed
306
  Function *VC = M.getFunction("llvm.hpvm.cleanup");
307
  DEBUG(errs() << *VC << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
308
  assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once");
309

310
  CleanupCall = cast<Instruction>(*VC->user_begin());
311
  printTimerSet(CleanupCall);
312
313
314
315
316
317
318
}

// Generate Code to call the kernel
// The plan is to replace the internal node with a leaf node. This method is
// used to generate a function to associate with this leaf node. The function
// is responsible for all the memory allocation/transfer and invoking the
// kernel call on the device
319
320
void CGT_OpenCL::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
                                    const Twine &FileName) {
321
322
  // Check if clone already exists. If it does, it means we have visited this
  // function before.
Yifan Zhao's avatar
Yifan Zhao committed
323
  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
324

Yifan Zhao's avatar
Yifan Zhao committed
325
  assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL &&
326
         "Code already generated for this node");
327
328

  // Useful values
Yifan Zhao's avatar
Yifan Zhao committed
329
330
  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
331
332

  // If kernel struct has not been initialized with kernel function, then fail
333
  assert(K != NULL && "No kernel found!!");
334
335
336

  DEBUG(errs() << "Generating kernel call code\n");

Yifan Zhao's avatar
Yifan Zhao committed
337
  Function *F = N->getFuncPointer();
338
339
340

  // Create of clone of F with no instructions. Only the type is the same as F
  // without the extra arguments.
341
  Function *F_CPU;
342
343
344
345
346
347

  // Clone the function, if we are seeing this function for the first time. We
  // only need a clone in terms of type.
  ValueToValueMapTy VMap;

  // Create new function with the same type
348
  F_CPU =
Yifan Zhao's avatar
Yifan Zhao committed
349
      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
350
351

  // Loop over the arguments, copying the names of arguments over.
352
  Function::arg_iterator dest_iterator = F_CPU->arg_begin();
353
354
355
  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
       i != e; ++i) {
    dest_iterator->setName(i->getName()); // Copy the name over...
kotsifa2's avatar
kotsifa2 committed
356
    // Increment dest iterator
kotsifa2's avatar
kotsifa2 committed
357
    ++dest_iterator;
358
  }
359

360
  // Add a basic block to this empty function
361
  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_CPU);
Yifan Zhao's avatar
Yifan Zhao committed
362
  ReturnInst *RI = ReturnInst::Create(
363
      M.getContext(), UndefValue::get(F_CPU->getReturnType()), BB);
364

365
  // FIXME: Adding Index and Dim arguments are probably not required except
366
  // for consistency purpose (DFG2LLVM_CPU does assume that all leaf nodes do
367
  // have those arguments)
368

369
  // Add Index and Dim arguments except for the root node
Yifan Zhao's avatar
Yifan Zhao committed
370
  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
371
    F_CPU = addIdxDimArgs(F_CPU);
kotsifa2's avatar
kotsifa2 committed
372

373
  BB = &*F_CPU->begin();
kotsifa2's avatar
kotsifa2 committed
374
375
  RI = cast<ReturnInst>(BB->getTerminator());

Yifan Zhao's avatar
Yifan Zhao committed
376
  // Add the generated function info to DFNode
377
378
379
  //  N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
  N->addGenFunc(F_CPU, hpvm::GPU_TARGET, true);
  DEBUG(errs() << "Added GPUGenFunc: " << F_CPU->getName() << " for node "
Yifan Zhao's avatar
Yifan Zhao committed
380
               << N->getFuncPointer()->getName() << "\n");
kotsifa2's avatar
kotsifa2 committed
381
382

  // Loop over the arguments, to create the VMap
383
  dest_iterator = F_CPU->arg_begin();
kotsifa2's avatar
kotsifa2 committed
384
385
386
387
388
389
  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
       i != e; ++i) {
    // Add mapping to VMap and increment dest iterator
    VMap[&*i] = &*dest_iterator;
    ++dest_iterator;
  }
390

391
392
  /* TODO: Use this code to verufy if this is a good pattern for PTX kernel

393
394
  // Sort children in topological order before code generation for kernel call
  N->getChildGraph()->sortChildren();
395

396
397
398
399
  // The DFNode N has the property that it has only one child (leaving Entry
  // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
  // generation for kernel calls significantly. All the inputs to this child
  // node would either be constants or from the parent node N.
400

401
402
  assert(N->getChildGraph()->size() == 3
         && "Node expected to have just one non-dummy node!");
403

404
405
406
407
408
409
410
  DFNode* C;
  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
    C = *ci;
    // Skip dummy node call
    if (!C->isDummyNode())
      break;
411
412
  }

Yifan Zhao's avatar
Yifan Zhao committed
413
414
  assert(C->isDummyNode() == false && "Internal Node only contains dummy
  nodes!");
415
416

  Function* CF = C->getFuncPointer();
417
  */
Yifan Zhao's avatar
Yifan Zhao committed
418
  Function *KF = K->KernelLeafNode->getFuncPointer();
419
  // Initialize context
Yifan Zhao's avatar
Yifan Zhao committed
420
421
  // DEBUG(errs() << "Initializing context" << "\n");
  // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI);
422

Yifan Zhao's avatar
Yifan Zhao committed
423
424
  DEBUG(errs() << "Initializing commandQ"
               << "\n");
425
  // Initialize command queue
Yifan Zhao's avatar
Yifan Zhao committed
426
427
  switchToTimer(hpvm_TimerID_SETUP, InitCall);
  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
428
  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
429
430
431
432
433
434
435
436
437
  DEBUG(errs() << "Generating code for kernel - "
               << K->KernelFunction->getName() << "\n");
  Value *kernelStr =
      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");

  Value *LaunchInstArgs[] = {fileStr, kernelStr};

  DEBUG(errs() << "Inserting launch call"
               << "\n");
438
439
440
441
442
443
444
445
  CallInst *OpenCL_Ctx = CallInst::Create(llvm_hpvm_ocl_launch,
                                          ArrayRef<Value *>(LaunchInstArgs, 2),
                                          "graph" + KF->getName(), InitCall);
  DEBUG(errs() << *OpenCL_Ctx << "\n");
  GraphIDAddr = new GlobalVariable(
      M, OpenCL_Ctx->getType(), false, GlobalValue::CommonLinkage,
      Constant::getNullValue(OpenCL_Ctx->getType()),
      "graph" + KF->getName() + ".addr");
446
  DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
447
  StoreInst *SI = new StoreInst(OpenCL_Ctx, GraphIDAddr, InitCall);
448
  DEBUG(errs() << *SI << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
449
450
451
  switchToTimer(hpvm_TimerID_NONE, InitCall);
  switchToTimer(hpvm_TimerID_SETUP, RI);
  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
452

Yifan Zhao's avatar
Yifan Zhao committed
453
  // Iterate over the required input edges of the node and use the hpvm-rt API
454
  // to set inputs
Yifan Zhao's avatar
Yifan Zhao committed
455
  DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n");
456
  std::vector<OutputPtr> OutputPointers;
Yifan Zhao's avatar
Yifan Zhao committed
457
458
459
  // Vector to hold the device memory object that need to be cleared before we
  // release context
  std::vector<Value *> DevicePointers;
460

461
  std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
Prakalp Srivastava's avatar
Prakalp Srivastava committed
462
463
  /*
    for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
464

Prakalp Srivastava's avatar
Prakalp Srivastava committed
465
      // The kernel object gives us the mapping of arguments from kernel launch
466
467
      // node function (F_CPU) to kernel (kernel->KF)
      Value* inputVal = getArgumentAt(F_CPU, K->getInArgMap()[i]);
468

Prakalp Srivastava's avatar
Prakalp Srivastava committed
469
  */
470

Yifan Zhao's avatar
Yifan Zhao committed
471
  for (auto &InArgMapPair : kernelInArgMap) {
472
    unsigned i = InArgMapPair.first;
473
    Value *inputVal = getArgumentAt(F_CPU, InArgMapPair.second);
Yifan Zhao's avatar
Yifan Zhao committed
474
    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
475

476
477
478
479
480
    // input value has been obtained.
    // Check if input is a scalar value or a pointer operand
    // For scalar values such as int, float, etc. the size is simply the size of
    // type on target machine, but for pointers, the size of data would be the
    // next integer argument
Yifan Zhao's avatar
Yifan Zhao committed
481
    if (inputVal->getType()->isPointerTy()) {
482

Yifan Zhao's avatar
Yifan Zhao committed
483
      switchToTimer(hpvm_TimerID_COPY_PTR, RI);
484
      // Pointer Input
485
      // CheckAttribute
Yifan Zhao's avatar
Yifan Zhao committed
486
487
488
489
490
491
492
493
      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
                        !(hasAttribute(KF, i, Attribute::In)))
                           ? False
                           : True;

      Argument *A = getArgumentAt(KF, i);
      if (isOutput == True) {
494
        DEBUG(errs() << *A << " is an OUTPUT argument\n");
495
      }
Yifan Zhao's avatar
Yifan Zhao committed
496
      if (isInput == True) {
497
        DEBUG(errs() << *A << " is an INPUT argument\n");
498
499
      }

Yifan Zhao's avatar
Yifan Zhao committed
500
501
502
      Value *inputValI8Ptr = CastInst::CreatePointerCast(
          inputVal, Type::getInt8PtrTy(M.getContext()),
          inputVal->getName() + ".i8ptr", RI);
503
504

      // Assert that the pointer argument size (next argument) is in the map
Yifan Zhao's avatar
Yifan Zhao committed
505
506
      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());

507
      Value *inputSize = getArgumentAt(F_CPU, kernelInArgMap[i + 1]);
Yifan Zhao's avatar
Yifan Zhao committed
508
509
510
511
512
513
514
515
516
517
518
519
520
      assert(
          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
          "Pointer type input must always be followed by size (integer type)");
      Value *setInputArgs[] = {
          GraphID,
          inputValI8Ptr,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
          inputSize,
          isInput,
          isOutput};
      Value *d_ptr =
          CallInst::Create(llvm_hpvm_ocl_argument_ptr,
                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
521
      DevicePointers.push_back(d_ptr);
522
523
      // If this has out attribute, store the returned device pointer in
      // memory to read device memory later
Yifan Zhao's avatar
Yifan Zhao committed
524
525
526
527
      if (isOutput == True)
        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
    } else {
      switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
528
      // Scalar Input
529
530
      // Store the scalar value on stack and then pass the pointer to its
      // location
Yifan Zhao's avatar
Yifan Zhao committed
531
532
533
534
535
536
537
538
539
540
541
542
543
544
      AllocaInst *inputValPtr = new AllocaInst(
          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);

      Value *inputValI8Ptr = CastInst::CreatePointerCast(
          inputValPtr, Type::getInt8PtrTy(M.getContext()),
          inputVal->getName() + ".i8ptr", RI);

      Value *setInputArgs[] = {
          GraphID, inputValI8Ptr,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
          ConstantExpr::getSizeOf(inputVal->getType())};
      CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
545
    }
546
547
  }

Yifan Zhao's avatar
Yifan Zhao committed
548
549
  DEBUG(
      errs() << "Setup shared memory arguments of node and insert hpvm api\n");
550

551
552
  // Check to see if all the allocation sizes are constant (determined
  // statically)
553
  bool constSizes = true;
Yifan Zhao's avatar
Yifan Zhao committed
554
  for (auto &e : K->getSharedInArgMap()) {
555
    constSizes &= isa<Constant>(e.second.first);
556
  }
557

558
  // If the sizes are all constant
559
  if (constSizes) {
Yifan Zhao's avatar
Yifan Zhao committed
560
    for (auto &e : K->getSharedInArgMap()) {
561
      unsigned argNum = e.first;
Yifan Zhao's avatar
Yifan Zhao committed
562
      Value *allocSize = e.second.first;
563

Yifan Zhao's avatar
Yifan Zhao committed
564
565
      DEBUG(errs() << "\tLocal Memory at " << argNum
                   << ", size = " << *allocSize << "\n");
566

567
568
      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
        // Shared memory ptr argument - scalar at size position
Yifan Zhao's avatar
Yifan Zhao committed
569
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
570

Yifan Zhao's avatar
Yifan Zhao committed
571
572
        assert(isa<Constant>(allocSize) &&
               "Constant shared memory size is expected");
573

Yifan Zhao's avatar
Yifan Zhao committed
574
575
576
577
578
579
        Value *setInputArgs[] = {
            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            allocSize};
        CallInst::Create(llvm_hpvm_ocl_argument_shared,
                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
      } else {
580
        // Sharem memory size argument - scalar at address position
Yifan Zhao's avatar
Yifan Zhao committed
581
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
582
583
        // Store the scalar value on stack and then pass the pointer to its
        // location
Yifan Zhao's avatar
Yifan Zhao committed
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
        AllocaInst *allocSizePtr =
            new AllocaInst(allocSize->getType(), 0,
                           allocSize->getName() + ".sharedMem.ptr", RI);
        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);

        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
            allocSize->getName() + ".sharedMem.i8ptr", RI);

        Value *setInputArgs[] = {
            GraphID, allocSizeI8Ptr,
            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            ConstantExpr::getSizeOf(allocSize->getType())};
        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
599
600
601
      }
    }
  } else {
602

603
604
605
    Function *F_alloc = K->AllocationFunction;
    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
    assert(FAllocRetTy && "Allocation node with no struct return type");
606

607
608
    std::vector<Value *> AllocInputArgs;
    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
609
      AllocInputArgs.push_back(getArgumentAt(F_CPU, K->allocInArgMap.at(i)));
610
    }
611

612
613
614
615
616
617
    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
    std::vector<ExtractValueInst *> ExtractValueInstVec;
    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
      ExtractValueInstVec.push_back(EI);
    }
618

Yifan Zhao's avatar
Yifan Zhao committed
619
    for (auto &e : K->getSharedInArgMap()) {
620
      unsigned argNum = e.first;
Yifan Zhao's avatar
Yifan Zhao committed
621
      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
622

Yifan Zhao's avatar
Yifan Zhao committed
623
624
      DEBUG(errs() << "\tLocal Memory at " << argNum
                   << ", size = " << *allocSize << "\n");
625

626
627
      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
        // Shared memory ptr argument - scalar at size position
Yifan Zhao's avatar
Yifan Zhao committed
628
629
630
631
632
633
634
635
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);

        Value *setInputArgs[] = {
            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            allocSize};
        CallInst::Create(llvm_hpvm_ocl_argument_shared,
                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
      } else {
636
        // Sharem memory size argument - scalar at address position
Yifan Zhao's avatar
Yifan Zhao committed
637
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
638
639
        // Store the scalar value on stack and then pass the pointer to its
        // location
Yifan Zhao's avatar
Yifan Zhao committed
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
        AllocaInst *allocSizePtr =
            new AllocaInst(allocSize->getType(), 0,
                           allocSize->getName() + ".sharedMem.ptr", RI);
        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);

        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
            allocSize->getName() + ".sharedMem.i8ptr", RI);

        Value *setInputArgs[] = {
            GraphID, allocSizeI8Ptr,
            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            ConstantExpr::getSizeOf(allocSize->getType())};
        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
655
      }
656
657
    }
  }
658

Yifan Zhao's avatar
Yifan Zhao committed
659
  DEBUG(errs() << "Setup output edges of node and insert hpvm api\n");
660
  // Set output if struct is not an empty struct
Yifan Zhao's avatar
Yifan Zhao committed
661
662
663
664
  StructType *OutputTy = K->KernelLeafNode->getOutputType();
  std::vector<Value *> d_Outputs;
  if (!OutputTy->isEmptyTy()) {
    switchToTimer(hpvm_TimerID_COPY_PTR, RI);
665
    // Not an empty struct
666
    // Iterate over all elements of the struct and put them in
Yifan Zhao's avatar
Yifan Zhao committed
667
668
669
670
671
672
673
674
675
676
    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
      Value *setOutputArgs[] = {
          GraphID,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};

      CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr,
                                            ArrayRef<Value *>(setOutputArgs, 3),
                                            "d_output." + KF->getName(), RI);
677
678
      d_Outputs.push_back(d_Output);
    }
679
  }
680

681
682
683
684
685
  // Enqueue kernel
  // Need work dim, localworksize, globalworksize
  // Allocate size_t[numDims] space on stack. Store the work group sizes and
  // pass it as an argument to ExecNode

Yifan Zhao's avatar
Yifan Zhao committed
686
  switchToTimer(hpvm_TimerID_MISC, RI);
687
  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
kotsifa2's avatar
kotsifa2 committed
688
  getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
Yifan Zhao's avatar
Yifan Zhao committed
689
690
691
692
693
  switchToTimer(hpvm_TimerID_KERNEL, RI);
  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
  CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode,
                                     ArrayRef<Value *>(ExecNodeArgs, 4),
                                     "event." + KF->getName(), RI);
694
  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
695

696
  // Wait for Kernel to Finish
Yifan Zhao's avatar
Yifan Zhao committed
697
  CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
698

Yifan Zhao's avatar
Yifan Zhao committed
699
  switchToTimer(hpvm_TimerID_READ_OUTPUT, RI);
700
  // Read Output Struct if not empty
Yifan Zhao's avatar
Yifan Zhao committed
701
702
703
704
705
706
707
708
709
710
  if (!OutputTy->isEmptyTy()) {
    std::vector<Value *> h_Outputs;
    Value *KernelOutput = UndefValue::get(OutputTy);
    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
      Value *GetOutputArgs[] = {
          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
      CallInst *h_Output = CallInst::Create(
          llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
          "h_output." + KF->getName() + ".addr", RI);
711
712
      // Read each device pointer listed in output struct
      // Load the output struct
Yifan Zhao's avatar
Yifan Zhao committed
713
714
715
716
717
718
719
720
      CastInst *BI = BitCastInst::CreatePointerCast(
          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
          RI);

      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
                                             ArrayRef<unsigned>(i),
                                             KF->getName() + "output", RI);
721
    }
722
    OutputMap[K->KernelLeafNode] = KernelOutput;
723
  }
Prakalp Srivastava's avatar
Prakalp Srivastava committed
724

725
726
  // Read all the pointer arguments which had side effects i.e., had out
  // attribute
727
  DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
728
729
730
  // FIXME: Not reading output pointers anymore as we read them when data is
  // actually requested
  /*for(auto output: OutputPointers) {
731
732
733
    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
734

Yifan Zhao's avatar
Yifan Zhao committed
735
736
    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
  output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput,
737
738
                                    ArrayRef<Value*>(GetOutputArgs, 4),
                                    "", RI);
739
  }*/
Yifan Zhao's avatar
Yifan Zhao committed
740
  switchToTimer(hpvm_TimerID_MEM_FREE, RI);
741
  // Clear Context and free device memory
Yifan Zhao's avatar
Yifan Zhao committed
742
743
  DEBUG(errs() << "Clearing context"
               << "\n");
744
  // Free Device Memory
Yifan Zhao's avatar
Yifan Zhao committed
745
746
  for (auto d_ptr : DevicePointers) {
    CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
747
  }
Yifan Zhao's avatar
Yifan Zhao committed
748
  switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall);
749
  // Clear Context
Yifan Zhao's avatar
Yifan Zhao committed
750
751
752
753
  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
  CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "",
                   CleanupCall);
  switchToTimer(hpvm_TimerID_NONE, CleanupCall);
754

Yifan Zhao's avatar
Yifan Zhao committed
755
  switchToTimer(hpvm_TimerID_MISC, RI);
756
757
758
  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
  // Generate code for output bindings
  // Get Exit node
Yifan Zhao's avatar
Yifan Zhao committed
759
  DFNode *C = N->getChildGraph()->getExit();
760
  // Get OutputType of this node
Yifan Zhao's avatar
Yifan Zhao committed
761
  StructType *OutTy = N->getOutputType();
762
  Value *retVal = UndefValue::get(F_CPU->getReturnType());
763
764
  // Find the kernel's output arg map, to use instead of the bindings
  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
765
  // Find all the input edges to exit node
Yifan Zhao's avatar
Yifan Zhao committed
766
  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
767
    DEBUG(errs() << "Output Edge " << i << "\n");
768
    // Find the incoming edge at the requested input port
Yifan Zhao's avatar
Yifan Zhao committed
769
    DFEdge *E = C->getInDFEdgeAt(i);
770
771

    assert(E && "No Binding for output element!");
772
    // Find the Source DFNode associated with the incoming edge
Yifan Zhao's avatar
Yifan Zhao committed
773
    DFNode *SrcDF = E->getSourceDF();
774

Yifan Zhao's avatar
Yifan Zhao committed
775
776
    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
                 << "\n");
777

778
779
    // If Source DFNode is a dummyNode, edge is from parent. Get the
    // argument from argument list of this internal node
Yifan Zhao's avatar
Yifan Zhao committed
780
781
    Value *inputVal;
    if (SrcDF->isEntryNode()) {
782
      inputVal = getArgumentAt(F_CPU, i);
Yifan Zhao's avatar
Yifan Zhao committed
783
784
      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
    } else {
785
      // edge is from a internal node
786
      // Check - code should already be generated for this source dfnode
787
788
789
790
      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
      // can assume the SrcDF is same as Kernel Leaf node.
      // Use outArgMap to get correct mapping
      SrcDF = K->KernelLeafNode;
Yifan Zhao's avatar
Yifan Zhao committed
791
792
      assert(OutputMap.count(SrcDF) &&
             "Source node call not found. Dependency violation!");
793

794
      // Find Output Value associated with the Source DFNode using OutputMap
Yifan Zhao's avatar
Yifan Zhao committed
795
      Value *CI = OutputMap[SrcDF];
796
797
798

      // Extract element at source position from this call instruction
      std::vector<unsigned> IndexList;
799
800
      // i is the destination of DFEdge E
      // Use the mapping instead of the bindings
Yifan Zhao's avatar
Yifan Zhao committed
801
      //      IndexList.push_back(E->getSourcePosition());
802
      IndexList.push_back(outArgMap[i]);
Yifan Zhao's avatar
Yifan Zhao committed
803
804
      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
805
806
      inputVal = EI;
    }
807
808
809
    std::vector<unsigned> IdxList;
    IdxList.push_back(i);
    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
810
  }
811

812
  DEBUG(errs() << "Extracted all\n");
Yifan Zhao's avatar
Yifan Zhao committed
813
  switchToTimer(hpvm_TimerID_NONE, RI);
814
  retVal->setName("output");
815
  ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal);
816
817
  ReplaceInstWithInst(RI, newRI);
}
818

819
820
// Right now, only targeting the one level case. In general, device functions
// can return values so we don't need to change them
821
void CGT_OpenCL::codeGen(DFInternalNode *N) {
Yifan Zhao's avatar
Yifan Zhao committed
822
823
824
825
  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
               << "\n");
  if (KernelLaunchNode == NULL)
    DEBUG(errs() << "No kernel launch node\n");
826
  else {
Yifan Zhao's avatar
Yifan Zhao committed
827
828
    DEBUG(errs() << "KernelLaunchNode: "
                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
829
  }
Prakalp Srivastava's avatar
Prakalp Srivastava committed
830

831
  if (!KernelLaunchNode) {
Yifan Zhao's avatar
Yifan Zhao committed
832
833
    DEBUG(errs()
          << "No code generated (host code for kernel launch complete).\n");
834
835
    return;
  }
836

837
838
  if (N == KernelLaunchNode) {
    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
Yifan Zhao's avatar
Yifan Zhao committed
839
    // TODO
840

841
842
    // Now the remaining nodes to be visited should be ignored
    KernelLaunchNode = NULL;
843
    DEBUG(errs() << "Insert Runtime calls\n");
844
    insertRuntimeCalls(N, kernel, getPTXFilename(M));
845

846
847
  } else {
    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
848
    // Keep track of the arguments order.
849
850
851
852
853
    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
    // TODO: Structure assumed: one thread node, one allocation node (at most),
    // TB node
    std::map<unsigned, unsigned> inmapFinal;