DFG2LLVM_OpenCL.cpp 96.6 KB
Newer Older
1
//=== DFG2LLVM_OpenCL.cpp ===//
kotsifa2's avatar
kotsifa2 committed
2
3
4
5
6
7
8
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
9
10
11
12
13
14
15
16
// 
// This pass is responsible for generating code for kernel code and code for 
// launching kernels for GPU target using HPVM dataflow graph. The kernels are
// generated into a separate file which is the C-Backend uses to generate 
// OpenCL kernels with.
//
//===----------------------------------------------------------------------===//

kotsifa2's avatar
kotsifa2 committed
17

18
#define ENABLE_ASSERTS
19
#define TARGET_PTX 64
kotsifa2's avatar
kotsifa2 committed
20
#define GENERIC_ADDRSPACE 0
21
#define GLOBAL_ADDRSPACE 1
22
#define CONSTANT_ADDRSPACE 4
23
#define SHARED_ADDRSPACE 3
24

25
#define DEBUG_TYPE "DFG2LLVM_OpenCL"
Yifan Zhao's avatar
Yifan Zhao committed
26
27
28
29
30
#include "SupportHPVM/DFG2LLVM.h"
#include "SupportHPVM/HPVMTimer.h"
#include "SupportHPVM/HPVMUtils.h"
#include "llvm-c/Core.h"
#include "llvm/IR/Attributes.h"
31
#include "llvm/IR/DataLayout.h"
kotsifa2's avatar
kotsifa2 committed
32
#include "llvm/IR/IRBuilder.h"
33
#include "llvm/IR/InstIterator.h"
Yifan Zhao's avatar
Yifan Zhao committed
34
#include "llvm/IR/Module.h"
35
36
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
Yifan Zhao's avatar
Yifan Zhao committed
37
#include "llvm/Pass.h"
38
#include "llvm/Support/FileSystem.h"
Yifan Zhao's avatar
Yifan Zhao committed
39
40
41
42
#include "llvm/Support/SourceMgr.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
43
44
45

#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
46
#include "llvm/IR/UseListOrder.h"
Yifan Zhao's avatar
Yifan Zhao committed
47
#include "llvm/Support/ToolOutputFile.h"
kotsifa2's avatar
kotsifa2 committed
48

49
50
#include <sstream>

51
52
53
54
55
56
57
58
#ifndef LLVM_BUILD_DIR
#error LLVM_BUILD_DIR is not defined
#endif

#define STR_VALUE(X) #X
#define STRINGIFY(X) STR_VALUE(X)
#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR)

kotsifa2's avatar
kotsifa2 committed
59
60
using namespace llvm;
using namespace builddfg;
61
using namespace dfg2llvm;
Yifan Zhao's avatar
Yifan Zhao committed
62
using namespace hpvmUtils;
kotsifa2's avatar
kotsifa2 committed
63

Yifan Zhao's avatar
Yifan Zhao committed
64
// HPVM Command line option to use timer or not
65
66
static cl::opt<bool> HPVMTimer_OpenCL("hpvm-timers-ptx",
                                      cl::desc("Enable hpvm timers"));
67

kotsifa2's avatar
kotsifa2 committed
68
namespace {
69
70
71
72
73
74
// Helper class declarations

// Class to maintain the tuple of host pointer, device pointer and size
// in bytes. Would have preferred to use tuple but support not yet available
class OutputPtr {
public:
Yifan Zhao's avatar
Yifan Zhao committed
75
76
  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
77

Yifan Zhao's avatar
Yifan Zhao committed
78
79
80
  Value *h_ptr;
  Value *d_ptr;
  Value *bytes;
81
82
83
84
85
86
};

// Class to maintain important kernel info required for generating runtime
// calls
class Kernel {
public:
Yifan Zhao's avatar
Yifan Zhao committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  Kernel(
      Function *_KF, DFLeafNode *_KLeafNode,
      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
          std::map<unsigned, std::pair<Value *, unsigned>>(),
      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
      unsigned _gridDim = 0,
      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
      unsigned _blockDim = 0,
      std::vector<Value *> _localWGSize = std::vector<Value *>())
      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
        localWGSize(_localWGSize) {

    assert(gridDim == globalWGSize.size() &&
           "gridDim should be same as the size of vector globalWGSize");
    assert(blockDim == localWGSize.size() &&
           "blockDim should be same as the size of vector localWGSize");
106
107
  }

Yifan Zhao's avatar
Yifan Zhao committed
108
109
  Function *KernelFunction;
  DFLeafNode *KernelLeafNode;
110
111
  std::map<unsigned, unsigned> inArgMap;
  // Map for shared memory arguments
Yifan Zhao's avatar
Yifan Zhao committed
112
  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
113
  // Fields for (potential) allocation node
Yifan Zhao's avatar
Yifan Zhao committed
114
115
  DFLeafNode *AllocationNode;
  Function *AllocationFunction;
Prakalp Srivastava's avatar
Prakalp Srivastava committed
116
  std::map<unsigned, unsigned> allocInArgMap;
117

118
  std::vector<unsigned> outArgMap;
119
  unsigned gridDim;
Yifan Zhao's avatar
Yifan Zhao committed
120
  std::vector<Value *> globalWGSize;
121
  unsigned blockDim;
Yifan Zhao's avatar
Yifan Zhao committed
122
  std::vector<Value *> localWGSize;
123
  std::vector<int> localDimMap;
124

Yifan Zhao's avatar
Yifan Zhao committed
125
126
  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
127

Yifan Zhao's avatar
Yifan Zhao committed
128
  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
129
130
    return sharedInArgMap;
  }
Yifan Zhao's avatar
Yifan Zhao committed
131
  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
132
133
134
    sharedInArgMap = map;
  }

Yifan Zhao's avatar
Yifan Zhao committed
135
136
  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
137

Yifan Zhao's avatar
Yifan Zhao committed
138
  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
139

Yifan Zhao's avatar
Yifan Zhao committed
140
  bool hasLocalWG() const { return blockDim != 0; }
141
142
};

143
// Helper function declarations
Yifan Zhao's avatar
Yifan Zhao committed
144
145
146
147
148
149
150
151
static bool canBePromoted(Argument *arg, Function *F);
static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
                                 Kernel *, ValueToValueMapTy &, Instruction *);
static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
                              ValueToValueMapTy &, Instruction *,
                              const Twine &WGName = "WGSize");
static std::string getPTXFilename(const Module &);
static std::string getFilenameFromModule(const Module &M);
152
153
154
static void changeDataLayout(Module &);
static void changeTargetTriple(Module &);
static void findReturnInst(Function *, std::vector<ReturnInst *> &);
Yifan Zhao's avatar
Yifan Zhao committed
155
156
static void findIntrinsicInst(Function *, Intrinsic::ID,
                              std::vector<IntrinsicInst *> &);
157
158
static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
static std::string getAtomicOpName(Intrinsic::ID);
159

160
161
// DFG2LLVM_OpenCL - The first implementation.
struct DFG2LLVM_OpenCL : public DFG2LLVM {
162
  static char ID; // Pass identification, replacement for typeid
163
  DFG2LLVM_OpenCL() : DFG2LLVM(ID) {}
164
165
166
167
168
169
170

private:
public:
  bool runOnModule(Module &M);
};

// Visitor for Code generation traversal (tree traversal for now)
171
class CGT_OpenCL : public CodeGenTraversal {
172
173

private:
Yifan Zhao's avatar
Yifan Zhao committed
174
  // Member variables
175
  std::unique_ptr<Module> KernelM;
Yifan Zhao's avatar
Yifan Zhao committed
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  DFNode *KernelLaunchNode = NULL;
  Kernel *kernel;

  // HPVM Runtime API
  FunctionCallee llvm_hpvm_ocl_launch;
  FunctionCallee llvm_hpvm_ocl_wait;
  FunctionCallee llvm_hpvm_ocl_initContext;
  FunctionCallee llvm_hpvm_ocl_clearContext;
  FunctionCallee llvm_hpvm_ocl_argument_shared;
  FunctionCallee llvm_hpvm_ocl_argument_scalar;
  FunctionCallee llvm_hpvm_ocl_argument_ptr;
  FunctionCallee llvm_hpvm_ocl_output_ptr;
  FunctionCallee llvm_hpvm_ocl_free;
  FunctionCallee llvm_hpvm_ocl_getOutput;
  FunctionCallee llvm_hpvm_ocl_executeNode;

  // Functions
193
  std::string getKernelsModuleName(Module &M);
Yifan Zhao's avatar
Yifan Zhao committed
194
195
196
197
198
199
200
201
  void fixValueAddrspace(Value *V, unsigned addrspace);
  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
                                                  Function *);
  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
                               unsigned i);
  void addCLMetadata(Function *F);
  Function *transformFunctionToVoid(Function *F);
  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
202

203
204
  // Virtual Functions
  void init() {
205
206
    HPVMTimer = HPVMTimer_OpenCL;
    TargetName = "OpenCL";
207
208
  }
  void initRuntimeAPI();
Yifan Zhao's avatar
Yifan Zhao committed
209
210
  void codeGen(DFInternalNode *N);
  void codeGen(DFLeafNode *N);
211
212
213

public:
  // Constructor
214
  CGT_OpenCL(Module &_M, BuildDFG &_DFG)
Yifan Zhao's avatar
Yifan Zhao committed
215
      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
216
    init();
217
    initRuntimeAPI();
Yifan Zhao's avatar
Yifan Zhao committed
218
219
    DEBUG(errs() << "Old module pointer: " << &_M << "\n");
    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
220

Yifan Zhao's avatar
Yifan Zhao committed
221
222
223
    // Copying instead of creating new, in order to preserve required info
    // (metadata) Remove functions, global variables and aliases
    std::vector<GlobalVariable *> GVVect;
224
    for (Module::global_iterator mi = KernelM->global_begin(),
Yifan Zhao's avatar
Yifan Zhao committed
225
226
227
                                 me = KernelM->global_end();
         (mi != me); ++mi) {
      GlobalVariable *GV = &*mi;
228
      GVVect.push_back(GV);
229
    }
230
231
232
    for (auto *GV : GVVect) {
      GV->replaceAllUsesWith(UndefValue::get(GV->getType()));
      GV->eraseFromParent();
233
234
    }

Yifan Zhao's avatar
Yifan Zhao committed
235
236
237
238
    std::vector<Function *> FuncVect;
    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
         (mi != me); ++mi) {
      Function *F = &*mi;
239
      FuncVect.push_back(F);
240
    }
241
242
243
    for (auto *F : FuncVect) {
      F->replaceAllUsesWith(UndefValue::get(F->getType()));
      F->eraseFromParent();
244
    }
kotsifa2's avatar
kotsifa2 committed
245

Yifan Zhao's avatar
Yifan Zhao committed
246
    std::vector<GlobalAlias *> GAVect;
247
    for (Module::alias_iterator mi = KernelM->alias_begin(),
Yifan Zhao's avatar
Yifan Zhao committed
248
249
250
                                me = KernelM->alias_end();
         (mi != me); ++mi) {
      GlobalAlias *GA = &*mi;
251
      GAVect.push_back(GA);
252
    }
253
254
255
    for (auto *GA : GAVect) {
      GA->replaceAllUsesWith(UndefValue::get(GA->getType()));
      GA->eraseFromParent();
256
    }
kotsifa2's avatar
kotsifa2 committed
257

258
259
    changeDataLayout(*KernelM);
    changeTargetTriple(*KernelM);
kotsifa2's avatar
kotsifa2 committed
260

261
    DEBUG(errs() << *KernelM);
262
  }
263

264
  void writeKernelsModule();
265
266
};

Yifan Zhao's avatar
Yifan Zhao committed
267
// Initialize the HPVM runtime API. This makes it easier to insert these calls
268
void CGT_OpenCL::initRuntimeAPI() {
269
270
271

  // Load Runtime API Module
  SMDiagnostic Err;
272

273
274
  std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) +
                           "/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
275

276
  runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext());
Yifan Zhao's avatar
Yifan Zhao committed
277
  if (runtimeModule == nullptr) {
278
279
    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
    assert(false && "couldn't parse runtime");
Yifan Zhao's avatar
Yifan Zhao committed
280
281
  } else
    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
282
283

  // Get or insert the global declarations for launch/wait functions
Yifan Zhao's avatar
Yifan Zhao committed
284
285
286
287
288
289
290
291
292
293
294
  DECLARE(llvm_hpvm_ocl_launch);
  DECLARE(llvm_hpvm_ocl_wait);
  DECLARE(llvm_hpvm_ocl_initContext);
  DECLARE(llvm_hpvm_ocl_clearContext);
  DECLARE(llvm_hpvm_ocl_argument_shared);
  DECLARE(llvm_hpvm_ocl_argument_scalar);
  DECLARE(llvm_hpvm_ocl_argument_ptr);
  DECLARE(llvm_hpvm_ocl_output_ptr);
  DECLARE(llvm_hpvm_ocl_free);
  DECLARE(llvm_hpvm_ocl_getOutput);
  DECLARE(llvm_hpvm_ocl_executeNode);
295
296
297

  // Get or insert timerAPI functions as well if you plan to use timers
  initTimerAPI();
298
299

  // Insert init context in main
300
  DEBUG(errs() << "Gen Code to initialize OpenCL Timer\n");
Yifan Zhao's avatar
Yifan Zhao committed
301
302
  Function *VI = M.getFunction("llvm.hpvm.init");
  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
303

304
  InitCall = cast<Instruction>(*VI->user_begin());
305
  initializeTimerSet(InitCall);
Yifan Zhao's avatar
Yifan Zhao committed
306
307
308
309
310
  switchToTimer(hpvm_TimerID_INIT_CTX, InitCall);
  CallInst::Create(llvm_hpvm_ocl_initContext,
                   ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "",
                   InitCall);
  switchToTimer(hpvm_TimerID_NONE, InitCall);
311

Yifan Zhao's avatar
Yifan Zhao committed
312
  // Insert print instruction at hpvm exit
313
  DEBUG(errs() << "Gen Code to print OpenCL Timer\n");
Yifan Zhao's avatar
Yifan Zhao committed
314
  Function *VC = M.getFunction("llvm.hpvm.cleanup");
315
  DEBUG(errs() << *VC << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
316
  assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once");
317

318
  CleanupCall = cast<Instruction>(*VC->user_begin());
319
  printTimerSet(CleanupCall);
320
321
322
323
324
325
326
}

// Generate Code to call the kernel
// The plan is to replace the internal node with a leaf node. This method is
// used to generate a function to associate with this leaf node. The function
// is responsible for all the memory allocation/transfer and invoking the
// kernel call on the device
327
328
void CGT_OpenCL::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
                                    const Twine &FileName) {
329
330
  // Check if clone already exists. If it does, it means we have visited this
  // function before.
Yifan Zhao's avatar
Yifan Zhao committed
331
  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
332

Yifan Zhao's avatar
Yifan Zhao committed
333
  assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL &&
334
         "Code already generated for this node");
335
336

  // Useful values
Yifan Zhao's avatar
Yifan Zhao committed
337
338
  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
339
340

  // If kernel struct has not been initialized with kernel function, then fail
341
  assert(K != NULL && "No kernel found!!");
342
343
344

  DEBUG(errs() << "Generating kernel call code\n");

Yifan Zhao's avatar
Yifan Zhao committed
345
  Function *F = N->getFuncPointer();
346
347
348

  // Create of clone of F with no instructions. Only the type is the same as F
  // without the extra arguments.
349
  Function *F_CPU;
350
351
352
353
354
355

  // Clone the function, if we are seeing this function for the first time. We
  // only need a clone in terms of type.
  ValueToValueMapTy VMap;

  // Create new function with the same type
356
  F_CPU =
Yifan Zhao's avatar
Yifan Zhao committed
357
      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
358
359

  // Loop over the arguments, copying the names of arguments over.
360
  Function::arg_iterator dest_iterator = F_CPU->arg_begin();
361
362
363
  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
       i != e; ++i) {
    dest_iterator->setName(i->getName()); // Copy the name over...
kotsifa2's avatar
kotsifa2 committed
364
    // Increment dest iterator
kotsifa2's avatar
kotsifa2 committed
365
    ++dest_iterator;
366
  }
367

368
  // Add a basic block to this empty function
369
  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_CPU);
Yifan Zhao's avatar
Yifan Zhao committed
370
  ReturnInst *RI = ReturnInst::Create(
371
      M.getContext(), UndefValue::get(F_CPU->getReturnType()), BB);
372

373
  // FIXME: Adding Index and Dim arguments are probably not required except
374
  // for consistency purpose (DFG2LLVM_CPU does assume that all leaf nodes do
375
  // have those arguments)
376

377
  // Add Index and Dim arguments except for the root node
Yifan Zhao's avatar
Yifan Zhao committed
378
  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
379
    F_CPU = addIdxDimArgs(F_CPU);
kotsifa2's avatar
kotsifa2 committed
380

381
  BB = &*F_CPU->begin();
kotsifa2's avatar
kotsifa2 committed
382
383
  RI = cast<ReturnInst>(BB->getTerminator());

Yifan Zhao's avatar
Yifan Zhao committed
384
  // Add the generated function info to DFNode
385
386
387
  //  N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
  N->addGenFunc(F_CPU, hpvm::GPU_TARGET, true);
  DEBUG(errs() << "Added GPUGenFunc: " << F_CPU->getName() << " for node "
Yifan Zhao's avatar
Yifan Zhao committed
388
               << N->getFuncPointer()->getName() << "\n");
kotsifa2's avatar
kotsifa2 committed
389
390

  // Loop over the arguments, to create the VMap
391
  dest_iterator = F_CPU->arg_begin();
kotsifa2's avatar
kotsifa2 committed
392
393
394
395
396
397
  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
       i != e; ++i) {
    // Add mapping to VMap and increment dest iterator
    VMap[&*i] = &*dest_iterator;
    ++dest_iterator;
  }
398

399
400
  /* TODO: Use this code to verufy if this is a good pattern for PTX kernel

401
402
  // Sort children in topological order before code generation for kernel call
  N->getChildGraph()->sortChildren();
403

404
405
406
407
  // The DFNode N has the property that it has only one child (leaving Entry
  // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
  // generation for kernel calls significantly. All the inputs to this child
  // node would either be constants or from the parent node N.
408

409
410
  assert(N->getChildGraph()->size() == 3
         && "Node expected to have just one non-dummy node!");
411

412
413
414
415
416
417
418
  DFNode* C;
  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
    C = *ci;
    // Skip dummy node call
    if (!C->isDummyNode())
      break;
419
420
  }

Yifan Zhao's avatar
Yifan Zhao committed
421
422
  assert(C->isDummyNode() == false && "Internal Node only contains dummy
  nodes!");
423
424

  Function* CF = C->getFuncPointer();
425
  */
Yifan Zhao's avatar
Yifan Zhao committed
426
  Function *KF = K->KernelLeafNode->getFuncPointer();
427
  // Initialize context
Yifan Zhao's avatar
Yifan Zhao committed
428
429
  // DEBUG(errs() << "Initializing context" << "\n");
  // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI);
430

Yifan Zhao's avatar
Yifan Zhao committed
431
432
  DEBUG(errs() << "Initializing commandQ"
               << "\n");
433
  // Initialize command queue
Yifan Zhao's avatar
Yifan Zhao committed
434
435
  switchToTimer(hpvm_TimerID_SETUP, InitCall);
  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
436
  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
437
438
439
440
441
442
443
444
445
  DEBUG(errs() << "Generating code for kernel - "
               << K->KernelFunction->getName() << "\n");
  Value *kernelStr =
      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");

  Value *LaunchInstArgs[] = {fileStr, kernelStr};

  DEBUG(errs() << "Inserting launch call"
               << "\n");
446
447
448
449
450
451
452
453
  CallInst *OpenCL_Ctx = CallInst::Create(llvm_hpvm_ocl_launch,
                                          ArrayRef<Value *>(LaunchInstArgs, 2),
                                          "graph" + KF->getName(), InitCall);
  DEBUG(errs() << *OpenCL_Ctx << "\n");
  GraphIDAddr = new GlobalVariable(
      M, OpenCL_Ctx->getType(), false, GlobalValue::CommonLinkage,
      Constant::getNullValue(OpenCL_Ctx->getType()),
      "graph" + KF->getName() + ".addr");
454
  DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
455
  StoreInst *SI = new StoreInst(OpenCL_Ctx, GraphIDAddr, InitCall);
456
  DEBUG(errs() << *SI << "\n");
Yifan Zhao's avatar
Yifan Zhao committed
457
458
459
  switchToTimer(hpvm_TimerID_NONE, InitCall);
  switchToTimer(hpvm_TimerID_SETUP, RI);
  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
460

Yifan Zhao's avatar
Yifan Zhao committed
461
  // Iterate over the required input edges of the node and use the hpvm-rt API
462
  // to set inputs
Yifan Zhao's avatar
Yifan Zhao committed
463
  DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n");
464
  std::vector<OutputPtr> OutputPointers;
Yifan Zhao's avatar
Yifan Zhao committed
465
466
467
  // Vector to hold the device memory object that need to be cleared before we
  // release context
  std::vector<Value *> DevicePointers;
468

469
  std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
Prakalp Srivastava's avatar
Prakalp Srivastava committed
470
471
  /*
    for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
472

Prakalp Srivastava's avatar
Prakalp Srivastava committed
473
      // The kernel object gives us the mapping of arguments from kernel launch
474
475
      // node function (F_CPU) to kernel (kernel->KF)
      Value* inputVal = getArgumentAt(F_CPU, K->getInArgMap()[i]);
476

Prakalp Srivastava's avatar
Prakalp Srivastava committed
477
  */
478

Yifan Zhao's avatar
Yifan Zhao committed
479
  for (auto &InArgMapPair : kernelInArgMap) {
480
    unsigned i = InArgMapPair.first;
481
    Value *inputVal = getArgumentAt(F_CPU, InArgMapPair.second);
Yifan Zhao's avatar
Yifan Zhao committed
482
    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
483

484
485
486
487
488
    // input value has been obtained.
    // Check if input is a scalar value or a pointer operand
    // For scalar values such as int, float, etc. the size is simply the size of
    // type on target machine, but for pointers, the size of data would be the
    // next integer argument
Yifan Zhao's avatar
Yifan Zhao committed
489
    if (inputVal->getType()->isPointerTy()) {
490

Yifan Zhao's avatar
Yifan Zhao committed
491
      switchToTimer(hpvm_TimerID_COPY_PTR, RI);
492
      // Pointer Input
493
      // CheckAttribute
Yifan Zhao's avatar
Yifan Zhao committed
494
495
496
497
498
499
500
501
      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
                        !(hasAttribute(KF, i, Attribute::In)))
                           ? False
                           : True;

      Argument *A = getArgumentAt(KF, i);
      if (isOutput == True) {
502
        DEBUG(errs() << *A << " is an OUTPUT argument\n");
503
      }
Yifan Zhao's avatar
Yifan Zhao committed
504
      if (isInput == True) {
505
        DEBUG(errs() << *A << " is an INPUT argument\n");
506
507
      }

Yifan Zhao's avatar
Yifan Zhao committed
508
509
510
      Value *inputValI8Ptr = CastInst::CreatePointerCast(
          inputVal, Type::getInt8PtrTy(M.getContext()),
          inputVal->getName() + ".i8ptr", RI);
511
512

      // Assert that the pointer argument size (next argument) is in the map
Yifan Zhao's avatar
Yifan Zhao committed
513
514
      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());

515
      Value *inputSize = getArgumentAt(F_CPU, kernelInArgMap[i + 1]);
Yifan Zhao's avatar
Yifan Zhao committed
516
517
518
519
520
521
522
523
524
525
526
527
528
      assert(
          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
          "Pointer type input must always be followed by size (integer type)");
      Value *setInputArgs[] = {
          GraphID,
          inputValI8Ptr,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
          inputSize,
          isInput,
          isOutput};
      Value *d_ptr =
          CallInst::Create(llvm_hpvm_ocl_argument_ptr,
                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
529
      DevicePointers.push_back(d_ptr);
530
531
      // If this has out attribute, store the returned device pointer in
      // memory to read device memory later
Yifan Zhao's avatar
Yifan Zhao committed
532
533
534
535
      if (isOutput == True)
        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
    } else {
      switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
536
      // Scalar Input
537
538
      // Store the scalar value on stack and then pass the pointer to its
      // location
Yifan Zhao's avatar
Yifan Zhao committed
539
540
541
542
543
544
545
546
547
548
549
550
551
552
      AllocaInst *inputValPtr = new AllocaInst(
          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);

      Value *inputValI8Ptr = CastInst::CreatePointerCast(
          inputValPtr, Type::getInt8PtrTy(M.getContext()),
          inputVal->getName() + ".i8ptr", RI);

      Value *setInputArgs[] = {
          GraphID, inputValI8Ptr,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
          ConstantExpr::getSizeOf(inputVal->getType())};
      CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
553
    }
554
555
  }

Yifan Zhao's avatar
Yifan Zhao committed
556
557
  DEBUG(
      errs() << "Setup shared memory arguments of node and insert hpvm api\n");
558

559
560
  // Check to see if all the allocation sizes are constant (determined
  // statically)
561
  bool constSizes = true;
Yifan Zhao's avatar
Yifan Zhao committed
562
  for (auto &e : K->getSharedInArgMap()) {
563
    constSizes &= isa<Constant>(e.second.first);
564
  }
565

566
  // If the sizes are all constant
567
  if (constSizes) {
Yifan Zhao's avatar
Yifan Zhao committed
568
    for (auto &e : K->getSharedInArgMap()) {
569
      unsigned argNum = e.first;
Yifan Zhao's avatar
Yifan Zhao committed
570
      Value *allocSize = e.second.first;
571

Yifan Zhao's avatar
Yifan Zhao committed
572
573
      DEBUG(errs() << "\tLocal Memory at " << argNum
                   << ", size = " << *allocSize << "\n");
574

575
576
      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
        // Shared memory ptr argument - scalar at size position
Yifan Zhao's avatar
Yifan Zhao committed
577
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
578

Yifan Zhao's avatar
Yifan Zhao committed
579
580
        assert(isa<Constant>(allocSize) &&
               "Constant shared memory size is expected");
581

Yifan Zhao's avatar
Yifan Zhao committed
582
583
584
585
586
587
        Value *setInputArgs[] = {
            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            allocSize};
        CallInst::Create(llvm_hpvm_ocl_argument_shared,
                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
      } else {
588
        // Sharem memory size argument - scalar at address position
Yifan Zhao's avatar
Yifan Zhao committed
589
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
590
591
        // Store the scalar value on stack and then pass the pointer to its
        // location
Yifan Zhao's avatar
Yifan Zhao committed
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
        AllocaInst *allocSizePtr =
            new AllocaInst(allocSize->getType(), 0,
                           allocSize->getName() + ".sharedMem.ptr", RI);
        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);

        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
            allocSize->getName() + ".sharedMem.i8ptr", RI);

        Value *setInputArgs[] = {
            GraphID, allocSizeI8Ptr,
            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            ConstantExpr::getSizeOf(allocSize->getType())};
        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
607
608
609
      }
    }
  } else {
610

611
612
613
    Function *F_alloc = K->AllocationFunction;
    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
    assert(FAllocRetTy && "Allocation node with no struct return type");
614

615
616
    std::vector<Value *> AllocInputArgs;
    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
617
      AllocInputArgs.push_back(getArgumentAt(F_CPU, K->allocInArgMap.at(i)));
618
    }
619

620
621
622
623
624
625
    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
    std::vector<ExtractValueInst *> ExtractValueInstVec;
    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
      ExtractValueInstVec.push_back(EI);
    }
626

Yifan Zhao's avatar
Yifan Zhao committed
627
    for (auto &e : K->getSharedInArgMap()) {
628
      unsigned argNum = e.first;
Yifan Zhao's avatar
Yifan Zhao committed
629
      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
630

Yifan Zhao's avatar
Yifan Zhao committed
631
632
      DEBUG(errs() << "\tLocal Memory at " << argNum
                   << ", size = " << *allocSize << "\n");
633

634
635
      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
        // Shared memory ptr argument - scalar at size position
Yifan Zhao's avatar
Yifan Zhao committed
636
637
638
639
640
641
642
643
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);

        Value *setInputArgs[] = {
            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            allocSize};
        CallInst::Create(llvm_hpvm_ocl_argument_shared,
                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
      } else {
644
        // Sharem memory size argument - scalar at address position
Yifan Zhao's avatar
Yifan Zhao committed
645
        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
646
647
        // Store the scalar value on stack and then pass the pointer to its
        // location
Yifan Zhao's avatar
Yifan Zhao committed
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
        AllocaInst *allocSizePtr =
            new AllocaInst(allocSize->getType(), 0,
                           allocSize->getName() + ".sharedMem.ptr", RI);
        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);

        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
            allocSize->getName() + ".sharedMem.i8ptr", RI);

        Value *setInputArgs[] = {
            GraphID, allocSizeI8Ptr,
            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
            ConstantExpr::getSizeOf(allocSize->getType())};
        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
663
      }
664
665
    }
  }
666

Yifan Zhao's avatar
Yifan Zhao committed
667
  DEBUG(errs() << "Setup output edges of node and insert hpvm api\n");
668
  // Set output if struct is not an empty struct
Yifan Zhao's avatar
Yifan Zhao committed
669
670
671
672
  StructType *OutputTy = K->KernelLeafNode->getOutputType();
  std::vector<Value *> d_Outputs;
  if (!OutputTy->isEmptyTy()) {
    switchToTimer(hpvm_TimerID_COPY_PTR, RI);
673
    // Not an empty struct
674
    // Iterate over all elements of the struct and put them in
Yifan Zhao's avatar
Yifan Zhao committed
675
676
677
678
679
680
681
682
683
684
    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
      Value *setOutputArgs[] = {
          GraphID,
          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};

      CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr,
                                            ArrayRef<Value *>(setOutputArgs, 3),
                                            "d_output." + KF->getName(), RI);
685
686
      d_Outputs.push_back(d_Output);
    }
687
  }
688

689
690
691
692
693
  // Enqueue kernel
  // Need work dim, localworksize, globalworksize
  // Allocate size_t[numDims] space on stack. Store the work group sizes and
  // pass it as an argument to ExecNode

Yifan Zhao's avatar
Yifan Zhao committed
694
  switchToTimer(hpvm_TimerID_MISC, RI);
695
  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
kotsifa2's avatar
kotsifa2 committed
696
  getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
Yifan Zhao's avatar
Yifan Zhao committed
697
698
699
700
701
  switchToTimer(hpvm_TimerID_KERNEL, RI);
  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
  CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode,
                                     ArrayRef<Value *>(ExecNodeArgs, 4),
                                     "event." + KF->getName(), RI);
702
  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
703

704
  // Wait for Kernel to Finish
Yifan Zhao's avatar
Yifan Zhao committed
705
  CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
706

Yifan Zhao's avatar
Yifan Zhao committed
707
  switchToTimer(hpvm_TimerID_READ_OUTPUT, RI);
708
  // Read Output Struct if not empty
Yifan Zhao's avatar
Yifan Zhao committed
709
710
711
712
713
714
715
716
717
718
  if (!OutputTy->isEmptyTy()) {
    std::vector<Value *> h_Outputs;
    Value *KernelOutput = UndefValue::get(OutputTy);
    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
      Value *GetOutputArgs[] = {
          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
      CallInst *h_Output = CallInst::Create(
          llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
          "h_output." + KF->getName() + ".addr", RI);
719
720
      // Read each device pointer listed in output struct
      // Load the output struct
Yifan Zhao's avatar
Yifan Zhao committed
721
722
723
724
725
726
727
728
      CastInst *BI = BitCastInst::CreatePointerCast(
          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
          RI);

      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
                                             ArrayRef<unsigned>(i),
                                             KF->getName() + "output", RI);
729
    }
730
    OutputMap[K->KernelLeafNode] = KernelOutput;
731
  }
Prakalp Srivastava's avatar
Prakalp Srivastava committed
732

733
734
  // Read all the pointer arguments which had side effects i.e., had out
  // attribute
735
  DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
736
737
738
  // FIXME: Not reading output pointers anymore as we read them when data is
  // actually requested
  /*for(auto output: OutputPointers) {
739
740
741
    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
742

Yifan Zhao's avatar
Yifan Zhao committed
743
744
    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
  output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput,
745
746
                                    ArrayRef<Value*>(GetOutputArgs, 4),
                                    "", RI);
747
  }*/
Yifan Zhao's avatar
Yifan Zhao committed
748
  switchToTimer(hpvm_TimerID_MEM_FREE, RI);
749
  // Clear Context and free device memory
Yifan Zhao's avatar
Yifan Zhao committed
750
751
  DEBUG(errs() << "Clearing context"
               << "\n");
752
  // Free Device Memory
Yifan Zhao's avatar
Yifan Zhao committed
753
754
  for (auto d_ptr : DevicePointers) {
    CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
755
  }
Yifan Zhao's avatar
Yifan Zhao committed
756
  switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall);
757
  // Clear Context
Yifan Zhao's avatar
Yifan Zhao committed
758
759
760
761
  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
  CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "",
                   CleanupCall);
  switchToTimer(hpvm_TimerID_NONE, CleanupCall);
762

Yifan Zhao's avatar
Yifan Zhao committed
763
  switchToTimer(hpvm_TimerID_MISC, RI);
764
765
766
  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
  // Generate code for output bindings
  // Get Exit node
Yifan Zhao's avatar
Yifan Zhao committed
767
  DFNode *C = N->getChildGraph()->getExit();
768
  // Get OutputType of this node
Yifan Zhao's avatar
Yifan Zhao committed
769
  StructType *OutTy = N->getOutputType();
770
  Value *retVal = UndefValue::get(F_CPU->getReturnType());
771
772
  // Find the kernel's output arg map, to use instead of the bindings
  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
773
  // Find all the input edges to exit node
Yifan Zhao's avatar
Yifan Zhao committed
774
  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
775
    DEBUG(errs() << "Output Edge " << i << "\n");
776
    // Find the incoming edge at the requested input port
Yifan Zhao's avatar
Yifan Zhao committed
777
    DFEdge *E = C->getInDFEdgeAt(i);
778
779

    assert(E && "No Binding for output element!");
780
    // Find the Source DFNode associated with the incoming edge
Yifan Zhao's avatar
Yifan Zhao committed
781
    DFNode *SrcDF = E->getSourceDF();
782

Yifan Zhao's avatar
Yifan Zhao committed
783
784
    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
                 << "\n");
785

786
787
    // If Source DFNode is a dummyNode, edge is from parent. Get the
    // argument from argument list of this internal node
Yifan Zhao's avatar
Yifan Zhao committed
788
789
    Value *inputVal;
    if (SrcDF->isEntryNode()) {
790
      inputVal = getArgumentAt(F_CPU, i);
Yifan Zhao's avatar
Yifan Zhao committed
791
792
      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
    } else {
793
      // edge is from a internal node
794
      // Check - code should already be generated for this source dfnode
795
796
797
798
      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
      // can assume the SrcDF is same as Kernel Leaf node.
      // Use outArgMap to get correct mapping
      SrcDF = K->KernelLeafNode;
Yifan Zhao's avatar
Yifan Zhao committed
799
800
      assert(OutputMap.count(SrcDF) &&
             "Source node call not found. Dependency violation!");
801

802
      // Find Output Value associated with the Source DFNode using OutputMap
Yifan Zhao's avatar
Yifan Zhao committed
803
      Value *CI = OutputMap[SrcDF];
804
805
806

      // Extract element at source position from this call instruction
      std::vector<unsigned> IndexList;
807
808
      // i is the destination of DFEdge E
      // Use the mapping instead of the bindings
Yifan Zhao's avatar
Yifan Zhao committed
809
      //      IndexList.push_back(E->getSourcePosition());
810
      IndexList.push_back(outArgMap[i]);
Yifan Zhao's avatar
Yifan Zhao committed
811
812
      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
813
814
      inputVal = EI;
    }
815
816
817
    std::vector<unsigned> IdxList;
    IdxList.push_back(i);
    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
818
  }
819

820
  DEBUG(errs() << "Extracted all\n");
Yifan Zhao's avatar
Yifan Zhao committed
821
  switchToTimer(hpvm_TimerID_NONE, RI);
822
  retVal->setName("output");
823
  ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal);
824
825
  ReplaceInstWithInst(RI, newRI);
}
826

827
828
// Right now, only targeting the one level case. In general, device functions
// can return values so we don't need to change them
829
void CGT_OpenCL::codeGen(DFInternalNode *N) {
Yifan Zhao's avatar
Yifan Zhao committed
830
831
832
833
  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
               << "\n");
  if (KernelLaunchNode == NULL)
    DEBUG(errs() << "No kernel launch node\n");
834
  else {
Yifan Zhao's avatar
Yifan Zhao committed
835
836
    DEBUG(errs() << "KernelLaunchNode: "
                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
837
  }
Prakalp Srivastava's avatar
Prakalp Srivastava committed
838

839
  if (!KernelLaunchNode) {
Yifan Zhao's avatar
Yifan Zhao committed
840
841
    DEBUG(errs()
          << "No code generated (host code for kernel launch complete).\n");
842
843
    return;
  }
844

845
846
  if (N == KernelLaunchNode) {
    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
Yifan Zhao's avatar
Yifan Zhao committed
847
    // TODO
848

849
850
    // Now the remaining nodes to be visited should be ignored
    KernelLaunchNode = NULL;
851
    DEBUG(errs() << "Insert Runtime calls\n");
852
    insertRuntimeCalls(N, kernel, getPTXFilename(M));
853

854
855
  } else {
    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
856
    // Keep track of the arguments order.