diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index a37d9c152504c9f4271daceff81ff5be83ec292c..48d65ec3b4113146ac4e79be681d4e1fc06c221a 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -409,17 +409,40 @@ namespace {
                      ArrayRef<Value*>(GraphID),
                      "",
                      RI);
-    // Read Output
+    // Read Output Struct
     Value* GetOutputArgs[] = {GraphID,
                               d_Output,
                               outputSize};
     CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
                                           ArrayRef<Value*>(GetOutputArgs, 3),
-                                          "h_output."+CF->getName(),
+                                          "h_output."+CF->getName()+".addr",
                                           RI);
-    // Prepare output
+    // Read each device pointer listed in output struct
+    // Load the output struct
     CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType(), "output.ptr", RI);
-    LoadInst* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI);
+    Value* KernelOutput = new LoadInst(BI, "", RI);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Type* elemTy = OutputTy->getElementType(i);
+      if(elemTy->isPointerTy()) {
+        // Pointer type
+        assert(OutputTy->getElementType(i+1)->isIntegerTy()
+            && "Every Pointer type must be followed by an integer");
+        ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI);
+        ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI);
+        // GetOutputPtr call
+        Value* GetOutputArgs[] = {GraphID,
+                                  d_ptr,
+                                  len};
+        CallInst* h_ptr = CallInst::Create(llvm_visc_ptx_getOutput,
+                                              ArrayRef<Value*>(GetOutputArgs, 3),
+                                              "",
+                                              RI);
+        KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI);
+
+      }
+    }
+    // Prepare output
+    KernelOutput->setName("output."+CF->getName());
     OutputMap[C] = KernelOutput;
 
     DEBUG(errs() << "*** Generating epilogue code for the function****\n");
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index d7e3cd530ccd032057a1fa3602562103883fa24f..0a5eab8a87d548747c0ceb9352c044b63e91e1b3 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -15,11 +15,6 @@ typedef struct {
   cl_kernel clKernel;
 } DFNodeContext_PTX;
 
-typedef struct {
-  cl_mem d_elem;
-  size_t size;
-} OutputTy;
-
 cl_context globalGPUContext;
 
 static inline void checkErr(cl_int err, cl_int success, const char * name) {
@@ -123,18 +118,6 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
   cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
                                 h_output, 0, NULL, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to read output");
-  // Assuming all output is in the format of device pointer followed by size of
-  // output size format
-  OutputTy* Output = (OutputTy*) h_output;
-  unsigned numElems = size/sizeof(OutputTy);
-  for(unsigned i = 0; i < numElems; i++) {
-    OutputTy& outputElem = Output[i];
-    void* h_outputElem = malloc(outputElem.size);
-    errcode = clEnqueueReadBuffer(Context->clCommandQue, outputElem.d_elem, CL_TRUE, 0,
-                                  outputElem.size, h_outputElem, 0, NULL, NULL);
-    checkErr(errcode, CL_SUCCESS, "Failure to read output");
-    Output[i].d_elem = (cl_mem) h_outputElem;
-  }
   return h_output;
 }