diff --git a/hpvm/test/unitTests/ThreeLevel.ll b/hpvm/test/unitTests/ThreeLevel.ll
index d8bf050234264e55be6af269e40ab5f2ef36a03b..460dd15b6b1f6dd38483a18e899f0d96b68cac08 100644
--- a/hpvm/test/unitTests/ThreeLevel.ll
+++ b/hpvm/test/unitTests/ThreeLevel.ll
@@ -1,70 +1,67 @@
-; RUN: opt - load LLVMGenHPVM.so -S -genhpvm < %s
-; ModuleID = 'TwoLevel.c'
-source_filename = "TwoLevel.c"
+; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_CPU.so -load LLVMClearDFG.so -S -dfg2llvm-cpu -clearDFG <  %s | FileCheck %s
+; ModuleID = 'ThreeLevel.ll'
+source_filename = "ThreeLevel.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 %struct.Root = type { i32*, i64, i32*, i64, i32*, i64 }
+%struct.out.Func1 = type <{ i32* }>
+%struct.out.Func3 = type <{ i32* }>
+%struct.out.Func2 = type <{ i32* }>
+%struct.out.PipeRoot = type <{ i32* }>
 
-; Function Attrs: nounwind uwtable
-define dso_local void @Func1(i32* %In, i64 %Insize, i32* %Out, i64 %Outsize) #0 {
-entry:
-  tail call void @__hpvm__hint(i32 1) #3
-  tail call void (i32, ...) @__hpvm__attributes(i32 1, i32* %In, i32* %Out, i32 1, i32* %Out) #3
-  %0 = load i32, i32* %In, align 4, !tbaa !2
-  store i32 %0, i32* %Out, align 4, !tbaa !2
-  tail call void (i32, ...) @__hpvm__return(i32 1, i32* %Out) #3
-  ret void
-}
 
-declare dso_local void @__hpvm__hint(i32) local_unnamed_addr #1
+; CHECK-LABEL: i32 @main(
+; CHECK-NOT: call void @llvm.hpvm.init()
+; CHECK: call i8* @llvm_hpvm_cpu_launch(i8* (i8*)* @LaunchDataflowGraph, i8*
+; CHECK-NOT: call i8* @llvm.hpvm.launch(i8*
+; CHECK: call void @llvm_hpvm_cpu_wait(i8*
 
-declare dso_local void @__hpvm__attributes(i32, ...) local_unnamed_addr #1
+; CHECK-LABEL: @Func1_cloned.1_cloned_cloned_cloned_cloned_cloned_cloned
+; CHECK: call i8* @llvm_hpvm_cpu_argument_ptr(
 
-declare dso_local void @__hpvm__return(i32, ...) local_unnamed_addr #1
+; CHECK-LABEL: @Func3_cloned.2_cloned_cloned_cloned_cloned_cloned_cloned(
+; CHECK-LABEL: for.body1:
+; CHECK: %index.y = phi i64 [ 0, %for.body ], [ %index.y.inc, %for.body1 ]
+; CHECK-NEXT: call void @llvm_hpvm_cpu_dstack_push(
+; CHECK-NEXT: @Func1_cloned.1_cloned_cloned_cloned_cloned_cloned_cloned(
+; CHECK-NEXT: call void @llvm_hpvm_cpu_dstack_pop()
 
-; Function Attrs: nounwind uwtable
-define dso_local void @Func2(i32* %In, i64 %Insize, i32* %Out, i64 %Outsize) #0 {
-entry:
-  tail call void @__hpvm__hint(i32 1) #3
-  tail call void (i32, ...) @__hpvm__attributes(i32 2, i32* %In, i32* %Out, i32 1, i32* %Out) #3
-  %call = tail call i8* (i32, ...) @__hpvm__createNodeND(i32 1, void (i32*, i64, i32*, i64)* nonnull @Func1, i64 3) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 0, i32 0, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 1, i32 1, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 2, i32 2, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 3, i32 3, i32 0) #3
-  tail call void @__hpvm__bindOut(i8* %call, i32 0, i32 0, i32 0) #3
-  ret void
-}
+; CHECK-LABEL: @Func2_cloned.3_cloned_cloned_cloned_cloned_cloned_cloned(
+; CHECK-LABEL: for.body:
+; CHECK-NEXT: %index.x = phi i64 [ 0, %entry ], [ %index.x.inc, %for.body ]
+; CHECK-NEXT: call void @llvm_hpvm_cpu_dstack_push(
+; CHECK-NEXT: @Func3_cloned.2_cloned_cloned_cloned_cloned_cloned_cloned(
+; CHECK-NEXT: call void @llvm_hpvm_cpu_dstack_pop()
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+; CHECK-LABEL: @PipeRoot_cloned.4(
+; CHECK: call void @llvm_hpvm_cpu_dstack_push(
+; CHECK-NEXT: @Func2_cloned.3_cloned_cloned_cloned_cloned_cloned_cloned(
+; CHECK-NEXT: call void @llvm_hpvm_cpu_dstack_pop()
+
+; CHECK-LABEL: @LaunchDataflowGraph(
+; CHECK: call %struct.out.PipeRoot @PipeRoot_cloned.4(
 
-declare dso_local i8* @__hpvm__createNodeND(i32, ...) local_unnamed_addr #1
+declare dso_local void @__hpvm__hint(i32) local_unnamed_addr #0
 
-declare dso_local void @__hpvm__bindIn(i8*, i32, i32, i32) local_unnamed_addr #1
+declare dso_local void @__hpvm__attributes(i32, ...) local_unnamed_addr #0
 
-declare dso_local void @__hpvm__bindOut(i8*, i32, i32, i32) local_unnamed_addr #1
+declare dso_local void @__hpvm__return(i32, ...) local_unnamed_addr #0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
 
-; Function Attrs: nounwind uwtable
-define dso_local void @PipeRoot(i32* %In1, i64 %Insize1, i32* %In2, i64 %InSize2, i32* %Out, i64 %Outsize) #0 {
-entry:
-  tail call void @__hpvm__hint(i32 1) #3
-  tail call void (i32, ...) @__hpvm__attributes(i32 3, i32* %In1, i32* %In2, i32* %Out, i32 1, i32* %Out) #3
-  %call = tail call i8* (i32, ...) @__hpvm__createNodeND(i32 0, void (i32*, i64, i32*, i64)* nonnull @Func2) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 0, i32 0, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 1, i32 1, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 2, i32 2, i32 0) #3
-  tail call void @__hpvm__bindIn(i8* %call, i32 3, i32 3, i32 0) #3
-  tail call void @__hpvm__bindOut(i8* %call, i32 0, i32 0, i32 0) #3
-  ret void
-}
+declare dso_local i8* @__hpvm__createNodeND(i32, ...) local_unnamed_addr #0
+
+declare dso_local void @__hpvm__bindIn(i8*, i32, i32, i32) local_unnamed_addr #0
+
+declare dso_local void @__hpvm__bindOut(i8*, i32, i32, i32) local_unnamed_addr #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 {
+define dso_local i32 @main() local_unnamed_addr #2 {
 entry:
   %In1 = alloca i32, align 4
   %In2 = alloca i32, align 4
@@ -72,31 +69,32 @@ entry:
   %RootArgs = alloca %struct.Root, align 8
   %0 = bitcast i32* %In1 to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
-  store i32 1, i32* %In1, align 4, !tbaa !2
+  store i32 1, i32* %In1, align 4, !tbaa !6
   %1 = bitcast i32* %In2 to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #3
-  store i32 2, i32* %In2, align 4, !tbaa !2
+  store i32 2, i32* %In2, align 4, !tbaa !6
   %2 = bitcast i32* %Out to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #3
-  store i32 0, i32* %Out, align 4, !tbaa !2
+  store i32 0, i32* %Out, align 4, !tbaa !6
   %3 = bitcast %struct.Root* %RootArgs to i8*
   call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %3) #3
   %input1 = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 0
-  store i32* %In1, i32** %input1, align 8, !tbaa !6
+  store i32* %In1, i32** %input1, align 8, !tbaa !10
   %Insize1 = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 1
-  store i64 32, i64* %Insize1, align 8, !tbaa !10
+  store i64 32, i64* %Insize1, align 8, !tbaa !14
   %input2 = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 2
-  store i32* %In2, i32** %input2, align 8, !tbaa !11
+  store i32* %In2, i32** %input2, align 8, !tbaa !15
   %Insize2 = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 3
-  store i64 32, i64* %Insize2, align 8, !tbaa !12
+  store i64 32, i64* %Insize2, align 8, !tbaa !16
   %output = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 4
-  store i32* %Out, i32** %output, align 8, !tbaa !13
+  store i32* %Out, i32** %output, align 8, !tbaa !17
   %Outsize = getelementptr inbounds %struct.Root, %struct.Root* %RootArgs, i64 0, i32 5
-  store i64 32, i64* %Outsize, align 8, !tbaa !14
-  call void (...) @__hpvm__init() #3
-  %call = call i8* (i32, ...) @__hpvm__launch(i32 0, void (i32*, i64, i32*, i64, i32*, i64)* nonnull @PipeRoot, %struct.Root* nonnull %RootArgs) #3
-  call void @__hpvm__wait(i8* %call) #3
-  call void (...) @__hpvm__cleanup() #3
+  store i64 32, i64* %Outsize, align 8, !tbaa !18
+  call void @llvm.hpvm.init()
+  %4 = bitcast %struct.Root* %RootArgs to i8*
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out.PipeRoot (i32*, i64, i32*, i64, i32*, i64)* @PipeRoot_cloned to i8*), i8* %4, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %3) #3
   call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #3
   call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #3
@@ -104,34 +102,124 @@ entry:
   ret i32 0
 }
 
-declare dso_local void @__hpvm__init(...) local_unnamed_addr #1
+declare dso_local void @__hpvm__init(...) local_unnamed_addr #0
+
+declare dso_local i8* @__hpvm__launch(i32, ...) local_unnamed_addr #0
+
+declare dso_local void @__hpvm__wait(i8*) local_unnamed_addr #0
+
+declare dso_local void @__hpvm__cleanup(...) local_unnamed_addr #0
+
+declare i8* @llvm_hpvm_initializeTimerSet()
+
+declare void @llvm_hpvm_switchToTimer(i8**, i32)
+
+declare void @llvm_hpvm_printTimerSet(i8**, i8*)
+
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out.Func1 @Func1_cloned(i32* in %In, i64 %Insize, i32* out %Out, i64 %Outsize) #2 {
+entry:
+  %returnStruct = insertvalue %struct.out.Func1 undef, i32* %Out, 0
+  ret %struct.out.Func1 %returnStruct
+}
+
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode2D(i8*, i64, i64) #3
+
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out.Func3 @Func3_cloned(i32* in %In, i64 %Insize, i32* out %Out, i64 %Outsize) #2 {
+; CHECK-NOT: @Func3_cloned
+entry:
+  %Func1_cloned.node = call i8* @llvm.hpvm.createNode2D(i8* bitcast (%struct.out.Func1 (i32*, i64, i32*, i64)* @Func1_cloned to i8*), i64 3, i64 5)
+  call void @llvm.hpvm.bind.input(i8* %Func1_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func1_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func1_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func1_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %Func1_cloned.node, i32 0, i32 0, i1 false)
+  ret %struct.out.Func3 undef
+}
+
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode1D(i8*, i64) #3
+
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out.Func2 @Func2_cloned(i32* in %In, i64 %Insize, i32* out %Out, i64 %Outsize) #2 {
+; CHECK-NOT: @Func2_cloned
+entry:
+  %Func3_cloned.node = call i8* @llvm.hpvm.createNode1D(i8* bitcast (%struct.out.Func3 (i32*, i64, i32*, i64)* @Func3_cloned to i8*), i64 3)
+  call void @llvm.hpvm.bind.input(i8* %Func3_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func3_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func3_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func3_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %Func3_cloned.node, i32 0, i32 0, i1 false)
+  ret %struct.out.Func2 undef
+}
+
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out.PipeRoot @PipeRoot_cloned(i32* in %In1, i64 %Insize1, i32* in %In2, i64 %InSize2, i32* out %Out, i64 %Outsize) #2 {
+; CHECK-NOT: @PipeRoot_cloned
+entry:
+  %Func2_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out.Func2 (i32*, i64, i32*, i64)* @Func2_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %Func2_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func2_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func2_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %Func2_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %Func2_cloned.node, i32 0, i32 0, i1 false)
+  ret %struct.out.PipeRoot undef
+}
+
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
 
-declare dso_local i8* @__hpvm__launch(i32, ...) local_unnamed_addr #1
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
 
-declare dso_local void @__hpvm__wait(i8*) local_unnamed_addr #1
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
 
-declare dso_local void @__hpvm__cleanup(...) local_unnamed_addr #1
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { argmemonly nounwind }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cpu-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cpu-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
+!hpvm_hint_cpu = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_spir = !{}
+!hpvm_hint_cudnn = !{}
+!hpvm_hint_promise = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cpu_spir = !{}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 3551132592a00cab6c966df508ab511598269f78)"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
-!6 = !{!7, !8, i64 0}
-!7 = !{!"Root", !8, i64 0, !9, i64 8, !8, i64 16, !9, i64 24, !8, i64 32, !9, i64 40}
-!8 = !{!"any pointer", !4, i64 0}
-!9 = !{!"long", !4, i64 0}
-!10 = !{!7, !9, i64 8}
-!11 = !{!7, !8, i64 16}
-!12 = !{!7, !9, i64 24}
-!13 = !{!7, !8, i64 32}
-!14 = !{!7, !9, i64 40}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 6690f9e7e8b46b96aea222d3e85315cd63545953)"}
+!2 = !{%struct.out.Func1 (i32*, i64, i32*, i64)* @Func1_cloned}
+!3 = !{%struct.out.Func3 (i32*, i64, i32*, i64)* @Func3_cloned}
+!4 = !{%struct.out.Func2 (i32*, i64, i32*, i64)* @Func2_cloned}
+!5 = !{%struct.out.PipeRoot (i32*, i64, i32*, i64, i32*, i64)* @PipeRoot_cloned}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"int", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !12, i64 0}
+!11 = !{!"Root", !12, i64 0, !13, i64 8, !12, i64 16, !13, i64 24, !12, i64 32, !13, i64 40}
+!12 = !{!"any pointer", !8, i64 0}
+!13 = !{!"long", !8, i64 0}
+!14 = !{!11, !13, i64 8}
+!15 = !{!11, !12, i64 16}
+!16 = !{!11, !13, i64 24}
+!17 = !{!11, !12, i64 32}
+!18 = !{!11, !13, i64 40}