From e52b8719cf0603e79ded51cbe1c9f88eea8b56de Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Thu, 17 Jul 2014 23:57:08 -0700
Subject: [PATCH] SPARK-2553. CoGroupedRDD unnecessarily allocates a Tuple2 per
 dependency...

... per key

My humble opinion is that avoiding allocations in this performance-critical section is worth the extra code.

Author: Sandy Ryza <sandy@cloudera.com>

Closes #1461 from sryza/sandy-spark-2553 and squashes the following commits:

7eaf7f2 [Sandy Ryza] SPARK-2553. CoGroupedRDD unnecessarily allocates a Tuple2 per dependency per key
---
 core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 5951865e56..b284b636f2 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -180,7 +180,11 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
     }
     val mergeCombiners: (CoGroupCombiner, CoGroupCombiner) => CoGroupCombiner =
       (combiner1, combiner2) => {
-        combiner1.zip(combiner2).map { case (v1, v2) => v1 ++ v2 }
+        var depNum = 0
+        while (depNum < numRdds) {
+          combiner1(depNum) ++= combiner2(depNum)
+          depNum += 1
+        }
       }
     new ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner](
       createCombiner, mergeValue, mergeCombiners)
-- 
GitLab