From 16c886a581da64d791c47b93c714e0a9c62f23b8 Mon Sep 17 00:00:00 2001 From: Matei Zaharia <matei@eecs.berkeley.edu> Date: Fri, 13 May 2011 10:41:34 -0700 Subject: [PATCH] Optimization for count() --- core/src/main/scala/spark/RDD.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index 12e2f4f902..40eb7967ec 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -107,7 +107,14 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) { } def count(): Long = { - sc.runJob(this, (iter: Iterator[T]) => iter.size.toLong).sum + sc.runJob(this, (iter: Iterator[T]) => { + var result = 0L + while (iter.hasNext) { + result += 1L + iter.next + } + result + }).sum } def toArray(): Array[T] = collect() -- GitLab