Skip to content
Snippets Groups Projects
Commit ffd1f59a authored by Davies Liu's avatar Davies Liu Committed by Patrick Wendell
Browse files

[SPARK-2887] fix bug of countApproxDistinct() when have more than one partition

fix bug of countApproxDistinct() when have more than one partition

Author: Davies Liu <davies.liu@gmail.com>

Closes #1812 from davies/approx and squashes the following commits:

bf757ce [Davies Liu] fix bug of countApproxDistinct() when have more than one partition
parent a263a7e9
No related branches found
No related tags found
No related merge requests found
......@@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag](
},
(h1: HyperLogLogPlus, h2: HyperLogLogPlus) => {
h1.addAll(h2)
h2
h1
}).cardinality()
}
......
......@@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {
def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble
val size = 100
val uniformDistro = for (i <- 1 to 100000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro)
assert(error(simpleRdd.countApproxDistinct(4, 0), size) < 0.4)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.1)
val size = 1000
val uniformDistro = for (i <- 1 to 5000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
}
test("SparkContext.union") {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment