From 86475520f88f90c9d3b71516f65ccc0e9d244863 Mon Sep 17 00:00:00 2001 From: Xin Ren <iamshrek@126.com> Date: Tue, 10 May 2016 15:12:47 -0700 Subject: [PATCH] [SPARK-14936][BUILD][TESTS] FlumePollingStreamSuite is slow https://issues.apache.org/jira/browse/SPARK-14936 ## What changes were proposed in this pull request? FlumePollingStreamSuite contains two tests which run for a minute each. This seems excessively slow and we should speed it up if possible. In this PR, instead of creating `StreamingContext` directly from `conf`, here an underlying `SparkContext` is created before all and it is used to create each`StreamingContext`. Running time is reduced by avoiding multiple `SparkContext` creations and destroys. ## How was this patch tested? Tested on my local machine running `testOnly *.FlumePollingStreamSuite` Author: Xin Ren <iamshrek@126.com> Closes #12845 from keypointt/SPARK-14936. --- .../flume/PollingFlumeTestUtils.scala | 4 +-- .../flume/FlumePollingStreamSuite.scala | 26 ++++++++++++++----- python/pyspark/streaming/tests.py | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala index 6a4dafb8ed..15ff4f6025 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala @@ -116,7 +116,7 @@ private[flume] class PollingFlumeTestUtils { /** * Send data and wait until all data has been received */ - def sendDatAndEnsureAllDataHasBeenReceived(): Unit = { + def sendDataAndEnsureAllDataHasBeenReceived(): Unit = { val executor = Executors.newCachedThreadPool() val executorCompletion = new ExecutorCompletionService[Void](executor) @@ -174,7 +174,7 @@ private[flume] class PollingFlumeTestUtils { val queueRemaining = channel.getClass.getDeclaredField("queueRemaining") queueRemaining.setAccessible(true) val m = queueRemaining.get(channel).getClass.getDeclaredMethod("availablePermits") - if (m.invoke(queueRemaining.get(channel)).asInstanceOf[Int] != 5000) { + if (m.invoke(queueRemaining.get(channel)).asInstanceOf[Int] != channelCapacity) { throw new AssertionError(s"Channel ${channel.getName} is not empty") } } diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index 156712483d..1c93079497 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -24,10 +24,10 @@ import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps -import org.scalatest.BeforeAndAfter +import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually._ -import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel @@ -35,11 +35,13 @@ import org.apache.spark.streaming.{Seconds, StreamingContext, TestOutputStream} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.util.{ManualClock, Utils} -class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { +class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { val maxAttempts = 5 val batchDuration = Seconds(1) + @transient private var _sc: SparkContext = _ + val conf = new SparkConf() .setMaster("local[2]") .setAppName(this.getClass.getSimpleName) @@ -47,6 +49,17 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log val utils = new PollingFlumeTestUtils + override def beforeAll(): Unit = { + _sc = new SparkContext(conf) + } + + override def afterAll(): Unit = { + if (_sc != null) { + _sc.stop() + _sc = null + } + } + test("flume polling test") { testMultipleTimes(testFlumePolling) } @@ -98,7 +111,7 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log def writeAndVerify(sinkPorts: Seq[Int]): Unit = { // Set up the streaming context and input streams - val ssc = new StreamingContext(conf, batchDuration) + val ssc = new StreamingContext(_sc, batchDuration) val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port)) val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK, @@ -109,7 +122,7 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log ssc.start() try { - utils.sendDatAndEnsureAllDataHasBeenReceived() + utils.sendDataAndEnsureAllDataHasBeenReceived() val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] clock.advance(batchDuration.milliseconds) @@ -123,7 +136,8 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log utils.assertOutput(headers.asJava, bodies.asJava) } } finally { - ssc.stop() + // here stop ssc only, but not underlying sparkcontext + ssc.stop(false) } } diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 148bf7e8ff..f27628c895 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -1357,7 +1357,7 @@ class FlumePollingStreamTests(PySparkStreamingTestCase): dstream.foreachRDD(get_output) ssc.start() - self._utils.sendDatAndEnsureAllDataHasBeenReceived() + self._utils.sendDataAndEnsureAllDataHasBeenReceived() self.wait_for(outputBuffer, self._utils.getTotalEvents()) outputHeaders = [event[0] for event in outputBuffer] -- GitLab