From e34922a221738bae1195d8ace90369c9ddc3a48d Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Wed, 23 Jul 2014 23:09:25 -0700
Subject: [PATCH] SPARK-2310. Support arbitrary Spark properties on the command
 line with ...

...spark-submit

The PR allows invocations like
  spark-submit --class org.MyClass --spark.shuffle.spill false myjar.jar

Author: Sandy Ryza <sandy@cloudera.com>

Closes #1253 from sryza/sandy-spark-2310 and squashes the following commits:

1dc9855 [Sandy Ryza] More doc and cleanup
00edfb9 [Sandy Ryza] Review comments
91b244a [Sandy Ryza] Change format to --conf PROP=VALUE
8fabe77 [Sandy Ryza] SPARK-2310. Support arbitrary Spark properties on the command line with spark-submit
---
 .../scala/org/apache/spark/deploy/SparkSubmit.scala  |  3 +++
 .../apache/spark/deploy/SparkSubmitArguments.scala   | 11 +++++++++++
 .../org/apache/spark/deploy/SparkSubmitSuite.scala   | 12 +++++++++++-
 docs/configuration.md                                |  8 +++++---
 docs/submitting-applications.md                      |  2 ++
 5 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 3d8373d817..3b5642b6ca 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -269,6 +269,9 @@ object SparkSubmit {
       sysProps.getOrElseUpdate(k, v)
     }
 
+    // Spark properties included on command line take precedence
+    sysProps ++= args.sparkProperties
+
     (childArgs, childClasspath, sysProps, childMainClass)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 57655aa4c3..3ab67a43a3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -55,6 +55,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
   var verbose: Boolean = false
   var isPython: Boolean = false
   var pyFiles: String = null
+  val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
 
   parseOpts(args.toList)
   loadDefaults()
@@ -177,6 +178,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
     |  executorCores           $executorCores
     |  totalExecutorCores      $totalExecutorCores
     |  propertiesFile          $propertiesFile
+    |  extraSparkProperties    $sparkProperties
     |  driverMemory            $driverMemory
     |  driverCores             $driverCores
     |  driverExtraClassPath    $driverExtraClassPath
@@ -290,6 +292,13 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
         jars = Utils.resolveURIs(value)
         parse(tail)
 
+      case ("--conf" | "-c") :: value :: tail =>
+        value.split("=", 2).toSeq match {
+          case Seq(k, v) => sparkProperties(k) = v
+          case _ => SparkSubmit.printErrorAndExit(s"Spark config without '=': $value")
+        }
+        parse(tail)
+
       case ("--help" | "-h") :: tail =>
         printUsageAndExit(0)
 
@@ -349,6 +358,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
         |                              on the PYTHONPATH for Python apps.
         |  --files FILES               Comma-separated list of files to be placed in the working
         |                              directory of each executor.
+        |
+        |  --conf PROP=VALUE           Arbitrary Spark configuration property.
         |  --properties-file FILE      Path to a file from which to load extra properties. If not
         |                              specified, this will look for conf/spark-defaults.conf.
         |
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 565c53e952..f497a5e0a1 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -120,6 +120,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "beauty",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -139,6 +140,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
     classpath should have length (0)
     sysProps("spark.app.name") should be ("beauty")
+    sysProps("spark.shuffle.spill") should be ("false")
     sysProps("SPARK_SUBMIT") should be ("true")
   }
 
@@ -156,6 +158,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "trill",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -176,6 +179,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
     sysProps("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
     sysProps("SPARK_SUBMIT") should be ("true")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles standalone cluster mode") {
@@ -186,6 +190,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--supervise",
       "--driver-memory", "4g",
       "--driver-cores", "5",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -195,9 +200,10 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     childArgsStr should include regex ("launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2")
     mainClass should be ("org.apache.spark.deploy.Client")
     classpath should have size (0)
-    sysProps should have size (2)
+    sysProps should have size (3)
     sysProps.keys should contain ("spark.jars")
     sysProps.keys should contain ("SPARK_SUBMIT")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles standalone client mode") {
@@ -208,6 +214,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -218,6 +225,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles mesos client mode") {
@@ -228,6 +236,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -238,6 +247,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("launch simple application with spark-submit") {
diff --git a/docs/configuration.md b/docs/configuration.md
index 02af461267..cb0c65e2d2 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -42,13 +42,15 @@ val sc = new SparkContext(new SparkConf())
 
 Then, you can supply configuration values at runtime:
 {% highlight bash %}
-./bin/spark-submit --name "My fancy app" --master local[4] myApp.jar
+./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false 
+  --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar 
 {% endhighlight %}
 
 The Spark shell and [`spark-submit`](cluster-overview.html#launching-applications-with-spark-submit)
 tool support two ways to load configurations dynamically. The first are command line options,
-such as `--master`, as shown above. Running `./bin/spark-submit --help` will show the entire list
-of options.
+such as `--master`, as shown above. `spark-submit` can accept any Spark property using the `--conf`
+flag, but uses special flags for properties that play a part in launching the Spark application.
+Running `./bin/spark-submit --help` will show the entire list of these options.
 
 `bin/spark-submit` will also read configuration options from `conf/spark-defaults.conf`, in which
 each line consists of a key and a value separated by whitespace. For example:
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index e05883072b..45b70b1a54 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -33,6 +33,7 @@ dependencies, and can support different cluster managers and deploy modes that S
   --class <main-class>
   --master <master-url> \
   --deploy-mode <deploy-mode> \
+  --conf <key>=<value> \
   ... # other options
   <application-jar> \
   [application-arguments]
@@ -43,6 +44,7 @@ Some of the commonly used options are:
 * `--class`: The entry point for your application (e.g. `org.apache.spark.examples.SparkPi`)
 * `--master`: The [master URL](#master-urls) for the cluster (e.g. `spark://23.195.26.187:7077`)
 * `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`)*
+* `--conf`: Arbitrary Spark configuration property in key=value format. For values that contain spaces wrap "key=value" in quotes (as shown).
 * `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes.
 * `application-arguments`: Arguments passed to the main method of your main class, if any
 
-- 
GitLab