diff --git a/.gitignore b/.gitignore index cd9f90d55932cf874df0f1491f61ed2679999575..857e9feb953bd9a7e345173280681695ca734f4d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ conf/java-opts conf/spark-env.sh conf/streaming-env.sh conf/log4j.properties +conf/spark-defaults.conf docs/_site docs/api target/ diff --git a/bin/spark-shell b/bin/spark-shell index ea12d256b23a18dcc820538feba432c8d71acf1b..f1f3c18877ed4fba93f5538ecad956a33af89055 100755 --- a/bin/spark-shell +++ b/bin/spark-shell @@ -19,9 +19,8 @@ # # Shell script for starting the Spark Shell REPL -# Note that it will set MASTER to spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT} -# if those two env vars are set in spark-env.sh but MASTER is not. +args="$@" cygwin=false case "`uname`" in CYGWIN*) cygwin=true;; @@ -30,133 +29,16 @@ esac # Enter posix mode for bash set -o posix +if [[ "$@" == *--help* ]]; then + echo "Usage: ./bin/spark-shell [options]" + ./bin/spark-submit --help 2>&1 | grep -v Usage 1>&2 + exit 0 +fi + ## Global script variables FWDIR="$(cd `dirname $0`/..; pwd)" -SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}" -DEFAULT_MASTER="local[*]" -MASTER=${MASTER:-""} - -info_log=0 - -#CLI Color Templates -txtund=$(tput sgr 0 1) # Underline -txtbld=$(tput bold) # Bold -bldred=${txtbld}$(tput setaf 1) # red -bldyel=${txtbld}$(tput setaf 3) # yellow -bldblu=${txtbld}$(tput setaf 4) # blue -bldwht=${txtbld}$(tput setaf 7) # white -txtrst=$(tput sgr0) # Reset -info=${bldwht}*${txtrst} # Feedback -pass=${bldblu}*${txtrst} -warn=${bldred}*${txtrst} -ques=${bldblu}?${txtrst} - -# Helper function to describe the script usage -function usage() { - cat << EOF -${txtbld}Usage${txtrst}: spark-shell [OPTIONS] - -${txtbld}OPTIONS${txtrst}: - -h --help : Print this help information. - -c --cores : The maximum number of cores to be used by the Spark Shell. - -em --executor-memory : The memory used by each executor of the Spark Shell, the number - is followed by m for megabytes or g for gigabytes, e.g. "1g". - -dm --driver-memory : The memory used by the Spark Shell, the number is followed - by m for megabytes or g for gigabytes, e.g. "1g". - -m --master : A full string that describes the Spark Master, defaults to "local[*]" - e.g. "spark://localhost:7077". - --log-conf : Enables logging of the supplied SparkConf as INFO at start of the - Spark Context. - -e.g. - spark-shell -m spark://localhost:7077 -c 4 -dm 512m -em 2g - -EOF -} - -function out_error(){ - echo -e "${txtund}${bldred}ERROR${txtrst}: $1" - usage - exit 1 -} - -function log_info(){ - [ $info_log -eq 1 ] && echo -e "${bldyel}INFO${txtrst}: $1" -} - -function log_warn(){ - echo -e "${txtund}${bldyel}WARN${txtrst}: $1" -} - -# PATTERNS used to validate more than one optional arg. -ARG_FLAG_PATTERN="^-" -MEM_PATTERN="^[0-9]+[m|g|M|G]$" -NUM_PATTERN="^[0-9]+$" -PORT_PATTERN="^[0-9]+$" - -# Setters for optional args. -function set_cores(){ - CORE_PATTERN="^[0-9]+$" - if [[ "$1" =~ $CORE_PATTERN ]]; then - SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1" - else - out_error "wrong format for $2" - fi -} - -function set_em(){ - if [[ $1 =~ $MEM_PATTERN ]]; then - SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.executor.memory=$1" - else - out_error "wrong format for $2" - fi -} - -function set_dm(){ - if [[ $1 =~ $MEM_PATTERN ]]; then - export SPARK_DRIVER_MEMORY=$1 - else - out_error "wrong format for $2" - fi -} - -function set_spark_log_conf(){ - SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.logConf=$1" -} - -function set_spark_master(){ - if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then - export MASTER="$1" - else - out_error "wrong format for $2" - fi -} - -function resolve_spark_master(){ - # Set MASTER from spark-env if possible - DEFAULT_SPARK_MASTER_PORT=7077 - if [ -z "$MASTER" ]; then - . $FWDIR/bin/load-spark-env.sh - if [ -n "$SPARK_MASTER_IP" ]; then - SPARK_MASTER_PORT="${SPARK_MASTER_PORT:-"$DEFAULT_SPARK_MASTER_PORT"}" - export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}" - fi - fi - - if [ -z "$MASTER" ]; then - export MASTER="$DEFAULT_MASTER" - fi - -} - function main(){ - log_info "Base Directory set to $FWDIR" - - resolve_spark_master - log_info "Spark Master is $MASTER" - - log_info "Spark REPL options $SPARK_REPL_OPTS" if $cygwin; then # Workaround for issue involving JLine and Cygwin # (see http://sourceforge.net/p/jline/bugs/40/). @@ -165,55 +47,14 @@ function main(){ # (see https://github.com/sbt/sbt/issues/562). stty -icanon min 1 -echo > /dev/null 2>&1 export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix" - $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@" + $FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main stty icanon echo > /dev/null 2>&1 else export SPARK_REPL_OPTS - $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@" + $FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main fi } -for option in "$@" -do - case $option in - -h | --help ) - usage - exit 1 - ;; - -c | --cores) - shift - _1=$1 - shift - set_cores $_1 "-c/--cores" - ;; - -em | --executor-memory) - shift - _1=$1 - shift - set_em $_1 "-em/--executor-memory" - ;; - -dm | --driver-memory) - shift - _1=$1 - shift - set_dm $_1 "-dm/--driver-memory" - ;; - -m | --master) - shift - _1=$1 - shift - set_spark_master $_1 "-m/--master" - ;; - --log-conf) - shift - set_spark_log_conf "true" - info_log=1 - ;; - ?) - ;; - esac -done - # Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in # binary distribution of Spark where Scala is not installed exit_status=127 diff --git a/bin/spark-submit b/bin/spark-submit index 498d0b27bacdfcb6131bbe75614763dccc16d618..b2a1dca721dffb54795809671469a80fe51bf811 100755 --- a/bin/spark-submit +++ b/bin/spark-submit @@ -21,15 +21,15 @@ export SPARK_HOME="$(cd `dirname $0`/..; pwd)" ORIG_ARGS=$@ while (($#)); do - if [ $1 = "--deploy-mode" ]; then + if [ "$1" = "--deploy-mode" ]; then DEPLOY_MODE=$2 - elif [ $1 = "--driver-memory" ]; then + elif [ "$1" = "--driver-memory" ]; then DRIVER_MEMORY=$2 - elif [ $1 = "--driver-library-path" ]; then + elif [ "$1" = "--driver-library-path" ]; then export _SPARK_LIBRARY_PATH=$2 - elif [ $1 = "--driver-class-path" ]; then + elif [ "$1" = "--driver-class-path" ]; then export SPARK_CLASSPATH="$SPARK_CLASSPATH:$2" - elif [ $1 = "--driver-java-options" ]; then + elif [ "$1" = "--driver-java-options" ]; then export SPARK_JAVA_OPTS="$SPARK_JAVA_OPTS $2" fi shift diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index a2efddbfe3f69c71736072c612a44005d9e72da6..58aa6d951a2047c5ed87d8f6d2817926403a1eef 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -38,6 +38,12 @@ object SparkSubmit { private var clusterManager: Int = LOCAL + /** + * A special jar name that indicates the class being run is inside of Spark itself, + * and therefore no user jar is needed. + */ + private val RESERVED_JAR_NAME = "spark-internal" + def main(args: Array[String]) { val appArgs = new SparkSubmitArguments(args) if (appArgs.verbose) { @@ -113,7 +119,9 @@ object SparkSubmit { if (!deployOnCluster) { childMainClass = appArgs.mainClass - childClasspath += appArgs.primaryResource + if (appArgs.primaryResource != RESERVED_JAR_NAME) { + childClasspath += appArgs.primaryResource + } } else if (clusterManager == YARN) { childMainClass = "org.apache.spark.deploy.yarn.Client" childArgs += ("--jar", appArgs.primaryResource) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index c3e8c6b8c65a97c6c0e55e2a378c8423fdcda373..c545b093ac82e0c68b06b70eecc6746f4dbe07d2 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -107,7 +107,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) { deployMode = Option(deployMode).getOrElse(System.getenv("DEPLOY_MODE")) // Global defaults. These should be keep to minimum to avoid confusing behavior. - master = Option(master).getOrElse("local") + master = Option(master).getOrElse("local[*]") } /** Ensure that required fields exists. Call this only once all defaults are loaded. */ diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index a3171709ff05b47b5b0d0572d33012926627f312..b8d89cf00ffbda7e1a1227fa6f42363381eb099d 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -60,17 +60,18 @@ which avoids hard-coding the master name in your application. In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the variable called `sc`. Making your own SparkContext will not work. You can set which master the -context connects to using the `MASTER` environment variable, and you can add JARs to the classpath -with the `ADD_JARS` variable. For example, to run `bin/spark-shell` on exactly four cores, use +context connects to using the `--master` argument, and you can add JARs to the classpath +by passing a comma separated list to the `--jars` argument. For example, to run +`bin/spark-shell` on exactly four cores, use {% highlight bash %} -$ MASTER=local[4] ./bin/spark-shell +$ ./bin/spark-shell --master local[4] {% endhighlight %} Or, to also add `code.jar` to its classpath, use: {% highlight bash %} -$ MASTER=local[4] ADD_JARS=code.jar ./bin/spark-shell +$ ./bin/spark-shell --master local[4] --jars code.jar {% endhighlight %} ### Master URLs diff --git a/docs/spark-debugger.md b/docs/spark-debugger.md index 891c2bfa8943d45d2bc874043251d3f68ebe1288..35d06c51aaf0ef2163cebd8511a689e9bd3b2977 100644 --- a/docs/spark-debugger.md +++ b/docs/spark-debugger.md @@ -39,7 +39,7 @@ where `path/to/event-log` is where you want the event log to go relative to `$SP ### Loading the event log into the debugger -1. Run a Spark shell with `MASTER=<i>host</i> ./bin/spark-shell`. +1. Run a Spark shell with `./bin/spark-shell --master <i>hist</i>`. 2. Use `EventLogReader` to load the event log as follows: {% highlight scala %} spark> val r = new spark.EventLogReader(sc, Some("path/to/event-log")) diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 7e4eea323aa63f2310507a64543c597fac0b4eb1..dc7f206e039968da69c44e5a968b8331f7697bda 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -139,12 +139,12 @@ constructor](scala-programming-guide.html#initializing-spark). To run an interactive Spark shell against the cluster, run the following command: - MASTER=spark://IP:PORT ./bin/spark-shell + ./bin/spark-shell --master spark://IP:PORT Note that if you are running spark-shell from one of the spark cluster machines, the `bin/spark-shell` script will automatically set MASTER from the `SPARK_MASTER_IP` and `SPARK_MASTER_PORT` variables in `conf/spark-env.sh`. -You can also pass an option `-c <numCores>` to control the number of cores that spark-shell uses on the cluster. +You can also pass an option `--cores <numCores>` to control the number of cores that spark-shell uses on the cluster. # Launching Compiled Spark Applications diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 946d6c4879cf3455830f43138901539b44ee43f6..7ad06427cac5c5d806c6831bcfbbc4ab81c12721 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -272,12 +272,10 @@ Time: 1357008430000 ms </td> </table> -If you plan to run the Scala code for Spark Streaming-based use cases in the Spark -shell, you should start the shell with the SparkConfiguration pre-configured to -discard old batches periodically: +You can also use Spark Streaming directly from the Spark shell: {% highlight bash %} -$ SPARK_JAVA_OPTS=-Dspark.cleaner.ttl=10000 bin/spark-shell +$ bin/spark-shell {% endhighlight %} ... and create your StreamingContext by wrapping the existing interactive shell diff --git a/make-distribution.sh b/make-distribution.sh index 62a28217740cd2f6df0cfff601d406cbe195b117..55fe6c09d0d66679bf80d2051b8b8e9f952421de 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -36,7 +36,7 @@ # 2) cd to deploy dir; ./sbin/start-master.sh # 3) Verify master is up by visiting web page, ie http://master-ip:8080. Note the spark:// URL. # 4) ./sbin/start-slave.sh 1 <<spark:// URL>> -# 5) MASTER="spark://my-master-ip:7077" ./bin/spark-shell +# 5) ./bin/spark-shell --master spark://my-master-ip:7077 # # Figure out where the Spark framework is installed diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala index beb40e87024bd269fa7ea3591ce4c8d506f2cc5e..296da740687ec52a073afdd5e3ff6301c21295a3 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -963,8 +963,9 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, val master = this.master match { case Some(m) => m case None => { - val prop = System.getenv("MASTER") - if (prop != null) prop else "local[*]" + val envMaster = sys.env.get("MASTER") + val propMaster = sys.props.get("spark.master") + envMaster.orElse(propMaster).getOrElse("local[*]") } } master