diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index cf2f663fa5df7a88bb46907d4852b2038e1dbcba..564706328832115663ad81b0bcbb90484a448609 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -39,7 +39,7 @@ <li><a href="{{HOME_PATH}}index.html">Overview</a></li> <li class="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guide<b class="caret"></b></a> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> <ul class="dropdown-menu"> <li><a href="{{HOME_PATH}}scala-programming-guide.html">Scala</a></li> <li><a href="{{HOME_PATH}}java-programming-guide.html">Java</a></li> diff --git a/docs/api.md b/docs/api.md index 8a01023ad43bb9c7499a14c74c7f2ed8e5527744..3bdac174f9137b62e18ee70899e78a948d640390 100644 --- a/docs/api.md +++ b/docs/api.md @@ -5,7 +5,6 @@ title: Spark API documentation (Scaladoc) Here you can find links to the Scaladoc generated for the Spark sbt subprojects. If the following links don't work, try running `sbt/sbt doc` from the Spark project home directory. -- [Core](api/core/index.html) -- [Examples](api/examples/index.html) -- [Repl](api/repl/index.html) -- [Bagel](api/bagel/index.html) +- [Core]({{HOME_PATH}}api/core/index.html) +- [Examples]({{HOME_PATH}}api/examples/index.html) +- [Bagel]({{HOME_PATH}}api/bagel/index.html) diff --git a/docs/contributing-to-spark.md b/docs/contributing-to-spark.md index a99ab4153140e3a39951da4200bff79a202c6e1e..fa81b858ebfed4f14973004171fa36293e10fde9 100644 --- a/docs/contributing-to-spark.md +++ b/docs/contributing-to-spark.md @@ -12,5 +12,12 @@ The Spark team welcomes contributions in the form of GitHub pull requests. Here * Always import packages using absolute paths (e.g. `scala.collection.Map` instead of `collection.Map`). * No "infix" syntax for methods other than operators. For example, don't write `table containsKey myKey`; replace it with `table.containsKey(myKey)`. - Add unit tests to your new code. We use [ScalaTest](http://www.scalatest.org/) for testing. Just add a new Suite in `core/src/test`, or methods to an existing Suite. +- If you'd like to report a bug but don't have time to fix it, you can still post it to our [issues page](https://github.com/mesos/spark/issues), or email the [mailing list](http://www.spark-project.org/mailing-lists.html). -If you'd like to report a bug but don't have time to fix it, you can still post it to our [issues page](https://github.com/mesos/spark/issues). Also, feel free to email the [mailing list](http://www.spark-project.org/mailing-lists.html). +# Licensing of Contributions + +Contributions via GitHub pull requests are gladly accepted from their original author. Along with any pull requests, please +state that the contribution is your original work and that you license the work to the project under the project's open source +license. *Whether or not you state this explicitly, by submitting any copyrighted material via pull request, email, or other +means you agree to license the material under the project's open source license and warrant that you have the legal authority +to do so.* diff --git a/docs/css/main.css b/docs/css/main.css index 6f37ed9c2b84116dcbbd36cea3468574d2afa173..7feccff53671c8c03bce4fc86aa6ee2a163c5aef 100755 --- a/docs/css/main.css +++ b/docs/css/main.css @@ -3,15 +3,20 @@ ========================================================================== */ .navbar .brand { - background: url(../img/spark-logo-77x40px-hd.png) no-repeat left center; - height: 40px; + background: url(../img/spark-logo-77x50px-hd.png) no-repeat left center; + height: 50px; width: 77px; + margin-left: 1px; padding: 0 10px 0 0; } .navbar-inner { - padding-top: 6px; - padding-bottom: 6px; + margin-top: 2px; + height: 50px; +} + +.navbar-inner .nav { + margin-top: 5px; font-size: 15px; } diff --git a/docs/index.md b/docs/index.md index b9d9a8a36f9362ec0ff2b1b6514c7b93259bd023..154ad42d49165c5deb90698fb59b1647c27949c5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,7 +7,7 @@ title: Spark Overview TODO(andyk): Rewrite to make the Java API a first class part of the story. {% endcomment %} -Spark is a MapReduce-like cluster computing framework designed to support low-latency iterative jobs and interactive use from an interpreter. It exposes clean language-integrated APIs in [Scala](http://www.scala-lang.org) and Java, providing a wide array of parallel operations. Spark can run on top of the [Apache Mesos](http://incubator.apache.org/mesos/) cluster manager, Hadoop YARN, or without an independent resource manager ("standalone mode"). +Spark is a MapReduce-like cluster computing framework designed for low-latency iterative jobs and interactive use from an interpreter. It provides clean, language-integrated APIs in Scala and Java, with a rich array of parallel operators. Spark can run on top of the [Apache Mesos](http://incubator.apache.org/mesos/) cluster manager, Hadoop YARN, Amazon EC2, or without an independent resource manager ("standalone mode"). # Downloading @@ -17,7 +17,7 @@ Get Spark by checking out the master branch of the Git repository, using `git cl Spark requires [Scala 2.9.2](http://www.scala-lang.org/). You will need to have Scala's `bin` directory in your `PATH`, or you will need to set the `SCALA_HOME` environment variable to point -to where you've installed Scala. Scala must be accessible through one +to where you've installed Scala. Scala must also be accessible through one of these methods on slave nodes on your cluster. Spark uses [Simple Build Tool](https://github.com/harrah/xsbt/wiki), which is bundled with it. To compile the code, go into the top-level Spark directory and run @@ -32,9 +32,10 @@ To run one of the samples, use `./run <class> <params>` in the top-level Spark d For example, `./run spark.examples.SparkPi` will run a sample program that estimates Pi. Each of the examples prints usage help if no params are given. -Note that all of the sample programs take a `<host>` parameter that is the Mesos master -to connect to. This can be a [Mesos master URL](http://www.github.com/mesos/mesos/wiki), or `local` to run locally with one -thread, or `local[N]` to run locally with N threads. You should start by using `local` for testing. +Note that all of the sample programs take a `<master>` parameter specifying the cluster URL +to connect to. This can be a [URL for a distributed cluster]({{HOME_PATH}}scala-programming-guide.html#master_urls), +or `local` to run locally with one thread, or `local[N]` to run locally with N threads. You should start by using +`local` for testing. Finally, Spark can be used interactively from a modified version of the Scala interpreter that you can start through `./spark-shell`. This is a great way to learn Spark. @@ -49,34 +50,37 @@ of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). # Where to Go from Here -Programming guides: +**Programming guides:** * [Spark Programming Guide]({{HOME_PATH}}scala-programming-guide.html): how to get started using Spark, and details on the Scala API * [Java Programming Guide]({{HOME_PATH}}java-programming-guide.html): using Spark from Java -Deployment guides: +**Deployment guides:** * [Running Spark on Amazon EC2]({{HOME_PATH}}ec2-scripts.html): scripts that let you launch a cluster on EC2 in about 5 minutes * [Standalone Deploy Mode]({{HOME_PATH}}spark-standalone.html): launch a standalone cluster quickly without Mesos * [Running Spark on Mesos]({{HOME_PATH}}running-on-mesos.html): deploy a private cluster using [Apache Mesos](http://incubator.apache.org/mesos) * [Running Spark on YARN]({{HOME_PATH}}running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN) -Miscellaneous: -* [Configuration]({{HOME_PATH}}configuration.html): customize Spark via its configuration system. +**Other documents:** +* [Configuration]({{HOME_PATH}}configuration.html): customize Spark via its configuration system +* [API docs (Scaladoc)]({{HOME_PATH}}api/core/index.html) * [Bagel]({{HOME_PATH}}bagel-programming-guide.html): an implementation of Google's Pregel on Spark * [Contributing to Spark](contributing-to-spark.html) -# Other Resources - +**External resources:** * [Spark Homepage](http://www.spark-project.org) -* [AMP Camp](http://ampcamp.berkeley.edu/) - In 2012, the AMP Lab hosted the first AMP Camp which featured talks and hands-on exercises about Spark, Shark, Mesos, and more. [Videos, slides](http://ampcamp.berkeley.edu/agenda) and the [exercises](http://ampcamp.berkeley.edu/exercises) are all available online, and provide a great introduction to Spark. -* [Paper describing the programming model](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf) +* [AMP Camp](http://ampcamp.berkeley.edu/): a two-day training camp at UC Berkeley that featured talks and exercises + about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012), + [slides](http://ampcamp.berkeley.edu/agenda-2012) and [exercises](http://ampcamp.berkeley.edu/exercises-2012) are + available online for free. * [Code Examples](http://spark-project.org/examples.html): more are also available in the [examples subfolder](https://github.com/mesos/spark/tree/master/examples/src/main/scala/spark/examples) of Spark -* [Mailing List](http://groups.google.com/group/spark-users): ask here for help +* [Paper describing the Spark system](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf) +* [Mailing List](http://groups.google.com/group/spark-users) # Community -To keep up with Spark development or get help, sign up for the [spark-users mailing list](http://groups.google.com/group/spark-users). +To get help using Spark or keep up with Spark development, sign up for the [spark-users mailing list](http://groups.google.com/group/spark-users). If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users. -If you'd like to contribute code to Spark, read [how to contribute]({{HOME_PATH}}contributing-to-spark.html). +Finally, if you'd like to contribute code to Spark, read [how to contribute]({{HOME_PATH}}contributing-to-spark.html). diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index 94d304e23a76d21350fb999d6960a3fe72f73b21..ad06c30dbfef266e8dae67b58ccb0f406149ceef 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -28,27 +28,33 @@ This is done through the following constructor: new SparkContext(master, jobName, [sparkHome], [jars]) {% endhighlight %} -The `master` parameter is a string specifying a [Mesos]({{HOME_PATH}}running-on-mesos.html) cluster to connect to, or a special "local" string to run in local mode, as described below. `jobName` is a name for your job, which will be shown in the Mesos web UI when running on a cluster. Finally, the last two parameters are needed to deploy your code to a cluster if running on Mesos, as described later. +The `master` parameter is a string specifying a [Mesos]({{HOME_PATH}}running-on-mesos.html) cluster to connect to, or a special "local" string to run in local mode, as described below. `jobName` is a name for your job, which will be shown in the Mesos web UI when running on a cluster. Finally, the last two parameters are needed to deploy your code to a cluster if running in distributed mode, as described later. In the Spark interpreter, a special interpreter-aware SparkContext is already created for you, in the variable called `sc`. Making your own SparkContext will not work. You can set which master the context connects to using the `MASTER` environment variable. For example, run `MASTER=local[4] ./spark-shell` to run locally with four cores. -### Master Names +### Master URLs -The master name can be in one of three formats: +The master URL passed to Spark can be in one of the following formats: <table class="table"> -<tr><th>Master Name</th><th>Meaning</th></tr> +<tr><th>Master URL</th><th>Meaning</th></tr> <tr><td> local </td><td> Run Spark locally with one worker thread (i.e. no parallelism at all). </td></tr> -<tr><td> local[K] </td><td> Run Spark locally with K worker threads (which should be set to the number of cores on your machine). </td></tr> -<tr><td> HOST:PORT </td><td> Connect Spark to the given (Mesos)({{HOME_PATH}}running-on-mesos.html) master to run on a cluster. The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use, which is 5050 by default. -<br /><br /> -<strong>NOTE:</strong> In earlier versions of Mesos (the <code>old-mesos</code> branch of Spark), you need to use master@HOST:PORT. +<tr><td> local[K] </td><td> Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine). +</td></tr> +<tr><td> spark://HOST:PORT </td><td> Connect to the given <a href="{{HOME_PATH}}spark-standalone.html">Spark standalone + cluster</a> master. The port must be whichever one your master is configured to use, which is 7077 by default. +</td></tr> +<tr><td> mesos://HOST:PORT </td><td> Connect Spark to the given <a href="{{HOME_PATH}}running-on-mesos.html">Mesos</a> cluster. + The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use, + which is 5050 by default. </td></tr> </table> -### Deploying to a Cluster +For running on YARN, Spark launches an instance of the standalone deploy cluster within YARN; see [running on YARN]({{HOME_PATH}}running-on-yarn.html) for details. + +### Running on a Cluster -If you want to run your job on a cluster, you will need to specify the two optional parameters: +If you want to run your job on a cluster, you will need to specify the two optional parameters to `SparkContext` to let it find your code: * `sparkHome`: The path at which Spark is installed on your worker machines (it should be the same on all of them). * `jars`: A list of JAR files on the local machine containing your job's code and any dependencies, which Spark will deploy to all the worker nodes. You'll need to package your job into a set of JARs using your build system. For example, if you're using SBT, the [sbt-assembly](https://github.com/sbt/sbt-assembly) plugin is a good way to make a single JAR with your code and dependencies. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 55e7f4e71cc0e59359fe08ee12edd39110b07bfb..f0bebcf3d9b8f3a2ee4c7a570f0c5e340180156c 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -14,7 +14,7 @@ In addition to running on top of [Mesos](https://github.com/mesos/mesos), Spark ## Getting Started -Download and compile Spark as described in the [Getting Started Guide](index.html). You do not need to install mesos on your machine if you are using the standalone mode. +Download and compile Spark as described in the [Getting Started Guide](index.html). You do not need to install Mesos on your machine if you are using the standalone mode. ## Standalone Mode Configuration @@ -22,7 +22,7 @@ The `conf/spark_env.sh` file contains several configuration parameters for the s - SPARK\_MASTER\_IP - Use this to bind the master to a particular ip address, for example a public one. (Default: local ip address) - SPARK\_MASTER\_PORT - Start the spark master on a different port (Default: 7077) -- SPARK\_MASTER\_WEBUI\_POR - Specify a different port for the Master WebUI (Default: 8080) +- SPARK\_MASTER\_WEBUI\_PORT - Specify a different port for the Master WebUI (Default: 8080) - SPARK\_WORKER\_PORT - Start the spark worker on a specific port (Default: random) - SPARK\_WORKER\_CORES - Specify the number of cores to use (Default: all available cores) - SPARK\_WORKER\_MEMORY - Specify how much memory to use, e.g. 1000M, 2G (Default: MAX(Available - 1024MB, 512MB)) diff --git a/examples/src/main/scala/spark/examples/BroadcastTest.scala b/examples/src/main/scala/spark/examples/BroadcastTest.scala index 4a560131e3f648b5f2e8bfb1a96b1aa35e7df4e2..08be49a41cc9ceac10fbc01118a44b4a1d5d40f2 100644 --- a/examples/src/main/scala/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/spark/examples/BroadcastTest.scala @@ -5,7 +5,7 @@ import spark.SparkContext object BroadcastTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: BroadcastTest <host> [<slices>] [numElem]") + System.err.println("Usage: BroadcastTest <master> [<slices>] [numElem]") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/ExceptionHandlingTest.scala b/examples/src/main/scala/spark/examples/ExceptionHandlingTest.scala index 90941ddbf085ba08830eb7a25f83e2f7e7363cab..c89f3dac0c5a396b74039b909213025a847a2c7c 100644 --- a/examples/src/main/scala/spark/examples/ExceptionHandlingTest.scala +++ b/examples/src/main/scala/spark/examples/ExceptionHandlingTest.scala @@ -5,7 +5,7 @@ import spark.SparkContext object ExceptionHandlingTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: ExceptionHandlingTest <host>") + System.err.println("Usage: ExceptionHandlingTest <master>") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/GroupByTest.scala b/examples/src/main/scala/spark/examples/GroupByTest.scala index 9d11692aebd226e1e801b4ffa5ffb5cf1f371336..86dfba3a404987d9aa53f8829b5e22886264cb3c 100644 --- a/examples/src/main/scala/spark/examples/GroupByTest.scala +++ b/examples/src/main/scala/spark/examples/GroupByTest.scala @@ -7,7 +7,7 @@ import java.util.Random object GroupByTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: GroupByTest <host> [numMappers] [numKVPairs] [KeySize] [numReducers]") + System.err.println("Usage: GroupByTest <master> [numMappers] [numKVPairs] [KeySize] [numReducers]") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/MultiBroadcastTest.scala b/examples/src/main/scala/spark/examples/MultiBroadcastTest.scala index 240376da908ab6cd33e2f80513543386c8d5f4cd..83ae014e9458063c41a15e861c1ee638b43f8ca4 100644 --- a/examples/src/main/scala/spark/examples/MultiBroadcastTest.scala +++ b/examples/src/main/scala/spark/examples/MultiBroadcastTest.scala @@ -5,7 +5,7 @@ import spark.SparkContext object MultiBroadcastTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: BroadcastTest <host> [<slices>] [numElem]") + System.err.println("Usage: BroadcastTest <master> [<slices>] [numElem]") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/spark/examples/SimpleSkewedGroupByTest.scala index c44b42c5b62d64a72a1ce595fd3e85894e5f4582..50b3a263b4feb993dbf5b5783758b0d75aff10ec 100644 --- a/examples/src/main/scala/spark/examples/SimpleSkewedGroupByTest.scala +++ b/examples/src/main/scala/spark/examples/SimpleSkewedGroupByTest.scala @@ -7,7 +7,7 @@ import java.util.Random object SimpleSkewedGroupByTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: SimpleSkewedGroupByTest <host> " + + System.err.println("Usage: SimpleSkewedGroupByTest <master> " + "[numMappers] [numKVPairs] [valSize] [numReducers] [ratio]") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/SkewedGroupByTest.scala b/examples/src/main/scala/spark/examples/SkewedGroupByTest.scala index 9fad7ab2e5b18623ddff31b4507c8434a44cd2cf..d2117a263e6d8269d13619b134ed8962fd4014b7 100644 --- a/examples/src/main/scala/spark/examples/SkewedGroupByTest.scala +++ b/examples/src/main/scala/spark/examples/SkewedGroupByTest.scala @@ -7,7 +7,7 @@ import java.util.Random object SkewedGroupByTest { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: GroupByTest <host> [numMappers] [numKVPairs] [KeySize] [numReducers]") + System.err.println("Usage: GroupByTest <master> [numMappers] [numKVPairs] [KeySize] [numReducers]") System.exit(1) } diff --git a/examples/src/main/scala/spark/examples/SparkALS.scala b/examples/src/main/scala/spark/examples/SparkALS.scala index 59d5154e08947296039d8e805bea55021528f032..fb28e2c93273f74603de6e8e2e4688dec42c9c26 100644 --- a/examples/src/main/scala/spark/examples/SparkALS.scala +++ b/examples/src/main/scala/spark/examples/SparkALS.scala @@ -107,7 +107,7 @@ object SparkALS { host = host_ } case _ => { - System.err.println("Usage: SparkALS <M> <U> <F> <iters> <slices> <host>") + System.err.println("Usage: SparkALS <M> <U> <F> <iters> <slices> <master>") System.exit(1) } } diff --git a/examples/src/main/scala/spark/examples/SparkLR.scala b/examples/src/main/scala/spark/examples/SparkLR.scala index 6777b471fb17ec26e8d0d186ae5d67b0ed4e2b11..aaaf062c8f9ea79a4a67b9314ba17eeebc9cc691 100644 --- a/examples/src/main/scala/spark/examples/SparkLR.scala +++ b/examples/src/main/scala/spark/examples/SparkLR.scala @@ -25,7 +25,7 @@ object SparkLR { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: SparkLR <host> [<slices>]") + System.err.println("Usage: SparkLR <master> [<slices>]") System.exit(1) } val sc = new SparkContext(args(0), "SparkLR") diff --git a/examples/src/main/scala/spark/examples/SparkPi.scala b/examples/src/main/scala/spark/examples/SparkPi.scala index 751fef6ab09d84a5839ac01883a365ac0a710056..2f226f13805c02ef6c9f8bd6d73d3434283a022c 100644 --- a/examples/src/main/scala/spark/examples/SparkPi.scala +++ b/examples/src/main/scala/spark/examples/SparkPi.scala @@ -7,7 +7,7 @@ import SparkContext._ object SparkPi { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: SparkPi <host> [<slices>]") + System.err.println("Usage: SparkPi <master> [<slices>]") System.exit(1) } val spark = new SparkContext(args(0), "SparkPi") diff --git a/examples/src/main/scala/spark/examples/SparkTC.scala b/examples/src/main/scala/spark/examples/SparkTC.scala index a6e4de4671e2a7d978aafbb556bcac1740cb30b2..90bae011adfb6381bf8fe2409d04f8a3a5598882 100644 --- a/examples/src/main/scala/spark/examples/SparkTC.scala +++ b/examples/src/main/scala/spark/examples/SparkTC.scala @@ -26,7 +26,7 @@ object SparkTC { def main(args: Array[String]) { if (args.length == 0) { - System.err.println("Usage: SparkTC <host> [<slices>]") + System.err.println("Usage: SparkTC <master> [<slices>]") System.exit(1) } val spark = new SparkContext(args(0), "SparkTC")