diff --git a/assembly/pom.xml b/assembly/pom.xml index c3ab92f993d079e1b045569439f1e6248eb550d6..6c79f9189787d2bf344324d7b7b8f9a6634f3db3 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -44,11 +44,6 @@ <artifactId>spark-core_${scala.binary.version}</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-bagel_${scala.binary.version}</artifactId> - <version>${project.version}</version> - </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_${scala.binary.version}</artifactId> diff --git a/bagel/pom.xml b/bagel/pom.xml deleted file mode 100644 index d45224cc8078d52b2247f1b9bb22c12f3598713a..0000000000000000000000000000000000000000 --- a/bagel/pom.xml +++ /dev/null @@ -1,64 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>org.apache.spark</groupId> - <artifactId>spark-parent_2.10</artifactId> - <version>2.0.0-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - - <groupId>org.apache.spark</groupId> - <artifactId>spark-bagel_2.10</artifactId> - <properties> - <sbt.project.name>bagel</sbt.project.name> - </properties> - <packaging>jar</packaging> - <name>Spark Project Bagel</name> - <url>http://spark.apache.org/</url> - - <dependencies> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-core_${scala.binary.version}</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-core_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.scalacheck</groupId> - <artifactId>scalacheck_${scala.binary.version}</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-test-tags_${scala.binary.version}</artifactId> - </dependency> - </dependencies> - <build> - <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory> - <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory> - </build> -</project> diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala deleted file mode 100644 index 8399033ac61ec04915a264a6511af4065aab02f9..0000000000000000000000000000000000000000 --- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.bagel - -import org.apache.spark._ -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel - -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -object Bagel extends Logging { - val DEFAULT_STORAGE_LEVEL = StorageLevel.MEMORY_AND_DISK - - /** - * Runs a Bagel program. - * @param sc org.apache.spark.SparkContext to use for the program. - * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the - * Key will be the vertex id. - * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often - * this will be an empty array, i.e. sc.parallelize(Array[K, Message]()). - * @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a - * given vertex into one message before sending (which often involves network - * I/O). - * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices - * after each superstep and provides the result to each vertex in the next - * superstep. - * @param partitioner org.apache.spark.Partitioner partitions values by key - * @param numPartitions number of partitions across which to split the graph. - * Default is the default parallelism of the SparkContext - * @param storageLevel org.apache.spark.storage.StorageLevel to use for caching of - * intermediate RDDs in each superstep. Defaults to caching in memory. - * @param compute function that takes a Vertex, optional set of (possibly combined) messages to - * the Vertex, optional Aggregator and the current superstep, - * and returns a set of (Vertex, outgoing Messages) pairs - * @tparam K key - * @tparam V vertex type - * @tparam M message type - * @tparam C combiner - * @tparam A aggregator - * @return an RDD of (K, V) pairs representing the graph after completion of the program - */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, - C: Manifest, A: Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - combiner: Combiner[M, C], - aggregator: Option[Aggregator[V, A]], - partitioner: Partitioner, - numPartitions: Int, - storageLevel: StorageLevel = DEFAULT_STORAGE_LEVEL - )( - compute: (V, Option[C], Option[A], Int) => (V, Array[M]) - ): RDD[(K, V)] = { - val splits = if (numPartitions != 0) numPartitions else sc.defaultParallelism - - var superstep = 0 - var verts = vertices - var msgs = messages - var noActivity = false - var lastRDD: RDD[(K, (V, Array[M]))] = null - do { - logInfo("Starting superstep " + superstep + ".") - val startTime = System.currentTimeMillis - - val aggregated = agg(verts, aggregator) - val combinedMsgs = msgs.combineByKeyWithClassTag( - combiner.createCombiner _, combiner.mergeMsg _, combiner.mergeCombiners _, partitioner) - val grouped = combinedMsgs.groupWith(verts) - val superstep_ = superstep // Create a read-only copy of superstep for capture in closure - val (processed, numMsgs, numActiveVerts) = - comp[K, V, M, C](sc, grouped, compute(_, _, aggregated, superstep_), storageLevel) - if (lastRDD != null) { - lastRDD.unpersist(false) - } - lastRDD = processed - - val timeTaken = System.currentTimeMillis - startTime - logInfo("Superstep %d took %d s".format(superstep, timeTaken / 1000)) - - verts = processed.mapValues { case (vert, msgs) => vert } - msgs = processed.flatMap { - case (id, (vert, msgs)) => msgs.map(m => (m.targetId, m)) - } - superstep += 1 - - noActivity = numMsgs == 0 && numActiveVerts == 0 - } while (!noActivity) - - verts - } - - /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default - * storage level */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - combiner: Combiner[M, C], - partitioner: Partitioner, - numPartitions: Int - )( - compute: (V, Option[C], Int) => (V, Array[M])): RDD[(K, V)] = run(sc, vertices, messages, - combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) - - /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - combiner: Combiner[M, C], - partitioner: Partitioner, - numPartitions: Int, - storageLevel: StorageLevel - )( - compute: (V, Option[C], Int) => (V, Array[M]) - ): RDD[(K, V)] = { - run[K, V, M, C, Nothing]( - sc, vertices, messages, combiner, None, partitioner, numPartitions, storageLevel)( - addAggregatorArg[K, V, M, C](compute)) - } - - /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default - * org.apache.spark.HashPartitioner and default storage level - */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - combiner: Combiner[M, C], - numPartitions: Int - )( - compute: (V, Option[C], Int) => (V, Array[M]) - ): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, - DEFAULT_STORAGE_LEVEL)(compute) - - /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the - * default org.apache.spark.HashPartitioner - */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - combiner: Combiner[M, C], - numPartitions: Int, - storageLevel: StorageLevel - )( - compute: (V, Option[C], Int) => (V, Array[M]) - ): RDD[(K, V)] = { - val part = new HashPartitioner(numPartitions) - run[K, V, M, C, Nothing]( - sc, vertices, messages, combiner, None, part, numPartitions, storageLevel)( - addAggregatorArg[K, V, M, C](compute)) - } - - /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], - * default org.apache.spark.HashPartitioner, - * [[org.apache.spark.bagel.DefaultCombiner]] and the default storage level - */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - numPartitions: Int - )( - compute: (V, Option[Array[M]], Int) => (V, Array[M]) - ): RDD[(K, V)] = run(sc, vertices, messages, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) - - /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], - * the default org.apache.spark.HashPartitioner - * and [[org.apache.spark.bagel.DefaultCombiner]] - */ - def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( - sc: SparkContext, - vertices: RDD[(K, V)], - messages: RDD[(K, M)], - numPartitions: Int, - storageLevel: StorageLevel - )( - compute: (V, Option[Array[M]], Int) => (V, Array[M]) - ): RDD[(K, V)] = { - val part = new HashPartitioner(numPartitions) - run[K, V, M, Array[M], Nothing]( - sc, vertices, messages, new DefaultCombiner(), None, part, numPartitions, storageLevel)( - addAggregatorArg[K, V, M, Array[M]](compute)) - } - - /** - * Aggregates the given vertices using the given aggregator, if it - * is specified. - */ - private def agg[K, V <: Vertex, A: Manifest]( - verts: RDD[(K, V)], - aggregator: Option[Aggregator[V, A]] - ): Option[A] = aggregator match { - case Some(a) => - Some(verts.map { - case (id, vert) => a.createAggregator(vert) - }.reduce(a.mergeAggregators(_, _))) - case None => None - } - - /** - * Processes the given vertex-message RDD using the compute - * function. Returns the processed RDD, the number of messages - * created, and the number of active vertices. - */ - private def comp[K: Manifest, V <: Vertex, M <: Message[K], C]( - sc: SparkContext, - grouped: RDD[(K, (Iterable[C], Iterable[V]))], - compute: (V, Option[C]) => (V, Array[M]), - storageLevel: StorageLevel - ): (RDD[(K, (V, Array[M]))], Int, Int) = { - var numMsgs = sc.accumulator(0) - var numActiveVerts = sc.accumulator(0) - val processed = grouped.mapValues(x => (x._1.iterator, x._2.iterator)) - .flatMapValues { - case (_, vs) if !vs.hasNext => None - case (c, vs) => { - val (newVert, newMsgs) = - compute(vs.next, - c.hasNext match { - case true => Some(c.next) - case false => None - } - ) - - numMsgs += newMsgs.size - if (newVert.active) { - numActiveVerts += 1 - } - - Some((newVert, newMsgs)) - } - }.persist(storageLevel) - - // Force evaluation of processed RDD for accurate performance measurements - processed.foreach(x => {}) - - (processed, numMsgs.value, numActiveVerts.value) - } - - /** - * Converts a compute function that doesn't take an aggregator to - * one that does, so it can be passed to Bagel.run. - */ - private def addAggregatorArg[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C]( - compute: (V, Option[C], Int) => (V, Array[M]) - ): (V, Option[C], Option[Nothing], Int) => (V, Array[M]) = { - (vert: V, msgs: Option[C], aggregated: Option[Nothing], superstep: Int) => - compute(vert, msgs, superstep) - } -} - -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -trait Combiner[M, C] { - def createCombiner(msg: M): C - def mergeMsg(combiner: C, msg: M): C - def mergeCombiners(a: C, b: C): C -} - -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -trait Aggregator[V, A] { - def createAggregator(vert: V): A - def mergeAggregators(a: A, b: A): A -} - -/** Default combiner that simply appends messages together (i.e. performs no aggregation) */ -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializable { - def createCombiner(msg: M): Array[M] = - Array(msg) - def mergeMsg(combiner: Array[M], msg: M): Array[M] = - combiner :+ msg - def mergeCombiners(a: Array[M], b: Array[M]): Array[M] = - a ++ b -} - -/** - * Represents a Bagel vertex. - * - * Subclasses may store state along with each vertex and must - * inherit from java.io.Serializable or scala.Serializable. - */ -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -trait Vertex { - def active: Boolean -} - -/** - * Represents a Bagel message to a target vertex. - * - * Subclasses may contain a payload to deliver to the target vertex - * and must inherit from java.io.Serializable or scala.Serializable. - */ -@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0") -trait Message[K] { - def targetId: K -} diff --git a/bagel/src/main/scala/org/apache/spark/bagel/package-info.java b/bagel/src/main/scala/org/apache/spark/bagel/package-info.java deleted file mode 100644 index 81f26f276549f2ee5bbaadd6adcc05aaea8486e7..0000000000000000000000000000000000000000 --- a/bagel/src/main/scala/org/apache/spark/bagel/package-info.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Bagel: An implementation of Pregel in Spark. THIS IS DEPRECATED - use Spark's GraphX library. - */ -package org.apache.spark.bagel; \ No newline at end of file diff --git a/bagel/src/main/scala/org/apache/spark/bagel/package.scala b/bagel/src/main/scala/org/apache/spark/bagel/package.scala deleted file mode 100644 index 2fb1934579781e041773d7e62c00b317de300370..0000000000000000000000000000000000000000 --- a/bagel/src/main/scala/org/apache/spark/bagel/package.scala +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark - -/** - * Bagel: An implementation of Pregel in Spark. THIS IS DEPRECATED - use Spark's GraphX library. - */ -package object bagel diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 52d3ab34c1784ea70126617fccc5482f02b457dd..669b6b614e38c4c73ccb3379ceac4ed3b12e7339 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -965,7 +965,7 @@ private[spark] object SparkSubmitUtils { // We need to specify each component explicitly, otherwise we miss spark-streaming-kafka and // other spark-streaming utility components. Underscore is there to differentiate between // spark-streaming_2.1x and spark-streaming-kafka-assembly_2.1x - val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_", + val components = Seq("catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_", "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_") components.foreach { comp => diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala index 63c346c1b8908a3f68907bc9034d59be8372b024..4b5039b668a46a7ba8cb17ae53212d7945843018 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala @@ -171,7 +171,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll { } test("neglects Spark and Spark's dependencies") { - val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_", + val components = Seq("catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_", "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_") val coordinates = diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py index 27d1dd784ce2e4a45859cc7af4b2060fa6543f14..972be30da1eb6add616d539d61763efae5606a0d 100755 --- a/dev/audit-release/audit_release.py +++ b/dev/audit-release/audit_release.py @@ -115,7 +115,7 @@ original_dir = os.getcwd() # maven that links against them. This will catch issues with messed up # dependencies within those projects. modules = [ - "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl", + "spark-core", "spark-mllib", "spark-streaming", "spark-repl", "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka", "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq", "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl" diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 3089474c13385f2280e009e5980cf8f911fb9b5c..62d75eff71057f3f1b0d507b736ff9801ea4d495 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -75,7 +75,6 @@ <li><a href="sql-programming-guide.html">DataFrames, Datasets and SQL</a></li> <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li> <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> - <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li> <li><a href="sparkr.html">SparkR (R on Spark)</a></li> </ul> </li> diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md deleted file mode 100644 index 347ca4a7af9892596a4b14d2ca35a4e0fdb81ae1..0000000000000000000000000000000000000000 --- a/docs/bagel-programming-guide.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -layout: global -displayTitle: Bagel Programming Guide -title: Bagel ---- - -**Bagel is deprecated, and superseded by [GraphX](graphx-programming-guide.html).** - -Bagel is a Spark implementation of Google's [Pregel](http://portal.acm.org/citation.cfm?id=1807184) graph processing framework. Bagel currently supports basic graph computation, combiners, and aggregators. - -In the Pregel programming model, jobs run as a sequence of iterations called _supersteps_. In each superstep, each vertex in the graph runs a user-specified function that can update state associated with the vertex and send messages to other vertices for use in the *next* iteration. - -This guide shows the programming model and features of Bagel by walking through an example implementation of PageRank on Bagel. - -# Linking with Bagel - -To use Bagel in your program, add the following SBT or Maven dependency: - - groupId = org.apache.spark - artifactId = spark-bagel_{{site.SCALA_BINARY_VERSION}} - version = {{site.SPARK_VERSION}} - -# Programming Model - -Bagel operates on a graph represented as a [distributed dataset](programming-guide.html) of (K, V) pairs, where keys are vertex IDs and values are vertices plus their associated state. In each superstep, Bagel runs a user-specified compute function on each vertex that takes as input the current vertex state and a list of messages sent to that vertex during the previous superstep, and returns the new vertex state and a list of outgoing messages. - -For example, we can use Bagel to implement PageRank. Here, vertices represent pages, edges represent links between pages, and messages represent shares of PageRank sent to the pages that a particular page links to. - -We first extend the default `Vertex` class to store a `Double` -representing the current PageRank of the vertex, and similarly extend -the `Message` and `Edge` classes. Note that these need to be marked `@serializable` to allow Spark to transfer them across machines. We also import the Bagel types and implicit conversions. - -{% highlight scala %} -import org.apache.spark.bagel._ -import org.apache.spark.bagel.Bagel._ - -@serializable class PREdge(val targetId: String) extends Edge - -@serializable class PRVertex( - val id: String, val rank: Double, val outEdges: Seq[Edge], - val active: Boolean) extends Vertex - -@serializable class PRMessage( - val targetId: String, val rankShare: Double) extends Message -{% endhighlight %} - -Next, we load a sample graph from a text file as a distributed dataset and package it into `PRVertex` objects. We also cache the distributed dataset because Bagel will use it multiple times and we'd like to avoid recomputing it. - -{% highlight scala %} -val input = sc.textFile("data/mllib/pagerank_data.txt") - -val numVerts = input.count() - -val verts = input.map(line => { - val fields = line.split('\t') - val (id, linksStr) = (fields(0), fields(1)) - val links = linksStr.split(',').map(new PREdge(_)) - (id, new PRVertex(id, 1.0 / numVerts, links, true)) -}).cache -{% endhighlight %} - -We run the Bagel job, passing in `verts`, an empty distributed dataset of messages, and a custom compute function that runs PageRank for 10 iterations. - -{% highlight scala %} -val emptyMsgs = sc.parallelize(List[(String, PRMessage)]()) - -def compute(self: PRVertex, msgs: Option[Seq[PRMessage]], superstep: Int) -: (PRVertex, Iterable[PRMessage]) = { - val msgSum = msgs.getOrElse(List()).map(_.rankShare).sum - val newRank = - if (msgSum != 0) - 0.15 / numVerts + 0.85 * msgSum - else - self.rank - val halt = superstep >= 10 - val msgsOut = - if (!halt) - self.outEdges.map(edge => - new PRMessage(edge.targetId, newRank / self.outEdges.size)) - else - List() - (new PRVertex(self.id, newRank, self.outEdges, !halt), msgsOut) -} -{% endhighlight %} - -val result = Bagel.run(sc, verts, emptyMsgs)()(compute) - -Finally, we print the results. - -{% highlight scala %} -println(result.map(v => "%s\t%s\n".format(v.id, v.rank)).collect.mkString) -{% endhighlight %} - -## Combiners - -Sending a message to another vertex generally involves expensive communication over the network. For certain algorithms, it's possible to reduce the amount of communication using _combiners_. For example, if the compute function receives integer messages and only uses their sum, it's possible for Bagel to combine multiple messages to the same vertex by summing them. - -For combiner support, Bagel can optionally take a set of combiner functions that convert messages to their combined form. - -_Example: PageRank with combiners_ - -## Aggregators - -Aggregators perform a reduce across all vertices after each superstep, and provide the result to each vertex in the next superstep. - -For aggregator support, Bagel can optionally take an aggregator function that reduces across each vertex. - -_Example_ - -## Operations - -Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details. - -### Actions - -{% highlight scala %} -/*** Full form ***/ - -Bagel.run(sc, vertices, messages, combiner, aggregator, partitioner, numSplits)(compute) -// where compute takes (vertex: V, combinedMessages: Option[C], aggregated: Option[A], superstep: Int) -// and returns (newVertex: V, outMessages: Array[M]) - -/*** Abbreviated forms ***/ - -Bagel.run(sc, vertices, messages, combiner, partitioner, numSplits)(compute) -// where compute takes (vertex: V, combinedMessages: Option[C], superstep: Int) -// and returns (newVertex: V, outMessages: Array[M]) - -Bagel.run(sc, vertices, messages, combiner, numSplits)(compute) -// where compute takes (vertex: V, combinedMessages: Option[C], superstep: Int) -// and returns (newVertex: V, outMessages: Array[M]) - -Bagel.run(sc, vertices, messages, numSplits)(compute) -// where compute takes (vertex: V, messages: Option[Array[M]], superstep: Int) -// and returns (newVertex: V, outMessages: Array[M]) -{% endhighlight %} - -### Types - -{% highlight scala %} -trait Combiner[M, C] { - def createCombiner(msg: M): C - def mergeMsg(combiner: C, msg: M): C - def mergeCombiners(a: C, b: C): C -} - -trait Aggregator[V, A] { - def createAggregator(vert: V): A - def mergeAggregators(a: A, b: A): A -} - -trait Vertex { - def active: Boolean -} - -trait Message[K] { - def targetId: K -} -{% endhighlight %} diff --git a/examples/pom.xml b/examples/pom.xml index d27a5096a6511d7f84daebfd09c36bdb2033cfdb..1a0d5e5854642427c7750ccde8881d067a56c035 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -53,12 +53,6 @@ <version>${project.version}</version> <scope>provided</scope> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-bagel_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_${scala.binary.version}</artifactId> diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java index 55fe156cf665f96959da2ebe87bf625fb9fb7283..68af14397ba81671ee74afe7066fdc8fa413760b 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java @@ -146,7 +146,7 @@ abstract class AbstractCommandBuilder { boolean isTesting = "1".equals(getenv("SPARK_TESTING")); if (prependClasses || isTesting) { String scala = getScalaVersion(); - List<String> projects = Arrays.asList("core", "repl", "mllib", "bagel", "graphx", + List<String> projects = Arrays.asList("core", "repl", "mllib", "graphx", "streaming", "tools", "sql/catalyst", "sql/core", "sql/hive", "sql/hive-thriftserver", "yarn", "launcher", "network/common", "network/shuffle", "network/yarn"); if (prependClasses) { diff --git a/pom.xml b/pom.xml index 1f570727dc4ccf8d01bd6e2695e1cace80bdd4d0..32918d6a74af9a999d8445377a6ac6ecfb1bc7af 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,6 @@ <modules> <module>tags</module> <module>core</module> - <module>bagel</module> <!-- Deprecated --> <module>graphx</module> <module>mllib</module> <module>tools</module> @@ -194,7 +193,7 @@ declared in the projects that build assemblies. For other projects the scope should remain as "compile", otherwise they are not available - during compilation if the dependency is transivite (e.g. "bagel/" depending on "core/" and + during compilation if the dependency is transivite (e.g. "graphx/" depending on "core/" and needing Hadoop classes in the classpath to compile). --> <flume.deps.scope>compile</flume.deps.scope> diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index b1dcaedcba75ec3c7583060d520e5d8703943132..c3d53f835f395c89ec079f783718bfe7dbf8a9d8 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -34,10 +34,10 @@ object BuildCommons { private val buildLocation = file(".").getAbsoluteFile.getParentFile - val allProjects@Seq(bagel, catalyst, core, graphx, hive, hiveThriftServer, mllib, repl, + val allProjects@Seq(catalyst, core, graphx, hive, hiveThriftServer, mllib, repl, sql, networkCommon, networkShuffle, streaming, streamingFlumeSink, streamingFlume, streamingKafka, streamingMqtt, streamingTwitter, streamingZeromq, launcher, unsafe, testTags) = - Seq("bagel", "catalyst", "core", "graphx", "hive", "hive-thriftserver", "mllib", "repl", + Seq("catalyst", "core", "graphx", "hive", "hive-thriftserver", "mllib", "repl", "sql", "network-common", "network-shuffle", "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka", "streaming-mqtt", "streaming-twitter", "streaming-zeromq", "launcher", "unsafe", "test-tags").map(ProjectRef(buildLocation, _)) @@ -352,7 +352,7 @@ object OldDeps { scalaVersion := "2.10.5", libraryDependencies := Seq("spark-streaming-mqtt", "spark-streaming-zeromq", "spark-streaming-flume", "spark-streaming-kafka", "spark-streaming-twitter", - "spark-streaming", "spark-mllib", "spark-bagel", "spark-graphx", + "spark-streaming", "spark-mllib", "spark-graphx", "spark-core").map(versionArtifact(_).get intransitive()) ) } @@ -556,7 +556,7 @@ object Unidoc { unidocProjectFilter in(ScalaUnidoc, unidoc) := inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, testTags), unidocProjectFilter in(JavaUnidoc, unidoc) := - inAnyProject -- inProjects(OldDeps.project, repl, bagel, examples, tools, streamingFlumeSink, yarn, testTags), + inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, testTags), // Skip actual catalyst, but include the subproject. // Catalyst is not public API and contains quasiquotes which break scaladoc. diff --git a/repl/pom.xml b/repl/pom.xml index 67f9866509337b98a0b99ed3a4f91b781c781fe1..efc3dd452e329dde4391bfd9f4cc8eb15f4f5fe6 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -50,12 +50,6 @@ <type>test-jar</type> <scope>test</scope> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-bagel_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <scope>runtime</scope> - </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_${scala.binary.version}</artifactId>