From 43b24635ee45a845f2432bc13c11fcf2eb02f2f3 Mon Sep 17 00:00:00 2001 From: Matei Zaharia <matei@eecs.berkeley.edu> Date: Fri, 5 Jul 2013 11:38:53 -0700 Subject: [PATCH] Renamed ML package to MLlib and added it to classpath --- bin/compute-classpath.cmd | 2 ++ bin/compute-classpath.sh | 2 ++ {ml => mllib}/data/als/test.data | 0 {ml => mllib}/data/lr-data/random.data | 0 {ml => mllib}/data/ridge-data/lpsa.data | 0 .../src/main/scala/spark/ml/clustering/KMeans.scala | 9 +++++---- .../src/main/scala/spark/ml/clustering/KMeansModel.scala | 4 ++-- .../src/main/scala/spark/ml/clustering/LocalKMeans.scala | 4 ++-- .../src/main/scala/spark/ml/optimization/Gradient.scala | 2 +- .../scala/spark/ml/optimization/GradientDescent.scala | 2 +- .../src/main/scala/spark/ml/optimization/Updater.scala | 2 +- .../src/main/scala/spark/ml/recommendation/ALS.scala | 2 +- .../ml/recommendation/MatrixFactorizationModel.scala | 2 +- .../scala/spark/ml/regression/LogisticRegression.scala | 6 +++--- .../ml/regression/LogisticRegressionGenerator.scala | 4 ++-- .../src/main/scala/spark/ml/regression/Regression.scala | 2 +- .../main/scala/spark/ml/regression/RidgeRegression.scala | 4 ++-- .../spark/ml/regression/RidgeRegressionGenerator.scala | 4 ++-- {ml => mllib}/src/main/scala/spark/ml/util/MLUtils.scala | 2 +- {ml => mllib}/src/test/resources/log4j.properties | 0 .../src/test/scala/spark/ml/clustering/KMeansSuite.scala | 2 +- .../test/scala/spark/ml/recommendation/ALSSuite.scala | 2 +- .../spark/ml/regression/LogisticRegressionSuite.scala | 2 +- .../scala/spark/ml/regression/RidgeRegressionSuite.scala | 2 +- project/SparkBuild.scala | 8 ++++---- 25 files changed, 37 insertions(+), 32 deletions(-) rename {ml => mllib}/data/als/test.data (100%) rename {ml => mllib}/data/lr-data/random.data (100%) rename {ml => mllib}/data/ridge-data/lpsa.data (100%) rename {ml => mllib}/src/main/scala/spark/ml/clustering/KMeans.scala (97%) rename {ml => mllib}/src/main/scala/spark/ml/clustering/KMeansModel.scala (92%) rename {ml => mllib}/src/main/scala/spark/ml/clustering/LocalKMeans.scala (97%) rename {ml => mllib}/src/main/scala/spark/ml/optimization/Gradient.scala (96%) rename {ml => mllib}/src/main/scala/spark/ml/optimization/GradientDescent.scala (98%) rename {ml => mllib}/src/main/scala/spark/ml/optimization/Updater.scala (96%) rename {ml => mllib}/src/main/scala/spark/ml/recommendation/ALS.scala (99%) rename {ml => mllib}/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala (95%) rename {ml => mllib}/src/main/scala/spark/ml/regression/LogisticRegression.scala (98%) rename {ml => mllib}/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala (94%) rename {ml => mllib}/src/main/scala/spark/ml/regression/Regression.scala (94%) rename {ml => mllib}/src/main/scala/spark/ml/regression/RidgeRegression.scala (98%) rename {ml => mllib}/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala (96%) rename {ml => mllib}/src/main/scala/spark/ml/util/MLUtils.scala (99%) rename {ml => mllib}/src/test/resources/log4j.properties (100%) rename {ml => mllib}/src/test/scala/spark/ml/clustering/KMeansSuite.scala (99%) rename {ml => mllib}/src/test/scala/spark/ml/recommendation/ALSSuite.scala (98%) rename {ml => mllib}/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala (98%) rename {ml => mllib}/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala (97%) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index 6e7efbd334..44826f339c 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -15,6 +15,7 @@ set CORE_DIR=%FWDIR%core set REPL_DIR=%FWDIR%repl set EXAMPLES_DIR=%FWDIR%examples set BAGEL_DIR=%FWDIR%bagel +set MLLIB_DIR=%FWDIR%mllib set STREAMING_DIR=%FWDIR%streaming set PYSPARK_DIR=%FWDIR%python @@ -29,6 +30,7 @@ set CLASSPATH=%CLASSPATH%;%FWDIR%lib_managed\bundles\* set CLASSPATH=%CLASSPATH%;%FWDIR%repl\lib\* set CLASSPATH=%CLASSPATH%;%FWDIR%python\lib\* set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes +set CLASSPATH=%CLASSPATH%;%MLLIB_DIR%\target\scala-%SCALA_VERSION%\classes rem Add hadoop conf dir - else FileSystem.*, etc fail rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 3a78880290..75c58d1181 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -18,6 +18,7 @@ REPL_DIR="$FWDIR/repl" REPL_BIN_DIR="$FWDIR/repl-bin" EXAMPLES_DIR="$FWDIR/examples" BAGEL_DIR="$FWDIR/bagel" +MLLIB_DIR="$FWDIR/mllib" STREAMING_DIR="$FWDIR/streaming" PYSPARK_DIR="$FWDIR/python" @@ -49,6 +50,7 @@ if [ -e $REPL_BIN_DIR/target ]; then CLASSPATH+=":$EXAMPLES_JAR" fi CLASSPATH="$CLASSPATH:$BAGEL_DIR/target/scala-$SCALA_VERSION/classes" +CLASSPATH="$CLASSPATH:$MLLIB_DIR/target/scala-$SCALA_VERSION/classes" for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do CLASSPATH="$CLASSPATH:$jar" done diff --git a/ml/data/als/test.data b/mllib/data/als/test.data similarity index 100% rename from ml/data/als/test.data rename to mllib/data/als/test.data diff --git a/ml/data/lr-data/random.data b/mllib/data/lr-data/random.data similarity index 100% rename from ml/data/lr-data/random.data rename to mllib/data/lr-data/random.data diff --git a/ml/data/ridge-data/lpsa.data b/mllib/data/ridge-data/lpsa.data similarity index 100% rename from ml/data/ridge-data/lpsa.data rename to mllib/data/ridge-data/lpsa.data diff --git a/ml/src/main/scala/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/spark/ml/clustering/KMeans.scala similarity index 97% rename from ml/src/main/scala/spark/ml/clustering/KMeans.scala rename to mllib/src/main/scala/spark/ml/clustering/KMeans.scala index d35f942c01..6d78f926c2 100644 --- a/ml/src/main/scala/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/spark/ml/clustering/KMeans.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.collection.mutable.ArrayBuffer import scala.util.Random @@ -6,7 +6,7 @@ import scala.util.Random import spark.{SparkContext, RDD} import spark.SparkContext._ import spark.Logging -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix @@ -270,7 +270,8 @@ object KMeans { /** * Return the index of the closest point in `centers` to `point`, as well as its distance. */ - private[ml] def findClosest(centers: Array[Array[Double]], point: Array[Double]): (Int, Double) = + private[mllib] def findClosest(centers: Array[Array[Double]], point: Array[Double]) + : (Int, Double) = { var bestDistance = Double.PositiveInfinity var bestIndex = 0 @@ -287,7 +288,7 @@ object KMeans { /** * Return the K-means cost of a given point against the given cluster centers. */ - private[ml] def pointCost(centers: Array[Array[Double]], point: Array[Double]): Double = { + private[mllib] def pointCost(centers: Array[Array[Double]], point: Array[Double]): Double = { var bestDistance = Double.PositiveInfinity for (i <- 0 until centers.length) { val distance = MLUtils.squaredDistance(point, centers(i)) diff --git a/ml/src/main/scala/spark/ml/clustering/KMeansModel.scala b/mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala similarity index 92% rename from ml/src/main/scala/spark/ml/clustering/KMeansModel.scala rename to mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala index 8244ccc55b..4fd0646160 100644 --- a/ml/src/main/scala/spark/ml/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala @@ -1,8 +1,8 @@ -package spark.ml.clustering +package spark.mllib.clustering import spark.RDD import spark.SparkContext._ -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils /** diff --git a/ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala b/mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala similarity index 97% rename from ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala rename to mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala index 03129ef552..e12b3be251 100644 --- a/ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala +++ b/mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.util.Random @@ -8,7 +8,7 @@ import org.jblas.{DoubleMatrix, SimpleBlas} * An utility object to run K-means locally. This is private to the ML package because it's used * in the initialization of KMeans but not meant to be publicly exposed. */ -private[ml] object LocalKMeans { +private[mllib] object LocalKMeans { /** * Run K-means++ on the weighted point set `points`. This first does the K-means++ * initialization procedure and then roudns of Lloyd's algorithm. diff --git a/ml/src/main/scala/spark/ml/optimization/Gradient.scala b/mllib/src/main/scala/spark/ml/optimization/Gradient.scala similarity index 96% rename from ml/src/main/scala/spark/ml/optimization/Gradient.scala rename to mllib/src/main/scala/spark/ml/optimization/Gradient.scala index 6d062ebddf..90b0999a5e 100644 --- a/ml/src/main/scala/spark/ml/optimization/Gradient.scala +++ b/mllib/src/main/scala/spark/ml/optimization/Gradient.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/optimization/GradientDescent.scala b/mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala similarity index 98% rename from ml/src/main/scala/spark/ml/optimization/GradientDescent.scala rename to mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala index d959ebf71c..eff853f379 100644 --- a/ml/src/main/scala/spark/ml/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import spark.{Logging, RDD, SparkContext} import spark.SparkContext._ diff --git a/ml/src/main/scala/spark/ml/optimization/Updater.scala b/mllib/src/main/scala/spark/ml/optimization/Updater.scala similarity index 96% rename from ml/src/main/scala/spark/ml/optimization/Updater.scala rename to mllib/src/main/scala/spark/ml/optimization/Updater.scala index dfc7bf2025..ea80bfcbfd 100644 --- a/ml/src/main/scala/spark/ml/optimization/Updater.scala +++ b/mllib/src/main/scala/spark/ml/optimization/Updater.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/spark/ml/recommendation/ALS.scala similarity index 99% rename from ml/src/main/scala/spark/ml/recommendation/ALS.scala rename to mllib/src/main/scala/spark/ml/recommendation/ALS.scala index 8d5c16847a..0c6fa6f741 100644 --- a/ml/src/main/scala/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/spark/ml/recommendation/ALS.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import scala.collection.mutable.{ArrayBuffer, BitSet} import scala.util.Random diff --git a/ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala similarity index 95% rename from ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala rename to mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala index cfdf2ba523..fb812a6dbe 100644 --- a/ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala +++ b/mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import spark.RDD import spark.SparkContext._ diff --git a/ml/src/main/scala/spark/ml/regression/LogisticRegression.scala b/mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala similarity index 98% rename from ml/src/main/scala/spark/ml/regression/LogisticRegression.scala rename to mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala index 3c471ab652..448ab9dce9 100644 --- a/ml/src/main/scala/spark/ml/regression/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala @@ -1,8 +1,8 @@ -package spark.ml.regression +package spark.mllib.regression import spark.{Logging, RDD, SparkContext} -import spark.ml.optimization._ -import spark.ml.util.MLUtils +import spark.mllib.optimization._ +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala b/mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala similarity index 94% rename from ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala rename to mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala index 6d37aad047..9f6abab70b 100644 --- a/ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala +++ b/mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala @@ -1,11 +1,11 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random import org.jblas.DoubleMatrix import spark.{RDD, SparkContext} -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils object LogisticRegressionGenerator { diff --git a/ml/src/main/scala/spark/ml/regression/Regression.scala b/mllib/src/main/scala/spark/ml/regression/Regression.scala similarity index 94% rename from ml/src/main/scala/spark/ml/regression/Regression.scala rename to mllib/src/main/scala/spark/ml/regression/Regression.scala index 4a20f513b7..f79974c191 100644 --- a/ml/src/main/scala/spark/ml/regression/Regression.scala +++ b/mllib/src/main/scala/spark/ml/regression/Regression.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import spark.RDD diff --git a/ml/src/main/scala/spark/ml/regression/RidgeRegression.scala b/mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala similarity index 98% rename from ml/src/main/scala/spark/ml/regression/RidgeRegression.scala rename to mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala index dae224144e..2d07c77141 100644 --- a/ml/src/main/scala/spark/ml/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala @@ -1,8 +1,8 @@ -package spark.ml.regression +package spark.mllib.regression import spark.{Logging, RDD, SparkContext} import spark.SparkContext._ -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix import org.jblas.Solve diff --git a/ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala b/mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala similarity index 96% rename from ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala rename to mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala index 75854fe1de..c9ac4a8b07 100644 --- a/ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala +++ b/mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala @@ -1,11 +1,11 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random import org.jblas.DoubleMatrix import spark.{RDD, SparkContext} -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils object RidgeRegressionGenerator { diff --git a/ml/src/main/scala/spark/ml/util/MLUtils.scala b/mllib/src/main/scala/spark/ml/util/MLUtils.scala similarity index 99% rename from ml/src/main/scala/spark/ml/util/MLUtils.scala rename to mllib/src/main/scala/spark/ml/util/MLUtils.scala index 6efa7c81ad..0a4a037c71 100644 --- a/ml/src/main/scala/spark/ml/util/MLUtils.scala +++ b/mllib/src/main/scala/spark/ml/util/MLUtils.scala @@ -1,4 +1,4 @@ -package spark.ml.util +package spark.mllib.util import spark.{RDD, SparkContext} import spark.SparkContext._ diff --git a/ml/src/test/resources/log4j.properties b/mllib/src/test/resources/log4j.properties similarity index 100% rename from ml/src/test/resources/log4j.properties rename to mllib/src/test/resources/log4j.properties diff --git a/ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala similarity index 99% rename from ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala rename to mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala index f3bd1d599f..ae7cf57c42 100644 --- a/ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala similarity index 98% rename from ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala rename to mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala index f3f56c4357..2ada9ae76b 100644 --- a/ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala b/mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala similarity index 98% rename from ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala rename to mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala index 55f2c5c18e..04d3400cb4 100644 --- a/ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala similarity index 97% rename from ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala rename to mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala index aed5cbec24..df41dbbdff 100644 --- a/ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala +++ b/mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 5dbb5d4a65..c487f34d4a 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -25,7 +25,7 @@ object SparkBuild extends Build { //val HADOOP_MAJOR_VERSION = "2" //val HADOOP_YARN = true - lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, ml) + lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, mllib) lazy val core = Project("core", file("core"), settings = coreSettings) @@ -37,7 +37,7 @@ object SparkBuild extends Build { lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn (core) - lazy val ml = Project("ml", file("ml"), settings = mlSettings) dependsOn (core) + lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn (core) // A configuration to set an alternative publishLocalConfiguration lazy val MavenCompile = config("m2r") extend(Compile) @@ -221,8 +221,8 @@ object SparkBuild extends Build { def bagelSettings = sharedSettings ++ Seq(name := "spark-bagel") - def mlSettings = sharedSettings ++ Seq( - name := "spark-ml", + def mllibSettings = sharedSettings ++ Seq( + name := "spark-mllib", libraryDependencies ++= Seq( "org.jblas" % "jblas" % "1.2.3" ) -- GitLab