diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index 6e7efbd3349870bfc47daf99b578d2261b1f3a36..44826f339c1da69893ab8820a869c456ec973721 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -15,6 +15,7 @@ set CORE_DIR=%FWDIR%core set REPL_DIR=%FWDIR%repl set EXAMPLES_DIR=%FWDIR%examples set BAGEL_DIR=%FWDIR%bagel +set MLLIB_DIR=%FWDIR%mllib set STREAMING_DIR=%FWDIR%streaming set PYSPARK_DIR=%FWDIR%python @@ -29,6 +30,7 @@ set CLASSPATH=%CLASSPATH%;%FWDIR%lib_managed\bundles\* set CLASSPATH=%CLASSPATH%;%FWDIR%repl\lib\* set CLASSPATH=%CLASSPATH%;%FWDIR%python\lib\* set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes +set CLASSPATH=%CLASSPATH%;%MLLIB_DIR%\target\scala-%SCALA_VERSION%\classes rem Add hadoop conf dir - else FileSystem.*, etc fail rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 3a7888029045537761a8e27cbdbbe160eea5ba0f..75c58d11813f57a8a829dc3aadca517fd857c969 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -18,6 +18,7 @@ REPL_DIR="$FWDIR/repl" REPL_BIN_DIR="$FWDIR/repl-bin" EXAMPLES_DIR="$FWDIR/examples" BAGEL_DIR="$FWDIR/bagel" +MLLIB_DIR="$FWDIR/mllib" STREAMING_DIR="$FWDIR/streaming" PYSPARK_DIR="$FWDIR/python" @@ -49,6 +50,7 @@ if [ -e $REPL_BIN_DIR/target ]; then CLASSPATH+=":$EXAMPLES_JAR" fi CLASSPATH="$CLASSPATH:$BAGEL_DIR/target/scala-$SCALA_VERSION/classes" +CLASSPATH="$CLASSPATH:$MLLIB_DIR/target/scala-$SCALA_VERSION/classes" for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do CLASSPATH="$CLASSPATH:$jar" done diff --git a/ml/data/als/test.data b/mllib/data/als/test.data similarity index 100% rename from ml/data/als/test.data rename to mllib/data/als/test.data diff --git a/ml/data/lr-data/random.data b/mllib/data/lr-data/random.data similarity index 100% rename from ml/data/lr-data/random.data rename to mllib/data/lr-data/random.data diff --git a/ml/data/ridge-data/lpsa.data b/mllib/data/ridge-data/lpsa.data similarity index 100% rename from ml/data/ridge-data/lpsa.data rename to mllib/data/ridge-data/lpsa.data diff --git a/ml/src/main/scala/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/spark/ml/clustering/KMeans.scala similarity index 97% rename from ml/src/main/scala/spark/ml/clustering/KMeans.scala rename to mllib/src/main/scala/spark/ml/clustering/KMeans.scala index d35f942c01c6b8b5fb9b305f293fb660c108552a..6d78f926c2c5d894f13c250d6b1c001629995f88 100644 --- a/ml/src/main/scala/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/spark/ml/clustering/KMeans.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.collection.mutable.ArrayBuffer import scala.util.Random @@ -6,7 +6,7 @@ import scala.util.Random import spark.{SparkContext, RDD} import spark.SparkContext._ import spark.Logging -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix @@ -270,7 +270,8 @@ object KMeans { /** * Return the index of the closest point in `centers` to `point`, as well as its distance. */ - private[ml] def findClosest(centers: Array[Array[Double]], point: Array[Double]): (Int, Double) = + private[mllib] def findClosest(centers: Array[Array[Double]], point: Array[Double]) + : (Int, Double) = { var bestDistance = Double.PositiveInfinity var bestIndex = 0 @@ -287,7 +288,7 @@ object KMeans { /** * Return the K-means cost of a given point against the given cluster centers. */ - private[ml] def pointCost(centers: Array[Array[Double]], point: Array[Double]): Double = { + private[mllib] def pointCost(centers: Array[Array[Double]], point: Array[Double]): Double = { var bestDistance = Double.PositiveInfinity for (i <- 0 until centers.length) { val distance = MLUtils.squaredDistance(point, centers(i)) diff --git a/ml/src/main/scala/spark/ml/clustering/KMeansModel.scala b/mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala similarity index 92% rename from ml/src/main/scala/spark/ml/clustering/KMeansModel.scala rename to mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala index 8244ccc55be1e4adb95ff3666b1a426fdded7c3b..4fd0646160e08e05f7a0bdcaef31ebde1833c8ae 100644 --- a/ml/src/main/scala/spark/ml/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/spark/ml/clustering/KMeansModel.scala @@ -1,8 +1,8 @@ -package spark.ml.clustering +package spark.mllib.clustering import spark.RDD import spark.SparkContext._ -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils /** diff --git a/ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala b/mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala similarity index 97% rename from ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala rename to mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala index 03129ef552db7c6de0560434c44d50e0939e6bb0..e12b3be251246d643a9bbefa4c96e09a55ac4e0d 100644 --- a/ml/src/main/scala/spark/ml/clustering/LocalKMeans.scala +++ b/mllib/src/main/scala/spark/ml/clustering/LocalKMeans.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.util.Random @@ -8,7 +8,7 @@ import org.jblas.{DoubleMatrix, SimpleBlas} * An utility object to run K-means locally. This is private to the ML package because it's used * in the initialization of KMeans but not meant to be publicly exposed. */ -private[ml] object LocalKMeans { +private[mllib] object LocalKMeans { /** * Run K-means++ on the weighted point set `points`. This first does the K-means++ * initialization procedure and then roudns of Lloyd's algorithm. diff --git a/ml/src/main/scala/spark/ml/optimization/Gradient.scala b/mllib/src/main/scala/spark/ml/optimization/Gradient.scala similarity index 96% rename from ml/src/main/scala/spark/ml/optimization/Gradient.scala rename to mllib/src/main/scala/spark/ml/optimization/Gradient.scala index 6d062ebddfe90221271f9fa815df1af2e75166bc..90b0999a5ec40c7af2ddba908de89f1ec7ee4b89 100644 --- a/ml/src/main/scala/spark/ml/optimization/Gradient.scala +++ b/mllib/src/main/scala/spark/ml/optimization/Gradient.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/optimization/GradientDescent.scala b/mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala similarity index 98% rename from ml/src/main/scala/spark/ml/optimization/GradientDescent.scala rename to mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala index d959ebf71c4f721b6d7725d55ac1587b3c5557f5..eff853f379b8aac6f8dab410f5091123e5388ad4 100644 --- a/ml/src/main/scala/spark/ml/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/spark/ml/optimization/GradientDescent.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import spark.{Logging, RDD, SparkContext} import spark.SparkContext._ diff --git a/ml/src/main/scala/spark/ml/optimization/Updater.scala b/mllib/src/main/scala/spark/ml/optimization/Updater.scala similarity index 96% rename from ml/src/main/scala/spark/ml/optimization/Updater.scala rename to mllib/src/main/scala/spark/ml/optimization/Updater.scala index dfc7bf20259032dbd82aa9ec9d7b8cc4e3bae9a7..ea80bfcbfd40500b9e795db30cc351f21c533f8c 100644 --- a/ml/src/main/scala/spark/ml/optimization/Updater.scala +++ b/mllib/src/main/scala/spark/ml/optimization/Updater.scala @@ -1,4 +1,4 @@ -package spark.ml.optimization +package spark.mllib.optimization import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/spark/ml/recommendation/ALS.scala similarity index 99% rename from ml/src/main/scala/spark/ml/recommendation/ALS.scala rename to mllib/src/main/scala/spark/ml/recommendation/ALS.scala index 8d5c16847a66d6c3eadd4e808711fae4429d911a..0c6fa6f741dc0e21c5c4760f116f60795ed5b558 100644 --- a/ml/src/main/scala/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/spark/ml/recommendation/ALS.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import scala.collection.mutable.{ArrayBuffer, BitSet} import scala.util.Random diff --git a/ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala similarity index 95% rename from ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala rename to mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala index cfdf2ba5231082498873997e6ddc320bf0e10c77..fb812a6dbeb5322f8b8c31565c8205e0b3f10628 100644 --- a/ml/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala +++ b/mllib/src/main/scala/spark/ml/recommendation/MatrixFactorizationModel.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import spark.RDD import spark.SparkContext._ diff --git a/ml/src/main/scala/spark/ml/regression/LogisticRegression.scala b/mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala similarity index 98% rename from ml/src/main/scala/spark/ml/regression/LogisticRegression.scala rename to mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala index 3c471ab652b4432878be12acb1940dc05d99b96f..448ab9dce9017c298c78f014e2c3475563407e5d 100644 --- a/ml/src/main/scala/spark/ml/regression/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/ml/regression/LogisticRegression.scala @@ -1,8 +1,8 @@ -package spark.ml.regression +package spark.mllib.regression import spark.{Logging, RDD, SparkContext} -import spark.ml.optimization._ -import spark.ml.util.MLUtils +import spark.mllib.optimization._ +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix diff --git a/ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala b/mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala similarity index 94% rename from ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala rename to mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala index 6d37aad047c1a82ad1c972f39f14d17b25c06a3f..9f6abab70b99e8cea51c865f282bed5adf48109c 100644 --- a/ml/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala +++ b/mllib/src/main/scala/spark/ml/regression/LogisticRegressionGenerator.scala @@ -1,11 +1,11 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random import org.jblas.DoubleMatrix import spark.{RDD, SparkContext} -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils object LogisticRegressionGenerator { diff --git a/ml/src/main/scala/spark/ml/regression/Regression.scala b/mllib/src/main/scala/spark/ml/regression/Regression.scala similarity index 94% rename from ml/src/main/scala/spark/ml/regression/Regression.scala rename to mllib/src/main/scala/spark/ml/regression/Regression.scala index 4a20f513b7bf803f40895830018b5aad5a32d092..f79974c1915f9d593df0f868becdc3b9c2a8dae8 100644 --- a/ml/src/main/scala/spark/ml/regression/Regression.scala +++ b/mllib/src/main/scala/spark/ml/regression/Regression.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import spark.RDD diff --git a/ml/src/main/scala/spark/ml/regression/RidgeRegression.scala b/mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala similarity index 98% rename from ml/src/main/scala/spark/ml/regression/RidgeRegression.scala rename to mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala index dae224144e7ea1489b7e4b2e444a151d324a32a0..2d07c771410985d39054f501835935bc354fb57a 100644 --- a/ml/src/main/scala/spark/ml/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/spark/ml/regression/RidgeRegression.scala @@ -1,8 +1,8 @@ -package spark.ml.regression +package spark.mllib.regression import spark.{Logging, RDD, SparkContext} import spark.SparkContext._ -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils import org.jblas.DoubleMatrix import org.jblas.Solve diff --git a/ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala b/mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala similarity index 96% rename from ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala rename to mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala index 75854fe1de8439d424fcbc7a0458fb66f2efcc8d..c9ac4a8b07c733881aefc1a2a0d2b023d05e0fcb 100644 --- a/ml/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala +++ b/mllib/src/main/scala/spark/ml/regression/RidgeRegressionGenerator.scala @@ -1,11 +1,11 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random import org.jblas.DoubleMatrix import spark.{RDD, SparkContext} -import spark.ml.util.MLUtils +import spark.mllib.util.MLUtils object RidgeRegressionGenerator { diff --git a/ml/src/main/scala/spark/ml/util/MLUtils.scala b/mllib/src/main/scala/spark/ml/util/MLUtils.scala similarity index 99% rename from ml/src/main/scala/spark/ml/util/MLUtils.scala rename to mllib/src/main/scala/spark/ml/util/MLUtils.scala index 6efa7c81ad2dc5043e58aec93df5ba141394c282..0a4a037c7139aa6bc03e398e4b2e562a687d5d42 100644 --- a/ml/src/main/scala/spark/ml/util/MLUtils.scala +++ b/mllib/src/main/scala/spark/ml/util/MLUtils.scala @@ -1,4 +1,4 @@ -package spark.ml.util +package spark.mllib.util import spark.{RDD, SparkContext} import spark.SparkContext._ diff --git a/ml/src/test/resources/log4j.properties b/mllib/src/test/resources/log4j.properties similarity index 100% rename from ml/src/test/resources/log4j.properties rename to mllib/src/test/resources/log4j.properties diff --git a/ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala similarity index 99% rename from ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala rename to mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala index f3bd1d599f51a02e39682cd6d255f16a1da10b45..ae7cf57c42cea8631759d9001694512095ec15ea 100644 --- a/ml/src/test/scala/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/spark/ml/clustering/KMeansSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.clustering +package spark.mllib.clustering import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala similarity index 98% rename from ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala rename to mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala index f3f56c435743c6d9e83ca309c27997204897bf3f..2ada9ae76b52639fbd1cb62fd9bbab781a17dc64 100644 --- a/ml/src/test/scala/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/spark/ml/recommendation/ALSSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.recommendation +package spark.mllib.recommendation import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala b/mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala similarity index 98% rename from ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala rename to mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala index 55f2c5c18e7e40f5d11445a64eeb5499e7a5ba05..04d3400cb4260f90094c3de6fd20f19d8c8d3af2 100644 --- a/ml/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/spark/ml/regression/LogisticRegressionSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random diff --git a/ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala similarity index 97% rename from ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala rename to mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala index aed5cbec24639ff997acff7645922a9436db20b5..df41dbbdff88fc137d173207f64646677d447006 100644 --- a/ml/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala +++ b/mllib/src/test/scala/spark/ml/regression/RidgeRegressionSuite.scala @@ -1,4 +1,4 @@ -package spark.ml.regression +package spark.mllib.regression import scala.util.Random diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 5dbb5d4a65ecfd7f3b0d55b4cb87ab5d29c97428..c487f34d4a08e41f55633fc1f4c54b9659460bde 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -25,7 +25,7 @@ object SparkBuild extends Build { //val HADOOP_MAJOR_VERSION = "2" //val HADOOP_YARN = true - lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, ml) + lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, mllib) lazy val core = Project("core", file("core"), settings = coreSettings) @@ -37,7 +37,7 @@ object SparkBuild extends Build { lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn (core) - lazy val ml = Project("ml", file("ml"), settings = mlSettings) dependsOn (core) + lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn (core) // A configuration to set an alternative publishLocalConfiguration lazy val MavenCompile = config("m2r") extend(Compile) @@ -221,8 +221,8 @@ object SparkBuild extends Build { def bagelSettings = sharedSettings ++ Seq(name := "spark-bagel") - def mlSettings = sharedSettings ++ Seq( - name := "spark-ml", + def mllibSettings = sharedSettings ++ Seq( + name := "spark-mllib", libraryDependencies ++= Seq( "org.jblas" % "jblas" % "1.2.3" )