From dbe81633a766c4dc68a0a27063e5dfde0f5690af Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Mon, 15 May 2017 21:21:54 -0700 Subject: [PATCH] [SPARK-20501][ML] ML 2.2 QA: New Scala APIs, docs ## What changes were proposed in this pull request? Review new Scala APIs introduced in 2.2. ## How was this patch tested? Existing tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #17934 from yanboliang/spark-20501. --- .../apache/spark/ml/classification/LinearSVC.scala | 5 +++-- .../ml/classification/LogisticRegression.scala | 8 ++++++-- .../org/apache/spark/ml/feature/Imputer.scala | 14 ++++++++------ .../scala/org/apache/spark/ml/fpm/FPGrowth.scala | 2 +- .../org/apache/spark/ml/stat/Correlation.scala | 2 +- python/pyspark/ml/classification.py | 1 + 6 files changed, 20 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 7507c7539d..9900fbc9ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -51,6 +51,7 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR * Linear SVM Classifier</a> * * This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. + * Only supports L2 regularization currently. * */ @Since("2.2.0") @@ -148,7 +149,7 @@ class LinearSVC @Since("2.2.0") ( @Since("2.2.0") override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra) - override protected[classification] def train(dataset: Dataset[_]): LinearSVCModel = { + override protected def train(dataset: Dataset[_]): LinearSVCModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map { @@ -264,7 +265,7 @@ object LinearSVC extends DefaultParamsReadable[LinearSVC] { /** * :: Experimental :: - * SVM Model trained by [[LinearSVC]] + * Linear SVM Model trained by [[LinearSVC]] */ @Since("2.2.0") @Experimental diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 053487242e..567af0488e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -267,8 +267,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } /** - * Logistic regression. Supports multinomial logistic (softmax) regression and binomial logistic - * regression. + * Logistic regression. Supports: + * - Multinomial logistic (softmax) regression. + * - Binomial logistic regression. + * + * This class supports fitting traditional logistic regression model by LBFGS/OWLQN and + * bound (box) constrained logistic regression model by LBFGSB. */ @Since("1.2.0") class LogisticRegression @Since("1.2.0") ( diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index a41bd8e689..9e023b9dd4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -102,7 +102,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols { * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001. */ @Experimental -class Imputer @Since("2.2.0")(override val uid: String) +@Since("2.2.0") +class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { @Since("2.2.0") @@ -165,8 +166,8 @@ class Imputer @Since("2.2.0")(override val uid: String) object Imputer extends DefaultParamsReadable[Imputer] { /** strategy names that Imputer currently supports. */ - private[ml] val mean = "mean" - private[ml] val median = "median" + private[feature] val mean = "mean" + private[feature] val median = "median" @Since("2.2.0") override def load(path: String): Imputer = super.load(path) @@ -180,9 +181,10 @@ object Imputer extends DefaultParamsReadable[Imputer] { * which are used to replace the missing values in the input DataFrame. */ @Experimental -class ImputerModel private[ml]( - override val uid: String, - val surrogateDF: DataFrame) +@Since("2.2.0") +class ImputerModel private[ml] ( + @Since("2.2.0") override val uid: String, + @Since("2.2.0") val surrogateDF: DataFrame) extends Model[ImputerModel] with ImputerParams with MLWritable { import ImputerModel._ diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala index 12804d08a4..aa7871d6ff 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala @@ -200,7 +200,7 @@ object FPGrowth extends DefaultParamsReadable[FPGrowth] { @Experimental class FPGrowthModel private[ml] ( @Since("2.2.0") override val uid: String, - @transient val freqItemsets: DataFrame) + @Since("2.2.0") @transient val freqItemsets: DataFrame) extends Model[FPGrowthModel] with FPGrowthParams with MLWritable { /** @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala index e185bc8a6f..6e885d7c8a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{StructField, StructType} /** - * API for correlation functions in MLlib, compatible with Dataframes and Datasets. + * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] * to spark.ml's Vector types. diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index dcc12d93e9..60bdeedd6a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -70,6 +70,7 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha `Linear SVM Classifier <https://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM>`_ This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. + Only supports L2 regularization currently. >>> from pyspark.sql import Row >>> from pyspark.ml.linalg import Vectors -- GitLab