From dbe81633a766c4dc68a0a27063e5dfde0f5690af Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 15 May 2017 21:21:54 -0700
Subject: [PATCH] [SPARK-20501][ML] ML 2.2 QA: New Scala APIs, docs

## What changes were proposed in this pull request?
Review new Scala APIs introduced in 2.2.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #17934 from yanboliang/spark-20501.
---
 .../apache/spark/ml/classification/LinearSVC.scala |  5 +++--
 .../ml/classification/LogisticRegression.scala     |  8 ++++++--
 .../org/apache/spark/ml/feature/Imputer.scala      | 14 ++++++++------
 .../scala/org/apache/spark/ml/fpm/FPGrowth.scala   |  2 +-
 .../org/apache/spark/ml/stat/Correlation.scala     |  2 +-
 python/pyspark/ml/classification.py                |  1 +
 6 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 7507c7539d..9900fbc9ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -51,6 +51,7 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
  *   Linear SVM Classifier</a>
  *
  * This binary classifier optimizes the Hinge Loss using the OWLQN optimizer.
+ * Only supports L2 regularization currently.
  *
  */
 @Since("2.2.0")
@@ -148,7 +149,7 @@ class LinearSVC @Since("2.2.0") (
   @Since("2.2.0")
   override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
 
-  override protected[classification] def train(dataset: Dataset[_]): LinearSVCModel = {
+  override protected def train(dataset: Dataset[_]): LinearSVCModel = {
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
@@ -264,7 +265,7 @@ object LinearSVC extends DefaultParamsReadable[LinearSVC] {
 
 /**
  * :: Experimental ::
- * SVM Model trained by [[LinearSVC]]
+ * Linear SVM Model trained by [[LinearSVC]]
  */
 @Since("2.2.0")
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 053487242e..567af0488e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -267,8 +267,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 }
 
 /**
- * Logistic regression. Supports multinomial logistic (softmax) regression and binomial logistic
- * regression.
+ * Logistic regression. Supports:
+ *  - Multinomial logistic (softmax) regression.
+ *  - Binomial logistic regression.
+ *
+ * This class supports fitting traditional logistic regression model by LBFGS/OWLQN and
+ * bound (box) constrained logistic regression model by LBFGSB.
  */
 @Since("1.2.0")
 class LogisticRegression @Since("1.2.0") (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index a41bd8e689..9e023b9dd4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -102,7 +102,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
  * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001.
  */
 @Experimental
-class Imputer @Since("2.2.0")(override val uid: String)
+@Since("2.2.0")
+class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String)
   extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable {
 
   @Since("2.2.0")
@@ -165,8 +166,8 @@ class Imputer @Since("2.2.0")(override val uid: String)
 object Imputer extends DefaultParamsReadable[Imputer] {
 
   /** strategy names that Imputer currently supports. */
-  private[ml] val mean = "mean"
-  private[ml] val median = "median"
+  private[feature] val mean = "mean"
+  private[feature] val median = "median"
 
   @Since("2.2.0")
   override def load(path: String): Imputer = super.load(path)
@@ -180,9 +181,10 @@ object Imputer extends DefaultParamsReadable[Imputer] {
  *                    which are used to replace the missing values in the input DataFrame.
  */
 @Experimental
-class ImputerModel private[ml](
-    override val uid: String,
-    val surrogateDF: DataFrame)
+@Since("2.2.0")
+class ImputerModel private[ml] (
+    @Since("2.2.0") override val uid: String,
+    @Since("2.2.0") val surrogateDF: DataFrame)
   extends Model[ImputerModel] with ImputerParams with MLWritable {
 
   import ImputerModel._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
index 12804d08a4..aa7871d6ff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -200,7 +200,7 @@ object FPGrowth extends DefaultParamsReadable[FPGrowth] {
 @Experimental
 class FPGrowthModel private[ml] (
     @Since("2.2.0") override val uid: String,
-    @transient val freqItemsets: DataFrame)
+    @Since("2.2.0") @transient val freqItemsets: DataFrame)
   extends Model[FPGrowthModel] with FPGrowthParams with MLWritable {
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala
index e185bc8a6f..6e885d7c8a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
- * API for correlation functions in MLlib, compatible with Dataframes and Datasets.
+ * API for correlation functions in MLlib, compatible with DataFrames and Datasets.
  *
  * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]]
  * to spark.ml's Vector types.
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index dcc12d93e9..60bdeedd6a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -70,6 +70,7 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha
     `Linear SVM Classifier <https://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM>`_
 
     This binary classifier optimizes the Hinge Loss using the OWLQN optimizer.
+    Only supports L2 regularization currently.
 
     >>> from pyspark.sql import Row
     >>> from pyspark.ml.linalg import Vectors
-- 
GitLab