From 07a98ca4ce4e715ce32b4be75010e28764da459b Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Wed, 1 Jun 2016 10:49:51 -0700 Subject: [PATCH] [SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature ## What changes were proposed in this pull request? ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include: * Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless. * Scala API docs update. * Sync Scala and Python API docs for these changes. ## How was this patch tested? Exist tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13410 from yanboliang/spark-15587. --- .../apache/spark/ml/feature/Bucketizer.scala | 6 ++-- .../spark/ml/feature/CountVectorizer.scala | 10 ++----- .../ml/feature/QuantileDiscretizer.scala | 7 ++--- .../apache/spark/ml/feature/Word2Vec.scala | 3 +- python/pyspark/ml/feature.py | 29 +++++++++---------- 5 files changed, 23 insertions(+), 32 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 10e622ace6..ff988cc815 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -43,7 +43,7 @@ final class Bucketizer(override val uid: String) /** * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which - * also includes y. Splits should be strictly increasing. + * also includes y. Splits should be of length >= 3 and strictly increasing. * Values at -inf, inf must be explicitly provided to cover all Double values; * otherwise, values outside the splits specified will be treated as errors. * @group param @@ -51,8 +51,8 @@ final class Bucketizer(override val uid: String) val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits", "Split points for mapping continuous features into buckets. With n+1 splits, there are n " + "buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last " + - "bucket, which also includes y. The splits should be strictly increasing. " + - "Values at -inf, inf must be explicitly provided to cover all Double values; " + + "bucket, which also includes y. The splits should be of length >= 3 and strictly " + + "increasing. Values at -inf, inf must be explicitly provided to cover all Double values; " + "otherwise, values outside the splits specified will be treated as errors.", Bucketizer.checkSplits) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index fc4885bf4b..272567d09c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -56,7 +56,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * If this is an integer >= 1, this specifies the number of documents the term must appear in; * if this is a double in [0,1), then this specifies the fraction of documents. * - * Default: 1 + * Default: 1.0 * @group param */ val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + @@ -86,7 +86,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit * Note that the parameter is only used in transform of [[CountVectorizerModel]] and does not * affect fitting. * - * Default: 1 + * Default: 1.0 * @group param */ val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" + @@ -96,8 +96,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit " of the document's token count). Note that the parameter is only used in transform of" + " CountVectorizerModel and does not affect fitting.", ParamValidators.gtEq(0.0)) - setDefault(minTF -> 1) - /** @group getParam */ def getMinTF: Double = $(minTF) @@ -114,7 +112,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getBinary: Boolean = $(binary) - setDefault(binary -> false) + setDefault(vocabSize -> (1 << 18), minDF -> 1.0, minTF -> 1.0, binary -> false) } /** @@ -145,8 +143,6 @@ class CountVectorizer(override val uid: String) /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - setDefault(vocabSize -> (1 << 18), minDF -> 1) - @Since("2.0.0") override def fit(dataset: Dataset[_]): CountVectorizerModel = { transformSchema(dataset.schema, logging = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 61483590cd..1fefaa1fdd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, StructType} @@ -31,7 +31,7 @@ import org.apache.spark.sql.types.{DoubleType, StructType} * Params for [[QuantileDiscretizer]]. */ private[feature] trait QuantileDiscretizerBase extends Params - with HasInputCol with HasOutputCol with HasSeed { + with HasInputCol with HasOutputCol { /** * Number of buckets (quantiles, or categories) into which data points are grouped. Must @@ -91,9 +91,6 @@ final class QuantileDiscretizer(override val uid: String) /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - /** @group setParam */ - def setSeed(value: Long): this.type = set(seed, value) - override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 1b929cdfff..2d89eb05a5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -51,7 +51,8 @@ private[feature] trait Word2VecBase extends Params def getVectorSize: Int = $(vectorSize) /** - * The window size (context words from [-window, window]) default 5. + * The window size (context words from [-window, window]). + * Default: 5 * @group expertParam */ final val windowSize = new IntParam( diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index eb555cb940..1aff2e550f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -19,8 +19,6 @@ import sys if sys.version > '3': basestring = str -from py4j.java_collections import JavaArray - from pyspark import since, keyword_only from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.linalg import _convert_to_vector @@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav "Split points for mapping continuous features into buckets. With n+1 splits, " + "there are n buckets. A bucket defined by splits x,y holds values in the " + "range [x,y) except the last bucket, which also includes y. The splits " + - "should be strictly increasing. Values at -inf, inf must be explicitly " + - "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.", + "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " + + "explicitly provided to cover all Double values; otherwise, values outside the " + + "splits specified will be treated as errors.", typeConverter=TypeConverters.toListFloat) @keyword_only @@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead @inherit_doc -class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, - JavaMLWritable): +class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> qds = QuantileDiscretizer(numBuckets=2, - ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01) - >>> qds.getSeed() - 123 + ... inputCol="values", outputCol="buckets", relativeError=0.01) >>> qds.getRelativeError() 0.01 >>> bucketizer = qds.fit(df) @@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav typeConverter=TypeConverters.toFloat) @keyword_only - def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001): + def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001): """ - __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001) + __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001) """ super(QuantileDiscretizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", @@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav @keyword_only @since("2.0.0") - def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, - relativeError=0.001): + def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001): """ - setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \ - relativeError=0.001) + setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001) Set the params for the QuantileDiscretizer """ kwargs = self.setParams._input_kwargs @@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. + The "unit std" is computed using the `corrected sample standard deviation \ + <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_, + which is computed as the square root of the unbiased sample variance. + >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") -- GitLab