From e9b275b7697e7ad3b52b157d3274acc17ca8d828 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Wed, 30 Jul 2014 17:34:32 -0700
Subject: [PATCH] SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression
 datasets

Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course.

Author: Sean Owen <srowen@gmail.com>

Closes #1663 from srowen/SPARK-2341 and squashes the following commits:

8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes
18a8c8e [Sean Owen] Updates from review
83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)
---
 .../examples/mllib/LinearRegression.scala     |  2 +-
 .../examples/mllib/SparseNaiveBayes.scala     |  4 +-
 .../spark/mllib/util/LabelParsers.scala       | 56 -------------------
 .../org/apache/spark/mllib/util/MLUtils.scala | 52 ++++++-----------
 .../spark/mllib/util/LabelParsersSuite.scala  | 41 --------------
 .../spark/mllib/util/MLUtilsSuite.scala       | 14 ++---
 project/MimaExcludes.scala                    |  8 +++
 python/pyspark/mllib/util.py                  | 23 ++++----
 8 files changed, 46 insertions(+), 154 deletions(-)
 delete mode 100644 mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
 delete mode 100644 mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 4811bb70e4..05b7d66f8d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -91,7 +91,7 @@ object LinearRegression extends App {
 
     Logger.getRootLogger.setLevel(Level.WARN)
 
-    val examples = MLUtils.loadLibSVMFile(sc, params.input, multiclass = true).cache()
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
 
     val splits = examples.randomSplit(Array(0.8, 0.2))
     val training = splits(0).cache()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
index 537e68a099..88acd9dbb0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
@@ -22,7 +22,7 @@ import scopt.OptionParser
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.classification.NaiveBayes
-import org.apache.spark.mllib.util.{MLUtils, MulticlassLabelParser}
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * An example naive Bayes app. Run with
@@ -76,7 +76,7 @@ object SparseNaiveBayes {
       if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions
 
     val examples =
-      MLUtils.loadLibSVMFile(sc, params.input, multiclass = true, params.numFeatures, minPartitions)
+      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
     // Cache examples because it will be used in both training and evaluation.
     examples.cache()
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
deleted file mode 100644
index e25bf18b78..0000000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-/** Trait for label parsers. */
-private trait LabelParser extends Serializable {
-  /** Parses a string label into a double label. */
-  def parse(labelString: String): Double
-}
-
-/** Factory methods for label parsers. */
-private object LabelParser {
-  def getInstance(multiclass: Boolean): LabelParser = {
-    if (multiclass) MulticlassLabelParser else BinaryLabelParser
-  }
-}
-
-/**
- * Label parser for binary labels, which outputs 1.0 (positive) if the value is greater than 0.5,
- * or 0.0 (negative) otherwise. So it works with +1/-1 labeling and +1/0 labeling.
- */
-private object BinaryLabelParser extends LabelParser {
-  /** Gets the default instance of BinaryLabelParser. */
-  def getInstance(): LabelParser = this
-
-  /**
-   * Parses the input label into positive (1.0) if the value is greater than 0.5,
-   * or negative (0.0) otherwise.
-   */
-  override def parse(labelString: String): Double = if (labelString.toDouble > 0.5) 1.0 else 0.0
-}
-
-/**
- * Label parser for multiclass labels, which converts the input label to double.
- */
-private object MulticlassLabelParser extends LabelParser {
-  /** Gets the default instance of MulticlassLabelParser. */
-  def getInstance(): LabelParser = this
-
-  override def parse(labelString: String): Double =  labelString.toDouble
-}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 30de24ad89..dc10a19478 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -55,7 +55,6 @@ object MLUtils {
    *
    * @param sc Spark context
    * @param path file or directory path in any Hadoop-supported file system URI
-   * @param labelParser parser for labels
    * @param numFeatures number of features, which will be determined from the input data if a
    *                    nonpositive value is given. This is useful when the dataset is already split
    *                    into multiple files and you want to load them separately, because some
@@ -64,10 +63,9 @@ object MLUtils {
    * @param minPartitions min number of partitions
    * @return labeled data stored as an RDD[LabeledPoint]
    */
-  private def loadLibSVMFile(
+  def loadLibSVMFile(
       sc: SparkContext,
       path: String,
-      labelParser: LabelParser,
       numFeatures: Int,
       minPartitions: Int): RDD[LabeledPoint] = {
     val parsed = sc.textFile(path, minPartitions)
@@ -75,7 +73,7 @@ object MLUtils {
       .filter(line => !(line.isEmpty || line.startsWith("#")))
       .map { line =>
         val items = line.split(' ')
-        val label = labelParser.parse(items.head)
+        val label = items.head.toDouble
         val (indices, values) = items.tail.map { item =>
           val indexAndValue = item.split(':')
           val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
@@ -102,64 +100,46 @@ object MLUtils {
 
   // Convenient methods for `loadLibSVMFile`.
 
-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
-   * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
-   * Each line represents a labeled sparse feature vector using the following format:
-   * {{{label index1:value1 index2:value2 ...}}}
-   * where the indices are one-based and in ascending order.
-   * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
-   * where the feature indices are converted to zero-based.
-   *
-   * @param sc Spark context
-   * @param path file or directory path in any Hadoop-supported file system URI
-   * @param multiclass whether the input labels contain more than two classes. If false, any label
-   *                   with value greater than 0.5 will be mapped to 1.0, or 0.0 otherwise. So it
-   *                   works for both +1/-1 and 1/0 cases. If true, the double value parsed directly
-   *                   from the label string will be used as the label value.
-   * @param numFeatures number of features, which will be determined from the input data if a
-   *                    nonpositive value is given. This is useful when the dataset is already split
-   *                    into multiple files and you want to load them separately, because some
-   *                    features may not present in certain files, which leads to inconsistent
-   *                    feature dimensions.
-   * @param minPartitions min number of partitions
-   * @return labeled data stored as an RDD[LabeledPoint]
-   */
-   def loadLibSVMFile(
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+  def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean,
       numFeatures: Int,
       minPartitions: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, LabelParser.getInstance(multiclass), numFeatures, minPartitions)
+    loadLibSVMFile(sc, path, numFeatures, minPartitions)
 
   /**
    * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
    * partitions.
    */
+  def loadLibSVMFile(
+      sc: SparkContext,
+      path: String,
+      numFeatures: Int): RDD[LabeledPoint] =
+    loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
+
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean,
       numFeatures: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, numFeatures, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, numFeatures)
 
-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the number of features
-   * determined automatically and the default number of partitions.
-   */
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path)
 
   /**
    * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
    * features determined automatically and the default number of partitions.
    */
   def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass = false, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, -1)
 
   /**
    * Save labeled data in LIBSVM format.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
deleted file mode 100644
index ac85677f2f..0000000000
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-import org.scalatest.FunSuite
-
-class LabelParsersSuite extends FunSuite {
-  test("binary label parser") {
-    for (parser <- Seq(BinaryLabelParser, BinaryLabelParser.getInstance())) {
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("0") === 0.0)
-      assert(parser.parse("-1") === 0.0)
-    }
-  }
-
-  test("multiclass label parser") {
-    for (parser <- Seq(MulticlassLabelParser, MulticlassLabelParser.getInstance())) {
-      assert(parser.parse("0") == 0.0)
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("2") === 2.0)
-      assert(parser.parse("3") === 3.0)
-    }
-  }
-}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index c14870fb96..8ef2bb1bf6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -63,9 +63,9 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
   test("loadLibSVMFile") {
     val lines =
       """
-        |+1 1:1.0 3:2.0 5:3.0
-        |-1
-        |-1 2:4.0 4:5.0 6:6.0
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
       """.stripMargin
     val tempDir = Files.createTempDir()
     tempDir.deleteOnExit()
@@ -73,7 +73,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
     Files.write(lines, file, Charsets.US_ASCII)
     val path = tempDir.toURI.toString
 
-    val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 6).collect()
+    val pointsWithNumFeatures = loadLibSVMFile(sc, path, 6).collect()
     val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect()
 
     for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
@@ -86,11 +86,11 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
       assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), (5, 6.0))))
     }
 
-    val multiclassPoints = loadLibSVMFile(sc, path, multiclass = true).collect()
+    val multiclassPoints = loadLibSVMFile(sc, path).collect()
     assert(multiclassPoints.length === 3)
     assert(multiclassPoints(0).label === 1.0)
-    assert(multiclassPoints(1).label === -1.0)
-    assert(multiclassPoints(2).label === -1.0)
+    assert(multiclassPoints(1).label === 0.0)
+    assert(multiclassPoints(2).label === 0.0)
 
     Utils.deleteRecursively(tempDir)
   }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 5ff88f0dd1..5a835f5820 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -97,6 +97,14 @@ object MimaExcludes {
               "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
             ProblemFilters.exclude[IncompatibleMethTypeProblem](
               "org.apache.spark.mllib.tree.impurity.Variance.calculate")
+          ) ++
+          Seq ( // Package-private classes removed in SPARK-2341
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
           )
         case v if v.startsWith("1.0") =>
           Seq(
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a707a9dcd5..d94900cefd 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -29,15 +29,18 @@ class MLUtils:
     Helper methods to load, save and pre-process data used in MLlib.
     """
 
+    @deprecated
     @staticmethod
     def _parse_libsvm_line(line, multiclass):
+        return _parse_libsvm_line(line)
+
+    @staticmethod
+    def _parse_libsvm_line(line):
         """
         Parses a line in LIBSVM format into (label, indices, values).
         """
         items = line.split(None)
         label = float(items[0])
-        if not multiclass:
-            label = 1.0 if label > 0.5 else 0.0
         nnz = len(items) - 1
         indices = np.zeros(nnz, dtype=np.int32)
         values = np.zeros(nnz)
@@ -64,8 +67,13 @@ class MLUtils:
                             " but got " % type(v))
         return " ".join(items)
 
+    @deprecated
     @staticmethod
     def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         """
         Loads labeled data in the LIBSVM format into an RDD of
         LabeledPoint. The LIBSVM format is a text-based format used by
@@ -81,13 +89,6 @@ class MLUtils:
         @param sc: Spark context
         @param path: file or directory path in any Hadoop-supported file
                      system URI
-        @param multiclass: whether the input labels contain more than
-                           two classes. If false, any label with value
-                           greater than 0.5 will be mapped to 1.0, or
-                           0.0 otherwise. So it works for both +1/-1 and
-                           1/0 cases. If true, the double value parsed
-                           directly from the label string will be used
-                           as the label value.
         @param numFeatures: number of features, which will be determined
                             from the input data if a nonpositive value
                             is given. This is useful when the dataset is
@@ -105,7 +106,7 @@ class MLUtils:
         >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
         >>> tempFile.flush()
         >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
+        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
         >>> tempFile.close()
         >>> type(examples[0]) == LabeledPoint
         True
@@ -124,7 +125,7 @@ class MLUtils:
         """
 
         lines = sc.textFile(path, minPartitions)
-        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
             numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
-- 
GitLab