Skip to content
Snippets Groups Projects
Commit 43adbd56 authored by Yuhao Yang's avatar Yuhao Yang Committed by Xiangrui Meng
Browse files

[SPARK-8043] [MLLIB] [DOC] update NaiveBayes and SVM examples in doc

jira: https://issues.apache.org/jira/browse/SPARK-8043

I found some issues during testing the save/load examples in markdown Documents, as a part of 1.4 QA plan

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #6584 from hhbyyh/naiveDocExample and squashes the following commits:

a01a206 [Yuhao Yang] fix for Gaussian mixture
2fb8b96 [Yuhao Yang] update NaiveBayes and SVM examples in doc
parent ccaa8232
No related branches found
No related tags found
No related merge requests found
...@@ -249,11 +249,11 @@ public class GaussianMixtureExample { ...@@ -249,11 +249,11 @@ public class GaussianMixtureExample {
GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd()); GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());
// Save and load GaussianMixtureModel // Save and load GaussianMixtureModel
gmm.save(sc, "myGMMModel") gmm.save(sc.sc(), "myGMMModel");
GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc, "myGMMModel") GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc.sc(), "myGMMModel");
// Output the parameters of the mixture model // Output the parameters of the mixture model
for(int j=0; j<gmm.k(); j++) { for(int j=0; j<gmm.k(); j++) {
System.out.println("weight=%f\nmu=%s\nsigma=\n%s\n", System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma()); gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
} }
} }
......
...@@ -163,11 +163,8 @@ object, and make predictions with the resulting model to compute the training ...@@ -163,11 +163,8 @@ object, and make predictions with the resulting model to compute the training
error. error.
{% highlight scala %} {% highlight scala %}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.util.MLUtils
// Load training data in LIBSVM format. // Load training data in LIBSVM format.
...@@ -231,15 +228,13 @@ calling `.rdd()` on your `JavaRDD` object. A self-contained application example ...@@ -231,15 +228,13 @@ calling `.rdd()` on your `JavaRDD` object. A self-contained application example
that is equivalent to the provided example in Scala is given bellow: that is equivalent to the provided example in Scala is given bellow:
{% highlight java %} {% highlight java %}
import java.util.Random;
import scala.Tuple2; import scala.Tuple2;
import org.apache.spark.api.java.*; import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.*; import org.apache.spark.mllib.classification.*;
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
...@@ -282,8 +277,8 @@ public class SVMClassifier { ...@@ -282,8 +277,8 @@ public class SVMClassifier {
System.out.println("Area under ROC = " + auROC); System.out.println("Area under ROC = " + auROC);
// Save and load model // Save and load model
model.save(sc.sc(), "myModelPath"); model.save(sc, "myModelPath");
SVMModel sameModel = SVMModel.load(sc.sc(), "myModelPath"); SVMModel sameModel = SVMModel.load(sc, "myModelPath");
} }
} }
{% endhighlight %} {% endhighlight %}
...@@ -315,15 +310,12 @@ a dependency. ...@@ -315,15 +310,12 @@ a dependency.
</div> </div>
<div data-lang="python" markdown="1"> <div data-lang="python" markdown="1">
The following example shows how to load a sample dataset, build Logistic Regression model, The following example shows how to load a sample dataset, build SVM model,
and make predictions with the resulting model to compute the training error. and make predictions with the resulting model to compute the training error.
Note that the Python API does not yet support model save/load but will in the future.
{% highlight python %} {% highlight python %}
from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LabeledPoint
from numpy import array
# Load and parse the data # Load and parse the data
def parsePoint(line): def parsePoint(line):
...@@ -334,12 +326,16 @@ data = sc.textFile("data/mllib/sample_svm_data.txt") ...@@ -334,12 +326,16 @@ data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint) parsedData = data.map(parsePoint)
# Build the model # Build the model
model = LogisticRegressionWithSGD.train(parsedData) model = SVMWithSGD.train(parsedData, iterations=100)
# Evaluating the model on training data # Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr)) print("Training Error = " + str(trainErr))
# Save and load model
model.save(sc, "myModelPath")
sameModel = SVMModel.load(sc, "myModelPath")
{% endhighlight %} {% endhighlight %}
</div> </div>
</div> </div>
......
...@@ -53,7 +53,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) ...@@ -53,7 +53,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0) val training = splits(0)
val test = splits(1) val test = splits(1)
val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial") val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")
val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment