diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java index 9a43189c91463c79ef0de8739bf8f3e529ca3fd1..4ccd8f6ce265047f9423305a97582e0c62e644c6 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java @@ -60,7 +60,7 @@ public class JavaPipelineExample { .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) - .setRegParam(0.01); + .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); @@ -71,7 +71,7 @@ public class JavaPipelineExample { Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), - new JavaDocument(6L, "mapreduce spark"), + new JavaDocument(6L, "spark hadoop spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java deleted file mode 100644 index ca80d0d8bba5796d2e4b4bea96edb95d1da44c44..0000000000000000000000000000000000000000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.util.List; - -import com.google.common.collect.Lists; - -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.param.ParamMap; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -/** - * A simple example demonstrating ways to specify parameters for Estimators and Transformers. - * Run with - * {{{ - * bin/run-example ml.JavaSimpleParamsExample - * }}} - */ -public class JavaSimpleParamsExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSimpleParamsExample") - .getOrCreate(); - - // Prepare training data. - // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans - // into DataFrames, where it uses the bean metadata to infer the schema. - List<LabeledPoint> localTraining = Lists.newArrayList( - new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), - new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), - new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - Dataset<Row> training = - spark.createDataFrame(localTraining, LabeledPoint.class); - - // Create a LogisticRegression instance. This instance is an Estimator. - LogisticRegression lr = new LogisticRegression(); - // Print out the parameters, documentation, and any default values. - System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); - - // We may set parameters using setter methods. - lr.setMaxIter(10) - .setRegParam(0.01); - - // Learn a LogisticRegression model. This uses the parameters stored in lr. - LogisticRegressionModel model1 = lr.fit(training); - // Since model1 is a Model (i.e., a Transformer produced by an Estimator), - // we can view the parameters it used during fit(). - // This prints the parameter (name: value) pairs, where names are unique IDs for this - // LogisticRegression instance. - System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); - - // We may alternatively specify parameters using a ParamMap. - ParamMap paramMap = new ParamMap(); - paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. - paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. - double[] thresholds = {0.5, 0.5}; - paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params. - - // One can also combine ParamMaps. - ParamMap paramMap2 = new ParamMap(); - paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name. - ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); - - // Now learn a new model using the paramMapCombined parameters. - // paramMapCombined overrides all parameters set earlier via lr.set* methods. - LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); - System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); - - // Prepare test documents. - List<LabeledPoint> localTest = Lists.newArrayList( - new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class); - - // Make predictions on test documents using the Transformer.transform() method. - // LogisticRegressionModel.transform will only use the 'features' column. - // Note that model2.transform() outputs a 'myProbability' column instead of the usual - // 'probability' column since we renamed the lr.probabilityCol parameter previously. - Dataset<Row> results = model2.transform(test); - Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); - for (Row r: rows.collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - - spark.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java deleted file mode 100644 index 7c24c46d2e287773e54986d1075bcc03cfa97c6e..0000000000000000000000000000000000000000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.util.List; - -import com.google.common.collect.Lists; - -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.feature.HashingTF; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -/** - * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java - * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of - * this example {@link SimpleTextClassificationPipeline}. Run with - * <pre> - * bin/run-example ml.JavaSimpleTextClassificationPipeline - * </pre> - */ -public class JavaSimpleTextClassificationPipeline { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSimpleTextClassificationPipeline") - .getOrCreate(); - - // Prepare training documents, which are labeled. - List<LabeledDocument> localTraining = Lists.newArrayList( - new LabeledDocument(0L, "a b c d e spark", 1.0), - new LabeledDocument(1L, "b d", 0.0), - new LabeledDocument(2L, "spark f g h", 1.0), - new LabeledDocument(3L, "hadoop mapreduce", 0.0)); - Dataset<Row> training = - spark.createDataFrame(localTraining, LabeledDocument.class); - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - Tokenizer tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words"); - HashingTF hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol()) - .setOutputCol("features"); - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.001); - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); - - // Fit the pipeline to training documents. - PipelineModel model = pipeline.fit(training); - - // Prepare test documents, which are unlabeled. - List<Document> localTest = Lists.newArrayList( - new Document(4L, "spark i j k"), - new Document(5L, "l m n"), - new Document(6L, "spark hadoop spark"), - new Document(7L, "apache hadoop")); - Dataset<Row> test = spark.createDataFrame(localTest, Document.class); - - // Make predictions on test documents. - Dataset<Row> predictions = model.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - - spark.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java index def59944291241731c0df9101ba85063f055c543..278cce084218a1b7677d6ad8c9e8cef9ec7db1af 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java @@ -47,7 +47,7 @@ public class JavaStopWordsRemoverExample { .setOutputCol("filtered"); List<Row> data = Arrays.asList( - RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index 1cc16bb60d172ad304639e2b28eb66dc17afa440..a206cef4c23296819c55417e97fe1f5a4e8203c0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -57,17 +57,24 @@ public class JavaTokenizerExample { Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame); - for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) { + RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); + + Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame); + for (Row r : tokenized.select("words", "label").takeAsList(3)) { java.util.List<String> words = r.getList(0); for (String word : words) System.out.print(word + " "); System.out.println(); } - RegexTokenizer regexTokenizer = new RegexTokenizer() - .setInputCol("sentence") - .setOutputCol("words") - .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); + Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame); + for (Row r : regexTokenized.select("words", "label").takeAsList(3)) { + java.util.List<String> words = r.getList(0); + for (String word : words) System.out.print(word + " "); + System.out.println(); + } // $example off$ spark.stop(); } diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py index 060f0171ffdb5b158c0c4526939ceccf9b63fec7..2f0ca995e55c74c6d7ff2a64603bb2bf77f72623 100644 --- a/examples/src/main/python/ml/aft_survival_regression.py +++ b/examples/src/main/python/ml/aft_survival_regression.py @@ -32,7 +32,7 @@ Run with: if __name__ == "__main__": spark = SparkSession \ .builder \ - .appName("PythonAFTSurvivalRegressionExample") \ + .appName("AFTSurvivalRegressionExample") \ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index ee0399ac5eb20cf5f87a0a726e6690eb287570fd..1263cb5d177a81f7321ed502a462171a3a59db71 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -31,7 +31,7 @@ Run with: if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonBisectingKMeansExample")\ + .appName("BisectingKMeansExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py index a41df6cf946fb47a44d5c04503dffced63b09f60..907eec67a0eb5843d7897d5314f9a918419a0f87 100644 --- a/examples/src/main/python/ml/cross_validator.py +++ b/examples/src/main/python/ml/cross_validator.py @@ -24,7 +24,7 @@ from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # $example off$ -from pyspark.sql import Row, SparkSession +from pyspark.sql import SparkSession """ A simple example demonstrating model selection using CrossValidator. @@ -39,6 +39,7 @@ if __name__ == "__main__": .builder\ .appName("CrossValidatorExample")\ .getOrCreate() + # $example on$ # Prepare training documents, which are labeled. training = spark.createDataFrame([ diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index c1818d72fe467b941a32492e19cd3b773ac5ebbb..109f901012c9c2ad14365e35a4f92f57deeaf5bd 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -34,15 +34,16 @@ if __name__ == "__main__": if len(sys.argv) > 2: print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr) exit(-1) - spark = SparkSession\ - .builder\ - .appName("DataFrameExample")\ - .getOrCreate() - if len(sys.argv) == 2: + elif len(sys.argv) == 2: input = sys.argv[1] else: input = "data/mllib/sample_libsvm_data.txt" + spark = SparkSession \ + .builder \ + .appName("DataFrameExample") \ + .getOrCreate() + # Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py index 708f1af6cc6eb2059b6e2bfb0d10e1e59152146b..d6e2977de00823710c8c38c5cdff577bea11af1c 100644 --- a/examples/src/main/python/ml/decision_tree_classification_example.py +++ b/examples/src/main/python/ml/decision_tree_classification_example.py @@ -31,7 +31,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("decision_tree_classification_example")\ + .appName("DecisionTreeClassificationExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py index 3bd3fd30f8e5787f693025e0cc3c58dcf4a52212..eb210514353934e1004b2d3db4bcc48a8b2c18d1 100644 --- a/examples/src/main/python/ml/estimator_transformer_param_example.py +++ b/examples/src/main/python/ml/estimator_transformer_param_example.py @@ -18,6 +18,7 @@ """ Estimator Transformer Param Example. """ +from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors @@ -42,7 +43,7 @@ if __name__ == "__main__": # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. - print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" + print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) @@ -51,8 +52,8 @@ if __name__ == "__main__": # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. - print "Model 1 was fit using parameters: " - print model1.extractParamMap() + print("Model 1 was fit using parameters: ") + print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} @@ -67,8 +68,8 @@ if __name__ == "__main__": # Now learn a new model using the paramMapCombined parameters. # paramMapCombined overrides all parameters set earlier via lr.set* methods. model2 = lr.fit(training, paramMapCombined) - print "Model 2 was fit using parameters: " - print model2.extractParamMap() + print("Model 2 was fit using parameters: ") + print(model2.extractParamMap()) # Prepare test data test = spark.createDataFrame([ @@ -81,9 +82,12 @@ if __name__ == "__main__": # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. prediction = model2.transform(test) - selected = prediction.select("features", "label", "myProbability", "prediction") - for row in selected.collect(): - print row + result = prediction.select("features", "label", "myProbability", "prediction") \ + .collect() + + for row in result: + print("features=%s, label=%s -> prob=%s, prediction=%s" + % (row.features, row.label, row.myProbability, row.prediction)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py index 2ca13d68f689042e57dadc6d2ff58fa74aec4598..edc258de05687b26805d8bec6ad76b8a70d8fc06 100644 --- a/examples/src/main/python/ml/gaussian_mixture_example.py +++ b/examples/src/main/python/ml/gaussian_mixture_example.py @@ -31,7 +31,7 @@ Run with: if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonGuassianMixtureExample")\ + .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py index 6c2d7e7b810df3b475e31b565a598627dd12c999..c2042fd7b7b0718decc2b014ad429090ba4f9b14 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py @@ -31,7 +31,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("gradient_boosted_tree_classifier_example")\ + .appName("GradientBoostedTreeClassifierExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py index 5dd2272748d7093b9c0d6f06d0f620de1f636971..cc96c973e4b2376eef629658a889f1b09f551b53 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py @@ -31,7 +31,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("gradient_boosted_tree_regressor_example")\ + .appName("GradientBoostedTreeRegressorExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py index 1e61bd8eff1433a714c93a487b37ad79bd04697d..a41b8ffacbde7140812f493e9e2238fbe1b91704 100644 --- a/examples/src/main/python/ml/isotonic_regression_example.py +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -21,7 +21,7 @@ Isotonic Regression Example. from __future__ import print_function # $example on$ -from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel +from pyspark.ml.regression import IsotonicRegression # $example off$ from pyspark.sql import SparkSession @@ -30,11 +30,11 @@ An example demonstrating isotonic regression. Run with: bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py """ -if __name__ == "__main__": +if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("PythonIsotonicRegressionExample")\ + .appName("IsotonicRegressionExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 4b8b7291f9188aacab004d965c5ec2d1ea4772b6..6846ec45997140eb284847b32c53c70a513692b7 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -31,12 +31,10 @@ Run with: This example requires NumPy (http://www.numpy.org/). """ - if __name__ == "__main__": - spark = SparkSession\ .builder\ - .appName("PythonKMeansExample")\ + .appName("KMeansExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py index 5ce810fccc6fb8859d00e1ea69474f206f08e17d..2dc1742ff7a0b54bd03df9ea5cf3fddefaaef32a 100644 --- a/examples/src/main/python/ml/lda_example.py +++ b/examples/src/main/python/ml/lda_example.py @@ -23,16 +23,13 @@ from pyspark.ml.clustering import LDA # $example off$ from pyspark.sql import SparkSession - """ An example demonstrating LDA. Run with: bin/spark-submit examples/src/main/python/ml/lda_example.py """ - if __name__ == "__main__": - # Creates a SparkSession spark = SparkSession \ .builder \ .appName("LDAExample") \ diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index aa33bef5a3dddf71df54d8a8a431ebee088308c2..2cc38c285579715eaf9f7da9c02dcc1fea48e470 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -31,18 +31,23 @@ if __name__ == "__main__": # Load training data data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") + # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] + # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] + # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) + # train the model model = trainer.fit(train) + # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index 9ac07f2c8ee2076c16e822b13ca3f89bb200453c..55263adb4666526cbf9086d08889e3530d2417e0 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -34,8 +34,10 @@ if __name__ == "__main__": (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) ], ["label", "words"]) + ngram = NGram(inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) + for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): print(ngrams_label) # $example off$ diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index 8bc32222fe32bedb842da1388f1eadd785058dc5..aa23f298c8c58c8f6302425fcf942fbf27494231 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -26,13 +26,14 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("naive_bayes_example")\ + .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") + # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] @@ -43,6 +44,7 @@ if __name__ == "__main__": # train the model model = nb.fit(train) + # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index b82087bebae8a4f0c087474a4871378b1d2907c7..8e00c25d9342e11f580ad07a99bc378a9c679512 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -30,11 +30,10 @@ Run with: bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ - if __name__ == "__main__": spark = SparkSession \ .builder \ - .appName("PythonOneVsRestExample") \ + .appName("OneVsRestExample") \ .getOrCreate() # $example on$ @@ -62,7 +61,7 @@ if __name__ == "__main__": # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) - print("Test Error : " + str(1 - accuracy)) + print("Test Error = %g" % (1.0 - accuracy)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index b9fceef68e7034adc4e6aaac461df018e9179bec..47faf8d2026f9a1d9e6ced6068e10b86f926f09e 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -41,6 +41,7 @@ if __name__ == "__main__": stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) + encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index 414629ff88bf9f2e3355894c6f5563b85cf2a323..38746aced096af18e7c814f1510225003425831d 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -34,8 +34,10 @@ if __name__ == "__main__": (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] df = spark.createDataFrame(data, ["features"]) + pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) + result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # $example off$ diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py index bd10cfd7a252b75d93b27a382cd6ce91170fa733..2d0865578ac7572e1bf85c6f5a413a367cc27910 100644 --- a/examples/src/main/python/ml/pipeline_example.py +++ b/examples/src/main/python/ml/pipeline_example.py @@ -38,12 +38,13 @@ if __name__ == "__main__": (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), - (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) + (3L, "hadoop mapreduce", 0.0) + ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - lr = LogisticRegression(maxIter=10, regParam=0.01) + lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. @@ -53,8 +54,9 @@ if __name__ == "__main__": test = spark.createDataFrame([ (4L, "spark i j k"), (5L, "l m n"), - (6L, "mapreduce spark"), - (7L, "apache hadoop")], ["id", "text"]) + (6L, "spark hadoop spark"), + (7L, "apache hadoop") + ], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index b46c1ba2f4391d19f7b908b56ac64f2f20ca5dc1..b464ee86b6e808fc416dc80c59c447516a04f72b 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -30,13 +30,15 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - df = spark\ - .createDataFrame([(Vectors.dense([-2.0, 2.3]),), - (Vectors.dense([0.0, 0.0]),), - (Vectors.dense([0.6, -1.1]),)], - ["features"]) + df = spark.createDataFrame([ + (Vectors.dense([-2.0, 2.3]),), + (Vectors.dense([0.0, 0.0]),), + (Vectors.dense([0.6, -1.1]),) + ], ["features"]) + px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) + for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py index 6f422f840ad282849e2d7970bafc77bdd6f14bd8..788a0baffebb4c3d7118fc22a616c9465cdfc911 100644 --- a/examples/src/main/python/ml/quantile_discretizer_example.py +++ b/examples/src/main/python/ml/quantile_discretizer_example.py @@ -22,18 +22,22 @@ from pyspark.ml.feature import QuantileDiscretizer # $example off$ from pyspark.sql import SparkSession - if __name__ == "__main__": - spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate() + spark = SparkSession\ + .builder\ + .appName("QuantileDiscretizerExample")\ + .getOrCreate() # $example on$ data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)] df = spark.createDataFrame(data, ["id", "hour"]) # $example off$ + # Output of QuantileDiscretizer for such small datasets can depend on the number of # partitions. Here we force a single partition to ensure consistent results. # Note this is not necessary for normal use cases df = df.repartition(1) + # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py index eb9ded9af555ed82ca5b19269f8fa2c4db1f8a6b..4eaa94dd7f489ae721814805beab203ed048a34b 100644 --- a/examples/src/main/python/ml/random_forest_classifier_example.py +++ b/examples/src/main/python/ml/random_forest_classifier_example.py @@ -23,7 +23,7 @@ from __future__ import print_function # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier -from pyspark.ml.feature import StringIndexer, VectorIndexer +from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ from pyspark.sql import SparkSession @@ -31,7 +31,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("random_forest_classifier_example")\ + .appName("RandomForestClassifierExample")\ .getOrCreate() # $example on$ @@ -41,6 +41,7 @@ if __name__ == "__main__": # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) + # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ @@ -52,8 +53,12 @@ if __name__ == "__main__": # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) + # Convert indexed labels back to original labels. + labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", + labels=labelIndexer.labels) + # Chain indexers and forest in a Pipeline - pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) + pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) @@ -62,7 +67,7 @@ if __name__ == "__main__": predictions = model.transform(testData) # Select example rows to display. - predictions.select("prediction", "indexedLabel", "features").show(5) + predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py index 3a793737dba893eda0e8e9610fb2ca80b2125439..a34edff2ecaa223730e9c2ec864d385f38f55463 100644 --- a/examples/src/main/python/ml/random_forest_regressor_example.py +++ b/examples/src/main/python/ml/random_forest_regressor_example.py @@ -31,7 +31,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ - .appName("random_forest_regressor_example")\ + .appName("RandomForestRegressorExample")\ .getOrCreate() # $example on$ diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py index d5df3ce4f59159ac97855df17c2b1e20795c884c..6629239db29ece9a6d2ef175e9583fa3c3c586fe 100644 --- a/examples/src/main/python/ml/rformula_example.py +++ b/examples/src/main/python/ml/rformula_example.py @@ -34,10 +34,12 @@ if __name__ == "__main__": (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) + formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") + output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py deleted file mode 100644 index 2f1eaa6f947f071461e90013f49b7922552a1278..0000000000000000000000000000000000000000 --- a/examples/src/main/python/ml/simple_params_example.py +++ /dev/null @@ -1,95 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import pprint -import sys - -from pyspark.ml.classification import LogisticRegression -from pyspark.ml.linalg import DenseVector -from pyspark.sql import Row, SparkSession - -""" -A simple example demonstrating ways to specify parameters for Estimators and Transformers. -Run with: - bin/spark-submit examples/src/main/python/ml/simple_params_example.py -""" - -if __name__ == "__main__": - spark = SparkSession \ - .builder \ - .appName("SimpleParamsExample") \ - .getOrCreate() - - # prepare training data. - # We create an RDD of LabeledPoints and convert them into a DataFrame. - # A LabeledPoint is an Object with two fields named label and features - # and Spark SQL identifies these fields and creates the schema appropriately. - training = spark.createDataFrame([ - Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])), - Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])), - Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])), - Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))]) - - # Create a LogisticRegression instance with maxIter = 10. - # This instance is an Estimator. - lr = LogisticRegression(maxIter=10) - # Print out the parameters, documentation, and any default values. - print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") - - # We may also set parameters using setter methods. - lr.setRegParam(0.01) - - # Learn a LogisticRegression model. This uses the parameters stored in lr. - model1 = lr.fit(training) - - # Since model1 is a Model (i.e., a Transformer produced by an Estimator), - # we can view the parameters it used during fit(). - # This prints the parameter (name: value) pairs, where names are unique IDs for this - # LogisticRegression instance. - print("Model 1 was fit using parameters:\n") - pprint.pprint(model1.extractParamMap()) - - # We may alternatively specify parameters using a parameter map. - # paramMap overrides all lr parameters set earlier. - paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"} - - # Now learn a new model using the new parameters. - model2 = lr.fit(training, paramMap) - print("Model 2 was fit using parameters:\n") - pprint.pprint(model2.extractParamMap()) - - # prepare test data. - test = spark.createDataFrame([ - Row(label=1.0, features=DenseVector([-1.0, 1.5, 1.3])), - Row(label=0.0, features=DenseVector([3.0, 2.0, -0.1])), - Row(label=0.0, features=DenseVector([0.0, 2.2, -1.5]))]) - - # Make predictions on test data using the Transformer.transform() method. - # LogisticRegressionModel.transform will only use the 'features' column. - # Note that model2.transform() outputs a 'myProbability' column instead of the usual - # 'probability' column since we renamed the lr.probabilityCol parameter previously. - result = model2.transform(test) \ - .select("features", "label", "myProbability", "prediction") \ - .collect() - - for row in result: - print("features=%s,label=%s -> prob=%s, prediction=%s" - % (row.features, row.label, row.myProbability, row.prediction)) - - spark.stop() diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py deleted file mode 100644 index b528b59be9621238b5fa57a50700222f95dd6c51..0000000000000000000000000000000000000000 --- a/examples/src/main/python/ml/simple_text_classification_pipeline.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark.ml import Pipeline -from pyspark.ml.classification import LogisticRegression -from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import Row, SparkSession - - -""" -A simple text classification pipeline that recognizes "spark" from -input text. This is to show how to create and configure a Spark ML -pipeline in Python. Run with: - - bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py -""" - - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("SimpleTextClassificationPipeline")\ - .getOrCreate() - - # Prepare training documents, which are labeled. - training = spark.createDataFrame([ - (0, "a b c d e spark", 1.0), - (1, "b d", 0.0), - (2, "spark f g h", 1.0), - (3, "hadoop mapreduce", 0.0) - ], ["id", "text", "label"]) - - # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. - tokenizer = Tokenizer(inputCol="text", outputCol="words") - hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features") - lr = LogisticRegression(maxIter=10, regParam=0.001) - pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) - - # Fit the pipeline to training documents. - model = pipeline.fit(training) - - # Prepare test documents, which are unlabeled. - test = spark.createDataFrame([ - (4, "spark i j k"), - (5, "l m n"), - (6, "spark hadoop spark"), - (7, "apache hadoop") - ], ["id", "text"]) - - # Make predictions on test documents and print columns of interest. - prediction = model.transform(test) - selected = prediction.select("id", "text", "prediction") - for row in selected.collect(): - print(row) - - spark.stop() diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 395fdeffc537987be93d247da42a62ec85755b64..8a8392cc1fca6130130722bbff805b33e0172c08 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -30,7 +30,7 @@ if __name__ == "__main__": # $example on$ sentenceData = spark.createDataFrame([ - (0, ["I", "saw", "the", "red", "baloon"]), + (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["label", "raw"]) diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py index a328e040f56365e7d923155a3de2753833051598..2255bfb9c1a60866bee494106d0b77ee7907dffb 100644 --- a/examples/src/main/python/ml/string_indexer_example.py +++ b/examples/src/main/python/ml/string_indexer_example.py @@ -32,6 +32,7 @@ if __name__ == "__main__": df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) + indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index fb4ad992fb809cfa1cc936a7418227d3c5ab365d..4ab7eb6964997302736ffe4328ac5527b68c680c 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -34,8 +34,10 @@ if __name__ == "__main__": (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") ], ["label", "sentence"]) + tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) + hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors @@ -43,6 +45,7 @@ if __name__ == "__main__": idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) + for features_label in rescaledData.select("features", "label").take(3): print(features_label) # $example off$ diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index e61ec920d22810302daa0ca2fd2e4152aae9c7d4..89f506070533dbd8d4da8ca88ec0c9ab4e244bef 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -34,12 +34,19 @@ if __name__ == "__main__": (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) + tokenizer = Tokenizer(inputCol="sentence", outputCol="words") - wordsDataFrame = tokenizer.transform(sentenceDataFrame) - for words_label in wordsDataFrame.select("words", "label").take(3): - print(words_label) + regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) + + tokenized = tokenizer.transform(sentenceDataFrame) + for words_label in tokenized.select("words", "label").take(3): + print(words_label) + + regexTokenized = regexTokenizer.transform(sentenceDataFrame) + for words_label in regexTokenized.select("words", "label").take(3): + print(words_label) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py index 5f5c52aca8c42f64b61d82e04313ea14582491c3..a92b861f8317bf8e27a9c9d57f7994e7c207c1db 100644 --- a/examples/src/main/python/ml/train_validation_split.py +++ b/examples/src/main/python/ml/train_validation_split.py @@ -35,18 +35,21 @@ if __name__ == "__main__": .builder\ .appName("TrainValidationSplit")\ .getOrCreate() + # $example on$ # Prepare training and test data. data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") - train, test = data.randomSplit([0.7, 0.3]) - lr = LinearRegression(maxIter=10, regParam=0.1) + train, test = data.randomSplit([0.9, 0.1], seed=12345) + + lr = LinearRegression(maxIter=10) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ + .addGrid(lr.fitIntercept, [False, True])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() @@ -60,6 +63,7 @@ if __name__ == "__main__": # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) + # Make predictions on test data. model is the model with combination of parameters # that performed best. prediction = model.transform(test) diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index bbfc316ff2d3334f8f53afab4bd9261d9576d854..eac33711adaa3e6b0393e670de97cede0e9c0b9f 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -33,9 +33,11 @@ if __name__ == "__main__": dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) + assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") + output = assembler.transform(dataset) print(output.select("features", "clicked").first()) # $example off$ diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 9b00e0f84136c2e771d92c975cf54653e1cda3db..3912c135be2e218383fb0dc3d0118613b2f646e1 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -30,6 +30,7 @@ if __name__ == "__main__": # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 66500bee152f7adaca8e6c0c32fddd044db2d89a..78a91c92fc5306f85b36226d820f2bf035fc701d 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -35,9 +35,11 @@ if __name__ == "__main__": ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) ], ["text"]) + # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) + result = model.transform(documentDF) for feature in result.select("result").take(3): print(feature) diff --git a/examples/src/main/python/streaming/queue_stream.py b/examples/src/main/python/streaming/queue_stream.py index b3808907f74a675a40817180784a15260c4a16b7..bdd2d48519494dc65c90388f3eae2819e2dd5e90 100644 --- a/examples/src/main/python/streaming/queue_stream.py +++ b/examples/src/main/python/streaming/queue_stream.py @@ -22,7 +22,6 @@ To run this example use `$ bin/spark-submit examples/src/main/python/streaming/queue_stream.py """ -import sys import time from pyspark import SparkContext diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index a68fd0285f567a69c92f840a7702b5d54528e39c..86eed3867c5398d61e5f00516f1ad1764422f5b1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples -import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession /** diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala index 2c2bf421bc5d3d492c86934aa52686ba474c1310..26095b46f58e1a0b599f5b6242d9210d70df4acc 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala @@ -33,8 +33,10 @@ import org.apache.spark.sql.SparkSession */ object GaussianMixtureExample { def main(args: Array[String]): Unit = { - // Creates a SparkSession - val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate() + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() // $example on$ // Loads data diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala index 7c5d3f23411f01916a150378b97901ad417a7ce5..a840559d24456f8846486334d2df2ef4c340efeb 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala @@ -33,8 +33,6 @@ import org.apache.spark.sql.SparkSession object IsotonicRegressionExample { def main(args: Array[String]): Unit = { - - // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index 2341b36db2400c86e0ba1cb2bb7ad35f3ea16cdb..a1d19e138dedbe4090b9b59cb50df20627bd5975 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.SparkSession object KMeansExample { def main(args: Array[String]): Unit = { - // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala index 75fef2922adbdf89519fee48e10e8cce611565bd..1cd2641f9a8d0a382fbad29143eb8799ccdde1e1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala @@ -46,6 +46,7 @@ object ModelSelectionViaTrainValidationSplitExample { val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() + .setMaxIter(10) // We use a ParamGridBuilder to construct a grid of parameters to search over. // TrainValidationSplit will try all combinations of values and determine best model using diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala index e8a9b32da966448eec89ffce577bc64a0c2dd1aa..a39e3202ba190ab32614a5675665c86e4d954dab 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala @@ -39,27 +39,33 @@ object MultilayerPerceptronClassifierExample { // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm") .load("data/mllib/sample_multiclass_classification_data.txt") + // Split the data into train and test val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) val train = splits(0) val test = splits(1) + // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) val layers = Array[Int](4, 5, 4, 3) + // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100) + // train the model val model = trainer.fit(train) + // compute accuracy on the test set val result = model.transform(test) val predictionAndLabels = result.select("prediction", "label") val evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy") + println("Accuracy: " + evaluator.evaluate(predictionAndLabels)) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 7089a4bc87aaa0788da33533ad2549da40193ad7..3ae0623c4c0fd8efef1354b52cda2a3bb49120de 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -30,6 +30,7 @@ object NaiveBayesExample { .builder .appName("NaiveBayesExample") .getOrCreate() + // $example on$ // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala index b16692b1fa36fe912d6eeca52aafd9cfdfc0aca2..12f8663b9ce55a0a25f409957d3790981d22a2ad 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala @@ -54,7 +54,7 @@ object PipelineExample { .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) - .setRegParam(0.01) + .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) @@ -74,7 +74,7 @@ object PipelineExample { val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), - (6L, "mapreduce spark"), + (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala index 2f7e217b8fe2d7711441e18a88de2e4e9684c33f..aedb9e7d3bb70b2c1f374e4022fdb703d1fca833 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala @@ -28,16 +28,16 @@ object QuantileDiscretizerExample { .builder .appName("QuantileDiscretizerExample") .getOrCreate() - import spark.implicits._ // $example on$ val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)) - var df = spark.createDataFrame(data).toDF("id", "hour") + val df = spark.createDataFrame(data).toDF("id", "hour") // $example off$ // Output of QuantileDiscretizer for such small datasets can depend on the number of // partitions. Here we force a single partition to ensure consistent results. // Note this is not necessary for normal use cases .repartition(1) + // $example on$ val discretizer = new QuantileDiscretizer() .setInputCol("hour") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala index 9ea4920146448140945f2c0bdee622839fd135a9..3498fa8a50c69b4b926d1329000ec403e2e5367d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala @@ -36,10 +36,12 @@ object RFormulaExample { (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0) )).toDF("id", "country", "hour", "clicked") + val formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label") + val output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala deleted file mode 100644 index 29f1f509608a726cbe58a92bab1d42b3a8b0a042..0000000000000000000000000000000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.ml - -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.{Row, SparkSession} - -/** - * A simple example demonstrating ways to specify parameters for Estimators and Transformers. - * Run with - * {{{ - * bin/run-example ml.SimpleParamsExample - * }}} - */ -object SimpleParamsExample { - - def main(args: Array[String]) { - val spark = SparkSession - .builder - .appName("SimpleParamsExample") - .getOrCreate() - import spark.implicits._ - - // Prepare training data. - // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes - // into DataFrames, where it uses the case class metadata to infer the schema. - val training = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - - // Create a LogisticRegression instance. This instance is an Estimator. - val lr = new LogisticRegression() - // Print out the parameters, documentation, and any default values. - println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") - - // We may set parameters using setter methods. - lr.setMaxIter(10) - .setRegParam(0.01) - - // Learn a LogisticRegression model. This uses the parameters stored in lr. - val model1 = lr.fit(training) - // Since model1 is a Model (i.e., a Transformer produced by an Estimator), - // we can view the parameters it used during fit(). - // This prints the parameter (name: value) pairs, where names are unique IDs for this - // LogisticRegression instance. - println("Model 1 was fit using parameters: " + model1.parent.extractParamMap()) - - // We may alternatively specify parameters using a ParamMap, - // which supports several methods for specifying parameters. - val paramMap = ParamMap(lr.maxIter -> 20) - paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. - paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params. - - // One can also combine ParamMaps. - val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name - val paramMapCombined = paramMap ++ paramMap2 - - // Now learn a new model using the paramMapCombined parameters. - // paramMapCombined overrides all parameters set earlier via lr.set* methods. - val model2 = lr.fit(training.toDF(), paramMapCombined) - println("Model 2 was fit using parameters: " + model2.parent.extractParamMap()) - - // Prepare test data. - val test = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)))) - - // Make predictions on test data using the Transformer.transform() method. - // LogisticRegressionModel.transform will only use the 'features' column. - // Note that model2.transform() outputs a 'myProbability' column instead of the usual - // 'probability' column since we renamed the lr.probabilityCol parameter previously. - model2.transform(test) - .select("features", "label", "myProbability", "prediction") - .collect() - .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => - println(s"($features, $label) -> prob=$prob, prediction=$prediction") - } - - spark.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala deleted file mode 100644 index 0b2a058bb61aa266bc366ff2a40dc65a35165404..0000000000000000000000000000000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.ml - -import scala.beans.BeanInfo - -import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.feature.{HashingTF, Tokenizer} -import org.apache.spark.ml.linalg.Vector -import org.apache.spark.sql.{Row, SparkSession} - -@BeanInfo -case class LabeledDocument(id: Long, text: String, label: Double) - -@BeanInfo -case class Document(id: Long, text: String) - -/** - * A simple text classification pipeline that recognizes "spark" from input text. This is to show - * how to create and configure an ML pipeline. Run with - * {{{ - * bin/run-example ml.SimpleTextClassificationPipeline - * }}} - */ -object SimpleTextClassificationPipeline { - - def main(args: Array[String]) { - val spark = SparkSession - .builder - .appName("SimpleTextClassificationPipeline") - .getOrCreate() - import spark.implicits._ - - // Prepare training documents, which are labeled. - val training = spark.createDataFrame(Seq( - LabeledDocument(0L, "a b c d e spark", 1.0), - LabeledDocument(1L, "b d", 0.0), - LabeledDocument(2L, "spark f g h", 1.0), - LabeledDocument(3L, "hadoop mapreduce", 0.0))) - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - val tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words") - val hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol) - .setOutputCol("features") - val lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.001) - val pipeline = new Pipeline() - .setStages(Array(tokenizer, hashingTF, lr)) - - // Fit the pipeline to training documents. - val model = pipeline.fit(training.toDF()) - - // Prepare test documents, which are unlabeled. - val test = spark.createDataFrame(Seq( - Document(4L, "spark i j k"), - Document(5L, "l m n"), - Document(6L, "spark hadoop spark"), - Document(7L, "apache hadoop"))) - - // Make predictions on test documents. - model.transform(test.toDF()) - .select("id", "text", "probability", "prediction") - .collect() - .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => - println(s"($id, $text) --> prob=$prob, prediction=$prediction") - } - - spark.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index fb1a43e962cd513821d4807c3aa5186c1436493f..a56de0856dbb898156a8aea91ad6e6e9703758a8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -36,7 +36,7 @@ object StopWordsRemoverExample { .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( - (0, Seq("I", "saw", "the", "red", "baloon")), + (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 33b5daec597838341f4952492d15af4f320ec235..97f6fcce15e368af6418edb2878ff2c9bedc99d5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -40,13 +40,16 @@ object TfIdfExample { val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) + val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) + val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) + val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index 1c70dc700b91c5bd4475e259902ecc431b45644e..90d0faaf47506e83eef763164c5df19603295fbe 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -45,6 +45,7 @@ object TokenizerExample { val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) + val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("words", "label").take(3).foreach(println) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index 9ac5623607296ca9223a0415bb9c5dfeeb3ef733..5c8bd19f20a8ca5644148d0607b4a42400d71448 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -45,6 +45,7 @@ object Word2VecExample { .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) + val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$