Skip to content
Snippets Groups Projects
Commit 4c28a2ba authored by Matei Zaharia's avatar Matei Zaharia
Browse files

Update some Python MLlib parameters to use camelCase, and tweak docs

We've used camel case in other Spark methods so it felt reasonable to
keep using it here and make the code match Scala/Java as much as
possible. Note that parameter names matter in Python because it allows
passing optional parameters by name.
parent 9a0dfdf8
No related branches found
No related tags found
No related merge requests found
......@@ -21,6 +21,8 @@ depends on native Fortran routines. You may need to install the
if it is not already present on your nodes. MLlib will throw a linking error if it cannot
detect these libraries automatically.
To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer.
# Binary Classification
Binary classification is a supervised learning problem in which we want to
......@@ -316,6 +318,13 @@ other signals), you can use the trainImplicit method to get better results.
val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
{% endhighlight %}
# Using MLLib in Java
All of MLlib's methods use Java-friendly types, so you can import and call them there the same
way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
calling `.rdd()` on your `JavaRDD` object.
# Using MLLib in Python
Following examples can be tested in the PySpark shell.
......
......@@ -44,13 +44,13 @@ class LogisticRegressionModel(LinearModel):
class LogisticRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0,
mini_batch_fraction=1.0, initial_weights=None):
miniBatchFraction=1.0, initialWeights=None):
"""Train a logistic regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLogisticRegressionModelWithSGD(d._jrdd,
iterations, step, mini_batch_fraction, i),
LogisticRegressionModel, data, initial_weights)
iterations, step, miniBatchFraction, i),
LogisticRegressionModel, data, initialWeights)
class SVMModel(LinearModel):
"""A support vector machine.
......@@ -67,14 +67,14 @@ class SVMModel(LinearModel):
class SVMWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
mini_batch_fraction=1.0, initial_weights=None):
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None):
"""Train a support vector machine on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainSVMModelWithSGD(d._jrdd,
iterations, step, reg_param, mini_batch_fraction, i),
SVMModel, data, initial_weights)
iterations, step, regParam, miniBatchFraction, i),
SVMModel, data, initialWeights)
class NaiveBayesModel(object):
"""
......
......@@ -47,57 +47,57 @@ class LinearRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class LinearRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0,
mini_batch_fraction=1.0, initial_weights=None):
miniBatchFraction=1.0, initialWeights=None):
"""Train a linear regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD(
d._jrdd, iterations, step, mini_batch_fraction, i),
LinearRegressionModel, data, initial_weights)
d._jrdd, iterations, step, miniBatchFraction, i),
LinearRegressionModel, data, initialWeights)
class LassoModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
l_1 penalty term.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
>>> lrm = LassoWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
>>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class LassoWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
mini_batch_fraction=1.0, initial_weights=None):
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None):
"""Train a Lasso regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLassoModelWithSGD(d._jrdd,
iterations, step, reg_param, mini_batch_fraction, i),
LassoModel, data, initial_weights)
iterations, step, regParam, miniBatchFraction, i),
LassoModel, data, initialWeights)
class RidgeRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
l_2 penalty term.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class RidgeRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
mini_batch_fraction=1.0, initial_weights=None):
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None):
"""Train a ridge regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainRidgeModelWithSGD(d._jrdd,
iterations, step, reg_param, mini_batch_fraction, i),
RidgeRegressionModel, data, initial_weights)
iterations, step, regParam, miniBatchFraction, i),
RidgeRegressionModel, data, initialWeights)
def _test():
import doctest
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment