diff --git a/pyspark/README b/pyspark/README index 461176de7de615d1e5f380c637381fc9a170d2e7..d8d521c72cb77148274698f1f1c025f8c982c9a9 100644 --- a/pyspark/README +++ b/pyspark/README @@ -32,30 +32,11 @@ The `pyspark/pyspark/examples` directory contains a few complete examples. ## Installing PySpark - -PySpark requires a development version of Py4J, a Python library for -interacting with Java processes. It can be installed from -https://github.com/bartdag/py4j; make sure to install a version that -contains at least the commits through b7924aabe9. - -PySpark requires the `argparse` module, which is included in Python 2.7 -and is is available for Python 2.6 through `pip` or `easy_install`. - -PySpark uses the `PYTHONPATH` environment variable to search for Python -classes; Py4J should be on this path, along with any libraries used by -PySpark programs. `PYTHONPATH` will be automatically shipped to worker -machines, but the files that it points to must be present on each -machine. - -PySpark requires the Spark assembly JAR, which can be created by running -`sbt/sbt assembly` in the Spark directory. - -Additionally, `SPARK_HOME` should be set to the location of the Spark +# +To use PySpark, `SPARK_HOME` should be set to the location of the Spark package. ## Running PySpark The easiest way to run PySpark is to use the `run-pyspark` and `pyspark-shell` scripts, which are included in the `pyspark` directory. -These scripts automatically load the `spark-conf.sh` file, set -`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`. diff --git a/pyspark/lib/PY4J_LICENSE.txt b/pyspark/lib/PY4J_LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70279ca14ae3cdfd3f166871eb888f4d6e112ac --- /dev/null +++ b/pyspark/lib/PY4J_LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +- The name of the author may not be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/pyspark/lib/PY4J_VERSION.txt b/pyspark/lib/PY4J_VERSION.txt new file mode 100644 index 0000000000000000000000000000000000000000..04a0cd52a8d9c0bff32eb2d907f09b41969232ed --- /dev/null +++ b/pyspark/lib/PY4J_VERSION.txt @@ -0,0 +1 @@ +b7924aabe9c5e63f0a4d8bbd17019534c7ec014e diff --git a/pyspark/lib/py4j0.7.egg b/pyspark/lib/py4j0.7.egg new file mode 100644 index 0000000000000000000000000000000000000000..f8a339d8eef591afce7e7e0e5beb73355bf0a4bf Binary files /dev/null and b/pyspark/lib/py4j0.7.egg differ diff --git a/pyspark/lib/py4j0.7.jar b/pyspark/lib/py4j0.7.jar new file mode 100644 index 0000000000000000000000000000000000000000..73b7ddb7d115be2fb1ba4cf8003fe90c662feef6 Binary files /dev/null and b/pyspark/lib/py4j0.7.jar differ diff --git a/pyspark/pyspark-shell b/pyspark/pyspark-shell index 4ed3e6010c3d1f48d318c7d58f25d9cc7613b3aa..e3736826e893efd643df01760199fae4b28cf1d5 100755 --- a/pyspark/pyspark-shell +++ b/pyspark/pyspark-shell @@ -1,3 +1,3 @@ -#!/bin/sh +#!/usr/bin/env bash FWDIR="`dirname $0`" exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@" diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..549c2d271149e3f24763658bc9fb3597ff56b365 100644 --- a/pyspark/pyspark/__init__.py +++ b/pyspark/pyspark/__init__.py @@ -0,0 +1,3 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg")) diff --git a/pyspark/pyspark/java_gateway.py b/pyspark/pyspark/java_gateway.py index 3726bcbf1780afd2f116e61c4d9d3eaef0bcadda..d4a4434c0556c283dacf79f572d060d9b9c3921a 100644 --- a/pyspark/pyspark/java_gateway.py +++ b/pyspark/pyspark/java_gateway.py @@ -1,19 +1,36 @@ -import glob import os -from py4j.java_gateway import java_import, JavaGateway +from subprocess import Popen, PIPE +from threading import Thread +from py4j.java_gateway import java_import, JavaGateway, GatewayClient SPARK_HOME = os.environ["SPARK_HOME"] -assembly_jar = glob.glob(os.path.join(SPARK_HOME, "core/target") + \ - "/spark-core-assembly-*.jar")[0] - # TODO: what if multiple assembly jars are found? - - def launch_gateway(): - gateway = JavaGateway.launch_gateway(classpath=assembly_jar, - javaopts=["-Xmx256m"], die_on_exit=True) + # Launch the Py4j gateway using Spark's run command so that we pick up the + # proper classpath and SPARK_MEM settings from spark-env.sh + command = [os.path.join(SPARK_HOME, "run"), "py4j.GatewayServer", + "--die-on-broken-pipe", "0"] + proc = Popen(command, stdout=PIPE, stdin=PIPE) + # Determine which ephemeral port the server started on: + port = int(proc.stdout.readline()) + # Create a thread to echo output from the GatewayServer, which is required + # for Java log output to show up: + class EchoOutputThread(Thread): + def __init__(self, stream): + Thread.__init__(self) + self.daemon = True + self.stream = stream + + def run(self): + while True: + line = self.stream.readline() + print line, + EchoOutputThread(proc.stdout).start() + # Connect to the gateway + gateway = JavaGateway(GatewayClient(port=port)) + # Import the classes used by PySpark java_import(gateway.jvm, "spark.api.java.*") java_import(gateway.jvm, "spark.api.python.*") java_import(gateway.jvm, "scala.Tuple2") diff --git a/pyspark/pyspark/shell.py b/pyspark/pyspark/shell.py index 7012884abc452463cedaadc1f1f3e75ced50ad5c..bd39b0283f7277db127c46bb7025661a0e8c2690 100644 --- a/pyspark/pyspark/shell.py +++ b/pyspark/pyspark/shell.py @@ -1,7 +1,7 @@ """ An interactive shell. """ -import argparse # argparse is avaiable for Python < 2.7 through easy_install. +import optparse # I prefer argparse, but it's not included with Python < 2.7 import code import sys @@ -21,10 +21,13 @@ def main(master='local', ipython=False): if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("master", help="Spark master host (default='local')", - nargs='?', type=str, default="local") - parser.add_argument("-i", "--ipython", help="Run IPython shell", - action="store_true") - args = parser.parse_args() - main(args.master, args.ipython) + usage = "usage: %prog [options] master" + parser = optparse.OptionParser(usage=usage) + parser.add_option("-i", "--ipython", help="Run IPython shell", + action="store_true") + (options, args) = parser.parse_args() + if len(sys.argv) > 1: + master = args[0] + else: + master = 'local' + main(master, options.ipython) diff --git a/pyspark/requirements.txt b/pyspark/requirements.txt deleted file mode 100644 index 2464ca0074ff298cdf15752e8da9617c2f3a7754..0000000000000000000000000000000000000000 --- a/pyspark/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -# The Python API relies on some new features from the Py4J development branch. -# pip can't install Py4J from git because the setup.py file for the Python -# package is not at the root of the git repository. It may be possible to -# install Py4J from git once https://github.com/pypa/pip/pull/526 is merged. - -# git+git://github.com/bartdag/py4j.git@b7924aabe9c5e63f0a4d8bbd17019534c7ec014e -argparse diff --git a/pyspark/run-pyspark b/pyspark/run-pyspark index 9c5e027962ebad91e6716af3f523fa238089b961..f8039b80383577ed8849eedbdcc6ec059f1a6356 100755 --- a/pyspark/run-pyspark +++ b/pyspark/run-pyspark @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Figure out where the Scala framework is installed FWDIR="$(cd `dirname $0`; cd ../; pwd)" diff --git a/run b/run index 15db23bbe0c6b9b135b3208ffb000232038b4cff..8fa61b086fe6513dfd856fd90cdea3853c606ded 100755 --- a/run +++ b/run @@ -40,6 +40,7 @@ CORE_DIR="$FWDIR/core" REPL_DIR="$FWDIR/repl" EXAMPLES_DIR="$FWDIR/examples" BAGEL_DIR="$FWDIR/bagel" +PYSPARK_DIR="$FWDIR/pyspark" # Build up classpath CLASSPATH="$SPARK_CLASSPATH" @@ -61,6 +62,9 @@ for jar in `find $REPL_DIR/lib -name '*jar'`; do CLASSPATH+=":$jar" done CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes" +for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do + CLASSPATH+=":$jar" +done export CLASSPATH # Needed for spark-shell # Figure out whether to run our class with java or with the scala launcher. diff --git a/run2.cmd b/run2.cmd index 097718b526a7a9bbac3d4fd5d381b1f831bf6e8d..60247407269e4009a6b4775f86daa3ae7aa4aa4f 100644 --- a/run2.cmd +++ b/run2.cmd @@ -34,6 +34,7 @@ set CORE_DIR=%FWDIR%core set REPL_DIR=%FWDIR%repl set EXAMPLES_DIR=%FWDIR%examples set BAGEL_DIR=%FWDIR%bagel +set PYSPARK_DIR=%FWDIR%pyspark rem Build up classpath set CLASSPATH=%SPARK_CLASSPATH%;%MESOS_CLASSPATH%;%FWDIR%conf;%CORE_DIR%\target\scala-%SCALA_VERSION%\classes @@ -42,6 +43,7 @@ set CLASSPATH=%CLASSPATH%;%REPL_DIR%\target\scala-%SCALA_VERSION%\classes;%EXAMP for /R "%FWDIR%\lib_managed\jars" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j for /R "%FWDIR%\lib_managed\bundles" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j for /R "%REPL_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j +for /R "%PYSPARK_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes rem Figure out whether to run our class with java or with the scala launcher.