Skip to content
Snippets Groups Projects
Commit 835a79d7 authored by Sun Rui's avatar Sun Rui Committed by Shivaram Venkataraman
Browse files

[SPARK-10500][SPARKR] sparkr.zip cannot be created if /R/lib is unwritable

The basic idea is that:
The archive of the SparkR package itself, that is sparkr.zip, is created during build process and is contained in the Spark binary distribution. No change to it after the distribution is installed as the directory it resides ($SPARK_HOME/R/lib) may not be writable.

When there is R source code contained in jars or Spark packages specified with "--jars" or "--packages" command line option, a temporary directory is created by calling Utils.createTempDir() where the R packages built from the R source code will be installed. The temporary directory is writable, and won't interfere with each other when there are multiple SparkR sessions, and will be deleted when this SparkR session ends. The R binary packages installed in the temporary directory then are packed into an archive named rpkg.zip.

sparkr.zip and rpkg.zip are distributed to the cluster in YARN modes.

The distribution of rpkg.zip in Standalone modes is not supported in this PR, and will be address in another PR.

Various R files are updated to accept multiple lib paths (one is for SparkR package, the other is for other R packages)  so that these package can be accessed in R.

Author: Sun Rui <rui.sun@intel.com>

Closes #9390 from sun-rui/SPARK-10500.
parent d7d9fa0b
No related branches found
No related tags found
No related merge requests found
Showing
with 121 additions and 36 deletions
......@@ -25,3 +25,9 @@ set SPARK_HOME=%~dp0..
MKDIR %SPARK_HOME%\R\lib
R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" %SPARK_HOME%\R\pkg\
rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
pushd %SPARK_HOME%\R\lib
%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR
popd
......@@ -42,4 +42,8 @@ Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtoo
# Install SparkR to $LIB_DIR
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
# Zip the SparkR package so that it can be distributed to worker nodes on YARN
cd $LIB_DIR
jar cfM "$LIB_DIR/sparkr.zip" SparkR
popd > /dev/null
......@@ -48,6 +48,12 @@ sparkR.stop <- function() {
}
}
# Remove the R package lib path from .libPaths()
if (exists(".libPath", envir = env)) {
libPath <- get(".libPath", envir = env)
.libPaths(.libPaths()[.libPaths() != libPath])
}
if (exists(".backendLaunched", envir = env)) {
callJStatic("SparkRHandler", "stopBackend")
}
......@@ -155,14 +161,20 @@ sparkR.init <- function(
f <- file(path, open="rb")
backendPort <- readInt(f)
monitorPort <- readInt(f)
rLibPath <- readString(f)
close(f)
file.remove(path)
if (length(backendPort) == 0 || backendPort == 0 ||
length(monitorPort) == 0 || monitorPort == 0) {
length(monitorPort) == 0 || monitorPort == 0 ||
length(rLibPath) != 1) {
stop("JVM failed to launch")
}
assign(".monitorConn", socketConnection(port = monitorPort), envir = .sparkREnv)
assign(".backendLaunched", 1, envir = .sparkREnv)
if (rLibPath != "") {
assign(".libPath", rLibPath, envir = .sparkREnv)
.libPaths(c(rLibPath, .libPaths()))
}
}
.sparkREnv$backendPort <- backendPort
......
......@@ -17,6 +17,7 @@
.First <- function() {
packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR")
.libPaths(c(packageDir, .libPaths()))
dirs <- strsplit(packageDir, ",")[[1]]
.libPaths(c(dirs, .libPaths()))
Sys.setenv(NOAWT=1)
}
......@@ -18,10 +18,11 @@
# Worker daemon
rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
script <- paste(rLibDir, "SparkR/worker/worker.R", sep = "/")
dirs <- strsplit(rLibDir, ",")[[1]]
script <- file.path(dirs[[1]], "SparkR", "worker", "worker.R")
# preload SparkR package, speedup worker
.libPaths(c(rLibDir, .libPaths()))
.libPaths(c(dirs, .libPaths()))
suppressPackageStartupMessages(library(SparkR))
port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
......
......@@ -35,10 +35,11 @@ bootTime <- currentTimeSecs()
bootElap <- elapsedSecs()
rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
dirs <- strsplit(rLibDir, ",")[[1]]
# Set libPaths to include SparkR package as loadNamespace needs this
# TODO: Figure out if we can avoid this by not loading any objects that require
# SparkR namespace
.libPaths(c(rLibDir, .libPaths()))
.libPaths(c(dirs, .libPaths()))
suppressPackageStartupMessages(library(SparkR))
port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
......
......@@ -113,6 +113,7 @@ private[spark] object RBackend extends Logging {
val dos = new DataOutputStream(new FileOutputStream(f))
dos.writeInt(boundPort)
dos.writeInt(listenPort)
SerDe.writeString(dos, RUtils.rPackages.getOrElse(""))
dos.close()
f.renameTo(new File(path))
......
......@@ -400,14 +400,14 @@ private[r] object RRDD {
val rOptions = "--vanilla"
val rLibDir = RUtils.sparkRPackagePath(isDriver = false)
val rExecScript = rLibDir + "/SparkR/worker/" + script
val rExecScript = rLibDir(0) + "/SparkR/worker/" + script
val pb = new ProcessBuilder(Arrays.asList(rCommand, rOptions, rExecScript))
// Unset the R_TESTS environment variable for workers.
// This is set by R CMD check as startup.Rs
// (http://svn.r-project.org/R/trunk/src/library/tools/R/testing.R)
// and confuses worker script which tries to load a non-existent file
pb.environment().put("R_TESTS", "")
pb.environment().put("SPARKR_RLIBDIR", rLibDir)
pb.environment().put("SPARKR_RLIBDIR", rLibDir.mkString(","))
pb.environment().put("SPARKR_WORKER_PORT", port.toString)
pb.redirectErrorStream(true) // redirect stderr into stdout
val proc = pb.start()
......
......@@ -23,6 +23,10 @@ import java.util.Arrays
import org.apache.spark.{SparkEnv, SparkException}
private[spark] object RUtils {
// Local path where R binary packages built from R source code contained in the spark
// packages specified with "--packages" or "--jars" command line option reside.
var rPackages: Option[String] = None
/**
* Get the SparkR package path in the local spark distribution.
*/
......@@ -34,11 +38,15 @@ private[spark] object RUtils {
}
/**
* Get the SparkR package path in various deployment modes.
* Get the list of paths for R packages in various deployment modes, of which the first
* path is for the SparkR package itself. The second path is for R packages built as
* part of Spark Packages, if any exist. Spark Packages can be provided through the
* "--packages" or "--jars" command line options.
*
* This assumes that Spark properties `spark.master` and `spark.submit.deployMode`
* and environment variable `SPARK_HOME` are set.
*/
def sparkRPackagePath(isDriver: Boolean): String = {
def sparkRPackagePath(isDriver: Boolean): Seq[String] = {
val (master, deployMode) =
if (isDriver) {
(sys.props("spark.master"), sys.props("spark.submit.deployMode"))
......@@ -51,15 +59,30 @@ private[spark] object RUtils {
val isYarnClient = master != null && master.contains("yarn") && deployMode == "client"
// In YARN mode, the SparkR package is distributed as an archive symbolically
// linked to the "sparkr" file in the current directory. Note that this does not apply
// to the driver in client mode because it is run outside of the cluster.
// linked to the "sparkr" file in the current directory and additional R packages
// are distributed as an archive symbolically linked to the "rpkg" file in the
// current directory.
//
// Note that this does not apply to the driver in client mode because it is run
// outside of the cluster.
if (isYarnCluster || (isYarnClient && !isDriver)) {
new File("sparkr").getAbsolutePath
val sparkRPkgPath = new File("sparkr").getAbsolutePath
val rPkgPath = new File("rpkg")
if (rPkgPath.exists()) {
Seq(sparkRPkgPath, rPkgPath.getAbsolutePath)
} else {
Seq(sparkRPkgPath)
}
} else {
// Otherwise, assume the package is local
// TODO: support this for Mesos
localSparkRPackagePath.getOrElse {
throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
val sparkRPkgPath = localSparkRPackagePath.getOrElse {
throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
}
if (!rPackages.isEmpty) {
Seq(sparkRPkgPath, rPackages.get)
} else {
Seq(sparkRPkgPath)
}
}
}
......
......@@ -100,20 +100,29 @@ private[deploy] object RPackageUtils extends Logging {
* Runs the standard R package installation code to build the R package from source.
* Multiple runs don't cause problems.
*/
private def rPackageBuilder(dir: File, printStream: PrintStream, verbose: Boolean): Boolean = {
private def rPackageBuilder(
dir: File,
printStream: PrintStream,
verbose: Boolean,
libDir: String): Boolean = {
// this code should be always running on the driver.
val pathToSparkR = RUtils.localSparkRPackagePath.getOrElse(
throw new SparkException("SPARK_HOME not set. Can't locate SparkR package."))
val pathToPkg = Seq(dir, "R", "pkg").mkString(File.separator)
val installCmd = baseInstallCmd ++ Seq(pathToSparkR, pathToPkg)
val installCmd = baseInstallCmd ++ Seq(libDir, pathToPkg)
if (verbose) {
print(s"Building R package with the command: $installCmd", printStream)
}
try {
val builder = new ProcessBuilder(installCmd.asJava)
builder.redirectErrorStream(true)
// Put the SparkR package directory into R library search paths in case this R package
// may depend on SparkR.
val env = builder.environment()
env.clear()
val rPackageDir = RUtils.sparkRPackagePath(isDriver = true)
env.put("SPARKR_PACKAGE_DIR", rPackageDir.mkString(","))
env.put("R_PROFILE_USER",
Seq(rPackageDir(0), "SparkR", "profile", "general.R").mkString(File.separator))
val process = builder.start()
new RedirectThread(process.getInputStream, printStream, "redirect R packaging").start()
process.waitFor() == 0
......@@ -170,8 +179,11 @@ private[deploy] object RPackageUtils extends Logging {
if (checkManifestForR(jar)) {
print(s"$file contains R source code. Now installing package.", printStream, Level.INFO)
val rSource = extractRFolder(jar, printStream, verbose)
if (RUtils.rPackages.isEmpty) {
RUtils.rPackages = Some(Utils.createTempDir().getAbsolutePath)
}
try {
if (!rPackageBuilder(rSource, printStream, verbose)) {
if (!rPackageBuilder(rSource, printStream, verbose, RUtils.rPackages.get)) {
print(s"ERROR: Failed to build R package in $file.", printStream)
print(RJarDoc, printStream)
}
......@@ -208,7 +220,7 @@ private[deploy] object RPackageUtils extends Logging {
}
}
/** Zips all the libraries found with SparkR in the R/lib directory for distribution with Yarn. */
/** Zips all the R libraries built for distribution to the cluster. */
private[deploy] def zipRLibraries(dir: File, name: String): File = {
val filesToBundle = listFilesRecursively(dir, Seq(".zip"))
// create a zip file from scratch, do not append to existing file.
......
......@@ -82,9 +82,10 @@ object RRunner {
val env = builder.environment()
env.put("EXISTING_SPARKR_BACKEND_PORT", sparkRBackendPort.toString)
val rPackageDir = RUtils.sparkRPackagePath(isDriver = true)
env.put("SPARKR_PACKAGE_DIR", rPackageDir)
// Put the R package directories into an env variable of comma-separated paths
env.put("SPARKR_PACKAGE_DIR", rPackageDir.mkString(","))
env.put("R_PROFILE_USER",
Seq(rPackageDir, "SparkR", "profile", "general.R").mkString(File.separator))
Seq(rPackageDir(0), "SparkR", "profile", "general.R").mkString(File.separator))
builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
val process = builder.start()
......
......@@ -83,6 +83,7 @@ object SparkSubmit {
private val PYSPARK_SHELL = "pyspark-shell"
private val SPARKR_SHELL = "sparkr-shell"
private val SPARKR_PACKAGE_ARCHIVE = "sparkr.zip"
private val R_PACKAGE_ARCHIVE = "rpkg.zip"
private val CLASS_NOT_FOUND_EXIT_STATUS = 101
......@@ -362,22 +363,46 @@ object SparkSubmit {
}
}
// In YARN mode for an R app, add the SparkR package archive to archives
// that can be distributed with the job
// In YARN mode for an R app, add the SparkR package archive and the R package
// archive containing all of the built R libraries to archives so that they can
// be distributed with the job
if (args.isR && clusterManager == YARN) {
val rPackagePath = RUtils.localSparkRPackagePath
if (rPackagePath.isEmpty) {
val sparkRPackagePath = RUtils.localSparkRPackagePath
if (sparkRPackagePath.isEmpty) {
printErrorAndExit("SPARK_HOME does not exist for R application in YARN mode.")
}
val rPackageFile =
RPackageUtils.zipRLibraries(new File(rPackagePath.get), SPARKR_PACKAGE_ARCHIVE)
if (!rPackageFile.exists()) {
val sparkRPackageFile = new File(sparkRPackagePath.get, SPARKR_PACKAGE_ARCHIVE)
if (!sparkRPackageFile.exists()) {
printErrorAndExit(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.")
}
val localURI = Utils.resolveURI(rPackageFile.getAbsolutePath)
val sparkRPackageURI = Utils.resolveURI(sparkRPackageFile.getAbsolutePath).toString
// Distribute the SparkR package.
// Assigns a symbol link name "sparkr" to the shipped package.
args.archives = mergeFileLists(args.archives, localURI.toString + "#sparkr")
args.archives = mergeFileLists(args.archives, sparkRPackageURI + "#sparkr")
// Distribute the R package archive containing all the built R packages.
if (!RUtils.rPackages.isEmpty) {
val rPackageFile =
RPackageUtils.zipRLibraries(new File(RUtils.rPackages.get), R_PACKAGE_ARCHIVE)
if (!rPackageFile.exists()) {
printErrorAndExit("Failed to zip all the built R packages.")
}
val rPackageURI = Utils.resolveURI(rPackageFile.getAbsolutePath).toString
// Assigns a symbol link name "rpkg" to the shipped package.
args.archives = mergeFileLists(args.archives, rPackageURI + "#rpkg")
}
}
// TODO: Support distributing R packages with standalone cluster
if (args.isR && clusterManager == STANDALONE && !RUtils.rPackages.isEmpty) {
printErrorAndExit("Distributing R packages with standalone cluster is not supported.")
}
// TODO: Support SparkR with mesos cluster
if (args.isR && clusterManager == MESOS) {
printErrorAndExit("SparkR is not supported for Mesos cluster.")
}
// If we're running a R app, set the main class to our specific R runner
......
......@@ -28,6 +28,7 @@ import org.scalatest.concurrent.Timeouts
import org.scalatest.time.SpanSugar._
import org.apache.spark._
import org.apache.spark.api.r.RUtils
import org.apache.spark.deploy.SparkSubmit._
import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
import org.apache.spark.util.{ResetSystemProperties, Utils}
......@@ -369,9 +370,6 @@ class SparkSubmitSuite
}
test("correctly builds R packages included in a jar with --packages") {
// TODO(SPARK-9603): Building a package to $SPARK_HOME/R/lib is unavailable on Jenkins.
// It's hard to write the test in SparkR (because we can't create the repository dynamically)
/*
assume(RUtils.isRInstalled, "R isn't installed on this machine.")
val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
......@@ -389,7 +387,6 @@ class SparkSubmitSuite
rScriptDir)
runSparkSubmit(args)
}
*/
}
test("resolves command line argument paths correctly") {
......
......@@ -220,6 +220,7 @@ cp -r "$SPARK_HOME/ec2" "$DISTDIR"
if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then
mkdir -p "$DISTDIR"/R/lib
cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib
cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR"/R/lib
fi
# Download and copy in tachyon, if requested
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment