Skip to content
Snippets Groups Projects
Commit fa9df4a4 authored by Charles Reiss's avatar Charles Reiss
Browse files

Normalize executor exit statuses and report them to the user.

parent 1d8e2e6c
No related branches found
No related tags found
No related merge requests found
...@@ -50,9 +50,14 @@ private[spark] class Executor extends Logging { ...@@ -50,9 +50,14 @@ private[spark] class Executor extends Logging {
override def uncaughtException(thread: Thread, exception: Throwable) { override def uncaughtException(thread: Thread, exception: Throwable) {
try { try {
logError("Uncaught exception in thread " + thread, exception) logError("Uncaught exception in thread " + thread, exception)
System.exit(1) if (exception.isInstanceOf[OutOfMemoryError]) {
System.exit(ExecutorExitCode.OOM)
} else {
System.exit(ExecutorExitCode.UNCAUGHT_EXCEPTION)
}
} catch { } catch {
case t: Throwable => System.exit(2) case oom: OutOfMemoryError => System.exit(ExecutorExitCode.OOM)
case t: Throwable => System.exit(ExecutorExitCode.UNCAUGHT_EXCEPTION_TWICE)
} }
} }
} }
......
package spark.executor
/**
* These are exit codes that executors should use to provide the master with information about
* executor failures assuming that cluster management framework can capture the exit codes (but
* perhaps not log files). The exit code constants here are chosen to be unlikely to conflict
* with "natural" exit statuses that may be caused by the JVM or user code. In particular,
* exit codes 128+ arise on some Unix-likes as a result of signals, and it appears that the
* OpenJDK JVM may use exit code 1 in some of its own "last chance" code.
*/
private[spark]
object ExecutorExitCode {
/** The default uncaught exception handler was reached. */
val UNCAUGHT_EXCEPTION = 50
/** The default uncaught exception handler was called and an exception was encountered while
logging the exception. */
val UNCAUGHT_EXCEPTION_TWICE = 51
/** The default uncaught exception handler was reached, and the uncaught exception was an
OutOfMemoryError. */
val OOM = 52
/** DiskStore failed to create a local temporary directory after many attempts. */
val DISK_STORE_FAILED_TO_CREATE_DIR = 53
def explainExitCode(exitCode: Int): String = {
exitCode match {
case UNCAUGHT_EXCEPTION => "Uncaught exception"
case UNCAUGHT_EXCEPTION_TWICE => "Uncaught exception, and logging the exception failed"
case OOM => "OutOfMemoryError"
case DISK_STORE_FAILED_TO_CREATE_DIR =>
"Failed to create local directory (bad spark.local.dir?)"
case _ =>
"Unknown executor exit code (" + exitCode + ")" + (
if (exitCode > 128)
" (died from signal " + (exitCode - 128) + "?)"
else
""
)
}
}
}
...@@ -249,7 +249,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext) ...@@ -249,7 +249,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
} }
} }
def slaveLost(slaveId: String) { def slaveLost(slaveId: String, reason: ExecutorLostReason) {
var failedHost: Option[String] = None var failedHost: Option[String] = None
synchronized { synchronized {
val host = slaveIdToHost(slaveId) val host = slaveIdToHost(slaveId)
...@@ -261,6 +261,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext) ...@@ -261,6 +261,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
} }
} }
if (failedHost != None) { if (failedHost != None) {
logError("Lost an executor on " + failedHost.get + ": " + reason)
listener.hostLost(failedHost.get) listener.hostLost(failedHost.get)
backend.reviveOffers() backend.reviveOffers()
} }
......
package spark.scheduler.cluster
import spark.executor.ExecutorExitCode
/**
* Represents an explanation for a executor or whole slave failing or exiting.
*/
private[spark]
class ExecutorLostReason(val message: String) {
override def toString: String = message
}
private[spark]
case class ExecutorExited(val exitCode: Int)
extends ExecutorLostReason(ExecutorExitCode.explainExitCode(exitCode)) {
}
private[spark]
case class SlaveLost(_message: String = "Slave lost")
extends ExecutorLostReason(_message) {
}
...@@ -72,11 +72,19 @@ private[spark] class SparkDeploySchedulerBackend( ...@@ -72,11 +72,19 @@ private[spark] class SparkDeploySchedulerBackend(
} }
def executorRemoved(id: String, message: String) { def executorRemoved(id: String, message: String) {
var reason: ExecutorLostReason = SlaveLost(message)
if (message.startsWith("Command exited with code ")) {
try {
reason = ExecutorExited(message.substring("Command exited with code ".length).toInt)
} catch {
case nfe: NumberFormatException => {}
}
}
logInfo("Executor %s removed: %s".format(id, message)) logInfo("Executor %s removed: %s".format(id, message))
executorIdToSlaveId.get(id) match { executorIdToSlaveId.get(id) match {
case Some(slaveId) => case Some(slaveId) =>
executorIdToSlaveId.remove(id) executorIdToSlaveId.remove(id)
scheduler.slaveLost(slaveId) scheduler.slaveLost(slaveId, reason)
case None => case None =>
logInfo("No slave ID known for executor %s".format(id)) logInfo("No slave ID known for executor %s".format(id))
} }
......
...@@ -109,7 +109,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor ...@@ -109,7 +109,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
freeCores -= slaveId freeCores -= slaveId
slaveHost -= slaveId slaveHost -= slaveId
totalCoreCount.addAndGet(-numCores) totalCoreCount.addAndGet(-numCores)
scheduler.slaveLost(slaveId) scheduler.slaveLost(slaveId, SlaveLost())
} }
} }
......
...@@ -267,17 +267,23 @@ private[spark] class MesosSchedulerBackend( ...@@ -267,17 +267,23 @@ private[spark] class MesosSchedulerBackend(
override def frameworkMessage(d: SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {} override def frameworkMessage(d: SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
override def slaveLost(d: SchedulerDriver, slaveId: SlaveID) { private def recordSlaveLost(d: SchedulerDriver, slaveId: SlaveID, reason: ExecutorLostReason) {
logInfo("Mesos slave lost: " + slaveId.getValue) logInfo("Mesos slave lost: " + slaveId.getValue)
synchronized { synchronized {
slaveIdsWithExecutors -= slaveId.getValue slaveIdsWithExecutors -= slaveId.getValue
} }
scheduler.slaveLost(slaveId.getValue) scheduler.slaveLost(slaveId.getValue, reason)
}
override def slaveLost(d: SchedulerDriver, slaveId: SlaveID) {
recordSlaveLost(d, slaveId, SlaveLost())
} }
override def executorLost(d: SchedulerDriver, e: ExecutorID, s: SlaveID, status: Int) { override def executorLost(d: SchedulerDriver, executorId: ExecutorID,
logInfo("Executor lost: %s, marking slave %s as lost".format(e.getValue, s.getValue)) slaveId: SlaveID, status: Int) {
slaveLost(d, s) logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
slaveId.getValue))
recordSlaveLost(d, slaveId, ExecutorExited(status))
} }
// TODO: query Mesos for number of cores // TODO: query Mesos for number of cores
......
...@@ -10,6 +10,8 @@ import it.unimi.dsi.fastutil.io.FastBufferedOutputStream ...@@ -10,6 +10,8 @@ import it.unimi.dsi.fastutil.io.FastBufferedOutputStream
import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.ArrayBuffer
import spark.executor.ExecutorExitCode
import spark.Utils import spark.Utils
/** /**
...@@ -162,7 +164,7 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String) ...@@ -162,7 +164,7 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String)
if (!foundLocalDir) { if (!foundLocalDir) {
logError("Failed " + MAX_DIR_CREATION_ATTEMPTS + logError("Failed " + MAX_DIR_CREATION_ATTEMPTS +
" attempts to create local dir in " + rootDir) " attempts to create local dir in " + rootDir)
System.exit(1) System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
} }
logInfo("Created local directory at " + localDir) logInfo("Created local directory at " + localDir)
localDir localDir
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment