Skip to content
Snippets Groups Projects
Commit 49c78446 authored by Matei Zaharia's avatar Matei Zaharia
Browse files

Merge pull request #402 from stephenh/handleslavelosttoosoon

Handle slaveLost before slaveIdToHost knows about it.
parents d1de9d7d 27b3f3f0
No related branches found
No related tags found
No related merge requests found
...@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext) ...@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
def slaveLost(slaveId: String, reason: ExecutorLossReason) { def slaveLost(slaveId: String, reason: ExecutorLossReason) {
var failedHost: Option[String] = None var failedHost: Option[String] = None
synchronized { synchronized {
val host = slaveIdToHost(slaveId) slaveIdToHost.get(slaveId) match {
if (hostsAlive.contains(host)) { case Some(host) =>
logError("Lost an executor on " + host + ": " + reason) if (hostsAlive.contains(host)) {
slaveIdsWithExecutors -= slaveId logError("Lost an executor on " + host + ": " + reason)
hostsAlive -= host slaveIdsWithExecutors -= slaveId
activeTaskSetsQueue.foreach(_.hostLost(host)) hostsAlive -= host
failedHost = Some(host) activeTaskSetsQueue.foreach(_.hostLost(host))
} else { failedHost = Some(host)
// We may get multiple slaveLost() calls with different loss reasons. For example, one } else {
// may be triggered by a dropped connection from the slave while another may be a report // We may get multiple slaveLost() calls with different loss reasons. For example, one
// of executor termination from Mesos. We produce log messages for both so we eventually // may be triggered by a dropped connection from the slave while another may be a report
// report the termination reason. // of executor termination from Mesos. We produce log messages for both so we eventually
logError("Lost an executor on " + host + " (already removed): " + reason) // report the termination reason.
logError("Lost an executor on " + host + " (already removed): " + reason)
}
case None =>
// We were told about a slave being lost before we could even allocate work to it
logError("Lost slave " + slaveId + " (no work assigned yet)")
} }
} }
if (failedHost != None) { if (failedHost != None) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment