Log duplicate slaveLost() calls in ClusterScheduler.

a4041dd8 · Charles Reiss · fa9df4a4 · a4041dd8
Commit a4041dd8 authored 12 years ago by Charles Reiss
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -254,14 +254,20 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
    synchronized {
      val host = slaveIdToHost(slaveId)
      if (hostsAlive.contains(host)) {
+        logError("Lost an executor on " + host + ": " + reason)
        slaveIdsWithExecutors -= slaveId
        hostsAlive -= host
        activeTaskSetsQueue.foreach(_.hostLost(host))
        failedHost = Some(host)
+      } else {
+        // We may get multiple slaveLost() calls with different loss reasons. For example, one 
+        // may be triggered by a dropped connection from the slave while another may be a report
+        // of executor termination from Mesos. We produce log messages for both so we eventually
+        // report the termination reason.
+        logError("Lost an executor on " + host + " (already removed): " + reason)
      }
    }
    if (failedHost != None) {
-      logError("Lost an executor on " + failedHost.get + ": " + reason)
      listener.hostLost(failedHost.get)
      backend.reviveOffers()
    }