Skip to content
Snippets Groups Projects
Commit 28dcbb53 authored by Davies Liu's avatar Davies Liu Committed by Josh Rosen
Browse files

[SPARK-2898] [PySpark] fix bugs in deamon.py

1. do not use signal handler for SIGCHILD, it's easy to cause deadlock
2. handle EINTR during accept()
3. pass errno into JVM
4. handle EAGAIN during fork()

Now, it can pass 50k tasks tests in 180 seconds.

Author: Davies Liu <davies.liu@gmail.com>

Closes #1842 from davies/qa and squashes the following commits:

f0ea451 [Davies Liu] fix lint
03a2e8c [Davies Liu] cleanup dead children every seconds
32cb829 [Davies Liu] fix lint
0cd0817 [Davies Liu] fix bugs in deamon.py
parent 1d03a26a
No related branches found
No related tags found
No related merge requests found
...@@ -68,7 +68,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String ...@@ -68,7 +68,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
val socket = new Socket(daemonHost, daemonPort) val socket = new Socket(daemonHost, daemonPort)
val pid = new DataInputStream(socket.getInputStream).readInt() val pid = new DataInputStream(socket.getInputStream).readInt()
if (pid < 0) { if (pid < 0) {
throw new IllegalStateException("Python daemon failed to launch worker") throw new IllegalStateException("Python daemon failed to launch worker with code " + pid)
} }
daemonWorkers.put(socket, pid) daemonWorkers.put(socket, pid)
socket socket
......
...@@ -22,7 +22,8 @@ import select ...@@ -22,7 +22,8 @@ import select
import socket import socket
import sys import sys
import traceback import traceback
from errno import EINTR, ECHILD import time
from errno import EINTR, ECHILD, EAGAIN
from socket import AF_INET, SOCK_STREAM, SOMAXCONN from socket import AF_INET, SOCK_STREAM, SOMAXCONN
from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN
from pyspark.worker import main as worker_main from pyspark.worker import main as worker_main
...@@ -80,6 +81,17 @@ def worker(sock): ...@@ -80,6 +81,17 @@ def worker(sock):
os._exit(compute_real_exit_code(exit_code)) os._exit(compute_real_exit_code(exit_code))
# Cleanup zombie children
def cleanup_dead_children():
try:
while True:
pid, _ = os.waitpid(0, os.WNOHANG)
if not pid:
break
except:
pass
def manager(): def manager():
# Create a new process group to corral our children # Create a new process group to corral our children
os.setpgid(0, 0) os.setpgid(0, 0)
...@@ -102,29 +114,21 @@ def manager(): ...@@ -102,29 +114,21 @@ def manager():
signal.signal(SIGTERM, handle_sigterm) # Gracefully exit on SIGTERM signal.signal(SIGTERM, handle_sigterm) # Gracefully exit on SIGTERM
signal.signal(SIGHUP, SIG_IGN) # Don't die on SIGHUP signal.signal(SIGHUP, SIG_IGN) # Don't die on SIGHUP
# Cleanup zombie children
def handle_sigchld(*args):
try:
pid, status = os.waitpid(0, os.WNOHANG)
if status != 0:
msg = "worker %s crashed abruptly with exit status %s" % (pid, status)
print >> sys.stderr, msg
except EnvironmentError as err:
if err.errno not in (ECHILD, EINTR):
raise
signal.signal(SIGCHLD, handle_sigchld)
# Initialization complete # Initialization complete
sys.stdout.close() sys.stdout.close()
try: try:
while True: while True:
try: try:
ready_fds = select.select([0, listen_sock], [], [])[0] ready_fds = select.select([0, listen_sock], [], [], 1)[0]
except select.error as ex: except select.error as ex:
if ex[0] == EINTR: if ex[0] == EINTR:
continue continue
else: else:
raise raise
# cleanup in signal handler will cause deadlock
cleanup_dead_children()
if 0 in ready_fds: if 0 in ready_fds:
try: try:
worker_pid = read_int(sys.stdin) worker_pid = read_int(sys.stdin)
...@@ -137,29 +141,41 @@ def manager(): ...@@ -137,29 +141,41 @@ def manager():
pass # process already died pass # process already died
if listen_sock in ready_fds: if listen_sock in ready_fds:
sock, addr = listen_sock.accept() try:
sock, _ = listen_sock.accept()
except OSError as e:
if e.errno == EINTR:
continue
raise
# Launch a worker process # Launch a worker process
try: try:
pid = os.fork() pid = os.fork()
if pid == 0: except OSError as e:
listen_sock.close() if e.errno in (EAGAIN, EINTR):
try: time.sleep(1)
worker(sock) pid = os.fork() # error here will shutdown daemon
except:
traceback.print_exc()
os._exit(1)
else:
os._exit(0)
else: else:
outfile = sock.makefile('w')
write_int(e.errno, outfile) # Signal that the fork failed
outfile.flush()
outfile.close()
sock.close() sock.close()
continue
except OSError as e:
print >> sys.stderr, "Daemon failed to fork PySpark worker: %s" % e if pid == 0:
outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536) # in child process
write_int(-1, outfile) # Signal that the fork failed listen_sock.close()
outfile.flush() try:
outfile.close() worker(sock)
except:
traceback.print_exc()
os._exit(1)
else:
os._exit(0)
else:
sock.close() sock.close()
finally: finally:
shutdown(1) shutdown(1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment