diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6b7b8d8fa52aa6f9690bacd6b855e0476a1a3c95..8fb2694c7a72bb8aeeaef99036e12382bfaa8e64 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -112,7 +112,7 @@ mysql-39: diffcov: script: - - git config --global --add safe.directory /builds/mailman/mailman + - git config --global --add safe.directory "/builds/${CI_PROJECT_NAMESPACE}/mailman" - git config safe.directory - tox -e py37-diffcov only: diff --git a/src/mailman/bin/master.py b/src/mailman/bin/master.py index aceeb23cc4bb391a22b3009d6e7ecd9c051599f8..5a4816d3edc285ff2e521cfadc9b188958bd463d 100644 --- a/src/mailman/bin/master.py +++ b/src/mailman/bin/master.py @@ -257,6 +257,7 @@ class Loop: """Main control loop class.""" def __init__(self, lock=None, restartable=None, config_file=None): + self._log = logging.getLogger('mailman.runner') self._lock = lock self._restartable = restartable self._config_file = config_file @@ -264,7 +265,6 @@ class Loop: def install_signal_handlers(self): """Install various signals handlers for control from the master.""" - log = logging.getLogger('mailman.runner') # Set up our signal handlers. Also set up a SIGALRM handler to # refresh the lock once per day. The lock lifetime is 1 day + 6 hours # so this should be plenty. @@ -273,32 +273,64 @@ class Loop: signal.alarm(SECONDS_IN_A_DAY) signal.signal(signal.SIGALRM, sigalrm_handler) signal.alarm(SECONDS_IN_A_DAY) - # SIGHUP tells the runners to close and reopen their log files. - def sighup_handler(signum, frame): # noqa: E306 - reopen() - for pid in self._kids: + signal.signal(signal.SIGHUP, self._sighup_handler) + signal.signal(signal.SIGUSR1, self._sigusr1_handler) + signal.signal(signal.SIGTERM, self._sigterm_handler) + signal.signal(signal.SIGINT, self._sigint_handler) + + def _sighup_handler(self, signum, frame): + """Handles SIGHUP. + + SIGHUP tells the runners to close and reopen their log files. + """ + reopen() + for pid in self._kids: + try: os.kill(pid, signal.SIGHUP) - log.info('Master watcher caught SIGHUP. Re-opening log files.') - signal.signal(signal.SIGHUP, sighup_handler) - # SIGUSR1 is used by 'mailman restart'. - def sigusr1_handler(signum, frame): # noqa: E306 - for pid in self._kids: + except ProcessLookupError: # pragma: nocover + pass + self._log.info('Master watcher caught SIGHUP. Re-opening log files.') + + def _sigusr1_handler(self, signum, frame): + """Handles SIGUSR1. + + SIGUSR1 is used by 'mailman restart'. + """ + for pid in self._kids: + try: os.kill(pid, signal.SIGUSR1) - log.info('Master watcher caught SIGUSR1. Exiting.') - signal.signal(signal.SIGUSR1, sigusr1_handler) - # SIGTERM is what init will kill this process with when changing run - # levels. It's also the signal 'mailman stop' uses. - def sigterm_handler(signum, frame): # noqa: E306 - for pid in self._kids: + except ProcessLookupError: # pragma: nocover + pass + self._log.info('Master watcher caught SIGUSR1. Restarting.') + + def _sigterm_handler(self, signum, frame): + """Handles SIGTERM. + + SIGTERM is what init will kill this process with when changing + run levels. It's also the signal 'mailman stop' uses. + """ + # Clear the flag so that we won't try to restart the runners. + self._restartable = False + for pid in self._kids: + try: os.kill(pid, signal.SIGTERM) - log.info('Master watcher caught SIGTERM. Exiting.') - signal.signal(signal.SIGTERM, sigterm_handler) - # SIGINT is what control-C gives. - def sigint_handler(signum, frame): # noqa: E306 - for pid in self._kids: + except ProcessLookupError: # pragma: nocover + pass + self._log.info('Master watcher caught SIGTERM. Exiting.') + + def _sigint_handler(self, signum, frame): + """Handles SIGINT. + + SIGINT is what control-C gives. + """ + # Clear the flag so that we won't try to restart the runners. + self._restartable = False + for pid in self._kids: + try: os.kill(pid, signal.SIGINT) - log.info('Master watcher caught SIGINT. Restarting.') - signal.signal(signal.SIGINT, sigint_handler) + except ProcessLookupError: # pragma: nocover + pass + self._log.info('Master watcher caught SIGINT. Exiting.') def _start_runner(self, spec): """Start a runner. @@ -392,8 +424,14 @@ class Loop: """Sleep until a signal is received.""" # Sleep until a signal is received. This prevents the master from # exiting immediately even if there are no runners (as happens in the - # test suite). + # test suite). We install a handler for SIGCHLD so that pause(2) + # will return control to us. + signal.signal(signal.SIGCHLD, + lambda sig, stack: None) # pragma: nocover signal.pause() + # Note: We cannot set the signal handler to SIG_IGN again, + # because since POSIX.1-2001 this changes the semantics of + # wait(2) to not return until all children have exited. def loop(self): """Main loop. @@ -434,11 +472,20 @@ class Loop: # runaway restarts (e.g. if the subprocess had a syntax error!) rname, slice_number, count, restarts = self._kids.pop(pid) config_name = 'runner.' + rname - restart = False - if why == signal.SIGUSR1 and self._restartable: # pragma: nocover - restart = True + restart = self._restartable + # Don't restart if the runner explicitly asks not to be + # restarted. + if why == signal.SIGTERM: + # Note: See bin/runner.py and core/runner.py where + # this signaling is used. + restart = False # Have we hit the maximum number of restarts? - restarts += 1 + if not why == signal.SIGUSR1: + # Note: Explicit restarts using SIGUSR1 should not be + # counted here, because max_restarts only limits the + # number of automated restarts (see + # mailman/config/schema.cfg). + restarts += 1 max_restarts = int(getattr(config, config_name).max_restarts) if restarts > max_restarts: restart = False diff --git a/src/mailman/bin/tests/test_master.py b/src/mailman/bin/tests/test_master.py index e5e9b203ce4e0e295ce2915f11cbe202e48a0483..092c337c9c398ee8f9fe6b023755c383c5e3ebb4 100644 --- a/src/mailman/bin/tests/test_master.py +++ b/src/mailman/bin/tests/test_master.py @@ -18,6 +18,8 @@ """Test master watcher utilities.""" import os +import time +import signal import tempfile import unittest @@ -29,6 +31,7 @@ from importlib_resources import path from io import StringIO from mailman.bin import master from mailman.config import config +from mailman.testing.helpers import LogFileMark, TestableMaster from mailman.testing.layers import ConfigLayer from unittest.mock import patch @@ -135,3 +138,216 @@ Exiting. # We created a non-restartable loop. start_mock.assert_called_once_with([('in', 1, 1)]) loop_mock.assert_called_once_with() + + def test_sighup_handler(self): + """Invokes the SIGHUP handler. + + This sends SIGHUPs to the runners. Unfortunately, there is no + easy way to test whether that actually happens. We'd need a + specialized runner for that. + + """ + m = TestableMaster() + mark = LogFileMark('mailman.runner') + m.start('command') + self.assertEqual(len(list(m._kids)), 1) + + # We need to give the runner some time to install the signal + # handler. If we send SIGHUP before it does, the default + # action is to terminate the process. We wait until we see + # that the runner has done so by inspecting the log file. + # This is race free, and bounded in time. + start = time.time() + while ("runner started." not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + + mark = LogFileMark('mailman.runner') + m._sighup_handler(None, None) + + # Check if the runner reopened the log. + start = time.time() + needle = "command runner caught SIGHUP. Reopening logs." + while (needle not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + self.assertIn(needle, mark.read()) + + # Just to make sure it didn't die. + self.assertEqual(len(list(m._kids)), 1) + m.stop() + + def test_sigusr1_handler(self): + """Invokes the SIGUSR1 handler. + + We then check whether the runners restart. + """ + m = TestableMaster() + m._restartable = True + mark = LogFileMark('mailman.runner') + m.start('command') + kids = list(m._kids) + self.assertEqual(len(kids), 1) + old_kid = kids[0] + + # We need to give the runner some time to install the signal + # handler. If we send SIGUSR1 before it does, the default + # action is to terminate the process. We wait until we see + # that the runner has done so by inspecting the log file. + # This is race free, and bounded in time. + start = time.time() + while ("runner started." not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + + # Invoke the handler in a loop. This is race free, and + # bounded in time. + start = time.time() + mark = LogFileMark('mailman.runner') + while old_kid in set(m._kids) and time.time() - start < 10: + # We must not send signals in rapid succession, because + # the behavior of signals arriving while the process is in + # the signal handler varies. Linux implements System V + # semantics, which means the default signal action is + # restored for the duration of the signal handler. In + # this case it means to terminate the process. + time.sleep(1) + m._sigusr1_handler(None, None) + + # Check if the runner got the signal. + start = time.time() + needle = "command runner caught SIGUSR1. Stopping." + while (needle not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + self.assertIn(needle, mark.read()) + + new_kids = list(m._kids) + self.assertEqual(len(new_kids), 1) + self.assertTrue(kids[0] != new_kids[0]) + m._restartable = False + for pid in new_kids: + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + m.thread.join() + m.cleanup() + + def test_sigterm_handler(self): + """Invokes the SIGTERM handler. + + We then check whether the runners are actually stopped. + """ + m = TestableMaster() + mark = LogFileMark('mailman.runner') + m.start('command') + kids = list(m._kids) + self.assertEqual(len(kids), 1) + old_kid = kids[0] + + # We need to give the runner some time to install the signal + # handler. If we send SIGTERM before it does, the default + # action is to terminate the process and will return a + # slightly different status code. We wait until we see that + # the runner has done so by inspecting the log file. This is + # race free, and bounded in time. + start = time.time() + while ("runner started." not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + + # Invoke the handler in a loop. This is race free, and + # bounded in time. + start = time.time() + mark = LogFileMark('mailman.runner') + while old_kid in set(m._kids) and time.time() - start < 10: + time.sleep(0.1) + m._sigterm_handler(None, None) + + # Check if the runner got the signal. + start = time.time() + needle = "command runner caught SIGTERM. Stopping." + while (needle not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + self.assertIn(needle, mark.read()) + + m.thread.join() + self.assertEqual(len(list(m._kids)), 0) + m.cleanup() + + def test_sigint_handler(self): + """Invokes the SIGINT handler. + + We then check whether the runners are actually stopped. + """ + m = TestableMaster() + mark = LogFileMark('mailman.runner') + m.start('command') + self.assertEqual(len(list(m._kids)), 1) + + kids = list(m._kids) + self.assertEqual(len(kids), 1) + old_kid = kids[0] + + # We need to give the runner some time to install the signal + # handler. If we send SIGINT before it does, the default + # action is to terminate the process and will return a + # slightly different status code. We wait until we see that + # the runner has done so by inspecting the log file. This is + # race free, and bounded in time. + start = time.time() + while ("runner started." not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + + # Invoke the handler in a loop. This is race free, and + # bounded in time. + start = time.time() + mark = LogFileMark('mailman.runner') + while old_kid in set(m._kids) and time.time() - start < 10: + time.sleep(0.1) + m._sigint_handler(None, None) + + # Check if the runner got the signal. + start = time.time() + needle = "command runner caught SIGINT. Stopping." + while (needle not in mark.read() + and time.time() - start < 10): + time.sleep(0.1) + self.assertIn(needle, mark.read()) + + m.thread.join() + self.assertEqual(len(list(m._kids)), 0) + m.cleanup() + + def test_runner_restart_on_sigkill(self): + """Kills a runner with SIGKILL and see if it restarted.""" + m = TestableMaster() + m._restartable = True + m.start('command') + kids = list(m._kids) + self.assertEqual(len(kids), 1) + old_kid = kids[0] + + # Kill it. No need to wait for anything as this cannot be + # caught anyway. + os.kill(old_kid, signal.SIGKILL) + + # But, we need to wait for the master to collect it. + start = time.time() + while old_kid in set(m._kids) and time.time() - start < 10: + time.sleep(0.1) + + new_kids = list(m._kids) + self.assertEqual(len(new_kids), 1) + self.assertTrue(old_kid != new_kids[0]) + m._restartable = False + for pid in new_kids: + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + m.thread.join() + m.cleanup() diff --git a/src/mailman/docs/NEWS.rst b/src/mailman/docs/NEWS.rst index 98bb1e05f8c023806b03587c1f7553356c5ff69f..ac218c0938b66bbe8d3ff4bc32e9bb5d422a994d 100644 --- a/src/mailman/docs/NEWS.rst +++ b/src/mailman/docs/NEWS.rst @@ -24,6 +24,8 @@ Bugs fixed ---------- * ``config.mta.remove_dkim_headers`` now applies to messages to -owner. (Closes #1053) +* The master process now robustly detects when runner processes + terminate and restarts them (Closes #898). .. _news-3.3.8: diff --git a/src/mailman/testing/layers.py b/src/mailman/testing/layers.py index 84e5735ec27966147afa1596a6cbf306dabe9691..6ea9f67f9e62ca7c0a34a6bb4b2a19580a5bbccc 100644 --- a/src/mailman/testing/layers.py +++ b/src/mailman/testing/layers.py @@ -162,12 +162,16 @@ class ConfigLayer(MockAndMonkeyLayer): # debugging because it's just too verbose. Unfortunately, if you # do want that level of debugging you currently have to manually # modify this conditional. - if cls.stderr and sub_name != 'database': + if sub_name != 'database': test_config += expand(dedent(""" [logging.$name] - propagate: yes - level: debug - """), None, dict(name=sub_name, path=path)) + path: $path + $extra_verbose + """), None, dict(name=sub_name, path=path, + extra_verbose=dedent(""" + propagate: yes + level: debug + """) if cls.stderr else "")) # The root logger will already have a handler, but it's not the right # handler. Remove that and set our own. if cls.stderr: