Fix infinite hang in wait_for_process_to_die() on Windows

The ninja generator uses wait_for_process_to_die() to wait for a previous scons daemon process to terminate. It previously had 3 separate implemementations: one using psutil if that module was not available, plus a fallback implementation for Windows, and one for non-Windows platforms. I was encountering problems with the fallback implementation on Windows hanging forever, and not being interruptible even with Ctrl-C. Apparently the win32 OpenProcess() function can return a valid process handle even for already terminated processes. This change adds an extra call to GetExitCodeProcess() to avoid infinitely looping if the process has in fact already exited. I also added a timeout so that this function will eventually fail rather than hanging forever if a process does not exit. I also removed the psutil implementation: it seemed simpler to me to avoid having multiple separate implementations with different behaviors. This appeared to be the only place in scons that depends on psutil outside of tests. Also note that this wait_for_process_to_die() function is only used by the ninja tool. I have added unit tests checking the behavior of wait_for_process_to_die(). Without the `GetExitCodeProcess()` check added by this PR, the simple code in `test_wait_for_process_to_die_success()` would hang forever.
author: Adam Simpkins <adam@adamsimpkins.net> 2025-02-23 19:42:04 (GMT)
committer: Adam Simpkins <adam@adamsimpkins.net> 2025-02-23 22:15:39 (GMT)
commit: a6af8994c21c9b499f438574ac74d0749bfdac61 (patch)
tree: bd41693355879b13cd5cacbe1c2a2bd571ea1466
parent: a6abb0fff2e8b836dfd58efec752def52faf16b8 (diff)
download: SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.zip
SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.tar.gz
SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.tar.bz2
3 files changed, 73 insertions, 31 deletions
diff --git a/SCons/Tool/ninja_tool/NinjaState.py b/SCons/Tool/ninja_tool/NinjaState.py
index ef11b50..be64a13 100644
--- a/SCons/Tool/ninja_tool/NinjaState.py
+++ b/SCons/Tool/ninja_tool/NinjaState.py
@@ -694,9 +694,8 @@ class NinjaState:
                         pass
 
                 # wait for the server process to fully killed
-                # TODO: update wait_for_process_to_die() to handle timeout and then catch exception
-                #       here and do something smart.
-                wait_for_process_to_die(pid)
+                # TODO: catch TimeoutException here and possibly do something smart.
+                wait_for_process_to_die(pid, timeout=10)
 
             if os.path.exists(scons_daemon_dirty):
                 os.unlink(scons_daemon_dirty)
diff --git a/SCons/Util/UtilTests.py b/SCons/Util/UtilTests.py
index 0f457c3..57ea2c2 100644
--- a/SCons/Util/UtilTests.py
+++ b/SCons/Util/UtilTests.py
@@ -27,6 +27,7 @@ import functools
 import hashlib
 import io
 import os
+import subprocess
 import sys
 import unittest
 import unittest.mock
@@ -73,6 +74,7 @@ from SCons.Util import (
     to_String,
     to_bytes,
     to_str,
+    wait_for_process_to_die,
 )
 from SCons.Util.envs import is_valid_construction_var
 from SCons.Util.hashes import (
@@ -847,6 +849,27 @@ bling
         s4 = silent_intern("spam")
         assert id(s1) == id(s4)
 
+    def test_wait_for_process_to_die_success(self) -> None:
+        cmd = [sys.executable, "-c", ""]
+        p = subprocess.Popen(cmd)
+        p.wait()
+        wait_for_process_to_die(p.pid, timeout=10.0)
+
+    def test_wait_for_process_to_die_timeout(self) -> None:
+        # Run a python script that will keep running until we close it's stdin
+        cmd = [sys.executable, "-c", "import sys; data = sys.stdin.read()"]
+        p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+
+        # wait_for_process_to_die() should time out while the process is running
+        with self.assertRaises(TimeoutError):
+            wait_for_process_to_die(p.pid, timeout=0.2)
+
+        p.stdin.close()
+        p.wait()
+
+        # wait_for_process_to_die() should complete normally now
+        wait_for_process_to_die(p.pid, timeout=10.0)
+
 
 class HashTestCase(unittest.TestCase):
 
diff --git a/SCons/Util/__init__.py b/SCons/Util/__init__.py
index 0398c1f..5bd84dd 100644
--- a/SCons/Util/__init__.py
+++ b/SCons/Util/__init__.py
@@ -1297,36 +1297,56 @@ def print_time():
     return print_time
 
 
-def wait_for_process_to_die(pid) -> None:
+def wait_for_process_to_die(pid: int, timeout: float = 60.0) -> None:
     """
-    Wait for specified process to die, or alternatively kill it
-    NOTE: This function operates best with psutil pypi package
-    TODO: Add timeout which raises exception
+    Wait for the specified process to die.
+
+    A TimeoutError will be thrown if the timeout is hit before the process terminates.
+    Specifying a negative timeout will cause wait_for_process_to_die() to wait
+    indefinitely, and never throw TimeoutError.
     """
-    # wait for the process to fully killed
-    try:
-        import psutil  # pylint: disable=import-outside-toplevel
-        while True:
-            if pid not in [proc.pid for proc in psutil.process_iter()]:
-                break
-            time.sleep(0.1)
-    except ImportError:
-        # if psutil is not installed we can do this the hard way
-        while True:
-            if sys.platform == 'win32':
-                import ctypes  # pylint: disable=import-outside-toplevel
-                PROCESS_QUERY_INFORMATION = 0x1000
-                processHandle = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION, 0, pid)
-                if processHandle == 0:
-                    break
-                ctypes.windll.kernel32.CloseHandle(processHandle)
-                time.sleep(0.1)
-            else:
-                try:
-                    os.kill(pid, 0)
-                except OSError:
-                    break
-                time.sleep(0.1)
+    start_time = time.time()
+    while True:
+        if not _is_process_alive(pid):
+            break
+        if timeout >= 0.0 and time.time() - start_time > timeout:
+            raise TimeoutError(f"timed out waiting for process {pid}")
+        time.sleep(0.1)
+
+
+if sys.platform == 'win32':
+    def _is_process_alive(pid: int) -> bool:
+        import ctypes  # pylint: disable=import-outside-toplevel
+        PROCESS_QUERY_INFORMATION = 0x1000
+        STILL_ACTIVE = 259
+
+        processHandle = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION, 0, pid)
+        if processHandle == 0:
+            return False
+
+        # OpenProcess() may successfully return a handle even for terminated
+        # processes when something else in the system is still holding a
+        # reference to their handle.  Call GetExitCodeProcess() to check if the
+        # process has already exited.
+        try:
+            exit_code = ctypes.c_ulong()
+            success = ctypes.windll.kernel32.GetExitCodeProcess(
+                    processHandle, ctypes.byref(exit_code))
+            if success:
+                return exit_code.value == STILL_ACTIVE
+        finally:
+            ctypes.windll.kernel32.CloseHandle(processHandle)
+
+        return True
+
+else:
+    def _is_process_alive(pid: int) -> bool:
+        try:
+            os.kill(pid, 0)
+            return True
+        except OSError:
+            return False
+
 
 # From: https://stackoverflow.com/questions/1741972/how-to-use-different-formatters-with-the-same-logging-handler-in-python
 class DispatchingFormatter(Formatter):
author	Adam Simpkins <adam@adamsimpkins.net>	2025-02-23 19:42:04 (GMT)
committer	Adam Simpkins <adam@adamsimpkins.net>	2025-02-23 22:15:39 (GMT)
commit	a6af8994c21c9b499f438574ac74d0749bfdac61 (patch)
tree	bd41693355879b13cd5cacbe1c2a2bd571ea1466
parent	a6abb0fff2e8b836dfd58efec752def52faf16b8 (diff)
download	SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.zip SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.tar.gz SCons-a6af8994c21c9b499f438574ac74d0749bfdac61.tar.bz2