Skip to content

Commit 4379fc6

Browse files
scottmeyerostk.ai
authored andcommitted
ignore: fix parallel walker hang when visitor panics
When a visitor callback panics during parallel directory traversal, the panic unwinds past Worker::run() without signaling other workers to stop. Those workers then spin indefinitely in their work-stealing sleep loop because: 1. active_workers is never decremented for the panicked thread 2. quit_now is never set 3. The sleep loop in get_work() never checks quit_now This manifests as the process hanging with threads stuck in nanosleep(2), as reported in #3009. The fix has three parts: - Add a WorkerPanicGuard (Drop guard) to Worker::run() that sets quit_now on unwind, ensuring all sibling workers see the signal. - Add a quit_now check to the inner sleep loop in get_work(), so workers blocked waiting for work break out immediately when a sibling signals termination. - Replace handle.join().unwrap() with graceful panic collection: join all threads first, then resume_unwind with the first panic payload. This prevents the join loop from aborting before all threads have exited. This also partially mitigates #2761 (nanosleep spin when a thread is blocked on a FIFO), since the quit_now check in the sleep loop provides an escape hatch that was previously missing. Closes #3009
1 parent 4519153 commit 4379fc6

File tree

1 file changed

+90
-1
lines changed

1 file changed

+90
-1
lines changed

crates/ignore/src/walk.rs

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1424,8 +1424,18 @@ impl WalkParallel {
14241424
})
14251425
.map(|worker| s.spawn(|| worker.run()))
14261426
.collect();
1427+
let mut panic_payload = None;
14271428
for handle in handles {
1428-
handle.join().unwrap();
1429+
if let Err(payload) = handle.join() {
1430+
// Capture the first panic but keep joining the rest,
1431+
// so all threads get a chance to clean up.
1432+
if panic_payload.is_none() {
1433+
panic_payload = Some(payload);
1434+
}
1435+
}
1436+
}
1437+
if let Some(payload) = panic_payload {
1438+
std::panic::resume_unwind(payload);
14291439
}
14301440
});
14311441
}
@@ -1622,17 +1632,52 @@ struct Worker<'s> {
16221632
filter: Option<Filter>,
16231633
}
16241634

1635+
/// A guard that ensures the parallel walker terminates cleanly even if a
1636+
/// worker thread panics. Without this, a panic in a visitor callback would
1637+
/// unwind past `Worker::run` without signaling other threads to stop,
1638+
/// causing them to spin indefinitely in their work-stealing loop.
1639+
///
1640+
/// This fixes two known issues:
1641+
/// - Workers hanging when a visitor panics (gh-3009)
1642+
/// - Workers spinning in nanosleep when a sibling is stuck (gh-2761)
1643+
struct WorkerPanicGuard {
1644+
quit_now: Arc<AtomicBool>,
1645+
/// Set to `true` before normal return from `Worker::run()`.
1646+
/// When `false` at drop time, we know we're unwinding from a panic.
1647+
defused: bool,
1648+
}
1649+
1650+
impl Drop for WorkerPanicGuard {
1651+
fn drop(&mut self) {
1652+
if !self.defused {
1653+
// Signal all workers to quit. We intentionally do NOT touch
1654+
// active_workers here: the termination protocol in get_work()
1655+
// handles its own accounting, and the quit_now flag takes
1656+
// priority over the active worker count (see the check at the
1657+
// top of the get_work loop). Setting quit_now is sufficient to
1658+
// break all workers out of both their work loop and their sleep
1659+
// loop.
1660+
self.quit_now.store(true, AtomicOrdering::SeqCst);
1661+
}
1662+
}
1663+
}
1664+
16251665
impl<'s> Worker<'s> {
16261666
/// Runs this worker until there is no more work left to do.
16271667
///
16281668
/// The worker will call the caller's callback for all entries that aren't
16291669
/// skipped by the ignore matcher.
16301670
fn run(mut self) {
1671+
let mut guard = WorkerPanicGuard {
1672+
quit_now: self.quit_now.clone(),
1673+
defused: false,
1674+
};
16311675
while let Some(work) = self.get_work() {
16321676
if let WalkState::Quit = self.run_one(work) {
16331677
self.quit_now();
16341678
}
16351679
}
1680+
guard.defused = true;
16361681
}
16371682

16381683
fn run_one(&mut self, mut work: Work) -> WalkState {
@@ -1836,6 +1881,12 @@ impl<'s> Worker<'s> {
18361881
}
18371882
// Wait for next `Work` or `Quit` message.
18381883
loop {
1884+
if self.is_quit_now() {
1885+
// A sibling worker has signaled termination
1886+
// (possibly via panic). Break out immediately.
1887+
self.send_quit();
1888+
return None;
1889+
}
18391890
if let Some(v) = self.recv() {
18401891
self.activate_worker();
18411892
value = Some(v);
@@ -2491,4 +2542,42 @@ mod tests {
24912542
&["x", "x/y", "x/y/foo"],
24922543
);
24932544
}
2545+
2546+
/// Test that the parallel walker terminates cleanly when a visitor
2547+
/// panics, instead of hanging indefinitely. This is a regression test
2548+
/// for <https://github.com/BurntSushi/ripgrep/issues/3009>.
2549+
#[test]
2550+
fn parallel_visitor_panic_does_not_hang() {
2551+
use std::sync::atomic::{AtomicUsize, Ordering};
2552+
2553+
let td = tmpdir();
2554+
// Create enough entries to ensure multiple workers get work.
2555+
for i in 0..100 {
2556+
wfile(td.path().join(format!("file{i}")), "");
2557+
}
2558+
2559+
let visited = Arc::new(AtomicUsize::new(0));
2560+
let result = std::panic::catch_unwind(|| {
2561+
let visited = visited.clone();
2562+
let mut builder = WalkBuilder::new(td.path());
2563+
builder.threads(4);
2564+
builder.build_parallel().run(|| {
2565+
let visited = visited.clone();
2566+
Box::new(move |_| {
2567+
let n = visited.fetch_add(1, Ordering::SeqCst);
2568+
if n == 5 {
2569+
panic!("intentional panic in visitor");
2570+
}
2571+
WalkState::Continue
2572+
})
2573+
});
2574+
});
2575+
2576+
// The walker should have terminated (not hung), and the panic
2577+
// should have propagated.
2578+
assert!(result.is_err(), "expected panic to propagate");
2579+
// We should have visited at least a few entries before the panic
2580+
// killed everything.
2581+
assert!(visited.load(Ordering::SeqCst) >= 5);
2582+
}
24942583
}

0 commit comments

Comments
 (0)