laforge submitted this change.
bankd: Avoid osmocom logging mutex deadlock in signal handling
The main thread communicates slotmap add + delete via POSIX signals
to the worker threads. As those signals interrupt the normal
processing of the worker thread, they might get delivered while the
thread is already logging something, causing a deadlock. This has
been observed in the real world in the following stack trace (where it's
actually two nested signals):
Thread 45 (Thread 0x7fa014ff96c0 (LWP 620753) "osmo-remsim-ban"):
#0 futex_wait (private=0, expected=2, futex_word=0x7fa0a992b360 <osmo_log_tgt_mutex>) at ../sysdeps/nptl/futex-internal.h:146
#1 __GI___lll_lock_wait (futex=futex@entry=0x7fa0a992b360 <osmo_log_tgt_mutex>, private=0) at ./nptl/lowlevellock.c:49
#2 0x00007fa0a9730482 in lll_mutex_lock_optimized (mutex=0x7fa0a992b360 <osmo_log_tgt_mutex>) at ./nptl/pthread_mutex_lock.c:48
#3 ___pthread_mutex_lock (mutex=0x7fa0a992b360 <osmo_log_tgt_mutex>) at ./nptl/pthread_mutex_lock.c:93
#4 0x00007fa0a98d878d in log_tgt_mutex_lock_impl () from /usr/local/lib/libosmocore.so.22
#5 0x00007fa0a98db9a1 in log_check_level () from /usr/local/lib/libosmocore.so.22
#6 0x0000559db6278e31 in handle_sig_mapadd (sig=<optimized out>) at bankd_main.c:558
#7 <signal handler called>
#8 0x00007fa0a9701548 in __vfprintf_internal (s=s@entry=0x7fa014fe3a50, format=format@entry=0x7fa0a991905a "%ld ", ap=ap@entry=0x7fa014fe3bd0, mode_flags=mode_flags@entry=0) at ./stdio-common/vfprintf-internal.c:983
#9 0x00007fa0a9722758 in __vsnprintf_internal (string=0x7fa014fe3eb0 "", maxlen=<optimized out>, format=0x7fa0a991905a "%ld ", args=args@entry=0x7fa014fe3bd0, mode_flags=mode_flags@entry=0) at ./libio/vsnprintf.c:114
#10 0x00007fa0a96fcca2 in __GI___snprintf (s=<optimized out>, maxlen=<optimized out>, format=<optimized out>) at ./stdio-common/snprintf.c:31
#11 0x00007fa0a98d91b4 in _output_buf () from /usr/local/lib/libosmocore.so.22
#12 0x00007fa0a98d9def in _output () from /usr/local/lib/libosmocore.so.22
#13 0x00007fa0a98da0ef in osmo_vlogp () from /usr/local/lib/libosmocore.so.22
#14 0x00007fa0a98da2bb in logp2 () from /usr/local/lib/libosmocore.so.22
#15 0x0000559db62790ba in handle_sig_mapdel (sig=35) at bankd_main.c:546
#16 <signal handler called>
#17 0x00007fa0a97a025f in __GI___poll (fds=0x7fa014fe5650, nfds=1, timeout=-1) at ../sysdeps/unix/sysv/linux/poll.c:29
#18 0x00007fa0a9892d53 in ?? () from /lib/x86_64-linux-gnu/libpcsclite.so.1
#19 0x00007fa0a988fd9e in SCardTransmit () from /lib/x86_64-linux-gnu/libpcsclite.so.1
#20 0x0000559db627c9ef in pcsc_transceive (worker=0x559dbd5c5750, out=<optimized out>, out_len=<optimized out>, in=<optimized out>, in_len=0x7fa014fe57b8) at bankd_pcsc.c:319
#21 0x0000559db627a479 in worker_handle_tpduModemToCard (pdu=0x7fa050003a50, worker=0x559dbd5c5750) at bankd_main.c:826
#22 worker_handle_rspro (pdu=0x7fa050003a50, worker=0x559dbd5c5750) at bankd_main.c:892
#23 worker_transceive_loop (worker=0x559dbd5c5750) at bankd_main.c:1005
#24 0x0000559db627aff2 in worker_main (arg=<optimized out>) at bankd_main.c:1079
#25 0x00007fa0a972d1f5 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#26 0x00007fa0a97ad8dc in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
As a hot-fix, let's avoid logging from the handle_sig_map{del,add}()
functions at all, making them safe against a deadlock around this mutex.
We should decide how to proceed in general with potentially some
architectural changes later on; any such changes are not suitable as a
hot fix due to their potential of introducing other regressions.
Change-Id: I5ea32886dfaf624b4dc5ad7924941c7b904c1d36
Related: SYS#7930
---
M src/bankd/bankd_main.c
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/bankd/bankd_main.c b/src/bankd/bankd_main.c
index 1adc34c..5e33b41 100644
--- a/src/bankd/bankd_main.c
+++ b/src/bankd/bankd_main.c
@@ -521,9 +521,10 @@
static int worker_send_rspro(struct bankd_worker *worker, RsproPDU_t *pdu);
-static void worker_set_state(struct bankd_worker *worker, enum bankd_worker_state new_state)
+static void worker_set_state(struct bankd_worker *worker, enum bankd_worker_state new_state, bool quiet)
{
- LOGW(worker, "Changing state to %s\n", get_value_string(worker_state_names, new_state));
+ if (!quiet)
+ LOGW(worker, "Changing state to %s\n", get_value_string(worker_state_names, new_state));
worker->state = new_state;
worker->timeout = 0;
}
@@ -540,25 +541,26 @@
/* signal handler for receiving SIGMAPDEL from main thread */
static void handle_sig_mapdel(int sig)
{
- LOGW(g_worker, "SIGMAPDEL received: Main thread informs us our map is gone\n");
+ /* DO NOT LOG ANYTHING HERE, IT WILL DEADLOCK WITH THE osmo_log_tgt_mutex */
OSMO_ASSERT(sig == SIGMAPDEL);
if (g_worker->state >= BW_ST_CONN_CLIENT_MAPPED) {
g_worker->slot.bank_id = 0xffff;
g_worker->slot.slot_nr = 0xffff;
- worker_set_state(g_worker, BW_ST_CONN_CLIENT_UNMAPPED);
+ worker_set_state(g_worker, BW_ST_CONN_CLIENT_UNMAPPED, true);
}
}
/* signal handler for receiving SIGMAPADD from main thread */
static void handle_sig_mapadd(int sig)
{
- LOGW(g_worker, "SIGMAPADD received\n");
+ /* DO NOT LOG ANYTHING HERE, IT WILL DEADLOCK WITH THE osmo_log_tgt_mutex */
/* do nothing */
}
static void handle_sig_usr1(int sig)
{
OSMO_ASSERT(sig == SIGUSR1);
+ /* FIXME: WE SHOULD NOT LOG ANYTHING HERE, IT WILL DEADLOCK WITH THE osmo_log_tgt_mutex */
if (pthread_equal(g_bankd->main, pthread_self())) {
struct bankd_worker *worker;
@@ -612,7 +614,7 @@
if (rc < 0)
return rc;
- worker_set_state(worker, BW_ST_CONN_CLIENT_MAPPED_CARD);
+ worker_set_state(worker, BW_ST_CONN_CLIENT_MAPPED_CARD, false);
/* FIXME: notify client about this state change */
return 0;
@@ -763,7 +765,7 @@
}
worker->client.clslot.client_id = pdu->msg.choice.connectClientReq.clientSlot->clientId;
worker->client.clslot.slot_nr = pdu->msg.choice.connectClientReq.clientSlot->slotNr;
- worker_set_state(worker, BW_ST_CONN_CLIENT);
+ worker_set_state(worker, BW_ST_CONN_CLIENT, false);
if (worker_try_slotmap(worker) >= 0)
res = ResultCode_ok;
@@ -1041,7 +1043,7 @@
g_worker = (struct bankd_worker *) arg;
- worker_set_state(g_worker, BW_ST_INIT);
+ worker_set_state(g_worker, BW_ST_INIT, false);
/* not permitted in multithreaded environment */
talloc_disable_null_tracking();
@@ -1065,7 +1067,7 @@
g_worker->client.peer_addr_len = sizeof(g_worker->client.peer_addr);
- worker_set_state(g_worker, BW_ST_ACCEPTING);
+ worker_set_state(g_worker, BW_ST_ACCEPTING, false);
/* first wait for an incoming TCP connection */
rc = accept(g_worker->bankd->accept_fd, (struct sockaddr *) &g_worker->client.peer_addr,
&g_worker->client.peer_addr_len);
@@ -1075,7 +1077,7 @@
g_worker->client.fd = rc;
worker_client_addrstr(buf, sizeof(buf), g_worker);
LOGW(g_worker, "Accepted connection from %s\n", buf);
- worker_set_state(g_worker, BW_ST_CONN_WAIT_ID);
+ worker_set_state(g_worker, BW_ST_CONN_WAIT_ID, false);
/* run the main worker transceive loop body until there was some error */
while (1) {
To view, visit change 42300. To unsubscribe, or for help writing mail filters, visit settings.