dexter has uploaded this change for review. (
https://gerrit.osmocom.org/c/libosmo-abis/+/32374 )
Change subject: e1d: reconnect to osmo-e1d after connection loss
......................................................................
e1d: reconnect to osmo-e1d after connection loss
When osmo-e1d crashes while a line is in use the client process may
experience not only a lost connection, it may also hang up in an endless
loop that would also spam the logfile. The reason for this is that the
e1d driver in libosmo-abis lacks mechanisms to detect when the
connection to osmo-e1d gets lost.
When osmo-e1d goes down the effects should be similar to those of a
normal connection loss of an e1 line (cable pulled). So let's add an FSM
that takes care of the recovery of the connection when it is lost. Also
make sure that no havoc happens when the connection gets lost.
Related: OS#5983
Change-Id: Iaf4d42c2f009b1d7666e319fabdeb2598aa0b338
---
M src/input/e1d.c
1 file changed, 256 insertions(+), 13 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/libosmo-abis refs/changes/74/32374/1
diff --git a/src/input/e1d.c b/src/input/e1d.c
index 15169f1..9c9b4ab 100644
--- a/src/input/e1d.c
+++ b/src/input/e1d.c
@@ -29,6 +29,7 @@
#include <osmocom/core/bits.h>
#include <osmocom/core/logging.h>
+#include <osmocom/core/fsm.h>
#include <osmocom/vty/vty.h>
@@ -41,15 +42,198 @@
#define TS_SIGN_ALLOC_SIZE 300
+#define S(x) (1 << (x))
struct osmo_e1dp_client *g_e1d = NULL;
+struct osmo_fsm_inst *g_e1d_fsm_inst = NULL;
static int invertbits = 1;
/* pre-declaration */
extern struct e1inp_driver e1d_driver;
static int e1d_want_write(struct e1inp_ts *e1i_ts);
+static int e1d_fd_cb(struct osmo_fd *bfd, unsigned int what);
+static int e1d_line_update(struct e1inp_line *line);
+/* flag array to remember which lines are handled by osmo-e1d */
+bool lines[256];
+
+enum fsm_mgcp_client_states {
+ ST_DISCONNECTED,
+ ST_CONNECTED,
+};
+
+enum fsm_mgcp_client_evt {
+ EV_CONN_LOST,
+ EV_CONNECT,
+};
+
+static const struct value_string fsm_e1d_client_evt_names[] = {
+ OSMO_VALUE_STRING(EV_CONN_LOST),
+ OSMO_VALUE_STRING(EV_CONNECT),
+ { 0, NULL }
+};
+
+static int fsm_e1_client_timer_cb(struct osmo_fsm_inst *fi)
+{
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONNECT, NULL);
+ return 0;
+}
+
+static void fsm_e1d_client_disconnected_cb(struct osmo_fsm_inst *fi, uint32_t event, void
*data)
+{
+ switch (event) {
+ case EV_CONNECT:
+ if (!g_e1d) {
+ g_e1d = osmo_e1dp_client_create(NULL, "/tmp/osmo-e1d.ctl");
+ if (!g_e1d) {
+ LOGPFSML(fi, LOGL_ERROR, "Unable to (re)connect to osmo-e1d daemon,
retrying...\n");
+ osmo_fsm_inst_state_chg(g_e1d_fsm_inst, ST_DISCONNECTED, 1, 0);
+ return;
+ }
+ }
+
+ LOGPFSML(fi, LOGL_NOTICE, "Sucessfully (re)connected to osmo-e1d
daemon!\n");
+ osmo_fsm_inst_state_chg(g_e1d_fsm_inst, ST_CONNECTED, 0, 0);
+ break;
+ default:
+ OSMO_ASSERT(false);
+ break;
+ }
+
+ return;
+}
+
+static void loose_line(struct e1inp_line *line)
+{
+ int ts;
+
+ /* There should be technically no way to get a non e1d line into our private memory */
+ OSMO_ASSERT(line->driver == &e1d_driver);
+
+ for (ts = 1; ts < line->num_ts; ts++) {
+ unsigned int idx = ts - 1;
+ struct e1inp_ts *e1i_ts = &line->ts[idx];
+ struct osmo_fd *bfd = &e1i_ts->driver.e1d.fd;
+
+ /* Only affect file descriptors that are currently in use (do not reset file descriptor
value since we
+ * will use this value to detect if the file descriptor was in use when the connection
broke.) */
+ if (bfd->fd > 0) {
+ LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "Loosing osmo-e1d connection to timeslot:
%d\n", ts);
+
+ if (bfd->list.next && bfd->list.next != LLIST_POISON1)
+ osmo_fd_unregister(bfd);
+
+ close(bfd->fd);
+ bfd->fd = 0;
+ }
+ }
+}
+
+static void fsm_e1d_client_connected_onenter_cb(struct osmo_fsm_inst *fi, uint32_t
prev_state)
+{
+ unsigned int i;
+ struct e1inp_line *line;
+ int ret;
+
+ /* Run a line update to re-establish lost connections */
+ for (i = 0; i < ARRAY_SIZE(lines); i++) {
+ if (!lines[i])
+ continue;
+
+ line = e1inp_line_find(i);
+ if (!line) {
+ /* Apparantly we lost a line - this should not happen */
+ lines[i] = false;
+ continue;
+ }
+
+ ret = e1d_line_update(line);
+ if (ret < 0)
+ LOGPFSML(fi, LOGL_ERROR, "Line update failed after (re)connecting to osmo-e1d
daemon!\n");
+ }
+}
+
+static void fsm_e1d_client_connected_cb(struct osmo_fsm_inst *fi, uint32_t event, void
*data)
+{
+ unsigned int i;
+ struct e1inp_line *line;
+
+ switch (event) {
+ case EV_CONN_LOST:
+ LOGPFSML(fi, LOGL_ERROR, "Lost connection to osmo-e1d daemon!\n");
+
+ /* Destroy e1d clinet */
+ if (g_e1d) {
+ osmo_e1dp_client_destroy(g_e1d);
+ g_e1d = NULL;
+ }
+
+ /* Loose all lines at once */
+ for (i = 0; i < ARRAY_SIZE(lines); i++) {
+ if (!lines[i])
+ continue;
+ line = e1inp_line_find(i);
+ if (!line) {
+ /* Apparantly we lost a line - this should not happen */
+ lines[i] = false;
+ continue;
+ }
+ loose_line(line);
+ }
+
+ osmo_fsm_inst_state_chg(fi, ST_DISCONNECTED, 1, 0);
+ break;
+ default:
+ OSMO_ASSERT(false);
+ break;
+ }
+
+ return;
+}
+
+static bool e1d_connected(void)
+{
+ if (g_e1d_fsm_inst && g_e1d_fsm_inst->state == ST_CONNECTED)
+ return true;
+
+ LOGPFSML(g_e1d_fsm_inst, LOGL_ERROR, "No connection to osmo-e1d daemon!\n");
+ return false;
+}
+
+static struct osmo_fsm_state fsm_e1d_client_states[] = {
+
+ /* Initial CRCX state. This state is immediately entered and executed
+ * when the FSM is started. The rationale is that we first have to
+ * create a connectin before we can execute other operations on that
+ * connection. */
+ [ST_DISCONNECTED] = {
+ .in_event_mask = S(EV_CONNECT),
+ .out_state_mask = S(ST_CONNECTED) | S(ST_DISCONNECTED),
+ .name = OSMO_STRINGIFY(ST_DISCONNECTED),
+ .action = fsm_e1d_client_disconnected_cb,
+ },
+
+ /* Wait for the response to a CRCX operation, check and process the
+ * results, change to ST_READY afterwards. */
+ [ST_CONNECTED] = {
+ .in_event_mask = S(EV_CONN_LOST),
+ .out_state_mask = S(ST_DISCONNECTED),
+ .name = OSMO_STRINGIFY(ST_CONNECTED),
+ .action = fsm_e1d_client_connected_cb,
+ .onenter = fsm_e1d_client_connected_onenter_cb,
+ },
+
+};
+
+static struct osmo_fsm fsm_e1d_client = {
+ .name = "e1d_client",
+ .states = fsm_e1d_client_states,
+ .num_states = ARRAY_SIZE(fsm_e1d_client_states),
+ .log_subsys = DLINP,
+ .event_names = fsm_e1d_client_evt_names,
+ .timer_cb = fsm_e1_client_timer_cb,
+};
static int handle_ts_sign_read(struct osmo_fd *bfd)
{
@@ -66,6 +250,7 @@
if (ret < 1) {
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "%s read error: %d %s\n", __func__, ret,
ret < 0 ? strerror(errno) : "bytes read");
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
return ret;
}
@@ -158,6 +343,7 @@
if (ret < 0 || ret != D_TSX_ALLOC_SIZE) {
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "%s read error: %d %s\n", __func__, ret,
ret < 0 ? strerror(errno) : "bytes read");
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
return ret;
}
@@ -206,8 +392,10 @@
osmo_revbytebits_buf(msg->data, msg->len);
ret = write(bfd->fd, msg->data, msg->len);
- if (ret < msg->len)
+ if (ret < msg->len) {
LOGPITS(e1i_ts, DLINP, LOGL_DEBUG, "send returns %d instead of %d\n", ret,
msg->len);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
+ }
msgb_free(msg);
return ret;
@@ -229,6 +417,7 @@
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "%s read error: %d %s\n", __func__, ret,
ret < 0 ? strerror(errno) : "bytes read");
msgb_free(msg);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
return ret;
}
@@ -263,8 +452,10 @@
LOGPITS(e1i_ts, DLMIB, LOGL_DEBUG, "HDLC CHAN TX: %s\n",
osmo_hexdump(msg->data, msg->len));
ret = write(bfd->fd, msg->data, msg->len);
- if (ret < msg->len)
+ if (ret < msg->len) {
LOGPITS(e1i_ts, DLINP, LOGL_NOTICE, "send returns %d instead of %d\n", ret,
msg->len);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
+ }
msgb_free(msg);
return ret;
@@ -288,6 +479,7 @@
if (ret < 0) {
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "%s read error: %d %s\n", __func__, ret,
strerror(errno));
msgb_free(msg);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
return ret;
}
@@ -308,10 +500,17 @@
struct e1inp_ts *e1i_ts = &line->ts[ts_nr-1];
int ret;
+ if (!e1d_connected()) {
+ msgb_free(msg);
+ return;
+ }
+
ret = write(bfd->fd, msg->data, msg->len);
msgb_free(msg);
- if (ret < 0)
+ if (ret < 0) {
LOGPITS(e1i_ts, DLMI, LOGL_NOTICE, "%s write failed %d\n", __func__, ret);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, line);
+ }
}
static int e1d_fd_cb(struct osmo_fd *bfd, unsigned int what)
@@ -322,6 +521,9 @@
struct e1inp_ts *e1i_ts = &line->ts[idx];
int ret = 0;
+ if (!e1d_connected())
+ return -EIO;
+
switch (e1i_ts->type) {
case E1INP_TS_TYPE_SIGN:
if (what & OSMO_FD_READ)
@@ -359,6 +561,9 @@
static int e1d_want_write(struct e1inp_ts *e1i_ts)
{
+ if (!e1d_connected())
+ return -EIO;
+
/* We never include the DAHDI B-Channel FD into the writeset */
if (e1i_ts->type == E1INP_TS_TYPE_TRAU ||
e1i_ts->type == E1INP_TS_TYPE_I460) {
@@ -386,22 +591,33 @@
if (line->driver != &e1d_driver)
return -EINVAL;
- if (!g_e1d) {
- /* Connect to daemon */
- g_e1d = osmo_e1dp_client_create(NULL, "/tmp/osmo-e1d.ctl");
- if (!g_e1d) {
- LOGPIL(line, DLINP, LOGL_ERROR, "Unable to connect to osmo-e1d daemon\n");
- return -EPIPE;
- }
+ /* Memorize that osmo-e1d is responsible for this line */
+ lines[line->num] = true;
+
+ /* Spawn FSM to handle connection towards osmo-e1d */
+ if (!g_e1d_fsm_inst) {
+ g_e1d_fsm_inst = osmo_fsm_inst_alloc(&fsm_e1d_client, NULL, NULL, LOGL_DEBUG,
"fsm_e1d_client");
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONNECT, NULL);
+ OSMO_ASSERT(g_e1d_fsm_inst);
}
- LOGPIL(line, DLINP, LOGL_NOTICE, "Line update %d %d=E1D(%d:%d) %d\n",
line->num, line->port_nr,
+ /* In case no connection to osmo-e1d is available, we may postpone the line update until
the connection is
+ * available (again) */
+ if (!e1d_connected()) {
+ LOGPIL(line, DLINP, LOGL_NOTICE, "No connection to osmo-e1d daemon, postponing
Line update: %d %d=E1D(%d:%d) %d\n", line->num, line->port_nr,
+ e1d_intf, e1d_line, line->num_ts);
+ return 0;
+ }
+ LOGPIL(line, DLINP, LOGL_NOTICE, "Line update: %d %d=E1D(%d:%d) %d\n",
line->num, line->port_nr,
e1d_intf, e1d_line, line->num_ts);
ret = osmo_e1dp_client_ts_query(g_e1d, &ts_info, &num_ts_info, e1d_intf,
e1d_line, E1DP_INVALID);
if (ret < 0) {
- LOGPIL(line, DLINP, LOGL_ERROR, "Cannot query E1D for timeslot information:
%d\n", ret);
- return -EIO;
+ LOGPIL(line, DLINP, LOGL_ERROR, "Cannot query E1D for timeslot information: %d,
postpoining line update\n", ret);
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, NULL);
+ /* Since we have mechanisms in place that allow us to postpone the line update until
the connection
+ * to osmo-e1d is up again, we may pretend that the line update went ok. */
+ return 0;
}
for (ts=1; ts<line->num_ts; ts++)
@@ -445,6 +661,7 @@
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "Could not open timeslot %d\n", ts);
talloc_free(ts_info);
bfd->fd = 0;
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, NULL);
return -EIO;
}
bfd->when = OSMO_FD_READ;
@@ -476,6 +693,7 @@
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "Could not open timeslot %d\n", ts);
talloc_free(ts_info);
bfd->fd = 0;
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, NULL);
return -EIO;
}
bfd->when = OSMO_FD_READ;
@@ -501,6 +719,7 @@
LOGPITS(e1i_ts, DLINP, LOGL_ERROR, "Could not open timeslot %d\n", ts);
talloc_free(ts_info);
bfd->fd = 0;
+ osmo_fsm_inst_dispatch(g_e1d_fsm_inst, EV_CONN_LOST, NULL);
return -EIO;
}
bfd->when = OSMO_FD_READ;
@@ -530,6 +749,9 @@
int e1inp_e1d_init(void)
{
+ OSMO_ASSERT(osmo_fsm_register(&fsm_e1d_client) == 0);
+ memset(lines, 0, sizeof(lines));
+
/* register the driver with the core */
return e1inp_driver_register(&e1d_driver);
}
--
To view, visit
https://gerrit.osmocom.org/c/libosmo-abis/+/32374
To unsubscribe, or for help writing mail filters, visit
https://gerrit.osmocom.org/settings
Gerrit-Project: libosmo-abis
Gerrit-Branch: master
Gerrit-Change-Id: Iaf4d42c2f009b1d7666e319fabdeb2598aa0b338
Gerrit-Change-Number: 32374
Gerrit-PatchSet: 1
Gerrit-Owner: dexter <pmaier(a)sysmocom.de>
Gerrit-MessageType: newchange