jtavares has uploaded this change for review. ( https://gerrit.osmocom.org/c/osmo-remsim/+/30138 )
Change subject: rspro_client: implement re-establish delay ......................................................................
rspro_client: implement re-establish delay
- add new SRVC_ST_REESTABLISH_DELAY state with delay stipulated by table k_reestablish_delay_s[], that implements a simple exponential-like back-off with an upper bound. - new function srvc_do_reestablish() is used to initiate a reestablish, and apply the appropriate delay, if any. - takes external delays (such as TCP connect() delay) into account, and does not double-penalize. - delay is reset to shortest possible if there has been no reestablish initiated in a long time (2x greater than our longest delay). Allows for fast reconnects even if a delay was used to connect. - addresses issues https://osmocom.org/issues/5348 and https://osmocom.org/issues/5610
Change-Id: I86cdc3ba37482e6577b429194d273a2399f32208 --- M src/rspro_client_fsm.c M src/rspro_client_fsm.h 2 files changed, 118 insertions(+), 12 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/osmo-remsim refs/changes/38/30138/1
diff --git a/src/rspro_client_fsm.c b/src/rspro_client_fsm.c index bd267ca..da39a09 100644 --- a/src/rspro_client_fsm.c +++ b/src/rspro_client_fsm.c @@ -20,6 +20,7 @@ #include <stdint.h> #include <string.h> #include <errno.h> +#include <unistd.h>
#include <talloc.h>
@@ -40,6 +41,25 @@ #define T1_WAIT_CLIENT_CONN_RES 10 #define T2_RECONNECT 10
+static const int k_reestablish_delay_s[] = { + 0, 0, 0, // 3 immediate retries + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 Hz for 30 seconds + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 1/2 hz for 1 minute + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 1/4 Hz for 2 minutes + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 1/8 Hz for 4 minutes + 16, // 1/16 Hz thereafter +}; + +#define REESTABLISH_DELAY_COUNT sizeof(k_reestablish_delay_s)/sizeof(k_reestablish_delay_s[0]) + /*********************************************************************** * client-side FSM for a RSPRO connection to remsim-server * @@ -101,7 +121,9 @@ SRVC_ST_ESTABLISHED, /* server connection established, ClientConnect succeeded */ SRVC_ST_CONNECTED, - /* connection lost, we're waiting for a re-establish */ + /* connection lost, 1st step: delaying until we re-establish */ + SRVC_ST_REESTABLISH_DELAY, + /* connection lost, 2nd step: wait for a re-establish */ SRVC_ST_REESTABLISH, };
@@ -191,6 +213,46 @@ .wait_for_resp = 10, };
+static int64_t get_monotonic_ms() +{ + struct timespec t; + clock_gettime(CLOCK_BOOTTIME, &t); + return ((1000LL * t.tv_sec) + (t.tv_nsec / 1000000)); +} + +static void srvc_do_reestablish(struct osmo_fsm_inst *fi) +{ + struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv; + + const int64_t since_last_ms = get_monotonic_ms() - srvc->reestablish_last_ms; + + /* reset delay loop if it has been > 2x the longest timeout since our last attempt; + * this lets us revert to rapid reconnect behavior for a good connection */ + const int64_t reset_ms = 2*1000*(OSMO_MAX(OSMO_MAX(T1_WAIT_CLIENT_CONN_RES, T2_RECONNECT), + k_reestablish_delay_s[REESTABLISH_DELAY_COUNT-1])); + + if (since_last_ms > reset_ms) { + srvc->reestablish_delay_idx = 0; + LOGPFSML(fi, LOGL_NOTICE, "->REESTABLISH_DELAY reset; %" PRId64 "ms since last attempt\n", + since_last_ms); + } + + /* determine if we need to delay reestablishment */ + const int64_t need_ms = k_reestablish_delay_s[srvc->reestablish_delay_idx] * 1000; + int64_t delay_ms = need_ms - since_last_ms; + + if (delay_ms > 0) { + LOGPFSML(fi, LOGL_NOTICE, "->REESTABLISH_DELAY delay %" PRId64 "ms; %" PRId64 "ms since last attempt [step %zu/%zu@%ds]\n", + delay_ms, since_last_ms, srvc->reestablish_delay_idx, (REESTABLISH_DELAY_COUNT-1), + k_reestablish_delay_s[srvc->reestablish_delay_idx]); + } else { + /* cheat and always use a minimum delay of 1ms to ensure a fsm timeout is triggered */ + delay_ms = 1; + } + + osmo_fsm_inst_state_chg_ms(fi, SRVC_ST_REESTABLISH_DELAY, delay_ms, 3); +} + static void srvc_st_init(struct osmo_fsm_inst *fi, uint32_t event, void *data) { switch (event) { @@ -224,7 +286,7 @@ switch (event) { case SRVC_E_TCP_DOWN: case SRVC_E_KA_TIMEOUT: - osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 2); + srvc_do_reestablish(fi); break; case SRVC_E_CLIENT_CONN_RES: pdu = data; @@ -260,7 +322,7 @@ switch (event) { case SRVC_E_TCP_DOWN: case SRVC_E_KA_TIMEOUT: - osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 2); + srvc_do_reestablish(fi); break; case SRVC_E_RSPRO_TX: pdu = data; @@ -286,10 +348,9 @@ return 0; /* we will explicitly terminate it */ }
-static void srvc_st_reestablish_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state) +static void srvc_st_reestablish_delay_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state) { struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv; - int rc;
if (srvc->keepalive_fi) { ipa_keepalive_fsm_stop(srvc->keepalive_fi); @@ -303,6 +364,27 @@ ipa_client_conn_destroy(srvc->conn); srvc->conn = NULL; } + + /* saturate timeout at last (longest) entry */ + if (srvc->reestablish_delay_idx < REESTABLISH_DELAY_COUNT-1) { + srvc->reestablish_delay_idx++; + } +} + +static void srvc_st_reestablish_delay(struct osmo_fsm_inst *fi, uint32_t event, void *data) +{ + switch (event) { + default: + OSMO_ASSERT(0); + } +} +static void srvc_st_reestablish_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state) +{ + struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv; + int rc; + + srvc->reestablish_last_ms = get_monotonic_ms(); + LOGPFSML(fi, LOGL_INFO, "Creating TCP connection to server at %s:%u\n", srvc->server_host, srvc->server_port); srvc->conn = ipa_client_conn_create2(fi, NULL, 0, NULL, 0, srvc->server_host, srvc->server_port, @@ -351,7 +433,10 @@
switch (event) { case SRVC_E_ESTABLISH: - osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 2); + /* reset delay connect immediately on our first connection */ + srvc->reestablish_delay_idx = 0; + srvc->reestablish_last_ms = 0; + srvc_do_reestablish(fi); break; case SRVC_E_DISCONNECT: if (srvc->keepalive_fi) { @@ -377,10 +462,14 @@ struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv;
switch (fi->T) { - case 2: - /* TCP reconnect failed: retry */ + case 3: + /* delay has expired; let's re-establish */ osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 2); break; + case 2: + /* TCP reconnect failed: retry after wait */ + srvc_do_reestablish(fi); + break; case 1: /* no ClientConnectRes received: disconnect + reconnect */ ipa_client_conn_close(srvc->conn); @@ -397,28 +486,35 @@ [SRVC_ST_INIT] = { .name = "INIT", .in_event_mask = 0, /* S(SRVC_E_ESTABLISH) via allstate */ - .out_state_mask = S(SRVC_ST_INIT) | S(SRVC_ST_REESTABLISH), + .out_state_mask = S(SRVC_ST_INIT) | S(SRVC_ST_REESTABLISH_DELAY), .action = srvc_st_init, }, [SRVC_ST_ESTABLISHED] = { .name = "ESTABLISHED", .in_event_mask = S(SRVC_E_TCP_DOWN) | S(SRVC_E_KA_TIMEOUT) | S(SRVC_E_CLIENT_CONN_RES), - .out_state_mask = S(SRVC_ST_CONNECTED) | S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT), + .out_state_mask = S(SRVC_ST_CONNECTED) | S(SRVC_ST_REESTABLISH_DELAY) | S(SRVC_ST_INIT), .action = srvc_st_established, .onenter = srvc_st_established_onenter, }, [SRVC_ST_CONNECTED] = { .name = "CONNECTED", .in_event_mask = S(SRVC_E_TCP_DOWN) | S(SRVC_E_KA_TIMEOUT) | S(SRVC_E_RSPRO_TX), - .out_state_mask = S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT), + .out_state_mask = S(SRVC_ST_REESTABLISH_DELAY) | S(SRVC_ST_INIT), .action = srvc_st_connected, .onenter = srvc_st_connected_onenter, .onleave = srvc_st_connected_onleave, }, + [SRVC_ST_REESTABLISH_DELAY] = { + .name = "REESTABLISH_DELAY", + .in_event_mask = 0, + .out_state_mask = S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT), + .action = srvc_st_reestablish_delay, + .onenter = srvc_st_reestablish_delay_onenter, + }, [SRVC_ST_REESTABLISH] = { .name = "REESTABLISH", .in_event_mask = S(SRVC_E_TCP_UP) | S(SRVC_E_TCP_DOWN), - .out_state_mask = S(SRVC_ST_ESTABLISHED) | S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT), + .out_state_mask = S(SRVC_ST_ESTABLISHED) | S(SRVC_ST_REESTABLISH_DELAY) | S(SRVC_ST_INIT), .action = srvc_st_reestablish, .onenter = srvc_st_reestablish_onenter, }, @@ -444,6 +540,9 @@ return -1;
srvc->fi = fi; + srvc->reestablish_delay_idx = 0; + srvc->reestablish_last_ms = 0; + return 0; }
diff --git a/src/rspro_client_fsm.h b/src/rspro_client_fsm.h index 029fcd9..55fe4a4 100644 --- a/src/rspro_client_fsm.h +++ b/src/rspro_client_fsm.h @@ -26,6 +26,13 @@ struct osmo_fsm_inst *fi; struct osmo_fsm_inst *keepalive_fi; int (*handle_rx)(struct rspro_server_conn *conn, const RsproPDU_t *pdu); + + /* index into k_reestablish_delay[] for this connection */ + size_t reestablish_delay_idx; + + /* timestamp of last re-establish attempt, in milliseconds */ + int64_t reestablish_last_ms; + /* IPA protocol identity */ struct ipaccess_unit ipa_dev;