osmith has uploaded this change for review. (
https://gerrit.osmocom.org/c/osmo-bsc/+/30252 )
Change subject: lchan: try to recover from ST_BORKEN after timeout
......................................................................
lchan: try to recover from ST_BORKEN after timeout
Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
---
M include/osmocom/bsc/lchan.h
M include/osmocom/bsc/lchan_fsm.h
M src/osmo-bsc/lchan_fsm.c
M src/osmo-bsc/net_init.c
M tests/timer.vty
5 files changed, 110 insertions(+), 0 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/osmo-bsc refs/changes/52/30252/1
diff --git a/include/osmocom/bsc/lchan.h b/include/osmocom/bsc/lchan.h
index 4fcfa20..7f25acf 100644
--- a/include/osmocom/bsc/lchan.h
+++ b/include/osmocom/bsc/lchan.h
@@ -359,6 +359,8 @@
/* Timestamps and markers to track active state duration. */
struct timespec active_start;
struct timespec active_stored;
+ /* How many times we tried to recover from ST_BORKEN, gets reset to 0 on success */
+ uint8_t borken_recovery_attempts;
};
#define GSM_LCHAN_SI(lchan, i) (void *)((lchan)->si.buf[i][0])
diff --git a/include/osmocom/bsc/lchan_fsm.h b/include/osmocom/bsc/lchan_fsm.h
index cc231dc..f47c417 100644
--- a/include/osmocom/bsc/lchan_fsm.h
+++ b/include/osmocom/bsc/lchan_fsm.h
@@ -33,6 +33,8 @@
LCHAN_ST_WAIT_RF_RELEASE_ACK,
LCHAN_ST_WAIT_AFTER_ERROR,
LCHAN_ST_BORKEN,
+ LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: first try to
activate the lchan */
+ LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from BORKEN: then try to
release it */
};
enum lchan_fsm_event {
diff --git a/src/osmo-bsc/lchan_fsm.c b/src/osmo-bsc/lchan_fsm.c
index b1cef5d..f7ceec2 100644
--- a/src/osmo-bsc/lchan_fsm.c
+++ b/src/osmo-bsc/lchan_fsm.c
@@ -43,6 +43,8 @@
#include <osmocom/bsc/bsc_stats.h>
#include <osmocom/bsc/lchan.h>
+#define BORKEN_RECOVERY_ATTEMPTS_MAX 5
+
static struct osmo_fsm lchan_fsm;
struct gsm_lchan *lchan_fi_lchan(struct osmo_fsm_inst *fi)
@@ -291,6 +293,9 @@
[LCHAN_ST_WAIT_AFTER_ERROR] = { .T=-3111 },
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T=-13 },
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = { .T=-14 },
+ [LCHAN_ST_BORKEN] = { .T=-15 },
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = { .T=-6 },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = { .T=3111 },
};
/* Transition to a state, using the T timer defined in lchan_fsm_timeouts.
@@ -337,6 +342,8 @@
[LCHAN_ST_BORKEN] = LCHAN_ST_BORKEN,
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = LCHAN_ST_BORKEN,
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = LCHAN_ST_BORKEN,
};
#define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], fmt, ##
args)
@@ -1494,6 +1501,10 @@
if (prev_state != LCHAN_ST_BORKEN)
osmo_stat_item_inc(osmo_stat_item_group_get_item(bts->bts_statg,
BTS_STAT_LCHAN_BORKEN), 1);
+ if (lchan->borken_recovery_attempts == BORKEN_RECOVERY_ATTEMPTS_MAX)
+ LOG_LCHAN(lchan, LOGL_ERROR, "Reached BORKEN_RECOVERY_ATTEMPTS_MAX=%d, giving
up\n",
+ BORKEN_RECOVERY_ATTEMPTS_MAX);
+
/* The actual action besides all the beancounting above */
lchan_reset(lchan);
chan_counts_ts_update(lchan->ts);
@@ -1552,6 +1563,69 @@
}
}
+static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, uint32_t
prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ lchan->borken_recovery_attempts++;
+ LOG_LCHAN(lchan, LOGL_NOTICE, "attempting to recover from BORKEN lchan
(%d/%d)\n",
+ lchan->borken_recovery_attempts, BORKEN_RECOVERY_ATTEMPTS_MAX);
+
+ rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0);
+ if (rc)
+ lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, uint32_t event,
void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ switch (event) {
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_ACK:
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_NACK:
+ /* If an earlier lchan activ got through to the BTS, but the
+ * ACK did not get back to the BSC, it may still be active on
+ * the BTS side. Proceed to release it. */
+ LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of BORKEN lchan,
assuming still active\n");
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst *fi,
uint32_t prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ rc = rsl_tx_rf_chan_release(lchan);
+ if (rc)
+ lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", strerror(-rc),
rc);
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, uint32_t
event, void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+ switch (event) {
+
+ case LCHAN_EV_RSL_RF_CHAN_REL_ACK:
+ LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN lchan\n");
+ lchan->borken_recovery_attempts = 0;
+ lchan_fsm_state_chg(LCHAN_ST_UNUSED);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
#define S(x) (1 << (x))
static const struct osmo_fsm_state lchan_fsm_states[] = {
@@ -1743,6 +1817,30 @@
| S(LCHAN_ST_WAIT_AFTER_ERROR)
,
},
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] {
+ .name = "RECOVER_WAIT_ACTIV_ACK",
+ .onenter = lchan_fsm_recover_wait_activ_ack_onenter,
+ .action = lchan_fsm_recover_wait_activ_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_ACK)
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_NACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_BORKEN)
+ | S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK)
+ ,
+ },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] {
+ .name = "RECOVER_WAIT_RF_RELEASE_ACK",
+ .onenter = lchan_fsm_recover_wait_rf_release_ack_onenter,
+ .action = lchan_fsm_recover_wait_rf_release_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_RF_CHAN_REL_ACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_UNUSED)
+ ,
+ },
};
static const struct value_string lchan_fsm_event_names[] = {
@@ -1814,6 +1912,11 @@
lchan_fsm_state_chg(LCHAN_ST_UNUSED);
return 0;
+ case LCHAN_ST_BORKEN:
+ if (lchan->borken_recovery_attempts < BORKEN_RECOVERY_ATTEMPTS_MAX)
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK);
+ return 0;
+
default:
lchan->release.in_error = true;
lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING;
diff --git a/src/osmo-bsc/net_init.c b/src/osmo-bsc/net_init.c
index 8f2b26c..28ce175 100644
--- a/src/osmo-bsc/net_init.c
+++ b/src/osmo-bsc/net_init.c
@@ -58,6 +58,7 @@
{ .T=-12, .default_val=5, .desc="Timeout for obtaining TA after BSSLAP TA
Request" },
{ .T=-13, .default_val=5, .desc="Timeout for RR Channel Mode Modify ACK (BSC
<-> MS)" },
{ .T=-14, .default_val=5, .desc="Timeout for RSL Channel Mode Modify ACK (BSC
<-> BTS)" },
+ { .T=-15, .default_val=30, .desc="Wait time before trying to use a BORKEN timeslot
again" },
{ .T = -16, .default_val = 1000, .unit = OSMO_TDEF_MS,
.desc = "Granularity for all_allocated:* rate counters: amount of milliseconds
that one counter increment"
" represents. See also X17, X18" },
diff --git a/tests/timer.vty b/tests/timer.vty
index 04c9872..16baaf2 100644
--- a/tests/timer.vty
+++ b/tests/timer.vty
@@ -30,6 +30,7 @@
net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s)
net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 s)
net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 5
s)
+net: X15 = 30 s Wait time before trying to use a BORKEN timeslot again (default: 30 s)
net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of milliseconds
that one counter increment represents. See also X17, X18 (default: 1000 ms)
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the
next counter increment after this many milliseconds. If set to half of X16 (or 0), employ
the usual round() behavior: round up after half of a granularity period. If set to 1,
behave like ceil(): already increment the counter immediately when all channels are
allocated. If set >= X16, behave like floor(): only increment after a full X16 period
of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this
amount of idle time, forget internally cumulated time remainders. Zero to always keep
remainders. See also X16, X17. (default: 60000 ms)
@@ -84,6 +85,7 @@
net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s)
net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 s)
net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 5
s)
+net: X15 = 30 s Wait time before trying to use a BORKEN timeslot again (default: 30 s)
net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of milliseconds
that one counter increment represents. See also X17, X18 (default: 1000 ms)
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the
next counter increment after this many milliseconds. If set to half of X16 (or 0), employ
the usual round() behavior: round up after half of a granularity period. If set to 1,
behave like ceil(): already increment the counter immediately when all channels are
allocated. If set >= X16, behave like floor(): only increment after a full X16 period
of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this
amount of idle time, forget internally cumulated time remainders. Zero to always keep
remainders. See also X16, X17. (default: 60000 ms)
--
To view, visit
https://gerrit.osmocom.org/c/osmo-bsc/+/30252
To unsubscribe, or for help writing mail filters, visit
https://gerrit.osmocom.org/settings
Gerrit-Project: osmo-bsc
Gerrit-Branch: master
Gerrit-Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
Gerrit-Change-Number: 30252
Gerrit-PatchSet: 1
Gerrit-Owner: osmith <osmith(a)sysmocom.de>
Gerrit-MessageType: newchange