laforge submitted this change.

View Change


Approvals: osmith: Looks good to me, but someone else must approve Jenkins Builder: Verified laforge: Looks good to me, approved daniel: Looks good to me, but someone else must approve
recover BORKEN lchans for missing ACK scenarios

We already recover broken lchans where an ACTIV ACK or REL ACK arrives
late. Now add a recovery path for lchans that are broken because no
ACTIV ACK or REL ACK arrives at all.

Add a timeout of X28 = 30s to the lchan BORKEN state.
On timeout, attempt both a Channel Activation and a Channel Release. If
any of them is ACKed, we have successfully synced BTS and BSC's state.

After successful recovery, place the lchan back in the UNUSED state,
available for servicing subscribers.

If recovery is unsuccessful, just continue to attempt recovery every
further X28 seconds.

Patch-by: osmith, nhofmeyr
Related: osmo-ttcn3-hacks I9b4ddfc4a337808d9d5ec538c25fd390b1b2530f
Related: OS#5106
Related: SYS#6655
Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
---
M doc/lchan-fsm.dot
M include/osmocom/bsc/lchan_fsm.h
M src/osmo-bsc/lchan_fsm.c
M src/osmo-bsc/net_init.c
M tests/timer.vty
5 files changed, 142 insertions(+), 0 deletions(-)

diff --git a/doc/lchan-fsm.dot b/doc/lchan-fsm.dot
index fe35903..82ef922 100644
--- a/doc/lchan-fsm.dot
+++ b/doc/lchan-fsm.dot
@@ -32,6 +32,7 @@
WAIT_TS_READY -> UNUSED [label="error/timeout",style=dashed,constraint=false]
{WAIT_ACTIV_ACK,WAIT_RF_RELEASE_ACK} -> BORKEN [label="error/timeout",style=dashed]
BORKEN -> WAIT_AFTER_ERROR [label="late RF Release ACK"]
+ BORKEN -> WAIT_RF_RELEASE_ACK [label="late Activation ACK"]
WAIT_RLL_RTP_ESTABLISH -> WAIT_RLL_RTP_RELEASED [label=error,style=dashed]

WAIT_ACTIV_ACK -> rtp [label="LCHAN_RTP_EV_LCHAN_READY",style=dotted]
@@ -44,4 +45,13 @@
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> ESTABLISHED [label="LCHAN_EV_RSL_CHAN_MODE_MODIFY_ACK\nno change to RTP"]
WAIT_RR_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
+
+ BORKEN -> RECOVER_WAIT_ACTIV_ACK [label="X28"]
+ RECOVER_WAIT_ACTIV_ACK -> BORKEN [label="error/timeout",style=dashed]
+
+ RECOVER_WAIT_ACTIV_ACK -> UNUSED [label="rx ACK"]
+ RECOVER_WAIT_ACTIV_ACK -> RECOVER_WAIT_RF_RELEASE_ACK [label="rx NACK"]
+
+ RECOVER_WAIT_RF_RELEASE_ACK -> UNUSED [label="rx ACK"]
+ RECOVER_WAIT_RF_RELEASE_ACK -> BORKEN [label="error/timeout",style=dashed]
}
diff --git a/include/osmocom/bsc/lchan_fsm.h b/include/osmocom/bsc/lchan_fsm.h
index cf9f20f..3c7bbc1 100644
--- a/include/osmocom/bsc/lchan_fsm.h
+++ b/include/osmocom/bsc/lchan_fsm.h
@@ -33,6 +33,8 @@
LCHAN_ST_WAIT_RF_RELEASE_ACK,
LCHAN_ST_WAIT_AFTER_ERROR,
LCHAN_ST_BORKEN,
+ LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: first try to activate the lchan */
+ LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from BORKEN: then try to release it */
};

enum lchan_fsm_event {
diff --git a/src/osmo-bsc/lchan_fsm.c b/src/osmo-bsc/lchan_fsm.c
index d6dfe3a..a663326 100644
--- a/src/osmo-bsc/lchan_fsm.c
+++ b/src/osmo-bsc/lchan_fsm.c
@@ -334,6 +334,9 @@
[LCHAN_ST_WAIT_AFTER_ERROR] = { .T = -3111 },
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T = -13 },
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = { .T = -14 },
+ [LCHAN_ST_BORKEN] = { .T = -28 },
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = { .T = -6 },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = { .T = -6 },
};

/* Transition to a state, using the T timer defined in lchan_fsm_timeouts.
@@ -380,6 +383,8 @@
[LCHAN_ST_BORKEN] = LCHAN_ST_BORKEN,
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = LCHAN_ST_BORKEN,
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = LCHAN_ST_BORKEN,
};

#define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], fmt, ## args)
@@ -1631,6 +1636,71 @@
}
}

+static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ LOG_LCHAN(lchan, LOGL_INFO, "attempting to recover from BORKEN lchan\n");
+
+ lchan->type = GSM_LCHAN_SDCCH;
+ lchan->activate.info.ta_known = true;
+
+ chan_counts_ts_update(lchan->ts);
+
+ rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0);
+ if (rc)
+ lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ switch (event) {
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_ACK:
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_NACK:
+ /* If an earlier lchan activ got through to the BTS, but the
+ * ACK did not get back to the BSC, it may still be active on
+ * the BTS side. Proceed to release it. */
+ LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of BORKEN lchan, assuming still active\n");
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ rc = rsl_tx_rf_chan_release(lchan);
+ if (rc)
+ lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+ switch (event) {
+
+ case LCHAN_EV_RSL_RF_CHAN_REL_ACK:
+ LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN lchan\n");
+ lchan_fsm_state_chg(LCHAN_ST_UNUSED);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
#define S(x) (1 << (x))

static const struct osmo_fsm_state lchan_fsm_states[] = {
@@ -1820,6 +1890,32 @@
| S(LCHAN_ST_WAIT_RF_RELEASE_ACK)
| S(LCHAN_ST_UNUSED)
| S(LCHAN_ST_WAIT_AFTER_ERROR)
+ | S(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK)
+ ,
+ },
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] {
+ .name = "RECOVER_WAIT_ACTIV_ACK",
+ .onenter = lchan_fsm_recover_wait_activ_ack_onenter,
+ .action = lchan_fsm_recover_wait_activ_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_ACK)
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_NACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_BORKEN)
+ | S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK)
+ ,
+ },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] {
+ .name = "RECOVER_WAIT_RF_RELEASE_ACK",
+ .onenter = lchan_fsm_recover_wait_rf_release_ack_onenter,
+ .action = lchan_fsm_recover_wait_rf_release_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_RF_CHAN_REL_ACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_BORKEN)
+ | S(LCHAN_ST_UNUSED)
,
},
};
@@ -1893,6 +1989,10 @@
lchan_fsm_state_chg(LCHAN_ST_UNUSED);
return 0;

+ case LCHAN_ST_BORKEN:
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK);
+ return 0;
+
default:
lchan->release.in_error = true;
lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING;
diff --git a/src/osmo-bsc/net_init.c b/src/osmo-bsc/net_init.c
index d97fd01..e5d3fad 100644
--- a/src/osmo-bsc/net_init.c
+++ b/src/osmo-bsc/net_init.c
@@ -74,6 +74,7 @@
" after this amount of idle time, forget internally cumulated time remainders. Zero to always"
" keep remainders. See also X16, X17." },
{ .T = -25, .default_val = 5, .desc = "Timeout for initial user data after an MSC initiated an SCCP connection to the BSS" },
+ { .T = -28, .default_val = 30, .desc = "Interval at which to try to recover a BORKEN lchan" },
{ .T = -3105, .default_val = GSM_NY1_DEFAULT, .unit = OSMO_TDEF_CUSTOM,
.desc = "Ny1: Maximum number of Physical Information (re)transmissions" },
{ .T = -3111, .default_val = 4, .desc = "Wait time after lchan was released in error (should be T3111 + 2s)" },
diff --git a/tests/timer.vty b/tests/timer.vty
index e418054..9b18439 100644
--- a/tests/timer.vty
+++ b/tests/timer.vty
@@ -34,6 +34,7 @@
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
+net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)
@@ -90,6 +91,7 @@
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
+net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)

To view, visit change 30252. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: osmo-bsc
Gerrit-Branch: master
Gerrit-Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
Gerrit-Change-Number: 30252
Gerrit-PatchSet: 14
Gerrit-Owner: osmith <osmith@sysmocom.de>
Gerrit-Reviewer: Jenkins Builder
Gerrit-Reviewer: daniel <dwillmann@sysmocom.de>
Gerrit-Reviewer: fixeria <vyanitskiy@sysmocom.de>
Gerrit-Reviewer: laforge <laforge@osmocom.org>
Gerrit-Reviewer: neels <nhofmeyr@sysmocom.de>
Gerrit-Reviewer: osmith <osmith@sysmocom.de>
Gerrit-CC: pespin <pespin@sysmocom.de>
Gerrit-MessageType: merged