fixeria has submitted this change. ( https://gerrit.osmocom.org/c/erlang/osmo-s1gw/+/41624?usp=email )
Change subject: enb_proxy: add initial MME pooling support ......................................................................
enb_proxy: add initial MME pooling support
Rework the CONNECTING state to dynamically select an MME from the pool via mme_registry:mme_select/1, passing the eNB's Tracking Area Codes (from the ?'id-SupportedTAs' IE of the S1 SETUP REQUEST) and a list of already-tried MMEs, so successive attempts pick a different candidate.
On connection failure (SCTP establishment timeout or error), or when the selected MME rejects the S1 SETUP REQUEST or fails to respond in time, the FSM re-enters the CONNECTING state rather than terminating. This triggers another mme_select/1 call with the failed MME added to the tried_mmes list. S1 SETUP FAILURE PDUs from the MME are intentionally not forwarded to the eNB, so the retry is fully transparent.
Once mme_select/1 exhausts all candidates it returns 'error'; at that point the FSM builds and sends an S1 SETUP FAILURE PDU to the eNB and terminates.
Other changes: * add close_sock/1, close_conn/1 helpers; simplify terminate/3 * add ?S1GW_CTR_ENB_PROXY_MME_SELECT_ERROR counter
Change-Id: I83dc4a78c78a7b87e87f5ca9a941a168d6c1dc36 Related: SYS#7052 --- M include/s1gw_metrics.hrl M src/enb_proxy.erl M src/s1ap_utils.erl M src/s1gw_metrics.erl 4 files changed, 84 insertions(+), 33 deletions(-)
Approvals: pespin: Looks good to me, but someone else must approve osmith: Looks good to me, but someone else must approve Jenkins Builder: Verified fixeria: Looks good to me, approved
diff --git a/include/s1gw_metrics.hrl b/include/s1gw_metrics.hrl index ecdf315..249bc6c 100644 --- a/include/s1gw_metrics.hrl +++ b/include/s1gw_metrics.hrl @@ -49,6 +49,7 @@ -define(S1GW_CTR_ENB_PROXY_CONN_EST_FAILURE, [ctr, enb_proxy, conn_est, failure]). -define(S1GW_CTR_ENB_PROXY_UNEXPECTED_PDU, [ctr, enb_proxy, unexpected_pdu]). -define(S1GW_CTR_ENB_PROXY_MALFORMED_PDU, [ctr, enb_proxy, malformed_pdu]). +-define(S1GW_CTR_ENB_PROXY_MME_SELECT_ERROR, [ctr, enb_proxy, mme_select, error]).
%% SCTP related metrics -define(S1GW_CTR_SCTP_ERROR_ALL, [ctr, sctp, error, all]). diff --git a/src/enb_proxy.erl b/src/enb_proxy.erl index 0aa00db..5ab79d1 100644 --- a/src/enb_proxy.erl +++ b/src/enb_proxy.erl @@ -71,6 +71,8 @@ enb_conn_info :: sctp_server:conn_info(), mme_conn_cfg :: sctp_client:cfg(), s1setup_req :: undefined | binary(), + tried_mmes :: [mme_registry:mme_name()], + enb_tacs :: undefined | [s1ap_utils:tac()], sock :: undefined | gen_sctp:sctp_socket(), enb_handle :: enb_registry:enb_handle(), genb_id_str :: undefined | string(), @@ -121,6 +123,7 @@ enb_conn_info = EnbConnInfo, mme_conn_cfg = MmeConnCfg, enb_handle = EnbHandle, + tried_mmes = [], handler = Pid}}.
@@ -161,9 +164,13 @@ s1ap_proxy:set_genb_id(S#state.handler, GlobalENBId), enb_registry:notify_genb_id(S#state.enb_handle, GENBId), gtpu_kpi_enb_register(S#state{genb_id_str = GlobalENBId}), + %% fetch the TAC (Tracking Area Code) list + TACs = proplists:get_value(?'id-SupportedTAs', IEs), + ?LOG_DEBUG("Broadcast TACs: ~p", [TACs]), {next_state, connecting, S#state{s1setup_req = Data, - genb_id_str = GlobalENBId}}; + genb_id_str = GlobalENBId, + enb_tacs = TACs}}; {{Proc, Type}, IEs} -> ?LOG_ERROR("Rx unexpected S1AP PDU from eNB: ~p/~p, ~p", [Proc, Type, IEs]), ctr_inc(?S1GW_CTR_ENB_PROXY_UNEXPECTED_PDU, S), @@ -179,19 +186,47 @@
%% CONNECTING state connecting(enter, OldState, - #state{mme_conn_cfg = MmeConnCfg} = S) -> + #state{tried_mmes = TriedMMEs, + enb_tacs = EnbTACs} = S) -> ?LOG_INFO("State change: ~p -> ~p", [OldState, ?FUNCTION_NAME]), - enb_registry:notify_mme_connecting(S#state.enb_handle), - %% Initiate connection establishment with the MME - {ok, Sock} = sctp_client:connect(MmeConnCfg), - %% loop transition to enable state_timeout - {next_state, ?FUNCTION_NAME, S#state{sock = Sock}, - [{state_timeout, 2_000, conn_est_timeout}]}; + %% Select an MME from the pool + case mme_registry:mme_select(#{tried_mmes => TriedMMEs, + enb_tacs => EnbTACs}) of + {ok, MmeInfo} -> + MmeName = maps:get(name, MmeInfo), + ?LOG_INFO("MME selection: trying ~p", [MmeName]), + %% Close the old connection, if any + close_sock(S), + %% Initiate connection establishment with the MME + MmeSockOpts = maps:get(sockopts, S#state.mme_conn_cfg), + MmeConnCfg = #{laddr => maps:get(laddr, MmeInfo), + raddr => maps:get(raddr, MmeInfo), + rport => maps:get(rport, MmeInfo), + sockopts => MmeSockOpts}, + {ok, Sock} = sctp_client:connect(MmeConnCfg), + enb_registry:notify_mme_connecting(S#state.enb_handle), + {next_state, ?FUNCTION_NAME, %% loop transition to enable state_timeout + S#state{sock = Sock, + mme_aid = undefined, + mme_conn_cfg = MmeConnCfg, + tried_mmes = [MmeName | TriedMMEs]}, + [{state_timeout, 2_000, conn_est_timeout}]}; + error -> + ?LOG_ERROR("Failed to select an MME"), + ctr_inc(?S1GW_CTR_ENB_PROXY_MME_SELECT_ERROR, S), + %% Build and send an S1 SETUP FAILURE PDU + {ok, PDU} = s1ap_utils:build_s1setup_fail_pdu([]), + sctp_send_from_mme(PDU, S), + {stop, {shutdown, mme_select_error}} + end;
%% Handle connection establishment timeout connecting(state_timeout, conn_est_timeout, S) -> + ?LOG_ERROR("MME ~p: timeout establishing connection", + [hd(S#state.tried_mmes)]), ctr_inc(?S1GW_CTR_ENB_PROXY_CONN_EST_TIMEOUT, S), - {stop, {shutdown, conn_est_timeout}}; + %% re-enter the state to try again (or another MME) + repeat_state_and_data;
%% Handle PDUs coming from the eNB connecting(cast, {send_data, Data}, S) -> @@ -203,18 +238,21 @@ connecting(info, {sctp, _Socket, MmeAddr, MmePort, {[], #sctp_assoc_change{state = ConnState, assoc_id = Aid}}}, S) -> + MmeName = hd(S#state.tried_mmes), case ConnState of comm_up -> - ?LOG_NOTICE("MME connection (id=~p, ~p:~p) established", - [Aid, MmeAddr, MmePort]), + ?LOG_NOTICE("MME ~p: connection (id=~p, ~p:~p) established", + [MmeName, Aid, MmeAddr, MmePort]), %% send the S1 SETUP REQUEST PDU to the MME sctp_send_from_enb(S#state.s1setup_req, S#state{mme_aid = Aid}), {next_state, wait_s1setup_rsp, S#state{mme_aid = Aid}}; _ -> - ?LOG_NOTICE("MME connection establishment failed: ~p", [ConnState]), + ?LOG_NOTICE("MME ~p: connection establishment failed: ~p", + [MmeName, ConnState]), ctr_inc(?S1GW_CTR_ENB_PROXY_CONN_EST_FAILURE, S), - {stop, {shutdown, conn_est_fail}} + %% re-enter the state to try again (or another MME) + repeat_state_and_data end;
connecting(Event, EventData, S) -> @@ -232,7 +270,8 @@ wait_s1setup_rsp(state_timeout, s1setup_rsp_timeout, S) -> ?LOG_ERROR("Timeout waiting for S1 SETUP RESPONSE from MME"), ctr_inc(?S1GW_CTR_ENB_PROXY_S1_SETUP_RSP_TIMEOUT, S), - {stop, {shutdown, s1setup_rsp_timeout}}; + %% re-enter state 'connecting' to try again (or another MME) + {next_state, connecting, S};
%% Handle PDUs coming from the eNB wait_s1setup_rsp(cast, {send_data, Data}, S) -> @@ -260,15 +299,18 @@ {{?'id-S1Setup', unsuccessfulOutcome}, _IEs} -> ?LOG_NOTICE("Rx S1 SETUP FAILURE from MME"), ctr_inc(?S1GW_CTR_ENB_PROXY_S1_SETUP_FAILURE, S), - sctp_send_from_mme(Data, S), - {stop, {shutdown, s1setup_error}}; + %% do *not* forward the FAILURE to the eNB + %% re-enter state 'connecting' to try again (or another MME) + {next_state, connecting, S}; {{Proc, Type}, IEs} -> ?LOG_ERROR("Rx unexpected S1AP PDU from MME: ~p/~p, ~p", [Proc, Type, IEs]), ctr_inc(?S1GW_CTR_ENB_PROXY_UNEXPECTED_PDU, S), - {stop, {shutdown, s1setup_error}}; + %% re-enter state 'connecting' to try again (or another MME) + {next_state, connecting, S}; {error, _Error} -> ctr_inc(?S1GW_CTR_ENB_PROXY_MALFORMED_PDU, S), - {stop, {shutdown, s1setup_error}} + %% re-enter state 'connecting' to try again (or another MME) + {next_state, connecting, S} end;
%% Handle an #sctp_assoc_change event (MME connection state) @@ -283,7 +325,8 @@ {keep_state, S}; _ -> ?LOG_NOTICE("MME connection state: ~p", [ConnState]), - {stop, {shutdown, conn_fail}} + %% re-enter state 'connecting' to try again (or another MME) + {next_state, connecting, S} end;
wait_s1setup_rsp(Event, EventData, S) -> @@ -369,22 +412,11 @@
terminate(Reason, State, #state{handler = Pid, - enb_handle = Handle, - sock = Sock, - mme_aid = MmeAid}) -> + enb_handle = Handle} = S) -> ?LOG_NOTICE("Terminating in state ~p, reason ~p", [State, Reason]), enb_registry:enb_unregister(Handle), s1ap_proxy:shutdown(Pid), - case Sock of - undefined -> ok; - _ -> - case MmeAid of - undefined -> ok; - _ -> - sctp_common:shutdown({Sock, MmeAid}) - end, - gen_sctp:close(Sock) - end. + close_sock(S).
%% ------------------------------------------------------------------ @@ -507,4 +539,19 @@ s1gw_metrics:ctr_inc(C1).
+-spec close_sock(state()) -> ok | {error, term()}. +close_sock(#state{sock = undefined}) -> ok; + +close_sock(#state{sock = Sock} = S) -> + close_conn(S), %% terminate the MME connection, if needed + gen_sctp:close(Sock). + + +-spec close_conn(state()) -> ok | {error, term()}. +close_conn(#state{mme_aid = undefined}) -> ok; + +close_conn(#state{sock = Sock, mme_aid = MmeAid}) -> + sctp_common:shutdown({Sock, MmeAid}). + + %% vim:set ts=4 sw=4 et: diff --git a/src/s1ap_utils.erl b/src/s1ap_utils.erl index 5cded8b..ea1e463 100644 --- a/src/s1ap_utils.erl +++ b/src/s1ap_utils.erl @@ -70,13 +70,15 @@ s1ap_pdu_info/0]).
+-type tac() :: 0..16#ffff. -type enb_id() :: 0..16#fffffff. -type plmn_id() :: {MCC :: nonempty_string(), MNC :: nonempty_string()}. -type genb_id() :: #{enb_id => enb_id(), plmn_id => plmn_id()}.
--export_type([enb_id/0, +-export_type([tac/0, + enb_id/0, plmn_id/0, genb_id/0]).
diff --git a/src/s1gw_metrics.erl b/src/s1gw_metrics.erl index 52a5a10..c505a8b 100644 --- a/src/s1gw_metrics.erl +++ b/src/s1gw_metrics.erl @@ -106,6 +106,7 @@ ?S1GW_CTR_ENB_PROXY_CONN_EST_FAILURE, %% MME connection establishment failure ?S1GW_CTR_ENB_PROXY_UNEXPECTED_PDU, %% unexpected PDUs received from eNB/MME ?S1GW_CTR_ENB_PROXY_MALFORMED_PDU, %% malformed PDUs received from eNB/MME + ?S1GW_CTR_ENB_PROXY_MME_SELECT_ERROR, %% failed to select an MME (pool exhaustion)
%% SCTP related counters ?S1GW_CTR_SCTP_ERROR_ALL, %% total number of SCTP errors