fixeria has uploaded this change for review. ( https://gerrit.osmocom.org/c/erlang/osmo-s1gw/+/42490?usp=email )
Change subject: pfcp_peer: detect UPF restart via RTS mismatch in Heartbeat Response ......................................................................
pfcp_peer: detect UPF restart via RTS mismatch in Heartbeat Response
Check the Recovery Timestamp (RTS) in each Heartbeat Response against the value stored during Association Setup (`rem_rts`). A mismatch means the UPF has restarted; log a warning, increment the new `pfcp.peer_restart counter`, and reset the PFCP association so it is re-established with the restarted peer.
The `is_integer(ExpRTS)` guard prevents a spurious reset when a stale Heartbeat Response arrives in the connecting state (where `rem_rts` is still undefined).
Change-Id: I00a8384db1ea9c38d0ad9ff90b3d45ad86c3a020 --- M doc/manuals/chapters/metrics.adoc M include/s1gw_metrics.hrl M src/pfcp_peer.erl M src/s1gw_metrics.erl 4 files changed, 32 insertions(+), 16 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/erlang/osmo-s1gw refs/changes/90/42490/1
diff --git a/doc/manuals/chapters/metrics.adoc b/doc/manuals/chapters/metrics.adoc index 4ea8a53..42a3d54 100644 --- a/doc/manuals/chapters/metrics.adoc +++ b/doc/manuals/chapters/metrics.adoc @@ -49,6 +49,7 @@ | `pfcp.assoc_setup_resp.rx_ack` | PFCP Association Setup Responses with success cause | `pfcp.assoc_setup_resp.rx_nack` | PFCP Association Setup Responses with failure cause | `pfcp.unexpected_pdu` | Unexpected or unrecognised PFCP PDUs received +| `pfcp.peer_restart` | UPF peer restarts detected via Recovery Timestamp mismatch in Heartbeat Responses |===
[[metrics_s1ap_counters]] diff --git a/include/s1gw_metrics.hrl b/include/s1gw_metrics.hrl index 06a9719..9f3450d 100644 --- a/include/s1gw_metrics.hrl +++ b/include/s1gw_metrics.hrl @@ -9,6 +9,7 @@ -define(S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_ACK, [ctr, pfcp, assoc_setup_resp, rx_ack]). -define(S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_NACK, [ctr, pfcp, assoc_setup_resp, rx_nack]). -define(S1GW_CTR_PFCP_UNEXPECTED_PDU, [ctr, pfcp, unexpected_pdu]). +-define(S1GW_CTR_PFCP_PEER_RESTART, [ctr, pfcp, peer_restart]). -define(S1GW_CTR_S1AP_ENB_ALL_RX, [ctr, s1ap, enb, all, rx]). -define(S1GW_CTR_S1AP_ENB_ALL_RX_UNKNOWN_ENB, [ctr, s1ap, enb, all, rx_unknown_enb]). -define(S1GW_CTR_S1AP_PROXY_EXCEPTION, [ctr, s1ap, proxy, exception]). diff --git a/src/pfcp_peer.erl b/src/pfcp_peer.erl index 34c570c..145eca0 100644 --- a/src/pfcp_peer.erl +++ b/src/pfcp_peer.erl @@ -319,8 +319,13 @@ {_, S1} = recv_heartbeat_request(PDU, {FromIp, FromPort}, S0), {keep_state, S1}; #pfcp{type = heartbeat_response} -> - {_, S1} = recv_heartbeat_response(PDU, {FromIp, FromPort}, S0), - {keep_state, S1}; + case recv_heartbeat_response(PDU, {FromIp, FromPort}, S0) of + {restart, S1} -> + %% RTS mismatch detected, reset the association + {next_state, connecting, S1#peer_state{rem_rts = undefined}}; + {_, S1} -> + {keep_state, S1} + end; #pfcp{seid = SEID} when is_integer(SEID) -> route_pdu(PDU, S0), {keep_state, S0}; @@ -384,12 +389,7 @@ tref = TRef} -> ?LOG_NOTICE("Heartbeat Request (SeqNr=~p) timeout", [SeqNr]), s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_HEARTBEAT_REQ_TIMEOUT), - if - From =/= undefined -> - gen_statem:reply(From, {error, timeout}); - true -> - ok - end, + maybe_reply(From, {error, timeout}), S1 = S#peer_state{heartbeat = undefined}, MaxMiss = maps:get(heartbeat_miss_count, Cfg, ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT), @@ -537,6 +537,11 @@ end.
+-spec maybe_reply(gen_statem:from() | undefined, term()) -> ok. +maybe_reply(undefined, _Reply) -> ok; +maybe_reply(From, Reply) -> gen_statem:reply(From, Reply). + + -spec get_node_id(peer_state()) -> binary(). get_node_id(#peer_state{loc_addr = LocAddr}) -> list_to_binary(tuple_to_list(LocAddr)). @@ -635,7 +640,8 @@ seq_no = SeqNr, ie = RspIEs}, {_FromIp, _FromPort}, - #peer_state{heartbeat = HB} = S) -> + #peer_state{heartbeat = HB, + rem_rts = ExpRTS} = S) -> ?LOG_DEBUG("Rx Heartbeat Response (SeqNr=~p): ~p", [SeqNr, RspIEs]), s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_HEARTBEAT_RESP_RX), case HB of @@ -643,13 +649,20 @@ seq_nr = SeqNr, tref = TRef} -> erlang:cancel_timer(TRef), - if - From =/= undefined -> - gen_statem:reply(From, ok); - true -> - ok - end, - {ok, S#peer_state{heartbeat = undefined, hb_missed = 0}}; + S1 = S#peer_state{heartbeat = undefined, hb_missed = 0}, + case RspIEs of + #{recovery_time_stamp := #recovery_time_stamp{time = RecvRTS}} + when is_integer(ExpRTS), RecvRTS =/= ExpRTS -> + ?LOG_WARNING("UPF restart detected via Heartbeat Response " + "(old RTS=~p =/= new RTS=~p), resetting association", + [ExpRTS, RecvRTS]), + s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_PEER_RESTART), + maybe_reply(From, {error, peer_restarted}), + {restart, S1}; + _ -> + maybe_reply(From, ok), + {ok, S1} + end; _ -> ?LOG_NOTICE("Heartbeat Response (SeqNr=~p) was not expected", [SeqNr]), {{error, unexpected}, S} diff --git a/src/s1gw_metrics.erl b/src/s1gw_metrics.erl index 551b821..306b418 100644 --- a/src/s1gw_metrics.erl +++ b/src/s1gw_metrics.erl @@ -63,6 +63,7 @@ ?S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_ACK, ?S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_NACK, ?S1GW_CTR_PFCP_UNEXPECTED_PDU, + ?S1GW_CTR_PFCP_PEER_RESTART, ?S1GW_CTR_S1AP_ENB_ALL_RX, ?S1GW_CTR_S1AP_ENB_ALL_RX_UNKNOWN_ENB, ?S1GW_CTR_S1AP_PROXY_EXCEPTION, %% exception(s) occurred