fixeria has uploaded this change for review.

View Change

pfcp_peer: detect UPF restart via RTS mismatch in Heartbeat Response

Check the Recovery Timestamp (RTS) in each Heartbeat Response against
the value stored during Association Setup (`rem_rts`). A mismatch
means the UPF has restarted; log a warning, increment the new
`pfcp.peer_restart counter`, and reset the PFCP association so it is
re-established with the restarted peer.

The `is_integer(ExpRTS)` guard prevents a spurious reset when a stale
Heartbeat Response arrives in the connecting state (where `rem_rts`
is still undefined).

Change-Id: I00a8384db1ea9c38d0ad9ff90b3d45ad86c3a020
---
M doc/manuals/chapters/metrics.adoc
M include/s1gw_metrics.hrl
M src/pfcp_peer.erl
M src/s1gw_metrics.erl
4 files changed, 32 insertions(+), 16 deletions(-)

git pull ssh://gerrit.osmocom.org:29418/erlang/osmo-s1gw refs/changes/90/42490/1
diff --git a/doc/manuals/chapters/metrics.adoc b/doc/manuals/chapters/metrics.adoc
index 4ea8a53..42a3d54 100644
--- a/doc/manuals/chapters/metrics.adoc
+++ b/doc/manuals/chapters/metrics.adoc
@@ -49,6 +49,7 @@
| `pfcp.assoc_setup_resp.rx_ack` | PFCP Association Setup Responses with success cause
| `pfcp.assoc_setup_resp.rx_nack` | PFCP Association Setup Responses with failure cause
| `pfcp.unexpected_pdu` | Unexpected or unrecognised PFCP PDUs received
+| `pfcp.peer_restart` | UPF peer restarts detected via Recovery Timestamp mismatch in Heartbeat Responses
|===

[[metrics_s1ap_counters]]
diff --git a/include/s1gw_metrics.hrl b/include/s1gw_metrics.hrl
index 06a9719..9f3450d 100644
--- a/include/s1gw_metrics.hrl
+++ b/include/s1gw_metrics.hrl
@@ -9,6 +9,7 @@
-define(S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_ACK, [ctr, pfcp, assoc_setup_resp, rx_ack]).
-define(S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_NACK, [ctr, pfcp, assoc_setup_resp, rx_nack]).
-define(S1GW_CTR_PFCP_UNEXPECTED_PDU, [ctr, pfcp, unexpected_pdu]).
+-define(S1GW_CTR_PFCP_PEER_RESTART, [ctr, pfcp, peer_restart]).
-define(S1GW_CTR_S1AP_ENB_ALL_RX, [ctr, s1ap, enb, all, rx]).
-define(S1GW_CTR_S1AP_ENB_ALL_RX_UNKNOWN_ENB, [ctr, s1ap, enb, all, rx_unknown_enb]).
-define(S1GW_CTR_S1AP_PROXY_EXCEPTION, [ctr, s1ap, proxy, exception]).
diff --git a/src/pfcp_peer.erl b/src/pfcp_peer.erl
index 34c570c..145eca0 100644
--- a/src/pfcp_peer.erl
+++ b/src/pfcp_peer.erl
@@ -319,8 +319,13 @@
{_, S1} = recv_heartbeat_request(PDU, {FromIp, FromPort}, S0),
{keep_state, S1};
#pfcp{type = heartbeat_response} ->
- {_, S1} = recv_heartbeat_response(PDU, {FromIp, FromPort}, S0),
- {keep_state, S1};
+ case recv_heartbeat_response(PDU, {FromIp, FromPort}, S0) of
+ {restart, S1} ->
+ %% RTS mismatch detected, reset the association
+ {next_state, connecting, S1#peer_state{rem_rts = undefined}};
+ {_, S1} ->
+ {keep_state, S1}
+ end;
#pfcp{seid = SEID} when is_integer(SEID) ->
route_pdu(PDU, S0),
{keep_state, S0};
@@ -384,12 +389,7 @@
tref = TRef} ->
?LOG_NOTICE("Heartbeat Request (SeqNr=~p) timeout", [SeqNr]),
s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_HEARTBEAT_REQ_TIMEOUT),
- if
- From =/= undefined ->
- gen_statem:reply(From, {error, timeout});
- true ->
- ok
- end,
+ maybe_reply(From, {error, timeout}),
S1 = S#peer_state{heartbeat = undefined},
MaxMiss = maps:get(heartbeat_miss_count, Cfg,
?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT),
@@ -537,6 +537,11 @@
end.


+-spec maybe_reply(gen_statem:from() | undefined, term()) -> ok.
+maybe_reply(undefined, _Reply) -> ok;
+maybe_reply(From, Reply) -> gen_statem:reply(From, Reply).
+
+
-spec get_node_id(peer_state()) -> binary().
get_node_id(#peer_state{loc_addr = LocAddr}) ->
list_to_binary(tuple_to_list(LocAddr)).
@@ -635,7 +640,8 @@
seq_no = SeqNr,
ie = RspIEs},
{_FromIp, _FromPort},
- #peer_state{heartbeat = HB} = S) ->
+ #peer_state{heartbeat = HB,
+ rem_rts = ExpRTS} = S) ->
?LOG_DEBUG("Rx Heartbeat Response (SeqNr=~p): ~p", [SeqNr, RspIEs]),
s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_HEARTBEAT_RESP_RX),
case HB of
@@ -643,13 +649,20 @@
seq_nr = SeqNr,
tref = TRef} ->
erlang:cancel_timer(TRef),
- if
- From =/= undefined ->
- gen_statem:reply(From, ok);
- true ->
- ok
- end,
- {ok, S#peer_state{heartbeat = undefined, hb_missed = 0}};
+ S1 = S#peer_state{heartbeat = undefined, hb_missed = 0},
+ case RspIEs of
+ #{recovery_time_stamp := #recovery_time_stamp{time = RecvRTS}}
+ when is_integer(ExpRTS), RecvRTS =/= ExpRTS ->
+ ?LOG_WARNING("UPF restart detected via Heartbeat Response "
+ "(old RTS=~p =/= new RTS=~p), resetting association",
+ [ExpRTS, RecvRTS]),
+ s1gw_metrics:ctr_inc(?S1GW_CTR_PFCP_PEER_RESTART),
+ maybe_reply(From, {error, peer_restarted}),
+ {restart, S1};
+ _ ->
+ maybe_reply(From, ok),
+ {ok, S1}
+ end;
_ ->
?LOG_NOTICE("Heartbeat Response (SeqNr=~p) was not expected", [SeqNr]),
{{error, unexpected}, S}
diff --git a/src/s1gw_metrics.erl b/src/s1gw_metrics.erl
index 551b821..306b418 100644
--- a/src/s1gw_metrics.erl
+++ b/src/s1gw_metrics.erl
@@ -63,6 +63,7 @@
?S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_ACK,
?S1GW_CTR_PFCP_ASSOC_SETUP_RESP_RX_NACK,
?S1GW_CTR_PFCP_UNEXPECTED_PDU,
+ ?S1GW_CTR_PFCP_PEER_RESTART,
?S1GW_CTR_S1AP_ENB_ALL_RX,
?S1GW_CTR_S1AP_ENB_ALL_RX_UNKNOWN_ENB,
?S1GW_CTR_S1AP_PROXY_EXCEPTION, %% exception(s) occurred

To view, visit change 42490. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-MessageType: newchange
Gerrit-Project: erlang/osmo-s1gw
Gerrit-Branch: master
Gerrit-Change-Id: I00a8384db1ea9c38d0ad9ff90b3d45ad86c3a020
Gerrit-Change-Number: 42490
Gerrit-PatchSet: 1
Gerrit-Owner: fixeria <vyanitskiy@sysmocom.de>