fixeria has uploaded this change for review. ( https://gerrit.osmocom.org/c/erlang/osmo-s1gw/+/42489?usp=email )
Change subject: pfcp_peer: add configurable heartbeat miss count + association reset ......................................................................
pfcp_peer: add configurable heartbeat miss count + association reset
Introduce heartbeat_miss_count (default: 3) that tracks consecutive unanswered PFCP Heartbeat Requests and resets the association when the threshold is reached. The counter applies to both periodic heartbeats and those triggered via the REST API, and resets to zero on any successful Heartbeat Response.
Change-Id: I8cb0fb23468aba4dead9865a90d893c78c6ae074 --- M config/sys.config M contrib/openapi.yaml M doc/manuals/chapters/configuration.adoc M doc/manuals/chapters/rest.adoc M include/osmo_s1gw.hrl M priv/openapi.json M src/osmo_s1gw_sup.erl M src/pfcp_peer.erl M src/rest_server.erl 9 files changed, 53 insertions(+), 13 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/erlang/osmo-s1gw refs/changes/89/42489/1
diff --git a/config/sys.config b/config/sys.config index ac4a428..aa784b4 100644 --- a/config/sys.config +++ b/config/sys.config @@ -37,7 +37,9 @@ %% optional: PFCP Heartbeat Request timeout in milliseconds (default: 2000) %% heartbeat_req_timeout => 2000, %% optional: periodic PFCP heartbeat interval in milliseconds (default: 0, disabled) - %% heartbeat_interval => 0 + %% heartbeat_interval => 0, + %% optional: missed heartbeat responses before resetting the association (default: 3) + %% heartbeat_miss_count => 3 }} %% Optional PFCP Network Instance IEs (omitted if not configured) %% {pfcp_net_inst_core, "core-side"}, %% PFCP Network Instance IE value (to core) diff --git a/contrib/openapi.yaml b/contrib/openapi.yaml index c9d668a..523c143 100644 --- a/contrib/openapi.yaml +++ b/contrib/openapi.yaml @@ -538,6 +538,9 @@ heartbeat_interval: type: integer description: Periodic PFCP heartbeat interval in milliseconds (0 = disabled) + heartbeat_miss_count: + type: integer + description: Missed heartbeat responses before resetting the PFCP association
GtpuKpiCfg: type: object diff --git a/doc/manuals/chapters/configuration.adoc b/doc/manuals/chapters/configuration.adoc index a8ea114..9c31985 100644 --- a/doc/manuals/chapters/configuration.adoc +++ b/doc/manuals/chapters/configuration.adoc @@ -167,6 +167,12 @@ by OsmoS1GW while associated with the UPF. Set to `0` to disable periodic heartbeats entirely. Default: `0`.
+`heartbeat_miss_count`:: + Number of consecutive unanswered Heartbeat Requests after which OsmoS1GW + considers the UPF unreachable and resets the PFCP association (transitions + back to the connecting state). Applies to both periodic heartbeats and + those triggered via the REST API. Default: `3`. + NOTE: The legacy flat keys `pfcp_loc_addr` and `pfcp_rem_addr` are still accepted for backwards compatibility. The `pfcp_peer` map takes priority if both are present. diff --git a/doc/manuals/chapters/rest.adoc b/doc/manuals/chapters/rest.adoc index 2a5dcbf..a20cf28 100644 --- a/doc/manuals/chapters/rest.adoc +++ b/doc/manuals/chapters/rest.adoc @@ -81,7 +81,8 @@ "raddr": "127.0.1.2", "assoc_setup_timeout": 2000, "heartbeat_req_timeout": 2000, - "heartbeat_interval": 0 + "heartbeat_interval": 0, + "heartbeat_miss_count": 3 }, "gtpu_kpi": { "enable": false, @@ -113,6 +114,7 @@ `assoc_setup_timeout`::: PFCP Association Setup response timeout in milliseconds. `heartbeat_req_timeout`::: PFCP Heartbeat Request response timeout in milliseconds. `heartbeat_interval`::: Periodic PFCP heartbeat interval in milliseconds (0 = disabled). + `heartbeat_miss_count`::: Consecutive unanswered heartbeats before resetting the association.
`gtpu_kpi`:: `enable`::: Whether GTP-U KPI reporting via nftables counters is active. diff --git a/include/osmo_s1gw.hrl b/include/osmo_s1gw.hrl index 95dabfc..3b3f290 100644 --- a/include/osmo_s1gw.hrl +++ b/include/osmo_s1gw.hrl @@ -44,6 +44,7 @@ -define(ENV_DEFAULT_PFCP_ASSOC_SETUP_TIMEOUT, 2000). -define(ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT, 2000). -define(ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL, 0). %% disabled +-define(ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT, 3). -define(ENV_DEFAULT_GTPU_KPI_ENABLE, false). -define(ENV_DEFAULT_GTPU_KPI_TABLE_NAME, "osmo-s1gw"). -define(ENV_DEFAULT_GTPU_KPI_INTERVAL, 3000). diff --git a/priv/openapi.json b/priv/openapi.json index bf5aaf2..67a83db 100644 --- a/priv/openapi.json +++ b/priv/openapi.json @@ -775,6 +775,10 @@ "heartbeat_interval": { "type": "integer", "description": "Periodic PFCP heartbeat interval in milliseconds (0 = disabled)" + }, + "heartbeat_miss_count": { + "type": "integer", + "description": "Missed heartbeat responses before resetting the PFCP association" } } }, diff --git a/src/osmo_s1gw_sup.erl b/src/osmo_s1gw_sup.erl index 832d30f..fc1c054 100644 --- a/src/osmo_s1gw_sup.erl +++ b/src/osmo_s1gw_sup.erl @@ -165,7 +165,9 @@ heartbeat_req_timeout => maps:get(heartbeat_req_timeout, Cfg, ?ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT), heartbeat_interval => maps:get(heartbeat_interval, Cfg, - ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL)}. + ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL), + heartbeat_miss_count => maps:get(heartbeat_miss_count, Cfg, + ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT)}.
-spec gtpu_kpi_cfg() -> gtpu_kpi:cfg(). diff --git a/src/pfcp_peer.erl b/src/pfcp_peer.erl index 9dd25ea..34c570c 100644 --- a/src/pfcp_peer.erl +++ b/src/pfcp_peer.erl @@ -78,7 +78,8 @@ raddr := string() | inet:ip_address(), assoc_setup_timeout => pos_integer(), heartbeat_req_timeout => pos_integer(), - heartbeat_interval => non_neg_integer()}. + heartbeat_interval => non_neg_integer(), + heartbeat_miss_count => pos_integer()}.
-type peer_info() :: #{state := atom(), laddr := inet:ip_address(), @@ -110,7 +111,8 @@ rem_rts :: undefined | pos_integer(), seq_nr :: pfcp_seq_nr(), registry :: dict:dict(), - heartbeat :: undefined | #heartbeat_state{} + heartbeat :: undefined | #heartbeat_state{}, + hb_missed :: non_neg_integer() }).
-type peer_state() :: #peer_state{}. @@ -200,7 +202,8 @@ rem_addr = RemAddr, seq_nr = 0, loc_rts = get_recovery_timestamp(), - registry = dict:new()}}. + registry = dict:new(), + hb_missed = 0}}.
callback_mode() -> @@ -276,10 +279,10 @@ ?LOG_INFO("State change: ~p -> ~p", [OldState, ?FUNCTION_NAME]), s1gw_metrics:gauge_set(?S1GW_GAUGE_PFCP_ASSOCIATED, 1), case maps:get(heartbeat_interval, Cfg, ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL) of - 0 -> {keep_state, S}; %% periodic heartbeat is disabled + 0 -> {keep_state, S#peer_state{hb_missed = 0}}; %% periodic heartbeat is disabled Interval -> ?LOG_INFO("Starting periodic PFCP heartbeat (interval=~p ms)", [Interval]), - {keep_state, S, [{{timeout, hb_timer}, Interval, heartbeat}]} + {keep_state, S#peer_state{hb_missed = 0}, [{{timeout, hb_timer}, Interval, heartbeat}]} end;
%% Periodic heartbeat timer @@ -370,9 +373,11 @@ end;
%% Heartbeat Req (timeout) -handle_event(_State, +handle_event(State, info, {timeout, TRef, heartbeat_request_watchdog}, - #peer_state{heartbeat = HB} = S) -> + #peer_state{cfg = Cfg, + heartbeat = HB, + hb_missed = Missed} = S) -> case HB of #heartbeat_state{from = From, seq_nr = SeqNr, @@ -385,7 +390,20 @@ true -> ok end, - {keep_state, S#peer_state{heartbeat = undefined}}; + S1 = S#peer_state{heartbeat = undefined}, + MaxMiss = maps:get(heartbeat_miss_count, Cfg, + ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT), + if + State =:= connected, Missed + 1 >= MaxMiss -> + ?LOG_ERROR("UPF not responding to heartbeats (~p/~p), " + "resetting association", [Missed + 1, MaxMiss]), + {next_state, connecting, + S1#peer_state{rem_rts = undefined, hb_missed = 0}}; + State =:= connected -> + {keep_state, S1#peer_state{hb_missed = Missed + 1}}; + true -> + {keep_state, S1} + end; _ -> {keep_state, S} end; @@ -631,7 +649,7 @@ true -> ok end, - {ok, S#peer_state{heartbeat = undefined}}; + {ok, S#peer_state{heartbeat = undefined, hb_missed = 0}}; _ -> ?LOG_NOTICE("Heartbeat Response (SeqNr=~p) was not expected", [SeqNr]), {{error, unexpected}, S} diff --git a/src/rest_server.erl b/src/rest_server.erl index bff37fc..4b051e0 100644 --- a/src/rest_server.erl +++ b/src/rest_server.erl @@ -287,7 +287,9 @@ <<"heartbeat_req_timeout">> => maps:get(heartbeat_req_timeout, Cfg, ?ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT), <<"heartbeat_interval">> => maps:get(heartbeat_interval, Cfg, - ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL)}. + ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL), + <<"heartbeat_miss_count">> => maps:get(heartbeat_miss_count, Cfg, + ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT)}.
-spec config_gtpu_kpi() -> map().