fixeria has uploaded this change for review.

View Change

pfcp_peer: add configurable heartbeat miss count + association reset

Introduce heartbeat_miss_count (default: 3) that tracks consecutive
unanswered PFCP Heartbeat Requests and resets the association when the
threshold is reached. The counter applies to both periodic heartbeats
and those triggered via the REST API, and resets to zero on any
successful Heartbeat Response.

Change-Id: I8cb0fb23468aba4dead9865a90d893c78c6ae074
---
M config/sys.config
M contrib/openapi.yaml
M doc/manuals/chapters/configuration.adoc
M doc/manuals/chapters/rest.adoc
M include/osmo_s1gw.hrl
M priv/openapi.json
M src/osmo_s1gw_sup.erl
M src/pfcp_peer.erl
M src/rest_server.erl
9 files changed, 53 insertions(+), 13 deletions(-)

git pull ssh://gerrit.osmocom.org:29418/erlang/osmo-s1gw refs/changes/89/42489/1
diff --git a/config/sys.config b/config/sys.config
index ac4a428..aa784b4 100644
--- a/config/sys.config
+++ b/config/sys.config
@@ -37,7 +37,9 @@
%% optional: PFCP Heartbeat Request timeout in milliseconds (default: 2000)
%% heartbeat_req_timeout => 2000,
%% optional: periodic PFCP heartbeat interval in milliseconds (default: 0, disabled)
- %% heartbeat_interval => 0
+ %% heartbeat_interval => 0,
+ %% optional: missed heartbeat responses before resetting the association (default: 3)
+ %% heartbeat_miss_count => 3
}}
%% Optional PFCP Network Instance IEs (omitted if not configured)
%% {pfcp_net_inst_core, "core-side"}, %% PFCP Network Instance IE value (to core)
diff --git a/contrib/openapi.yaml b/contrib/openapi.yaml
index c9d668a..523c143 100644
--- a/contrib/openapi.yaml
+++ b/contrib/openapi.yaml
@@ -538,6 +538,9 @@
heartbeat_interval:
type: integer
description: Periodic PFCP heartbeat interval in milliseconds (0 = disabled)
+ heartbeat_miss_count:
+ type: integer
+ description: Missed heartbeat responses before resetting the PFCP association

GtpuKpiCfg:
type: object
diff --git a/doc/manuals/chapters/configuration.adoc b/doc/manuals/chapters/configuration.adoc
index a8ea114..9c31985 100644
--- a/doc/manuals/chapters/configuration.adoc
+++ b/doc/manuals/chapters/configuration.adoc
@@ -167,6 +167,12 @@
by OsmoS1GW while associated with the UPF. Set to `0` to disable periodic
heartbeats entirely. Default: `0`.

+`heartbeat_miss_count`::
+ Number of consecutive unanswered Heartbeat Requests after which OsmoS1GW
+ considers the UPF unreachable and resets the PFCP association (transitions
+ back to the connecting state). Applies to both periodic heartbeats and
+ those triggered via the REST API. Default: `3`.
+
NOTE: The legacy flat keys `pfcp_loc_addr` and `pfcp_rem_addr` are still
accepted for backwards compatibility. The `pfcp_peer` map takes priority
if both are present.
diff --git a/doc/manuals/chapters/rest.adoc b/doc/manuals/chapters/rest.adoc
index 2a5dcbf..a20cf28 100644
--- a/doc/manuals/chapters/rest.adoc
+++ b/doc/manuals/chapters/rest.adoc
@@ -81,7 +81,8 @@
"raddr": "127.0.1.2",
"assoc_setup_timeout": 2000,
"heartbeat_req_timeout": 2000,
- "heartbeat_interval": 0
+ "heartbeat_interval": 0,
+ "heartbeat_miss_count": 3
},
"gtpu_kpi": {
"enable": false,
@@ -113,6 +114,7 @@
`assoc_setup_timeout`::: PFCP Association Setup response timeout in milliseconds.
`heartbeat_req_timeout`::: PFCP Heartbeat Request response timeout in milliseconds.
`heartbeat_interval`::: Periodic PFCP heartbeat interval in milliseconds (0 = disabled).
+ `heartbeat_miss_count`::: Consecutive unanswered heartbeats before resetting the association.

`gtpu_kpi`::
`enable`::: Whether GTP-U KPI reporting via nftables counters is active.
diff --git a/include/osmo_s1gw.hrl b/include/osmo_s1gw.hrl
index 95dabfc..3b3f290 100644
--- a/include/osmo_s1gw.hrl
+++ b/include/osmo_s1gw.hrl
@@ -44,6 +44,7 @@
-define(ENV_DEFAULT_PFCP_ASSOC_SETUP_TIMEOUT, 2000).
-define(ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT, 2000).
-define(ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL, 0). %% disabled
+-define(ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT, 3).
-define(ENV_DEFAULT_GTPU_KPI_ENABLE, false).
-define(ENV_DEFAULT_GTPU_KPI_TABLE_NAME, "osmo-s1gw").
-define(ENV_DEFAULT_GTPU_KPI_INTERVAL, 3000).
diff --git a/priv/openapi.json b/priv/openapi.json
index bf5aaf2..67a83db 100644
--- a/priv/openapi.json
+++ b/priv/openapi.json
@@ -775,6 +775,10 @@
"heartbeat_interval": {
"type": "integer",
"description": "Periodic PFCP heartbeat interval in milliseconds (0 = disabled)"
+ },
+ "heartbeat_miss_count": {
+ "type": "integer",
+ "description": "Missed heartbeat responses before resetting the PFCP association"
}
}
},
diff --git a/src/osmo_s1gw_sup.erl b/src/osmo_s1gw_sup.erl
index 832d30f..fc1c054 100644
--- a/src/osmo_s1gw_sup.erl
+++ b/src/osmo_s1gw_sup.erl
@@ -165,7 +165,9 @@
heartbeat_req_timeout => maps:get(heartbeat_req_timeout, Cfg,
?ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT),
heartbeat_interval => maps:get(heartbeat_interval, Cfg,
- ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL)}.
+ ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL),
+ heartbeat_miss_count => maps:get(heartbeat_miss_count, Cfg,
+ ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT)}.


-spec gtpu_kpi_cfg() -> gtpu_kpi:cfg().
diff --git a/src/pfcp_peer.erl b/src/pfcp_peer.erl
index 9dd25ea..34c570c 100644
--- a/src/pfcp_peer.erl
+++ b/src/pfcp_peer.erl
@@ -78,7 +78,8 @@
raddr := string() | inet:ip_address(),
assoc_setup_timeout => pos_integer(),
heartbeat_req_timeout => pos_integer(),
- heartbeat_interval => non_neg_integer()}.
+ heartbeat_interval => non_neg_integer(),
+ heartbeat_miss_count => pos_integer()}.

-type peer_info() :: #{state := atom(),
laddr := inet:ip_address(),
@@ -110,7 +111,8 @@
rem_rts :: undefined | pos_integer(),
seq_nr :: pfcp_seq_nr(),
registry :: dict:dict(),
- heartbeat :: undefined | #heartbeat_state{}
+ heartbeat :: undefined | #heartbeat_state{},
+ hb_missed :: non_neg_integer()
}).

-type peer_state() :: #peer_state{}.
@@ -200,7 +202,8 @@
rem_addr = RemAddr,
seq_nr = 0,
loc_rts = get_recovery_timestamp(),
- registry = dict:new()}}.
+ registry = dict:new(),
+ hb_missed = 0}}.


callback_mode() ->
@@ -276,10 +279,10 @@
?LOG_INFO("State change: ~p -> ~p", [OldState, ?FUNCTION_NAME]),
s1gw_metrics:gauge_set(?S1GW_GAUGE_PFCP_ASSOCIATED, 1),
case maps:get(heartbeat_interval, Cfg, ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL) of
- 0 -> {keep_state, S}; %% periodic heartbeat is disabled
+ 0 -> {keep_state, S#peer_state{hb_missed = 0}}; %% periodic heartbeat is disabled
Interval ->
?LOG_INFO("Starting periodic PFCP heartbeat (interval=~p ms)", [Interval]),
- {keep_state, S, [{{timeout, hb_timer}, Interval, heartbeat}]}
+ {keep_state, S#peer_state{hb_missed = 0}, [{{timeout, hb_timer}, Interval, heartbeat}]}
end;

%% Periodic heartbeat timer
@@ -370,9 +373,11 @@
end;

%% Heartbeat Req (timeout)
-handle_event(_State,
+handle_event(State,
info, {timeout, TRef, heartbeat_request_watchdog},
- #peer_state{heartbeat = HB} = S) ->
+ #peer_state{cfg = Cfg,
+ heartbeat = HB,
+ hb_missed = Missed} = S) ->
case HB of
#heartbeat_state{from = From,
seq_nr = SeqNr,
@@ -385,7 +390,20 @@
true ->
ok
end,
- {keep_state, S#peer_state{heartbeat = undefined}};
+ S1 = S#peer_state{heartbeat = undefined},
+ MaxMiss = maps:get(heartbeat_miss_count, Cfg,
+ ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT),
+ if
+ State =:= connected, Missed + 1 >= MaxMiss ->
+ ?LOG_ERROR("UPF not responding to heartbeats (~p/~p), "
+ "resetting association", [Missed + 1, MaxMiss]),
+ {next_state, connecting,
+ S1#peer_state{rem_rts = undefined, hb_missed = 0}};
+ State =:= connected ->
+ {keep_state, S1#peer_state{hb_missed = Missed + 1}};
+ true ->
+ {keep_state, S1}
+ end;
_ ->
{keep_state, S}
end;
@@ -631,7 +649,7 @@
true ->
ok
end,
- {ok, S#peer_state{heartbeat = undefined}};
+ {ok, S#peer_state{heartbeat = undefined, hb_missed = 0}};
_ ->
?LOG_NOTICE("Heartbeat Response (SeqNr=~p) was not expected", [SeqNr]),
{{error, unexpected}, S}
diff --git a/src/rest_server.erl b/src/rest_server.erl
index bff37fc..4b051e0 100644
--- a/src/rest_server.erl
+++ b/src/rest_server.erl
@@ -287,7 +287,9 @@
<<"heartbeat_req_timeout">> => maps:get(heartbeat_req_timeout, Cfg,
?ENV_DEFAULT_PFCP_HEARTBEAT_REQ_TIMEOUT),
<<"heartbeat_interval">> => maps:get(heartbeat_interval, Cfg,
- ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL)}.
+ ?ENV_DEFAULT_PFCP_HEARTBEAT_INTERVAL),
+ <<"heartbeat_miss_count">> => maps:get(heartbeat_miss_count, Cfg,
+ ?ENV_DEFAULT_PFCP_HEARTBEAT_MISS_COUNT)}.


-spec config_gtpu_kpi() -> map().

To view, visit change 42489. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-MessageType: newchange
Gerrit-Project: erlang/osmo-s1gw
Gerrit-Branch: master
Gerrit-Change-Id: I8cb0fb23468aba4dead9865a90d893c78c6ae074
Gerrit-Change-Number: 42489
Gerrit-PatchSet: 1
Gerrit-Owner: fixeria <vyanitskiy@sysmocom.de>