neels has uploaded this change for review. (
https://gerrit.osmocom.org/c/osmo-hnbgw/+/36540?usp=email )
Change subject: nft_kpi: retrieve counters in a separate thread
......................................................................
nft_kpi: retrieve counters in a separate thread
Introduce an NFT thread which does:
- periodically run nftables command to read all counters
- parse the response
- update rate_ctr values.
The main thread still runs the rule addition / removal when a HNB
registers or deregisters. See the comment added in nft_kpi.c, starting
with "A more scalable solution...".
This patch requires the new osmo_stats_report_lock(), see 'Depends'.
Related: SYS#6773
Depends: libosmocore Ib335bea7d2a440ca284e6c439066f96456bf2c2d
Change-Id: I9dc54e6bc94c553f45adfa71ae8ad70be4afbc8f
---
M src/osmo-hnbgw/hnbgw.c
M src/osmo-hnbgw/nft_kpi.c
M src/osmo-hnbgw/osmo_hnbgw_main.c
M src/osmo-hnbgw/tdefs.c
4 files changed, 171 insertions(+), 34 deletions(-)
git pull ssh://gerrit.osmocom.org:29418/osmo-hnbgw refs/changes/40/36540/1
diff --git a/src/osmo-hnbgw/hnbgw.c b/src/osmo-hnbgw/hnbgw.c
index 6340d10..fa516ea 100644
--- a/src/osmo-hnbgw/hnbgw.c
+++ b/src/osmo-hnbgw/hnbgw.c
@@ -513,6 +513,12 @@
if (!hnbp)
return NULL;
+ /* nft_kpi.c updates hnb stats, and it locks osmo_stats_report_lock() while it does so.
This here should run
+ * in the same thread as stats reporting, so there should be no conflict with stats. But
to avoid a hnb in flux
+ * while nft_kpi.c looks up hnb, also obtain the osmo_stats_report_lock() while
manipulating hnb records. */
+ osmo_stats_report_lock(true);
+ /* { */
+
hnbp->id = *id;
hnbp->id_str = talloc_strdup(hnbp, umts_cell_id_name(id));
hnbp->ctrs = rate_ctr_group_alloc(hnbp, &hnb_ctrg_desc, 0);
@@ -525,14 +531,21 @@
osmo_stat_item_group_set_name(hnbp->statg, hnbp->id_str);
llist_add(&hnbp->list, &g_hnbgw->hnb_persistent_list);
+ /* success */
+ goto out_unlock;
- return hnbp;
-
+ /* failure */
out_free_ctrs:
rate_ctr_group_free(hnbp->ctrs);
out_free:
talloc_free(hnbp);
- return NULL;
+ hnbp = NULL;
+
+ /* for both success and failure: */
+out_unlock:
+ /* } */
+ osmo_stats_report_lock(false);
+ return hnbp;
}
struct hnb_persistent *hnb_persistent_find_by_id(const struct umts_cell_id *id)
diff --git a/src/osmo-hnbgw/nft_kpi.c b/src/osmo-hnbgw/nft_kpi.c
index f585415..039c891 100644
--- a/src/osmo-hnbgw/nft_kpi.c
+++ b/src/osmo-hnbgw/nft_kpi.c
@@ -15,7 +15,11 @@
* GNU Affero General Public License for more details.
*/
+#include <pthread.h>
+#include <unistd.h>
+
#include <osmocom/core/logging.h>
+#include <osmocom/core/stats.h>
#include <osmocom/hnbgw/hnbgw.h>
#include "config.h"
@@ -46,13 +50,68 @@
#include <osmocom/hnbgw/nft_kpi.h>
#include <osmocom/hnbgw/tdefs.h>
+/* Threading and locks in this implementation:
+ *
+ * - osmo_stats_report_lock() held while updating rate_ctr from nft results.
+ * - g_nft_kpi_state.lock held while running an nftables command buffer.
+ *
+ * contrived example:
+ *
+ * Main Thread
+ * |
+ * osmo_stats_report_use_lock(true)
+ * |
+ * nft_kpi_init()
+ * create nft ctx, create table
+ * |
+ * +---------------------------> NFT thread
+ * | |
+ * | sleep(X34)
+ * | |
+ * | LOCK(g_nft_kpi_state.lock)
+ * | |
+ * osmo_stats_report() query all nft counters
+ * LOCK(osmo_stats_report_lock) |
+ * | LOCK(osmo_stats_report_lock)
+ * collect stats : wait because libosmocore stats
reporting is busy
+ * | :
+ * UNLOCK(osmo_stats_report_lock) LOCK------|
+ * send out stats for all hnbp:
rate_ctr_add2()
+ * | |
+ * | UNLOCK(osmo_stats_report_lock)
+ * | |
+ * | UNLOCK(g_nft_kpi_state.lock)
+ * | |
+ * hnbgw_rx_hnb_register_req() sleep(X34)
+ * hnb_nft_kpi_start() |
+ * LOCK(g_nft_kpi_state.lock) ...
+ * |
+ * nftables: add new rule
+ * |
+ * UNLOCK(g_nft_kpi_state.lock)
+ * |
+ * ...
+ *
+ * So the NFT thread only retrieves counters. The main thread adds and removes NFT rules
for counters. It is possible
+ * that a HNBAP HNB Register or HNB De-Register occurrs while the NFT thread holds the
g_nft_kpi_state.lock, so that the
+ * main thread blocks until the NFT thread is done reading counters. Note, this happens
only for HNB attach or detach.
+ *
+ * A more scalable solution is to move all NFT interaction to the thread. Instead of
submitting rules from the main
+ * thread, we could submit instructions to an inter-thread queue that the NFT thread
works off. This would add
+ * considerable complexity -- for now we accept the possible but rarely occurring short
delay for HNB de/registration.
+ */
+
struct nft_kpi_state {
+ /* lock this while modifying g_nft_kpi_state */
+ pthread_mutex_t lock;
+
struct {
struct nft_ctx *nft_ctx;
char *table_name;
bool table_created;
} nft;
- struct osmo_timer_list period;
+
+ pthread_t thread;
};
static struct nft_kpi_state g_nft_kpi_state = {};
@@ -98,15 +157,22 @@
return 0;
}
-static void nft_kpi_period_cb(void *data);
+static void nft_kpi_period_cb(void);
-static void nft_kpi_period_schedule(void)
+void *nft_kpi_thread(void *arg)
{
- unsigned long period = osmo_tdef_get(hnbgw_T_defs, -34, OSMO_TDEF_S, 10);
- if (period < 1)
- period = 1;
- osmo_timer_setup(&g_nft_kpi_state.period, nft_kpi_period_cb, NULL);
- osmo_timer_schedule(&g_nft_kpi_state.period, period, 0);
+ OSMO_ASSERT(osmo_ctx_init(__func__) == 0);
+ while (1) {
+ /* Let's just hope that the unsigned long in the hnbgw_T_defs is not modified
non-atomically while
+ * reading the timeout value. */
+ unsigned long period = osmo_tdef_get(hnbgw_T_defs, -34, OSMO_TDEF_US, 1000000);
+ if (period < 1)
+ period = 1;
+ usleep(period);
+
+ nft_kpi_period_cb();
+ }
+ return NULL;
}
int nft_kpi_init(const char *table_name)
@@ -145,7 +211,11 @@
return -EIO;
s->nft.table_created = true;
- nft_kpi_period_schedule();
+
+ /* Up to here, it was fine to dispatch nft without locking, because this is the
initialization from the main
+ * thread. From now on, whoever wants to use the g_nft_ctx must lock this mutex first.
*/
+ pthread_mutex_init(&s->lock, NULL);
+ pthread_create(&s->thread, NULL, nft_kpi_thread, NULL);
return 0;
}
@@ -163,14 +233,19 @@
return -EINVAL;
}
+ /* Manipulating nft state, obtain lock */
+ pthread_mutex_lock(&s->lock);
+ /* { */
+
if (!osmo_sockaddr_str_cmp(gtpu_remote, &hnbp->nft_kpi.addr_remote)) {
/* The remote address is unchanged, no need to update the nft probe */
- return 0;
+ rc = 0;
+ goto unlock_return;
}
/* When there is no table created, it means nft is disabled. Do not attempt to set up
counters. */
if (!s->nft.table_created)
- return 0;
+ goto unlock_return;
/* The remote address has changed. Cancel previous probe, if any, and start a new one.
*/
if (osmo_sockaddr_str_is_nonzero(&hnbp->nft_kpi.addr_remote))
@@ -196,6 +271,10 @@
/* There was an error running the rule, clear addr_remote to indicate that no rule
exists. */
hnbp->nft_kpi.addr_remote = (struct osmo_sockaddr_str){};
}
+
+unlock_return:
+ /* } */
+ pthread_mutex_unlock(&s->lock);
return rc;
}
@@ -206,15 +285,19 @@
{
struct nft_kpi_state *s = &g_nft_kpi_state;
char *cmd;
+ int rc = 0;
/* When there is no table created, neither can there be any rules to be deleted.
* The rules get removed, but the table remains present for as long as osmo-hnbgw runs.
*/
if (!s->nft.table_created)
return 0;
+ pthread_mutex_lock(&s->lock);
+ /* { */
+
/* presence of addr_remote indicates whether an nft rule has been submitted and still
needs to be removed */
if (!osmo_sockaddr_str_is_nonzero(&hnbp->nft_kpi.addr_remote))
- return 0;
+ goto unlock_return;
if (!hnbp->nft_kpi.last.ul.handle_present
|| !hnbp->nft_kpi.last.dl.handle_present) {
@@ -235,7 +318,12 @@
hnbp->nft_kpi.last.ul.handle,
s->nft.table_name,
hnbp->nft_kpi.last.dl.handle);
- return nft_run_now(cmd);
+ rc = nft_run_now(cmd);
+
+unlock_return:
+ /* } */
+ pthread_mutex_unlock(&s->lock);
+ return rc;
}
static void update_ctr(struct rate_ctr_group *cg, int cgidx, uint64_t *last_val, uint64_t
new_val)
@@ -383,6 +471,7 @@
LOGP(DNFT, LOGL_DEBUG, "read %d counters from nft table %s\n", count,
s->nft.table_name);
}
+/* The caller must hold the g_nft_kpi_state.lock! */
static void nft_kpi_read_counters(void)
{
int rc;
@@ -399,6 +488,12 @@
OSMO_STRBUF_PRINTF(sb, "list table inet %s", s->nft.table_name);
OSMO_ASSERT(sb.chars_needed < sizeof(cmd));
+ size_t l = strlen(cmd);
+ LOGP(DNFT, LOGL_DEBUG, "running nft request, %zu chars:
\"%s%s\"\n",
+ l,
+ osmo_escape_cstr_c(OTC_SELECT, cmd, OSMO_MIN(logmax, l)),
+ l > logmax ? "..." : "");
+
rc = nft_ctx_buffer_output(nft);
if (rc) {
LOGP(DNFT, LOGL_ERROR, "error: nft_ctx_buffer_output() returned failure: rc=%d
cmd=%s\n",
@@ -413,29 +508,29 @@
}
output = nft_ctx_get_output_buffer(nft);
- if (log_check_level(DNFT, LOGL_DEBUG)) {
- size_t l = strlen(cmd);
- LOGP(DNFT, LOGL_DEBUG, "ran nft request, %zu chars: \"%s%s\"\n",
- l,
- osmo_escape_cstr_c(OTC_SELECT, cmd, OSMO_MIN(logmax, l)),
- l > logmax ? "..." : "");
- l = strlen(output);
- LOGP(DNFT, LOGL_DEBUG, "got nft response, %zu chars: \"%s%s\"\n",
- l,
- osmo_escape_cstr_c(OTC_SELECT, output, OSMO_MIN(logmax, l)),
- l > logmax ? "..." : "");
- }
+ l = strlen(output);
+ LOGP(DNFT, LOGL_DEBUG, "got nft response, %zu chars: \"%s%s\"\n",
+ l,
+ osmo_escape_cstr_c(OTC_SELECT, output, OSMO_MIN(logmax, l)),
+ l > logmax ? "..." : "");
+ osmo_stats_report_lock(true);
+ /* { */
decode_nft_response(output);
+ /* } */
+ osmo_stats_report_lock(false);
unbuffer_and_exit:
nft_ctx_unbuffer_output(nft);
}
-static void nft_kpi_period_cb(void *data)
+static void nft_kpi_period_cb(void)
{
+ pthread_mutex_lock(&g_nft_kpi_state.lock);
+ /* { */
nft_kpi_read_counters();
- nft_kpi_period_schedule();
+ /* } */
+ pthread_mutex_unlock(&g_nft_kpi_state.lock);
}
#endif // ENABLE_NFTABLES
diff --git a/src/osmo-hnbgw/osmo_hnbgw_main.c b/src/osmo-hnbgw/osmo_hnbgw_main.c
index 1052d18..36180ef 100644
--- a/src/osmo-hnbgw/osmo_hnbgw_main.c
+++ b/src/osmo-hnbgw/osmo_hnbgw_main.c
@@ -328,16 +328,23 @@
/* If UPF is configured, set up PFCP socket and send Association Setup Request to UPF
*/
hnbgw_pfcp_init();
#endif
-#if ENABLE_NFTABLES
+
/* If nftables is enabled, initialize the nft table now or fail startup. This is
important to immediately let
* the user know if cap_net_admin privileges are missing, and not only when the first
hNodeB connects. */
if (g_hnbgw->config.nft_kpi.enable) {
+#if ENABLE_NFTABLES
if (nft_kpi_init(g_hnbgw->config.nft_kpi.table_name)) {
- perror("Failed to initialize nft KPI, probably missing cap_net_admin");
+ perror("ERROR: Failed to initialize nft KPI, probably missing
cap_net_admin");
exit(1);
}
- }
+ /* nft_kpi.c manipulates rate_ctr state directly. Enable the mutex lock around stats
reporting, so
+ * nft_kpi.c can make use of it. */
+ osmo_stats_report_use_lock(true);
+#else
+ fprintf(stderr, "ERROR: Cannot enable nft KPI, this binary was built without
nftables support\n");
+ exit(1);
#endif
+ }
hnbgw_cnpool_start(&g_hnbgw->sccp.cnpool_iucs);
hnbgw_cnpool_start(&g_hnbgw->sccp.cnpool_iups);
diff --git a/src/osmo-hnbgw/tdefs.c b/src/osmo-hnbgw/tdefs.c
index 3a7f0bc..cfcd4c2 100644
--- a/src/osmo-hnbgw/tdefs.c
+++ b/src/osmo-hnbgw/tdefs.c
@@ -35,8 +35,8 @@
{.T = 3113, .default_val = 15, .desc = "Time to keep Paging record, for CN pools
with more than one link" },
{.T = 4, .default_val = 5, .desc = "Timeout to receive RANAP RESET ACKNOWLEDGE from
an MSC/SGSN" },
{.T = -31, .default_val = 15, .desc = "Timeout for establishing and releasing
context maps (RUA <-> SCCP)" },
+ {.T = -34, .default_val = 1000, .unit = OSMO_TDEF_MS, .desc = "Period to query
network traffic stats from netfilter" },
{.T = -1002, .default_val = 10, .desc = "Timeout for the HNB to respond to PS RAB
Assignment Request" },
- {.T = -34, .default_val = 1, .desc = "Period to query network traffic stats from
netfilter" },
{ }
};
--
To view, visit
https://gerrit.osmocom.org/c/osmo-hnbgw/+/36540?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gerrit.osmocom.org/settings
Gerrit-Project: osmo-hnbgw
Gerrit-Branch: master
Gerrit-Change-Id: I9dc54e6bc94c553f45adfa71ae8ad70be4afbc8f
Gerrit-Change-Number: 36540
Gerrit-PatchSet: 1
Gerrit-Owner: neels <nhofmeyr(a)sysmocom.de>
Gerrit-MessageType: newchange