osmith has submitted this change. ( https://gerrit.osmocom.org/c/osmo-ttcn3-hacks/+/38185?usp=email )
Change subject: testenv: get coredump + backtrace on crash ......................................................................
testenv: get coredump + backtrace on crash
If the SUT or another test component crashes, check if a matching coredump was registered in coredumpctl. If that is the case, then copy it into the testdir and print + store the backtrace.
This solves the problem that it is especially tricky to get a good backtrace when a component crashes inside a container. One needs to grab the coredump from the host (usually handled by systemd-coredump, we cannot override /proc/sys/kernel/core_pattern for containers so it can't be handled in the container), then put the coredump into the container and finally run gdb to get the backtrace inside the container (where proper libraries and debug symbols are). This patch automates all of these steps.
Pau requested this feature.
Related: OS#6494 Change-Id: I743c20968bda9b6d6fb9c2d23bef70ee11950761 --- M _testenv/data/podman/Dockerfile A _testenv/testenv/coredump.py M _testenv/testenv/daemons.py M _testenv/testenv/podman.py 4 files changed, 100 insertions(+), 0 deletions(-)
Approvals: Jenkins Builder: Verified pespin: Looks good to me, approved
diff --git a/_testenv/data/podman/Dockerfile b/_testenv/data/podman/Dockerfile index 95a8c66..42b0635 100644 --- a/_testenv/data/podman/Dockerfile +++ b/_testenv/data/podman/Dockerfile @@ -22,6 +22,7 @@ ccache \ cmake \ flex \ + gdb \ git \ iproute2 \ iputils-ping \ diff --git a/_testenv/testenv/coredump.py b/_testenv/testenv/coredump.py new file mode 100644 index 0000000..0d56328 --- /dev/null +++ b/_testenv/testenv/coredump.py @@ -0,0 +1,89 @@ +# Copyright 2024 sysmocom - s.f.m.c. GmbH +# SPDX-License-Identifier: GPL-3.0-or-later +import datetime +import fnmatch +import json +import logging +import os +import shlex +import shutil +import subprocess +import testenv +import testenv.daemons +import testenv.testdir + +executable_path = None + + +def executable_is_relevant(exe): + if testenv.args.binary_repo: + patterns = [ + "*/usr/bin/open5gs-*", + "*/usr/bin/osmo-*", + ] + + for pattern in patterns: + if fnmatch.fnmatch(exe, pattern): + return True + else: + if exe.startswith(testenv.args.cache): + return True + + return False + + +def get_from_coredumpctl(): + global executable_path + + logging.info("Looking for a coredump") + + if not shutil.which("coredumpctl"): + logging.debug("coredumpctl is not available, won't try to get coredump") + return + + # Check for any coredump within last 3 seconds + since = (datetime.datetime.now() - datetime.timedelta(seconds=3)).strftime("%Y-%m-%d %H:%M:%S") + cmd = ["coredumpctl", "-q", "-S", since, "--json=short", "-n1"] + logging.debug(f"+ {cmd}") + + p = subprocess.run(cmd, capture_output=True, text=True) + if p.returncode != 0: + logging.debug("No coredump found") + return + + # Check if the coredump executable is from osmo-*, open5gs-*, etc. + coredump = json.loads(p.stdout)[0] + if not executable_is_relevant(coredump["exe"]): + logging.debug("Found an unrelated coredump, ignoring") + return + + logging.debug("Coredump found, copying to log dir") + core_path = f"{testenv.testdir.testdir}/core" + testenv.cmd.run( + ["coredumpctl", "dump", "-q", "-S", since, "-o", core_path, str(coredump["pid"]), coredump["exe"]], + stdout=subprocess.DEVNULL, + no_podman=True, + ) + + executable_path = coredump["exe"] + + +def get_backtrace(): + global executable_path + + core_path = f"{testenv.testdir.testdir}/core" + if not executable_path or not os.path.exists(core_path): + return + + logging.info("Running gdb to get a backtrace") + + cmd = "gdb" + cmd += " --batch" + cmd += f" {shlex.quote(executable_path)}" + cmd += f" {shlex.quote(core_path)}" + cmd += " -ex bt" + cmd += f" | tee {shlex.quote(core_path)}.backtrace" + + testenv.cmd.run(cmd) + + executable_path = None diff --git a/_testenv/testenv/daemons.py b/_testenv/testenv/daemons.py index c7e8722..0c9c0dd 100644 --- a/_testenv/testenv/daemons.py +++ b/_testenv/testenv/daemons.py @@ -7,6 +7,7 @@ import shlex import subprocess import testenv +import testenv.coredump import testenv.testdir import time import sys @@ -18,6 +19,8 @@ def init(): global run_shell_on_stop
+ atexit.register(testenv.coredump.get_backtrace) + if not testenv.args.podman: atexit.register(stop) if testenv.args.shell: @@ -111,6 +114,8 @@ return
current_test = testenv.testsuite.get_current_test() + testenv.coredump.get_from_coredumpctl() + if current_test: logging.error(f"{daemon_name} crashed during {current_test}!") testenv.testsuite.wait_until_test_stopped() diff --git a/_testenv/testenv/podman.py b/_testenv/testenv/podman.py index 2668b0d..6d801fa 100644 --- a/_testenv/testenv/podman.py +++ b/_testenv/testenv/podman.py @@ -10,6 +10,7 @@ import subprocess import testenv.cmd import testenv.testdir +import testenv.coredump import time
image_name = None @@ -276,6 +277,10 @@ if not is_running(): return
+ # If we have a coredump, we must get the backtrace by running gdb inside + # the container. So do it before stopping the container. + testenv.coredump.get_backtrace() + if not restart and run_shell_on_stop: logging.info("Running interactive shell before stopping container (--shell)")