Change in libosmocore[master]: add contrib/struct_endianess.py

historical

Neels Hofmeyr has uploaded this change for review. ( https://gerrit.osmocom.org/11786


Change subject: add contrib/struct_endianess.py
......................................................................

add contrib/struct_endianess.py

In libosmocore (and likely elsewhere) we have scores of packed structs with
sub-byte integer members that lack the necessary member reversal shims to be
able to work on big endian architectures.

Instead of manually editing each one of them and probably introduce errors in
the process, this script handles the change automatically, and in the future
allows us to verify correctness in gerrit verifications.

Change-Id: I8e75b17d8071c7b3a2a171ba776fb76854b28a53
---
A contrib/struct_endianess.py
1 file changed, 337 insertions(+), 0 deletions(-)



  git pull ssh://gerrit.osmocom.org:29418/libosmocore refs/changes/86/11786/1

diff --git a/contrib/struct_endianess.py b/contrib/struct_endianess.py
new file mode 100755
index 0000000..fbff10f
--- /dev/null
+++ b/contrib/struct_endianess.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+
+'''Using mad regexes, automatically make sure that all structs with sub-byte
+integers have matching big-endian definitions. The idea is to save a lot of
+manual effort, and to automatically verify that there are no errors.
+This script most certainly has numerous holes and shortcomings, but actually,
+if you hit problems with it, rather adjust your coding style so that this
+script can deal with it...'''
+
+import re
+import sys
+import codecs
+import os.path
+
+re_struct_start = re.compile(r'^struct\s*[a-zA-Z_][a-zA-Z_0-9]*\s*{\s*$')
+re_struct_end = re.compile(r'^}[^;]*;\s*$')
+
+re_substruct_start = re.compile(r'^\s+struct\s*{\s*$')
+re_substruct_end = re.compile(r'^\s+}\s*[a-zA-Z_][a-zA-Z_0-9]*\s*;\s*$')
+
+re_int_def = re.compile(r'(^\s*((const|unsigned|signed|char|int|long|int[0-9]+_t|uint[0-9]_t)\s+)+\s*)([^;]*;)',
+                        re.DOTALL | re.MULTILINE)
+re_int_members = re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*|[a-zA-Z_][a-zA-Z_0-9]*\s*:\s*[0-9]+)\s*[,;]\s*', re.DOTALL | re.MULTILINE)
+
+re_little_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_LITTLE_ENDIAN\s*(==\s*1\s*|)');
+re_big_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_BIG_ENDIAN\s*');
+re_else = re.compile(r'#\s*else\s*');
+re_endif = re.compile(r'#\s*endif\s*');
+
+re_c_comment = re.compile(r'(/\*[^*]+\*/|//.?$)')
+
+errors_found = 0
+
+def remove_c_comments(code_str):
+    return ''.join(re_c_comment.split(code_str)[::2])
+
+def struct_body_to_big_endian(body_str):
+
+    # kick comments out of the code analysis. They will end up being stripped
+    # from big-endian only.
+    body_str = remove_c_comments(body_str)
+
+    def_strs = body_str.split(';')
+    def_strs = ('%s;' % def_str for def_str in def_strs if def_str.strip())
+
+    # classify defs as containing sub-byte members or not
+    # defs = [ (true, 'uint8_t ', ('foo:3', 'bar:5')),
+    #          (false, 'int baz;'),...]
+    defs = []
+    any_sub_byte_ints = False
+    for one_def in def_strs:
+
+        # does it have sub-string integers?
+        int_def = re_int_def.fullmatch(one_def)
+        if not int_def:
+            # not even a number, same for big and little endian
+            defs.append((False, one_def))
+            continue
+
+        int_type = int_def.group(1)
+        members_str = int_def.groups()[-1]
+        has_sub_byte_ints = False
+
+        members = []
+        for int_member in re_int_members.finditer(members_str):
+            member = int_member.group(1)
+            members.append(member)
+            if ':' in member:
+                has_sub_byte_ints = True
+
+        if not has_sub_byte_ints:
+            defs.append((False, one_def))
+        else:
+            defs.append((True, one_def, int_type, members))
+            any_sub_byte_ints = True
+
+    if not any_sub_byte_ints:
+        return None
+
+    # now the interesting part, go over the defs, and reverse the sub-byte ints
+    # at byte boundaries.
+
+    i = 0
+    got_bits = 0
+    byte_type = None
+    members_within_a_byte = []
+    big_endian_defs = []
+
+    big_defs = []
+    for classified_def in defs:
+        has_sub_byte_ints = classified_def[0]
+
+        # now the big endian part
+        if has_sub_byte_ints:
+            _, one_def, int_type, members = classified_def
+
+            if byte_type and byte_type.strip() != int_type.strip():
+                raise Exception('mismatching type continuation after incomplete byte: %r %r to %r'
+                                % (byte_type, members_within_a_byte, int_type))
+            byte_type = int_type
+
+            for member in members:
+                member_name, bits_str = member.split(':')
+                member_name = member_name.strip()
+                bits = int(bits_str)
+                member = '%s:%d' % (member_name, bits)
+                members_within_a_byte.append(member)
+                got_bits += bits
+
+                if got_bits == 8:
+                    # reverse these.
+                    big_endian_defs.append('%s%s;' % (byte_type, ', '.join(reversed(members_within_a_byte))))
+                    members_within_a_byte = []
+                    byte_type = None
+                    got_bits = 0
+
+                elif got_bits > 8:
+                    raise Exception('sub-byte int breaks clean byte bounds: %s -- %d + %d = %d bits'
+                                    % (member, got_bits - bits, bits, got_bits))
+
+        elif not has_sub_byte_ints:
+            if got_bits:
+                raise Exception('sub-byte members do not add up to clean byte bounds: %r' % members_within_a_byte)
+
+            big_endian_defs.append(classified_def[1])
+
+    # strip empty lines
+    lines = [l for l in (''.join(big_endian_defs).split('\n')) if l.strip()]
+    # clean lines' whitespace errors we might have taken in with the type names
+    for i in range(len(lines)):
+        line = lines[i]
+        while len(line) and line[-1] in ' \t':
+            line = line[:-1]
+        lines[i] = line
+    return '\n'.join(lines)
+
+def handle_struct_body(body_str):
+
+    big_endian_body_str = struct_body_to_big_endian(body_str)
+
+    if big_endian_body_str:
+        new_lines = ['#if OSMO_IS_LITTLE_ENDIAN\n']
+        new_lines.append(body_str)
+        new_lines.append('#elif OSMO_IS_BIG_ENDIAN\n'
+                         '/* auto-generated from the little endian part above (libosmocore/contrib/struct_endianess.py) */\n')
+        new_lines.append(big_endian_body_str)
+        new_lines.append('\n#endif\n')
+        return ''.join(new_lines)
+    else:
+        return body_str
+
+def _check_file(f):
+    global errors_found
+    global struct_body_parts
+    global arbitrary_part
+    global def_part
+    if not (f.endswith('.h') or f.endswith('.c') or f.endswith('.cpp')):
+        return
+
+    # section the file into
+    # [ ["no struct def"], ["struct {...};"], ["no struct def"], ... ]
+    sections = []
+    in_struct = False
+    buf = []
+    for line in codecs.open(f, "r", "utf-8").readlines():
+
+        if not in_struct and re_struct_start.fullmatch(line):
+            sections.append(buf)
+            buf = [line]
+            in_struct = True
+        elif in_struct and re_struct_end.fullmatch(line):
+            buf.append(line)
+            sections.append(buf)
+            in_struct = False
+            buf = []
+        else:
+            buf.append(line)
+    if buf:
+        sections.append(buf)
+
+    # examine each struct, i.e. every second item in 'sections'
+    for i in range(len(sections)):
+        if not (i & 1):
+            continue
+
+        struct = sections[i]
+
+        # if the struct isn't packed, we need not bother
+        if not 'packed' in struct[-1]:
+            continue
+
+        try:
+
+            # assume the 'struct foo {' is on the first line, the closing brace
+            # '};' on the last, and the rest are individual definitions split by
+            # ';'.
+            struct_body_lines = struct[1:-1]
+
+            # divide in struct body sections of
+            # ['arbitrary string', ['body;\n', 'lines;\n'], 'arbitrary string']
+            # Aim: handle each sub-struct on its own, and
+            # if there already are ifdefs for little and big endian, keep just the
+            # little endian bit and derive big endian from it.
+            struct_body_parts = []
+            arbitrary_part = []
+            def_part = []
+
+            def end_def():
+                global struct_body_parts
+                global arbitrary_part
+                global def_part
+
+                if def_part:
+                    struct_body_parts.append(arbitrary_part)
+                    arbitrary_part = []
+                    struct_body_parts.append(def_part)
+                    def_part = []
+
+            j = 0
+            while j < len(struct_body_lines):
+                line = struct_body_lines[j]
+
+                if (re_substruct_start.fullmatch(line)
+                    or re_substruct_end.fullmatch(line)):
+                    end_def()
+                    arbitrary_part.append(line)
+                    j += 1
+                    continue
+
+                if re_big_endian_ifdef.fullmatch(line):
+                    end_def()
+                    # discard big endian section
+                    j += 1
+                    while j < len(struct_body_lines):
+                        line = struct_body_lines[j]
+                        if re_endif.fullmatch(line):
+                            end_def()
+                            j += 1
+                            break;
+                        if re_little_endian_ifdef.fullmatch(line):
+                            end_def()
+                            # keep that start of little endian section, not j++
+                            break;
+                        if re_else.fullmatch(line):
+                            # there's an '#else' after big-endian. Shim a little-endian header in just for the loop.
+                            struct_body_lines[j] = '#if OSMO_IS_LITTLE_ENDIAN\n'
+                            break;
+                        j += 1
+                    continue
+
+                if re_little_endian_ifdef.fullmatch(line):
+                    end_def()
+                    j += 1
+                    while j < len(struct_body_lines):
+                        line = struct_body_lines[j]
+                        if re_endif.fullmatch(line):
+                            end_def()
+                            j += 1
+                            break;
+                        if re_big_endian_ifdef.fullmatch(line):
+                            end_def()
+                            # keep that start of big endian section, not j++
+                            break;
+                        if re_else.fullmatch(line):
+                            # there's an '#else' after little-endian. Shim a big-endian header in just for the loop.
+                            struct_body_lines[j] = '#if OSMO_IS_BIG_ENDIAN\n'
+                            break;
+                        def_part.append(line)
+                        j += 1
+
+                    continue
+
+                def_part.append(line)
+                j += 1
+
+            end_def()
+            if arbitrary_part:
+                struct_body_parts.append(arbitrary_part)
+
+            new_struct_body_parts = []
+            for j in range(len(struct_body_parts)):
+                part = ''.join(struct_body_parts[j])
+                if not (j & 1):
+                    new_struct_body_parts.append(part)
+                else:
+                    new_struct_body_parts.append(handle_struct_body(part))
+
+            new_struct = [struct[0], ''.join(new_struct_body_parts), struct[-1]]
+            sections[i] = new_struct
+        except Exception as e:
+            raise Exception('ERROR in struct %r' % struct[0])
+
+    # phew. result.
+    result = ''.join((''.join(s) for s in sections))
+
+    # see if osmocom/core/endian.h is needed and included.
+    if (not f.endswith('endian.h')
+        and 'OSMO_IS_LITTLE_ENDIAN' in result
+        and '#include <osmocom/core/endian.h>' not in result):
+        # add the include after the last 'osmocom/core' include
+        last_include_start = result.rfind('#include <osmocom/core/')
+        if last_include_start < 0:
+            last_include_start = result.rfind('#include <osmocom/')
+        if last_include_start < 0:
+            last_include_start = result.rfind('#include')
+
+        if last_include_start < 0:
+            raise Exception('do not know where to include osmocom/core/endian.h in %r' % f)
+
+        insert_at = result.find('\n', last_include_start)
+
+        result = result[:insert_at] + '\n#include <osmocom/core/endian.h>' + result[insert_at:]
+
+    with codecs.open(f, "w", "utf-8") as fd:
+        fd.write(result)
+
+def check_file(f):
+        try:
+            _check_file(f)
+        except Exception as e:
+            raise Exception('ERROR IN FILE %r' % f)
+
+args = sys.argv[1:]
+if not args:
+    args = ['.']
+
+for f in args:
+    if os.path.isdir(f):
+        for parent_path, subdirs, files in os.walk(f, None, None):
+            for ff in files:
+                check_file(os.path.join(parent_path, ff))
+    else:
+        check_file(f)
+
+sys.exit(errors_found)
+
+# vim: tabstop=4 shiftwidth=4 expandtab

-- 
To view, visit https://gerrit.osmocom.org/11786
To unsubscribe, or for help writing mail filters, visit https://gerrit.osmocom.org/settings

Gerrit-Project: libosmocore
Gerrit-Branch: master
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8e75b17d8071c7b3a2a171ba776fb76854b28a53
Gerrit-Change-Number: 11786
Gerrit-PatchSet: 1
Gerrit-Owner: Neels Hofmeyr <nhofmeyr at sysmocom.de>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.osmocom.org/pipermail/gerrit-log/attachments/20181116/7c84cbee/attachment.htm>