c_struct_clean: use pygments to exclude comments & strings in search

2025-07-23 05:19:35 +00:00 · 2023-01-30 11:31:18 +11:00
parent 63ca625e43
commit d2ade021f5
1 changed files with 26 additions and 9 deletions
--- a/utils_maintenance/c_struct_clean.py
+++ b/utils_maintenance/c_struct_clean.py
@ -5,7 +5,8 @@
 When a source file declares a struct which isn't used anywhere else in the file.
 Remove it.

-There may be times this is needed, however there can typically be removed.
+There may be times this is needed, however they can typically be removed
+and any errors caused can be added to the headers which require the forward declarations.
 """

 import os
@ -24,10 +25,10 @@ from batch_edit_text import run

 SOURCE_DIR = os.path.normpath(os.path.abspath(os.path.normpath(os.path.join(PWD, "..", "..", ".."))))

-# TODO, move to config file
+# TODO: move to configuration file.
 SOURCE_DIRS = (
    "source",
-    "intern/ghost",
+    os.path.join("intern", "ghost"),
 )

 SOURCE_EXT = (
@ -44,13 +45,29 @@ re_match_struct = re.compile(r"struct\s+([A-Za-z_][A-Za-z_0-9]*)\s*;")
 def clean_structs(fn: str, data_src: str) -> Optional[str]:
    import re

+    from pygments.token import Token
+    from pygments import lexers
+
    word_occurance: Dict[str, int] = {}
-    for w_match in re_words.finditer(data_src):
-        w = w_match.group(0)
-        try:
-            word_occurance[w] += 1
-        except KeyError:
-            word_occurance[w] = 1
+
+    lex = lexers.get_lexer_by_name("c++")
+    lex.get_tokens(data_src)
+
+    ty_exact = (Token.Comment.Preproc, Token.Comment.PreprocFile)
+
+    for ty, text in lex.get_tokens(data_src):
+        if ty not in ty_exact:
+            if ty in Token.String:  # type: ignore
+                continue
+            if ty in Token.Comment:  # type: ignore
+                continue
+
+        for w_match in re_words.finditer(data_src):
+            w = w_match.group(0)
+            try:
+                word_occurance[w] += 1
+            except KeyError:
+                word_occurance[w] = 1

    lines = data_src.splitlines(keepends=True)