feat(audit): add semantic category audit and report

2025-12-27 01:16:25 +01:00 · 2025-12-26 18:13:19 +01:00
parent ba2f0108d3
commit f1822a1482
3 changed files with 15571 additions and 0 deletions
--- a/frontend/public/json/semantic_audit_report.json
+++ b/frontend/public/json/semantic_audit_report.json
--- a/frontend/public/json/semantic_audit_report.md
+++ b/frontend/public/json/semantic_audit_report.md
--- a/scripts/audit_json_category_semantic.py
+++ b/scripts/audit_json_category_semantic.py
@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+import json
+from pathlib import Path
+import re
+from collections import Counter
+
+ROOT = Path(__file__).resolve().parents[1]
+JSON_DIR = ROOT / 'frontend' / 'public' / 'json'
+METADATA_FILE = JSON_DIR / 'metadata.json'
+REPORT_JSON = JSON_DIR / 'semantic_audit_report.json'
+REPORT_MD = JSON_DIR / 'semantic_audit_report.md'
+
+STOPWORDS = set(["the","and","of","in","a","to","with","for","on","is","an","by","as","or","all","tools","solutions","manage","management","system","systems","service","services"])
+
+
+def tokens(text):
+    if not text:
+        return []
+    text = text.lower()
+    text = re.sub(r"[^a-z0-9]+", " ", text)
+    toks = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 1]
+    return toks
+
+
+def load_categories():
+    md = json.loads(METADATA_FILE.read_text())
+    cats = {}
+    for c in md.get('categories', []):
+        try:
+            cid = int(c.get('id'))
+        except Exception:
+            continue
+        name = c.get('name','')
+        desc = c.get('description','')
+        kt = set(tokens(name) + tokens(desc))
+        # also add raw name token
+        cats[cid] = {'id': cid, 'name': name, 'desc': desc, 'keywords': kt}
+    return cats
+
+
+def score_text_against_category(text_tokens, cat_keywords):
+    if not text_tokens or not cat_keywords:
+        return 0
+    cnt = 0
+    for t in text_tokens:
+        if t in cat_keywords:
+            cnt += 1
+    # simple score: count
+    return cnt
+
+
+def analyze_file(p, cats):
+    try:
+        obj = json.loads(p.read_text())
+    except Exception as e:
+        return {'file': p.name, 'error': f'parse_error: {e}'}
+
+    # if the JSON is not an object (e.g., array of versions), we cannot determine category
+    if not isinstance(obj, dict):
+        return {'file': p.name, 'found': [], 'questionable': True, 'reasons': ['no_category_field']}
+
+    # gather text
+    parts = []
+    for k in ['name','description','slug','type','documentation','website']:
+        v = obj.get(k)
+        if isinstance(v, list):
+            parts.extend([str(x) for x in v if x])
+        elif v:
+            parts.append(str(v))
+    # include install script path and notes
+    for k in ['script','install_methods','notes','tags']:
+        v = obj.get(k)
+        if not v:
+            continue
+        if isinstance(v, list):
+            for item in v:
+                parts.append(json.dumps(item) if isinstance(item, (dict,list)) else str(item))
+        elif isinstance(v, dict):
+            parts.append(json.dumps(v))
+        else:
+            parts.append(str(v))
+
+    text = " ".join(parts)
+    tks = tokens(text)
+    if not tks:
+        return {'file': p.name, 'found': [], 'notes': ['no_text_to_analyze']}
+
+    scores = []
+    for cid, c in cats.items():
+        sc = score_text_against_category(tks, c['keywords'])
+        if sc > 0:
+            scores.append({'id': cid, 'name': c['name'], 'score': sc})
+    scores = sorted(scores, key=lambda x: (-x['score'], x['name']))
+
+    # determine current categories
+    current = []
+    raw = obj.get('categories') or obj.get('category')
+    if isinstance(raw, list):
+        current = raw
+    elif raw is not None:
+        current = [raw]
+
+    # normalize to ints where possible
+    normalized_current = []
+    for v in current:
+        try:
+            normalized_current.append(int(v))
+        except Exception:
+            # maybe it's a name; try to match by name
+            for cid,c in cats.items():
+                if isinstance(v,str) and v.strip().lower() == c['name'].lower():
+                    normalized_current.append(cid)
+                    break
+
+    # decide if questionable
+    questionable = False
+    reasons = []
+    if not normalized_current:
+        questionable = True
+        reasons.append('no_category_assigned')
+    else:
+        # if none of current in top 3 suggestions and top suggestion has score>0
+        top_ids = [s['id'] for s in scores[:3]]
+        if scores and all(cid not in top_ids for cid in normalized_current):
+            questionable = True
+            reasons.append('assigned_not_in_top_suggestions')
+
+    return {'file': p.name, 'current': normalized_current, 'suggestions': scores[:5], 'questionable': questionable, 'reasons': reasons}
+
+
+def main():
+    cats = load_categories()
+    report = {'summary': {'total': 0, 'questionable': 0, 'errors': 0}, 'files': []}
+    for p in sorted(JSON_DIR.glob('*.json')):
+        if p.name == METADATA_FILE.name:
+            continue
+        report['summary']['total'] += 1
+        res = analyze_file(p, cats)
+        if 'error' in res:
+            report['summary']['errors'] += 1
+        if res.get('questionable'):
+            report['summary']['questionable'] += 1
+        report['files'].append(res)
+
+    REPORT_JSON.write_text(json.dumps(report, indent=2))
+
+    lines = []
+    lines.append('# Semantic Audit Report: Category Suggestions')
+    lines.append('')
+    lines.append(f"- Total files scanned: {report['summary']['total']}")
+    lines.append(f"- Files with parse errors: {report['summary']['errors']}")
+    lines.append(f"- Files flagged as questionable: {report['summary']['questionable']}")
+    lines.append('')
+    lines.append('## Flagged files and suggestions')
+    lines.append('')
+    for f in report['files']:
+        if f.get('questionable') or f.get('error'):
+            lines.append(f"- **{f['file']}**")
+            if f.get('error'):
+                lines.append(f"  - Error: {f['error']}")
+            if f.get('current'):
+                lines.append(f"  - Current categories: {f['current']}")
+            if f.get('suggestions'):
+                for s in f['suggestions']:
+                    lines.append(f"  - Suggestion: {s['id']} {s['name']} (score={s['score']})")
+            if f.get('reasons'):
+                for r in f['reasons']:
+                    lines.append(f"  - Reason: {r}")
+            lines.append('')
+
+    REPORT_MD.write_text('\n'.join(lines))
+    print('Semantic audit complete:')
+    print(f"  Total: {report['summary']['total']}")
+    print(f"  Questionable: {report['summary']['questionable']}")
+    print(f"  Errors: {report['summary']['errors']}")
+    print(f"Wrote: {REPORT_JSON} and {REPORT_MD}")
+
+
+if __name__ == '__main__':
+    main()