#!/usr/bin/env python3 import json from pathlib import Path import re from collections import Counter ROOT = Path(__file__).resolve().parents[1] JSON_DIR = ROOT / 'frontend' / 'public' / 'json' METADATA_FILE = JSON_DIR / 'metadata.json' REPORT_JSON = JSON_DIR / 'semantic_audit_report.json' REPORT_MD = JSON_DIR / 'semantic_audit_report.md' STOPWORDS = set(["the","and","of","in","a","to","with","for","on","is","an","by","as","or","all","tools","solutions","manage","management","system","systems","service","services"]) def tokens(text): if not text: return [] text = text.lower() text = re.sub(r"[^a-z0-9]+", " ", text) toks = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 1] return toks def load_categories(): md = json.loads(METADATA_FILE.read_text()) cats = {} for c in md.get('categories', []): try: cid = int(c.get('id')) except Exception: continue name = c.get('name','') desc = c.get('description','') kt = set(tokens(name) + tokens(desc)) # also add raw name token cats[cid] = {'id': cid, 'name': name, 'desc': desc, 'keywords': kt} return cats def score_text_against_category(text_tokens, cat_keywords): if not text_tokens or not cat_keywords: return 0 cnt = 0 for t in text_tokens: if t in cat_keywords: cnt += 1 # simple score: count return cnt def analyze_file(p, cats): try: obj = json.loads(p.read_text()) except Exception as e: return {'file': p.name, 'error': f'parse_error: {e}'} # if the JSON is not an object (e.g., array of versions), we cannot determine category if not isinstance(obj, dict): return {'file': p.name, 'found': [], 'questionable': True, 'reasons': ['no_category_field']} # gather text parts = [] for k in ['name','description','slug','type','documentation','website']: v = obj.get(k) if isinstance(v, list): parts.extend([str(x) for x in v if x]) elif v: parts.append(str(v)) # include install script path and notes for k in ['script','install_methods','notes','tags']: v = obj.get(k) if not v: continue if isinstance(v, list): for item in v: parts.append(json.dumps(item) if isinstance(item, (dict,list)) else str(item)) elif isinstance(v, dict): parts.append(json.dumps(v)) else: parts.append(str(v)) text = " ".join(parts) tks = tokens(text) if not tks: return {'file': p.name, 'found': [], 'notes': ['no_text_to_analyze']} scores = [] for cid, c in cats.items(): sc = score_text_against_category(tks, c['keywords']) if sc > 0: scores.append({'id': cid, 'name': c['name'], 'score': sc}) scores = sorted(scores, key=lambda x: (-x['score'], x['name'])) # determine current categories current = [] raw = obj.get('categories') or obj.get('category') if isinstance(raw, list): current = raw elif raw is not None: current = [raw] # normalize to ints where possible normalized_current = [] for v in current: try: normalized_current.append(int(v)) except Exception: # maybe it's a name; try to match by name for cid,c in cats.items(): if isinstance(v,str) and v.strip().lower() == c['name'].lower(): normalized_current.append(cid) break # decide if questionable questionable = False reasons = [] if not normalized_current: questionable = True reasons.append('no_category_assigned') else: # if none of current in top 3 suggestions and top suggestion has score>0 top_ids = [s['id'] for s in scores[:3]] if scores and all(cid not in top_ids for cid in normalized_current): questionable = True reasons.append('assigned_not_in_top_suggestions') return {'file': p.name, 'current': normalized_current, 'suggestions': scores[:5], 'questionable': questionable, 'reasons': reasons} def main(): cats = load_categories() report = {'summary': {'total': 0, 'questionable': 0, 'errors': 0}, 'files': []} for p in sorted(JSON_DIR.glob('*.json')): if p.name == METADATA_FILE.name: continue report['summary']['total'] += 1 res = analyze_file(p, cats) if 'error' in res: report['summary']['errors'] += 1 if res.get('questionable'): report['summary']['questionable'] += 1 report['files'].append(res) REPORT_JSON.write_text(json.dumps(report, indent=2)) lines = [] lines.append('# Semantic Audit Report: Category Suggestions') lines.append('') lines.append(f"- Total files scanned: {report['summary']['total']}") lines.append(f"- Files with parse errors: {report['summary']['errors']}") lines.append(f"- Files flagged as questionable: {report['summary']['questionable']}") lines.append('') lines.append('## Flagged files and suggestions') lines.append('') for f in report['files']: if f.get('questionable') or f.get('error'): lines.append(f"- **{f['file']}**") if f.get('error'): lines.append(f" - Error: {f['error']}") if f.get('current'): lines.append(f" - Current categories: {f['current']}") if f.get('suggestions'): for s in f['suggestions']: lines.append(f" - Suggestion: {s['id']} {s['name']} (score={s['score']})") if f.get('reasons'): for r in f['reasons']: lines.append(f" - Reason: {r}") lines.append('') REPORT_MD.write_text('\n'.join(lines)) print('Semantic audit complete:') print(f" Total: {report['summary']['total']}") print(f" Questionable: {report['summary']['questionable']}") print(f" Errors: {report['summary']['errors']}") print(f"Wrote: {REPORT_JSON} and {REPORT_MD}") if __name__ == '__main__': main()