Dateien nach "tests" hochladen

2025-11-11 17:30:36 +01:00 · 2025-11-11 17:30:36 +01:00 · c501f8d6e6
commit c501f8d6e6
parent b4287cbfda
2 changed files with 135 additions and 50 deletions
--- a/tests/test_edges_smoke.py
+++ b/tests/test_edges_smoke.py
@ -1,36 +1,36 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-scripts/test_edges_smoke.py
+scripts/test_edges_smoke.py  — Progressive Ausgabe

-Integritäts-Check für mindnet-Edges in Qdrant.
-Prüft pro Note:
- Chunk-Anzahl (mindnet_chunks) = belongs_to-Kanten
- next/prev-Kanten: jeweils (#Chunks - 1)
- Dedupe: kein Duplikat (key=(kind,source_id,target_id,scope))
- references (chunk-scope): vorhanden, wenn Wikilinks erwartet werden (nur Zählreport)
- optional note-scope references/backlink: vorhanden, wenn --note-scope-refs genutzt wurde
+Prüft edge-Integrität je Note mit Live-Ausgabe:
+- belongs_to == #Chunks
+- next == prev == max(#Chunks-1,0)
+- Duplikat-Edges (Key: (kind,source_id,target_id,scope)) == 0
+- Zählt references (chunk/note), backlink

-Ausgabe: JSON pro Note + Gesamtsummary.
+Optionen:
+  --max-notes N     : prüft nur die ersten N Notizen
+  --limit L         : Scroll-Limit pro Anfrage (Default 256)
+  --flush           : jede Zeile sofort flushen
 """

 from __future__ import annotations
-import json, os, sys
+import os, sys, json, argparse
 from typing import Dict, Any, List, Tuple, Set
 from qdrant_client.http import models as rest
-
 from app.core.qdrant import QdrantConfig, get_client

 def collections(prefix: str) -> Tuple[str, str, str]:
    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"

-def scroll_ids(client, collection: str, filt: rest.Filter | None = None, payload=False, limit=256):
+def scroll_iter(client, collection: str, filt: rest.Filter | None, with_payload: bool, limit: int):
    next_page = None
    while True:
        pts, next_page = client.scroll(
            collection_name=collection,
            scroll_filter=filt,
-            with_payload=payload,
+            with_payload=with_payload,
            with_vectors=False,
            limit=limit,
            offset=next_page,
@ -39,48 +39,58 @@ def scroll_ids(client, collection: str, filt: rest.Filter | None = None, payload
            break
        for p in pts:
            yield p
+        if next_page is None:
+            break

-def list_notes(client, prefix: str) -> List[Dict[str, Any]]:
+def list_notes(client, prefix: str, limit: int, max_notes: int | None) -> List[Dict[str, Any]]:
    notes_col, _, _ = collections(prefix)
-    out = []
-    for p in scroll_ids(client, notes_col, None, payload=True):
+    out: List[Dict[str, Any]] = []
+    for p in scroll_iter(client, notes_col, None, True, limit):
        pl = p.payload or {}
        nid = pl.get("note_id") or pl.get("id")
        if nid:
-            out.append({
-                "note_id": nid,
-                "title": pl.get("title"),
-                "type": pl.get("type"),
-            })
+            out.append({"note_id": nid, "title": pl.get("title"), "type": pl.get("type")})
+            if max_notes is not None and len(out) >= max_notes:
+                break
    return out

-def count_chunks_for_note(client, prefix: str, note_id: str) -> int:
+def count_chunks_for_note(client, prefix: str, note_id: str, limit: int) -> int:
    _, chunks_col, _ = collections(prefix)
-    filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
-    return sum(1 for _ in scroll_ids(client, chunks_col, filt, payload=False))
+    f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
+    return sum(1 for _ in scroll_iter(client, chunks_col, f, False, limit))

-def fetch_edges_for_note(client, prefix: str, note_id: str) -> List[Dict[str, Any]]:
+def fetch_edges_for_note(client, prefix: str, note_id: str, limit: int) -> List[Dict[str, Any]]:
    _, _, edges_col = collections(prefix)
-    filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
-    return [p.payload or {} for p in scroll_ids(client, edges_col, filt, payload=True)]
+    f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
+    return [p.payload or {} for p in scroll_iter(client, edges_col, f, True, limit)]

 def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--max-notes", type=int)
+    ap.add_argument("--limit", type=int, default=256)
+    ap.add_argument("--flush", action="store_true")
+    args = ap.parse_args()
+
    cfg = QdrantConfig.from_env()
    client = get_client(cfg)
-    notes = list_notes(client, cfg.prefix)

-    report = []
+    notes = list_notes(client, cfg.prefix, args.limit, args.max_notes)
    total = {"notes": 0, "chunks": 0, "belongs_to": 0, "next": 0, "prev": 0, "refs_chunk": 0, "refs_note": 0, "backlink": 0, "dup_edges": 0}
+
    for n in notes:
        nid = n["note_id"]
        total["notes"] += 1
-        chunk_count = count_chunks_for_note(client, cfg.prefix, nid)
+        chunk_count = count_chunks_for_note(client, cfg.prefix, nid, args.limit)
        total["chunks"] += chunk_count

-        edges = fetch_edges_for_note(client, cfg.prefix, nid)
+        edges = fetch_edges_for_note(client, cfg.prefix, nid, args.limit)
        by_kind = {}
        keys: Set[tuple] = set()
        dup_count = 0
+        refs_chunk = 0
+        refs_note = 0
+        backlink = 0
+
        for e in edges:
            k = e.get("kind")
            by_kind[k] = by_kind.get(k, 0) + 1
@ -89,27 +99,26 @@ def main():
                dup_count += 1
            else:
                keys.add(t)
+            if k == "references" and e.get("scope") == "chunk":
+                refs_chunk += 1
+            if k == "references" and e.get("scope") == "note":
+                refs_note += 1
+            if k == "backlink":
+                backlink += 1

-        bt = by_kind.get("belongs_to", 0)
-        nx = by_kind.get("next", 0)
-        pv = by_kind.get("prev", 0)
-        rc = by_kind.get("references", 0) if any(e.get("scope") == "chunk" and e.get("kind") == "references" for e in edges) else 0
-        rn = sum(1 for e in edges if e.get("scope") == "note" and e.get("kind") == "references")
-        bl = by_kind.get("backlink", 0)
-
-        total["belongs_to"] += bt
-        total["next"] += nx
-        total["prev"] += pv
-        total["refs_chunk"] += rc
-        total["refs_note"] += rn
-        total["backlink"] += bl
+        total["belongs_to"] += by_kind.get("belongs_to", 0)
+        total["next"] += by_kind.get("next", 0)
+        total["prev"] += by_kind.get("prev", 0)
+        total["refs_chunk"] += refs_chunk
+        total["refs_note"] += refs_note
+        total["backlink"] += backlink
        total["dup_edges"] += dup_count

-        ok_bt = (bt == chunk_count)
-        ok_seq = (nx == max(chunk_count - 1, 0) and pv == max(chunk_count - 1, 0))
+        ok_bt = (by_kind.get("belongs_to", 0) == chunk_count)
+        ok_seq = (by_kind.get("next", 0) == max(chunk_count - 1, 0) and by_kind.get("prev", 0) == max(chunk_count - 1, 0))
        ok_dup = (dup_count == 0)

-        report.append({
+        line = {
            "note_id": nid,
            "title": n.get("title"),
            "type": n.get("type"),
@ -120,10 +129,12 @@ def main():
                "next_prev_match": ok_seq,
                "no_duplicate_edges": ok_dup,
            }
-        })
+        }
+        print(json.dumps(line, ensure_ascii=False))
+        if args.flush:
+            sys.stdout.flush()

-    out = {"prefix": cfg.prefix, "summary": total, "notes": report}
-    print(json.dumps(out, ensure_ascii=False, indent=2))
+    print(json.dumps({"prefix": cfg.prefix, "summary": total}, ensure_ascii=False))

 if __name__ == "__main__":
    main()
--- a/tests/test_edges_smoke_fast.py
+++ b/tests/test_edges_smoke_fast.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+scripts/test_edges_smoke_fast.py — Zählung über /count (sehr schnell)
+
+Verwendet Qdrant 'count' API je Note/Kind (anstatt scroll), dadurch sehr schnelle Ausführung.
+Optionen:
+  --max-notes N : prüft nur die ersten N Notizen
+"""
+
+from __future__ import annotations
+import argparse, json
+from typing import Dict, Any, List, Tuple
+from qdrant_client.http import models as rest
+
+from app.core.qdrant import QdrantConfig, get_client
+
+KINDS = ["belongs_to", "next", "prev", "references", "backlink"]
+
+def collections(prefix: str) -> Tuple[str, str, str]:
+    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
+
+def list_note_ids(client, notes_col: str, max_notes: int | None) -> List[Dict[str, Any]]:
+    pts, _ = client.scroll(collection_name=notes_col, with_payload=True, with_vectors=False, limit=max_notes or 1024)
+    out = []
+    for p in pts or []:
+        pl = p.payload or {}
+        nid = pl.get("note_id") or pl.get("id")
+        if nid:
+            out.append({"note_id": nid, "title": pl.get("title"), "type": pl.get("type")})
+    return out
+
+def count_points(client, col: str, filt: rest.Filter) -> int:
+    res = client.count(collection_name=col, count_filter=filt, exact=True)
+    return int(getattr(res, "count", 0))
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--max-notes", type=int)
+    args = ap.parse_args()
+
+    cfg = QdrantConfig.from_env()
+    client = get_client(cfg)
+    notes_col, chunks_col, edges_col = collections(cfg.prefix)
+    notes = list_note_ids(client, notes_col, args.max_notes)
+
+    summary = {"notes": 0, "chunks": 0, "edges": 0}
+    for n in notes:
+        nid = n["note_id"]
+        summary["notes"] += 1
+
+        filt_note = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
+        chunk_cnt = count_points(client, chunks_col, filt_note)
+        summary["chunks"] += chunk_cnt
+
+        # counts per kind (edges)
+        by_kind: Dict[str, int] = {}
+        for k in KINDS:
+            f = rest.Filter(must=[
+                rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)),
+                rest.FieldCondition(key="kind", match=rest.MatchValue(value=k)),
+            ])
+            c = count_points(client, edges_col, f)
+            if c:
+                by_kind[k] = c
+
+        summary["edges"] += sum(by_kind.values())
+        line = {"note_id": nid, "title": n.get("title"), "type": n.get("type"), "chunks": chunk_cnt, "edges_by_kind": by_kind}
+        print(json.dumps(line, ensure_ascii=False))
+
+    print(json.dumps({"prefix": cfg.prefix, "summary": summary}, ensure_ascii=False))
+
+if __name__ == "__main__":
+    main()