Update graph_derive_edges.py and graph_utils.py to version 4.1.0: Enhance edge ID generation by incorporating target_section into the ID calculation, allowing for distinct edges across different sections. Update documentation to reflect changes in ID structure and improve clarity on edge handling during de-duplication.

2026-01-10 15:45:26 +01:00 · 2026-01-10 15:45:26 +01:00 · 2da98e8e37
commit 2da98e8e37
parent a852975811
2 changed files with 20 additions and 20 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -3,7 +3,7 @@ FILE: app/core/graph/graph_derive_edges.py
 DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
             WP-15b/c Audit: 
             - Präzises Sektions-Splitting via parse_link_target.
-             - Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
+             - v4.1.0: Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
             - Ermöglicht dem Retriever die Super-Edge-Aggregation.
 """
 from typing import List, Optional, Dict, Tuple
@ -56,7 +56,6 @@ def build_edges_for_note(
                    "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
                }))
                edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
                    "chunk_id": next_id, 
                    "edge_id": _mk_edge_id("prev", next_id, cid, "chunk"),
                    "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
                }))
@ -79,8 +78,8 @@ def build_edges_for_note(
            payload = {
                "chunk_id": cid, 
-                # WP-24c v4.0.0: variant wird nur im Payload gespeichert (target_section), fließt nicht in die ID ein
+                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
-                "edge_id": _mk_edge_id(k, cid, t, "chunk"),
+                "edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
                "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
            }
            if sec: payload["target_section"] = sec
@ -92,10 +91,10 @@ def build_edges_for_note(
            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
            t, sec = parse_link_target(raw_t, note_id)
            if t:
-                # WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
+                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                payload = {
                    "chunk_id": cid, 
-                    "edge_id": _mk_edge_id(k, cid, t, "chunk"),
+                    "edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
                    "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
                }
                if sec: payload["target_section"] = sec
@ -107,10 +106,10 @@ def build_edges_for_note(
            t, sec = parse_link_target(raw_t, note_id)
            if not t: continue
-            # WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
+            # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
            payload = {
                "chunk_id": cid, 
-                "edge_id": _mk_edge_id(k, cid, t, "chunk"),
+                "edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
                "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
            }
            if sec: payload["target_section"] = sec
@ -122,10 +121,10 @@ def build_edges_for_note(
            r, sec = parse_link_target(raw_r, note_id)
            if not r: continue
-            # WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
+            # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
            payload = {
                "chunk_id": cid, "ref_text": raw_r, 
-                "edge_id": _mk_edge_id("references", cid, r, "chunk"),
+                "edge_id": _mk_edge_id("references", cid, r, "chunk", target_section=sec),
                "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
            }
            if sec: payload["target_section"] = sec
@ -134,10 +133,10 @@ def build_edges_for_note(
            # Automatische Kanten-Vererbung aus types.yaml
            for rel in defaults:
                if rel != "references":
-                    # WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
+                    # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                    def_payload = {
                        "chunk_id": cid, 
-                        "edge_id": _mk_edge_id(rel, cid, r, "chunk"),
+                        "edge_id": _mk_edge_id(rel, cid, r, "chunk", target_section=sec),
                        "provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
                    }
                    if sec: def_payload["target_section"] = sec
@ -164,9 +163,9 @@ def build_edges_for_note(
            }))
    # 4) De-Duplizierung (In-Place)
-    # WP-24c v4.0.0: Da die EDGE-ID nur auf 4 Parametern basiert (kind, source, target, scope),
+    # WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
-    # werden Links auf unterschiedliche Abschnitte derselben Note durch die De-Duplizierung
+    # bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige 
-    # konsolidiert. Die Sektion-Information bleibt im Payload (target_section) erhalten.
+    # Kanten erhalten. Nur identische Sektions-Links werden nach Confidence konsolidiert.
    unique_map: Dict[str, dict] = {}
    for e in edges:
        eid = e["edge_id"]
--- a/app/core/graph/graph_utils.py
+++ b/app/core/graph/graph_utils.py
@ -86,7 +86,7 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
    return target, section
-def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
+def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[str] = None) -> str:
    """
    WP-24c v4.0.0: DER GLOBALE STANDARD für Kanten-IDs.
    Erzeugt eine deterministische UUIDv5. Dies stellt sicher, dass manuelle Links
@ -109,12 +109,13 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] =
    if not all([kind, s, t]):
        raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={s}, tgt={t}")
-    # GOLD-STANDARD v4.0.0: STRICT 4-Parameter-ID
+    # Der String enthält nun alle distinkten semantischen Merkmale
    # Keine Suffixe für rule_id oder variant im Hash-String!
    # Jede manuelle Änderung an diesem String-Format führt zu doppelten Kanten in der DB!
    base = f"edge:{kind}:{s}:{t}:{scope}"
-    # Nutzt den URL-Namespace für deterministische Reproduzierbarkeit
+    # Wenn ein Link auf eine spezifische Sektion zeigt, ist es eine andere Relation
    if target_section:
        base += f":{target_section}"
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))
 def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: