diff --git a/app/core/graph/graph_derive_edges.py b/app/core/graph/graph_derive_edges.py index d12c5e8..1f880ff 100644 --- a/app/core/graph/graph_derive_edges.py +++ b/app/core/graph/graph_derive_edges.py @@ -1,7 +1,10 @@ """ FILE: app/core/graph/graph_derive_edges.py DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung. - AUDIT: Integriert parse_link_target für saubere Graphen-Topologie. + AUDIT: + - Nutzt parse_link_target + - Übergibt Section als 'variant' an ID-Gen + - Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections) """ from typing import List, Optional, Dict, Tuple from .graph_utils import ( @@ -18,11 +21,11 @@ def build_edges_for_note( note_level_references: Optional[List[str]] = None, include_note_scope_refs: bool = False, ) -> List[dict]: - """Erzeugt und aggregiert alle Kanten für eine Note (WP-15b).""" + """Erzeugt und aggregiert alle Kanten für eine Note.""" edges: List[dict] = [] note_type = _get(chunks[0], "type") if chunks else "concept" - # 1) Struktur-Kanten (belongs_to, next/prev) + # 1) Struktur-Kanten for idx, ch in enumerate(chunks): cid = _get(ch, "chunk_id", "id") if not cid: continue @@ -52,76 +55,78 @@ def build_edges_for_note( if not cid: continue raw = _get(ch, "window") or _get(ch, "text") or "" - # Typed & Candidate Pool (WP-15b Integration) + # Typed typed, rem = extract_typed_relations(raw) for k, raw_t in typed: t, sec = parse_link_target(raw_t, note_id) if not t: continue - payload = { - "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel"), + "chunk_id": cid, + # Variant=sec sorgt für eindeutige ID pro Abschnitt + "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec), "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"] } if sec: payload["target_section"] = sec - edges.append(_edge(k, "chunk", cid, t, note_id, payload)) + # Semantic AI Candidates pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] for cand in pool: raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") t, sec = parse_link_target(raw_t, note_id) if t: payload = { - "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}"), + "chunk_id": cid, + "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}", variant=sec), "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90) } if sec: payload["target_section"] = sec - edges.append(_edge(k, "chunk", cid, t, note_id, payload)) - # Callouts & Wikilinks + # Callouts call_pairs, rem2 = extract_callout_relations(rem) for k, raw_t in call_pairs: t, sec = parse_link_target(raw_t, note_id) if not t: continue - payload = { - "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge"), + "chunk_id": cid, + "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec), "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"] } if sec: payload["target_section"] = sec - edges.append(_edge(k, "chunk", cid, t, note_id, payload)) + # Wikilinks & Defaults refs = extract_wikilinks(rem2) for raw_r in refs: r, sec = parse_link_target(raw_r, note_id) if not r: continue + # Explicit Reference payload = { - "chunk_id": cid, "ref_text": raw_r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), + "chunk_id": cid, "ref_text": raw_r, + "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec), "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"] } if sec: payload["target_section"] = sec - edges.append(_edge("references", "chunk", cid, r, note_id, payload)) + # Defaults (nur einmal pro Target, Section hier irrelevant für Typ-Logik, oder?) + # Wir erzeugen Defaults auch pro Section, um Konsistenz zu wahren. for rel in defaults: if rel != "references": def_payload = { - "chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}"), + "chunk_id": cid, + "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}", variant=sec), "provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"] } if sec: def_payload["target_section"] = sec edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload)) - # Für Note-Scope Sammlung nutzen wir den Original-String zur Dedup, aber gesäubert refs_all.extend([parse_link_target(r, note_id)[0] for r in refs]) # 3) Note-Scope & De-Duplizierung if include_note_scope_refs: - # refs_all ist jetzt schon gesäubert (nur Targets) - # note_level_references müssen auch gesäubert werden cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])] refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs) @@ -136,10 +141,13 @@ def build_edges_for_note( "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"] })) - unique_map: Dict[Tuple[str, str, str], dict] = {} + # Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel. + # Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten. + unique_map: Dict[str, dict] = {} for e in edges: - key = (str(e.get("source_id")), str(e.get("target_id")), str(e.get("kind"))) - if key not in unique_map or e.get("confidence", 0) > unique_map[key].get("confidence", 0): - unique_map[key] = e + eid = e["edge_id"] + # Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence + if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0): + unique_map[eid] = e return list(unique_map.values()) \ No newline at end of file diff --git a/app/core/graph/graph_utils.py b/app/core/graph/graph_utils.py index d814ad7..fbdc51f 100644 --- a/app/core/graph/graph_utils.py +++ b/app/core/graph/graph_utils.py @@ -41,10 +41,19 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]: seen.add(s); out.append(s) return out -def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: - """Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.""" +def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str: + """ + Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s. + + WP-Fix: 'variant' (z.B. Section) fließt in den Hash ein, um mehrere Kanten + zum gleichen Target-Node (aber unterschiedlichen Abschnitten) zu unterscheiden. + """ base = f"{kind}:{s}->{t}#{scope}" - if rule_id: base += f"|{rule_id}" + if rule_id: + base += f"|{rule_id}" + if variant: + base += f"|{variant}" # <--- Hier entsteht die Eindeutigkeit für verschiedene Sections + return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: