From 38a61d7b509bbffc678108ab33048e75207faf03 Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 29 Dec 2025 12:21:57 +0100 Subject: [PATCH] Fix: Semantische Deduplizierung in graph_derive_edges.py --- app/core/graph/graph_derive_edges.py | 48 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/app/core/graph/graph_derive_edges.py b/app/core/graph/graph_derive_edges.py index 1f880ff..2d20530 100644 --- a/app/core/graph/graph_derive_edges.py +++ b/app/core/graph/graph_derive_edges.py @@ -4,7 +4,7 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung. AUDIT: - Nutzt parse_link_target - Übergibt Section als 'variant' an ID-Gen - - Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections) + - FIXED: Semantische De-Duplizierung (ignoriert rule_id bei Konflikten) """ from typing import List, Optional, Dict, Tuple from .graph_utils import ( @@ -21,11 +21,11 @@ def build_edges_for_note( note_level_references: Optional[List[str]] = None, include_note_scope_refs: bool = False, ) -> List[dict]: - """Erzeugt und aggregiert alle Kanten für eine Note.""" + """Erzeugt und aggregiert alle Kanten für eine Note (WP-15b).""" edges: List[dict] = [] note_type = _get(chunks[0], "type") if chunks else "concept" - # 1) Struktur-Kanten + # 1) Struktur-Kanten (belongs_to, next/prev) for idx, ch in enumerate(chunks): cid = _get(ch, "chunk_id", "id") if not cid: continue @@ -55,21 +55,21 @@ def build_edges_for_note( if not cid: continue raw = _get(ch, "window") or _get(ch, "text") or "" - # Typed + # Typed & Candidate Pool (WP-15b Integration) typed, rem = extract_typed_relations(raw) for k, raw_t in typed: t, sec = parse_link_target(raw_t, note_id) if not t: continue + payload = { "chunk_id": cid, - # Variant=sec sorgt für eindeutige ID pro Abschnitt "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec), "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"] } if sec: payload["target_section"] = sec + edges.append(_edge(k, "chunk", cid, t, note_id, payload)) - # Semantic AI Candidates pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] for cand in pool: raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") @@ -81,38 +81,38 @@ def build_edges_for_note( "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90) } if sec: payload["target_section"] = sec + edges.append(_edge(k, "chunk", cid, t, note_id, payload)) - # Callouts + # Callouts & Wikilinks call_pairs, rem2 = extract_callout_relations(rem) for k, raw_t in call_pairs: t, sec = parse_link_target(raw_t, note_id) if not t: continue + payload = { "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec), "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"] } if sec: payload["target_section"] = sec + edges.append(_edge(k, "chunk", cid, t, note_id, payload)) - # Wikilinks & Defaults refs = extract_wikilinks(rem2) for raw_r in refs: r, sec = parse_link_target(raw_r, note_id) if not r: continue - # Explicit Reference payload = { "chunk_id": cid, "ref_text": raw_r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec), "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"] } if sec: payload["target_section"] = sec + edges.append(_edge("references", "chunk", cid, r, note_id, payload)) - # Defaults (nur einmal pro Target, Section hier irrelevant für Typ-Logik, oder?) - # Wir erzeugen Defaults auch pro Section, um Konsistenz zu wahren. for rel in defaults: if rel != "references": def_payload = { @@ -141,13 +141,27 @@ def build_edges_for_note( "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"] })) - # Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel. - # Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten. + # FIX: Semantische Deduplizierung + # Wir nutzen einen Key aus (Source, Target, Kind, Section), um Duplikate + # aus verschiedenen Regeln (z.B. callout vs. wikilink) zusammenzuführen. unique_map: Dict[str, dict] = {} + for e in edges: - eid = e["edge_id"] - # Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence - if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0): - unique_map[eid] = e + # Semantischer Schlüssel: Unabhängig von rule_id oder edge_id + src = e.get("source_id", "") + tgt = e.get("target_id", "") + kind = e.get("kind", "") + sec = e.get("target_section", "") + + sem_key = f"{src}->{tgt}:{kind}@{sec}" + + if sem_key not in unique_map: + unique_map[sem_key] = e + else: + # Konfliktlösung: Die Kante mit der höheren Confidence gewinnt + curr_conf = unique_map[sem_key].get("confidence", 0.0) + new_conf = e.get("confidence", 0.0) + if new_conf > curr_conf: + unique_map[sem_key] = e return list(unique_map.values()) \ No newline at end of file